
On 05/27/2013 12:52 AM, Marek Marczykowski wrote:
On 16.05.2013 08:07, Chunyan Liu wrote:
Write separate module for hostdev passthrough so that it could be used by all hypervisor drivers and maintain a global hostdev state.
Signed-off-by: Chunyan Liu <cyliu@suse.com> Some comments inline. Besides that seems to be working (with your next patch for libxl) :)
--- po/POTFILES.in | 1 + src/Makefile.am | 1 + src/libvirt.c | 5 + src/libvirt_private.syms | 15 + src/util/virhostdevmanager.c | 1218 ++++++++++++++++++++++++++++++++++++++++++ src/util/virhostdevmanager.h | 91 ++++ src/util/virpci.c | 17 +- src/util/virpci.h | 7 +- src/util/virusb.c | 19 +- src/util/virusb.h | 4 +- 10 files changed, 1362 insertions(+), 16 deletions(-) create mode 100644 src/util/virhostdevmanager.c create mode 100644 src/util/virhostdevmanager.h
diff --git a/po/POTFILES.in b/po/POTFILES.in index f3ea4da..a7c21bf 100644 --- a/po/POTFILES.in +++ b/po/POTFILES.in @@ -188,6 +188,7 @@ src/util/viruri.c src/util/virusb.c src/util/virutil.c src/util/virxml.c +src/util/virhostdevmanager.c src/vbox/vbox_MSCOMGlue.c src/vbox/vbox_XPCOMCGlue.c src/vbox/vbox_driver.c diff --git a/src/Makefile.am b/src/Makefile.am index 4312c3c..a197c2b 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -130,6 +130,7 @@ UTIL_SOURCES = \ util/virutil.c util/virutil.h \ util/viruuid.c util/viruuid.h \ util/virxml.c util/virxml.h \ + util/virhostdevmanager.c util/virhostdevmanager.h \ $(NULL)
diff --git a/src/libvirt.c b/src/libvirt.c index 2b3515e..d9af5a6 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -65,6 +65,7 @@ #include "virthread.h" #include "virstring.h" #include "virutil.h" +#include "virhostdevmanager.h"
#ifdef WITH_TEST # include "test/test_driver.h" @@ -827,6 +828,7 @@ int virStateInitialize(bool privileged, if (virInitialize() < 0) return -1;
+ virHostdevManagerInit(); for (i = 0 ; i < virStateDriverTabCount ; i++) { if (virStateDriverTab[i]->stateInitialize) { VIR_DEBUG("Running global init for %s state driver", @@ -858,6 +860,9 @@ int virStateCleanup(void) { virStateDriverTab[i]->stateCleanup() < 0) ret = -1; } + + virHostdevManagerCleanup(); + return ret; }
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index cdd0b41..824de4e 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1991,6 +1991,21 @@ virXPathULong; virXPathULongHex; virXPathULongLong;
+#util/virhostdevmanager.h +virHostdevManagerGetDefault; +virHostdevManagerInit; +virHostdevManagerCleanup; +virHostdevManagerPrepareHostdevs; +virHostdevManagerReAttachHostdevs; +virHostdevManagerPreparePciHostdevs; +virHostdevManagerPrepareUsbHostdevs; +virHostdevManagerReAttachPciHostdevs; +virHostdevManagerReAttachUsbHostdevs; +virGetActivePciHostdevs; +virGetActiveUsbHostdevs; +virGetDomainActivePciHostdevs; +virGetDomainActiveUsbHostdevs; +virFreeHostdevNameList; "make check" reports wrong symbol order here.
# Let emacs know we want case-insensitive sorting # Local Variables: diff --git a/src/util/virhostdevmanager.c b/src/util/virhostdevmanager.c new file mode 100644 index 0000000..9034212 --- /dev/null +++ b/src/util/virhostdevmanager.c @@ -0,0 +1,1218 @@ +/* + * Copyright (C) 2013 SUSE LINUX Products GmbH, Nuernberg, Germany. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + * + * Author: Chunyan Liu <cyliu@suse.com> + */ +#include <config.h> + +#include "virhostdevmanager.h" + +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> + +#include "viralloc.h" +#include "virstring.h" +#include "virfile.h" +#include "virerror.h" +#include "virlog.h" +#include "virpci.h" +#include "virusb.h" +#include "virnetdev.h" + +/* For virReportOOMError() and virReportSystemError() */ +#define VIR_FROM_THIS VIR_FROM_NONE + +struct _virHostdevManager{ + char *stateDir; + + virPCIDeviceListPtr activePciHostdevs; + virPCIDeviceListPtr inactivePciHostdevs; + virUSBDeviceListPtr activeUsbHostdevs; +}; + +static virHostdevManagerPtr hostdevMgr; + +int virHostdevManagerInit(void) +{ + char ebuf[1024]; + if (VIR_ALLOC(hostdevMgr) < 0) + return -1; + + if ((hostdevMgr->activePciHostdevs = virPCIDeviceListNew()) == NULL) + return -1;
goto out_of_memory ?
+ + if ((hostdevMgr->activeUsbHostdevs = virUSBDeviceListNew()) == NULL) + return -1; goto out_of_memory ?
+ + if ((hostdevMgr->inactivePciHostdevs = virPCIDeviceListNew()) == NULL) + return -1; goto out_of_memory ?
+ + if (virAsprintf(&hostdevMgr->stateDir, + "%s",HOSTDEV_STATE_DIR) == -1) + goto out_of_memory; + + if (virFileMakePath(hostdevMgr->stateDir) < 0) { + VIR_ERROR(_("Failed to create state dir '%s': %s"), + hostdevMgr->stateDir, virStrerror(errno, ebuf, sizeof(ebuf))); + goto error; + } + + return 0; + +out_of_memory: + virReportOOMError(); +error: + return -1; +} + +void virHostdevManagerCleanup(void) +{ + if(!hostdevMgr) + return; + + virObjectUnref(hostdevMgr->activePciHostdevs); + virObjectUnref(hostdevMgr->inactivePciHostdevs); + virObjectUnref(hostdevMgr->activeUsbHostdevs); + + VIR_FREE(hostdevMgr->stateDir); +} + +virHostdevManagerPtr virHostdevManagerGetDefault(void) +{ + return hostdevMgr; +} + +static int +virDomainHostdevPciSysfsPath(virDomainHostdevDefPtr hostdev, char **sysfs_path) +{ + virPCIDeviceAddress config_address; + + config_address.domain = hostdev->source.subsys.u.pci.addr.domain; + config_address.bus = hostdev->source.subsys.u.pci.addr.bus; + config_address.slot = hostdev->source.subsys.u.pci.addr.slot; + config_address.function = hostdev->source.subsys.u.pci.addr.function; + + return virPCIDeviceAddressGetSysfsFile(&config_address, sysfs_path); +} + +static int +virDomainHostdevIsVirtualFunction(virDomainHostdevDefPtr hostdev) +{ + char *sysfs_path = NULL; + int ret = -1; + + if (virDomainHostdevPciSysfsPath(hostdev, &sysfs_path) < 0) + return ret; + + ret = virPCIIsVirtualFunction(sysfs_path); + + VIR_FREE(sysfs_path); + + return ret; +} + +static int +virDomainHostdevNetDevice(virDomainHostdevDefPtr hostdev, char **linkdev, + int *vf) +{ + int ret = -1; + char *sysfs_path = NULL; + + if (virDomainHostdevPciSysfsPath(hostdev, &sysfs_path) < 0) + return ret; + + if (virPCIIsVirtualFunction(sysfs_path) == 1) { + if (virPCIGetVirtualFunctionInfo(sysfs_path, linkdev, + vf) < 0) + goto cleanup; + } else { + if (virPCIGetNetName(sysfs_path, linkdev) < 0) + goto cleanup; + *vf = -1; + } + + ret = 0; + +cleanup: + VIR_FREE(sysfs_path); + + return ret; +} + +static int +virDomainHostdevNetConfigVirtPortProfile(const char *linkdev, int vf, + virNetDevVPortProfilePtr virtPort, + const virMacAddrPtr macaddr, + const unsigned char *uuid, + int associate) +{ + int ret = -1; + + if (!virtPort) + return ret; + + switch (virtPort->virtPortType) { + case VIR_NETDEV_VPORT_PROFILE_NONE: + case VIR_NETDEV_VPORT_PROFILE_OPENVSWITCH: + case VIR_NETDEV_VPORT_PROFILE_8021QBG: + case VIR_NETDEV_VPORT_PROFILE_LAST: + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("virtualport type %s is " + "currently not supported on interfaces of type " + "hostdev"), + virNetDevVPortTypeToString(virtPort->virtPortType)); + break; + + case VIR_NETDEV_VPORT_PROFILE_8021QBH: + if (associate) + ret = virNetDevVPortProfileAssociate(NULL, virtPort, macaddr, + linkdev, vf, uuid, + VIR_NETDEV_VPORT_PROFILE_OP_CREATE, false); + else + ret = virNetDevVPortProfileDisassociate(NULL, virtPort, + macaddr, linkdev, vf, + VIR_NETDEV_VPORT_PROFILE_OP_DESTROY); + break; + } + + return ret; +} + +static int +virDomainHostdevNetConfigReplace(virDomainHostdevDefPtr hostdev, + const unsigned char *uuid, + char *stateDir) +{ + char *linkdev = NULL; + virNetDevVlanPtr vlan; + virNetDevVPortProfilePtr virtPort; + int ret = -1; + int vf = -1; + int vlanid = -1; + int port_profile_associate = 1; + int isvf; + + isvf = virDomainHostdevIsVirtualFunction(hostdev); + if (isvf <= 0) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Interface type hostdev is currently supported on" + " SR-IOV Virtual Functions only")); + return ret; + } + + if (virDomainHostdevNetDevice(hostdev, &linkdev, &vf) < 0) + return ret; + + vlan = virDomainNetGetActualVlan(hostdev->parent.data.net); + virtPort = virDomainNetGetActualVirtPortProfile( + hostdev->parent.data.net); + if (virtPort) { + if (vlan) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("direct setting of the vlan tag is not allowed " + "for hostdev devices using %s mode"), + virNetDevVPortTypeToString(virtPort->virtPortType)); + goto cleanup; + } + ret = virDomainHostdevNetConfigVirtPortProfile(linkdev, vf, + virtPort, &hostdev->parent.data.net->mac, uuid, + port_profile_associate); + } else { + /* Set only mac and vlan */ + if (vlan) { + if (vlan->nTags != 1 || vlan->trunk) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("vlan trunking is not supported " + "by SR-IOV network devices")); + goto cleanup; + } + if (vf == -1) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("vlan can only be set for SR-IOV VFs, but " + "%s is not a VF"), linkdev); + goto cleanup; + } + vlanid = vlan->tag[0]; + } else if (vf >= 0) { + vlanid = 0; /* assure any current vlan tag is reset */ + } + + ret = virNetDevReplaceNetConfig(linkdev, vf, + &hostdev->parent.data.net->mac, + vlanid, stateDir); + } +cleanup: + VIR_FREE(linkdev); + return ret; +} + +static int +virDomainHostdevNetConfigRestore(virDomainHostdevDefPtr hostdev, + char *stateDir) +{ + char *linkdev = NULL; + virNetDevVPortProfilePtr virtPort; + int ret = -1; + int vf = -1; + int port_profile_associate = 0; + int isvf; + + isvf = virDomainHostdevIsVirtualFunction(hostdev); + if (isvf <= 0) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Interface type hostdev is currently supported on" + " SR-IOV Virtual Functions only")); + return ret; + } + + if (virDomainHostdevNetDevice(hostdev, &linkdev, &vf) < 0) + return ret; + + virtPort = virDomainNetGetActualVirtPortProfile( + hostdev->parent.data.net); + if (virtPort) + ret = virDomainHostdevNetConfigVirtPortProfile(linkdev, vf, virtPort, + &hostdev->parent.data.net->mac, NULL, + port_profile_associate); + else + ret = virNetDevRestoreNetConfig(linkdev, vf, stateDir); + + VIR_FREE(linkdev); + + return ret; +} + +static virPCIDeviceListPtr +virHostdevManagerGetPciHostDeviceList( + virDomainHostdevDefPtr *hostdevs, + int nhostdevs) +{ + virPCIDeviceListPtr list; + int i; + + if (!(list = virPCIDeviceListNew())) + return NULL; + + for (i = 0 ; i < nhostdevs ; i++) { + virDomainHostdevDefPtr hostdev = hostdevs[i]; + virPCIDevicePtr dev; + + if (hostdev->mode != VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) + continue; + if (hostdev->source.subsys.type != VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) + continue; + + dev = virPCIDeviceNew(hostdev->source.subsys.u.pci.addr.domain, + hostdev->source.subsys.u.pci.addr.bus, + hostdev->source.subsys.u.pci.addr.slot, + hostdev->source.subsys.u.pci.addr.function); + if (!dev) { + virObjectUnref(list); + return NULL; + } + + if (virPCIDeviceListAdd(list, dev) < 0) { + virPCIDeviceFree(dev); + virObjectUnref(list); + return NULL; + } + + virPCIDeviceSetManaged(dev, hostdev->managed); + } + + return list; +} + +int +virHostdevManagerPreparePciHostdevs(virHostdevManagerPtr mgr, + const char *drv_name, + const char *dom_name, + const unsigned char *uuid, + virDomainHostdevDefPtr *hostdevs, + int nhostdevs, + unsigned int flags) +{ + virPCIDeviceListPtr pcidevs; + int last_processed_hostdev_vf = -1; + int i; + int ret = -1; + + virObjectLock(mgr->activePciHostdevs); + virObjectLock(mgr->inactivePciHostdevs); + + if (!(pcidevs = virHostdevManagerGetPciHostDeviceList(hostdevs, nhostdevs))) + goto cleanup; + + /* We have to use 9 loops here. *All* devices must + * be detached before we reset any of them, because + * in some cases you have to reset the whole PCI, + * which impacts all devices on it. Also, all devices + * must be reset before being marked as active. + */ + + /* Loop 1: validate that non-managed device isn't in use, eg + * by checking that device is either un-bound, or bound + * to pci-stub.ko + */ + + for (i = 0; i < virPCIDeviceListCount(pcidevs); i++) { + virPCIDevicePtr dev = virPCIDeviceListGet(pcidevs, i); + virPCIDevicePtr other; + bool strict_acs_check = (flags & VIR_STRICT_ACS_CHECK)?true:false; + + if (!virPCIDeviceIsAssignable(dev, strict_acs_check)) { + virReportError(VIR_ERR_OPERATION_INVALID, + _("PCI device %s is not assignable"), + virPCIDeviceGetName(dev)); + goto cleanup; + } + /* The device is in use by other active domain if + * the dev is in list activePciHostdevs. + */ + if ((other = virPCIDeviceListFind(mgr->activePciHostdevs, dev))) { + char *other_drvname = NULL; + char *other_domname = NULL; + virPCIDeviceGetUsedBy(other, &other_drvname, &other_domname); + + if (other_drvname && other_domname) + virReportError(VIR_ERR_OPERATION_INVALID, + _("PCI device %s is in use by driver %s,domain %s"), + virPCIDeviceGetName(dev), other_drvname, + other_domname); + else + virReportError(VIR_ERR_OPERATION_INVALID, + _("PCI device %s is already in use"), + virPCIDeviceGetName(dev)); + VIR_FREE(other_drvname); + VIR_FREE(other_domname); + goto cleanup; + } + } + + /* Loop 2: detach managed devices */ + for (i = 0; i < virPCIDeviceListCount(pcidevs); i++) { + virPCIDevicePtr dev = virPCIDeviceListGet(pcidevs, i); + const char *stub_driver; + if (STREQ_NULLABLE(drv_name, "xenlight")) + stub_driver = "pciback"; + else + stub_driver = "pci_stub"; Really need hardcode driver name here?
Actually, no, it's not needed, and the code this is replacing has changed, so this all needs a rebase. The way that it works now (with the code that's in qemu) is that qemuGetPciHostDeviceList() (which builds a list of virPCIDevice from a list of virDomainHostDevDef) checks for a <driver name='xxx'/> setting, and if driver name is set to "vfio", it calls virPCIDeviceSetStubDriver(dev, "vfio-pci"), otherwise it calls virPCIDeviceSetStubDriver(dev, "pci-stub"). I guess your new virHostdevManagerGetPciHostDeviceList() needs to be made aware of which hypervisor is being used, and set the stub driver accordingly. Everything else in the device detach/reattach should be made to either use the stub driver name stored in the virPCIDevice, or to auto-detect it (virPCIDeviceUnbindFromStub() shows how to do that).
BTW "xen" driver missing here. Perhaps this can be passes as parameter (with default to "pci_stub" if NULL)? Or set by driver before calling this function (virPCIDeviceSetSubDriver?).
Right, that function is newer than the code this patch is based from. [...]
+ +/* + * Pre-condition: mgr->inactivePciHostdevs & mgr->activePciHostdevs + * are locked + */ +static void +virReattachPciDevice(virHostdevManagerPtr mgr, virPCIDevicePtr dev, const char *driver) +{ + /* If the device is not managed and was attached to guest + * successfully, it must have been inactive. + */ + if (!virPCIDeviceGetManaged(dev)) { + if (virPCIDeviceListAdd(mgr->inactivePciHostdevs, dev) < 0) + virPCIDeviceFree(dev); + return; + } + + if (STREQ_NULLABLE(driver, "QEMU")) { Same as earlier, perhaps the better approach will be checking for current stub driver here?
Hmm, yes. I wonder what this does for vfio. Is this needed for something in the pci-stub driver? Or is it common to any device assignment in qemu? (Fortunately it's harmless (although a bit wasteful of CPU time) when the device hasn't been used by qemu/kvm.
+ int retries = 100; + + while (virPCIDeviceWaitForCleanup(dev, "kvm_assigned_device") + && retries) { + usleep(100*1000); + retries--; + } + } + + if (virPCIDeviceReattach(dev, mgr->activePciHostdevs, + mgr->inactivePciHostdevs) < 0) { + virErrorPtr err = virGetLastError(); + VIR_ERROR(_("Failed to re-attach PCI device: %s"), + err ? err->message : _("unknown error")); + virResetError(err); + } + virPCIDeviceFree(dev); +} + +/*
Generally, please check for recent changes to the functions you're basing this on. We don't want any of that to mysteriously disappear when we switch from the qemu-specific functions to these general functions.