[libvirt] [PATCH v3 0/9] Few VFIO related fixes

The following series implements... --- Addressed comments on V2 and V1 Fixed the while loop exit in P8 virPCIDeviceUnbindFromStub() Shivaprasad G Bhat (9): Implement virPCIIsKnownStub function Initialize the stubDriver of pci devices if bound to a valid one Add iommu group number info to virPCIDevice Refuse to reattach from vfio if the iommu group is in use by any domain Wait for vfio-pci device cleanups before reassinging the device to host driver Split reprobe action from the virPCIUnbindFromStub into a new function Pass activeDevs and inactiveDevs to virPCIDeviceUnbindFromStub and virPCIDeviceBindToStub Postpone reprobing till all the devices in iommu group are unbound from vfio Wait for iommmu device to go away before reprobing the host driver src/libvirt_private.syms | 1 src/util/virhostdev.c | 18 +++ src/util/virpci.c | 300 +++++++++++++++++++++++++++++++++++++++------- src/util/virpci.h | 2 tests/virpcimock.c | 187 ++++++++++++++++++++++++++--- tests/virpcitest.c | 152 +++++++++++++++++++---- 6 files changed, 566 insertions(+), 94 deletions(-) -- Signature

The checks to knwon stubs can be easily done having this implementation. Signed-off-by: Shivaprasad G Bhat <sbhat@linux.vnet.ibm.com> --- src/util/virpci.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/util/virpci.c b/src/util/virpci.c index 35b1459..bff37d7 100644 --- a/src/util/virpci.c +++ b/src/util/virpci.c @@ -1080,6 +1080,23 @@ static const char *virPCIKnownStubs[] = { NULL }; +static bool +virPCIIsKnownStub(char *driver) +{ + const char **stubTest; + bool ret = false; + + for (stubTest = virPCIKnownStubs; *stubTest != NULL; stubTest++) { + if (STREQ_NULLABLE(driver, *stubTest)) { + ret = true; + VIR_DEBUG("Found stub driver %s", *stubTest); + break; + } + } + + return ret; +} + static int virPCIDeviceUnbindFromStub(virPCIDevicePtr dev) { @@ -1087,8 +1104,6 @@ virPCIDeviceUnbindFromStub(virPCIDevicePtr dev) char *drvdir = NULL; char *path = NULL; char *driver = NULL; - const char **stubTest; - bool isStub = false; /* If the device is currently bound to one of the "well known" * stub drivers, then unbind it, otherwise ignore it. @@ -1105,14 +1120,7 @@ virPCIDeviceUnbindFromStub(virPCIDevicePtr dev) goto remove_slot; /* If the device isn't bound to a known stub, skip the unbind. */ - for (stubTest = virPCIKnownStubs; *stubTest != NULL; stubTest++) { - if (STREQ(driver, *stubTest)) { - isStub = true; - VIR_DEBUG("Found stub driver %s", *stubTest); - break; - } - } - if (!isStub) + if (!virPCIIsKnownStub(driver)) goto remove_slot; if (virPCIDeviceUnbind(dev, dev->reprobe) < 0)

The stubDriver name can be used to make useful decisions if readily available. Set it if bound to a valid one during initialisation. Signed-off-by: Shivaprasad G Bhat <sbhat@linux.vnet.ibm.com> --- src/util/virpci.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/util/virpci.c b/src/util/virpci.c index bff37d7..876da70 100644 --- a/src/util/virpci.c +++ b/src/util/virpci.c @@ -1564,6 +1564,8 @@ virPCIDeviceNew(unsigned int domain, virPCIDevicePtr dev; char *vendor = NULL; char *product = NULL; + char *drvpath = NULL; + char *driver = NULL; if (VIR_ALLOC(dev) < 0) return NULL; @@ -1611,9 +1613,19 @@ virPCIDeviceNew(unsigned int domain, goto error; } + if (virPCIDeviceGetDriverPathAndName(dev, &drvpath, &driver) < 0) + goto cleanup; + VIR_DEBUG("%s %s: initialized", dev->id, dev->name); + if (!virPCIIsKnownStub(driver)) { + VIR_FREE(driver); + goto cleanup; + } + dev->stubDriver = driver; + cleanup: + VIR_FREE(drvpath); VIR_FREE(product); VIR_FREE(vendor); return dev;

The iommu group number need not be fetched from the sysfs everytime as it remains constant. Fetch it once during allocation Signed-off-by: Shivaprasad G Bhat <sbhat@linux.vnet.ibm.com> --- src/libvirt_private.syms | 1 + src/util/virpci.c | 11 +++++++++++ src/util/virpci.h | 2 ++ 3 files changed, 14 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index a835f18..1e624fe 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1971,6 +1971,7 @@ virPCIDeviceNew; virPCIDeviceReattach; virPCIDeviceReattachInit; virPCIDeviceReset; +virPCIDeviceSetIommuGroup; virPCIDeviceSetManaged; virPCIDeviceSetRemoveSlot; virPCIDeviceSetReprobe; diff --git a/src/util/virpci.c b/src/util/virpci.c index 876da70..b03521c 100644 --- a/src/util/virpci.c +++ b/src/util/virpci.c @@ -75,6 +75,7 @@ struct _virPCIDevice { bool has_pm_reset; bool managed; char *stubDriver; + int iommuGroup; /* used by reattach function */ bool unbind_from_stub; @@ -1566,6 +1567,8 @@ virPCIDeviceNew(unsigned int domain, char *product = NULL; char *drvpath = NULL; char *driver = NULL; + virPCIDeviceAddress devAddr = { domain, bus, + slot, function }; if (VIR_ALLOC(dev) < 0) return NULL; @@ -1613,6 +1616,8 @@ virPCIDeviceNew(unsigned int domain, goto error; } + dev->iommuGroup = virPCIDeviceAddressGetIOMMUGroupNum(&devAddr); + if (virPCIDeviceGetDriverPathAndName(dev, &drvpath, &driver) < 0) goto cleanup; @@ -1726,6 +1731,12 @@ virPCIDeviceSetStubDriver(virPCIDevicePtr dev, const char *driver) return VIR_STRDUP(dev->stubDriver, driver); } +void +virPCIDeviceSetIommuGroup(virPCIDevicePtr dev, int iommu) +{ + dev->iommuGroup = iommu; +} + const char * virPCIDeviceGetStubDriver(virPCIDevicePtr dev) { diff --git a/src/util/virpci.h b/src/util/virpci.h index 64b9e96..a6c3628 100644 --- a/src/util/virpci.h +++ b/src/util/virpci.h @@ -87,6 +87,8 @@ int virPCIDeviceReset(virPCIDevicePtr dev, virPCIDeviceListPtr activeDevs, virPCIDeviceListPtr inactiveDevs); +void virPCIDeviceSetIommuGroup(virPCIDevice *dev, + int iommu); void virPCIDeviceSetManaged(virPCIDevice *dev, bool managed); unsigned int virPCIDeviceGetManaged(virPCIDevice *dev);

It is incorrect to attempt the device reattach of a function, when some other domain is using some functions belonging to the same iommu group. Signed-off-by: Shivaprasad G Bhat <sbhat@linux.vnet.ibm.com> --- src/util/virhostdev.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/util/virhostdev.c b/src/util/virhostdev.c index de029a0..4c441b9 100644 --- a/src/util/virhostdev.c +++ b/src/util/virhostdev.c @@ -1590,6 +1590,7 @@ virHostdevPCINodeDeviceReAttach(virHostdevManagerPtr hostdev_mgr, virPCIDevicePtr pci) { virPCIDeviceAddressPtr devAddr = NULL; + bool usesVfio = STREQ_NULLABLE(virPCIDeviceGetStubDriver(pci), "vfio-pci"); struct virHostdevIsPCINodeDeviceUsedData data = {hostdev_mgr, NULL, false}; int ret = -1; @@ -1600,8 +1601,16 @@ virHostdevPCINodeDeviceReAttach(virHostdevManagerPtr hostdev_mgr, if (!(devAddr = virPCIDeviceGetAddress(pci))) goto out; - if (virHostdevIsPCINodeDeviceUsed(devAddr, &data)) + if (usesVfio) { + /* Doesn't matter which function. If any domain is actively using the + * iommu group, refuse to reattach */ + if (virPCIDeviceAddressIOMMUGroupIterate(devAddr, + virHostdevIsPCINodeDeviceUsed, + &data) < 0) + goto out; + } else if (virHostdevIsPCINodeDeviceUsed(devAddr, &data)) { goto out; + } virPCIDeviceReattachInit(pci);

Before unbind from stub driver libvirt should be sure the guest is not using the device anymore. The libvirt today waits for pci-stub driver alone in /proc/iommu. The same wait is needed for vfio devices too. Signed-off-by: Shivaprasad G Bhat <sbhat@linux.vnet.ibm.com> --- src/util/virhostdev.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/util/virhostdev.c b/src/util/virhostdev.c index 4c441b9..c5a6553 100644 --- a/src/util/virhostdev.c +++ b/src/util/virhostdev.c @@ -756,6 +756,13 @@ virHostdevReattachPCIDevice(virPCIDevicePtr dev, virHostdevManagerPtr mgr) usleep(100*1000); retries--; } + } else if (STREQ(virPCIDeviceGetStubDriver(dev), "vfio-pci")) { + int retries = 100; + while (virPCIDeviceWaitForCleanup(dev, "vfio-pci") + && retries) { + usleep(100*1000); + retries--; + } } if (virPCIDeviceReattach(dev, mgr->activePCIHostdevs,

The reprobe needs to be called multiple times for vfio devices for each device in the iommu group in future patch. So split the reprobe into a new function. No functional change. Signed-off-by: Shivaprasad G Bhat <sbhat@linux.vnet.ibm.com> --- src/util/virpci.c | 47 +++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/src/util/virpci.c b/src/util/virpci.c index b03521c..f395fdd 100644 --- a/src/util/virpci.c +++ b/src/util/virpci.c @@ -1099,6 +1099,36 @@ virPCIIsKnownStub(char *driver) } static int +virPCIDeviceReprobeHostDriver(virPCIDevicePtr dev, + const char *driver, + const char *drvdir) +{ + char *path = NULL; + int ret = -1; + + /* Trigger a re-probe of the device is not in the stub's dynamic + * ID table. If the stub is available, but 'remove_id' isn't + * available, then re-probing would just cause the device to be + * re-bound to the stub. + */ + if (driver && !(path = virPCIDriverFile(driver, "remove_id"))) + goto cleanup; + + if (!driver || !virFileExists(drvdir) || virFileExists(path)) { + if (virFileWriteStr(PCI_SYSFS "drivers_probe", dev->name, 0) < 0) { + virReportSystemError(errno, + _("Failed to trigger a re-probe for PCI device '%s'"), + dev->name); + goto cleanup; + } + } + ret = 0; + cleanup: + VIR_FREE(path); + return ret; +} + +static int virPCIDeviceUnbindFromStub(virPCIDevicePtr dev) { int result = -1; @@ -1150,24 +1180,9 @@ virPCIDeviceUnbindFromStub(virPCIDevicePtr dev) goto cleanup; } - /* Trigger a re-probe of the device is not in the stub's dynamic - * ID table. If the stub is available, but 'remove_id' isn't - * available, then re-probing would just cause the device to be - * re-bound to the stub. - */ - VIR_FREE(path); - if (driver && !(path = virPCIDriverFile(driver, "remove_id"))) + if (virPCIDeviceReprobeHostDriver(dev, driver, drvdir) < 0) goto cleanup; - if (!driver || !virFileExists(drvdir) || virFileExists(path)) { - if (virFileWriteStr(PCI_SYSFS "drivers_probe", dev->name, 0) < 0) { - virReportSystemError(errno, - _("Failed to trigger a re-probe for PCI device '%s'"), - dev->name); - goto cleanup; - } - } - result = 0; cleanup:

The inactiveDevs need to be selectively altered for more than one device in case of vfio devices. So, pass the whole list. Signed-off-by: Shivaprasad G Bhat <sbhat@linux.vnet.ibm.com> --- src/util/virpci.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/util/virpci.c b/src/util/virpci.c index f395fdd..0c20053 100644 --- a/src/util/virpci.c +++ b/src/util/virpci.c @@ -1129,7 +1129,9 @@ virPCIDeviceReprobeHostDriver(virPCIDevicePtr dev, } static int -virPCIDeviceUnbindFromStub(virPCIDevicePtr dev) +virPCIDeviceUnbindFromStub(virPCIDevicePtr dev, + virPCIDeviceListPtr activeDevs ATTRIBUTE_UNUSED, + virPCIDeviceListPtr inactiveDevs) { int result = -1; char *drvdir = NULL; @@ -1186,6 +1188,9 @@ virPCIDeviceUnbindFromStub(virPCIDevicePtr dev) result = 0; cleanup: + if ((result == 0) && inactiveDevs) + virPCIDeviceListDel(inactiveDevs, dev); + /* do not do it again */ dev->unbind_from_stub = false; dev->remove_slot = false; @@ -1201,7 +1206,9 @@ virPCIDeviceUnbindFromStub(virPCIDevicePtr dev) static int virPCIDeviceBindToStub(virPCIDevicePtr dev, - const char *stubDriverName) + const char *stubDriverName, + virPCIDeviceListPtr activeDevs, + virPCIDeviceListPtr inactiveDevs) { int result = -1; bool reprobe = false; @@ -1328,9 +1335,15 @@ virPCIDeviceBindToStub(virPCIDevicePtr dev, VIR_FREE(driverLink); VIR_FREE(path); + /* Add *a copy of* the dev into list inactiveDevs, if + * it's not already there. */ + if ((result == 0) && inactiveDevs && !virPCIDeviceListFind(inactiveDevs, dev) && + virPCIDeviceListAddCopy(inactiveDevs, dev) < 0) { + result = -1; + } if (result < 0) { VIR_FREE(newDriverName); - virPCIDeviceUnbindFromStub(dev); + virPCIDeviceUnbindFromStub(dev, activeDevs, inactiveDevs); } else { VIR_FREE(dev->stubDriver); dev->stubDriver = newDriverName; @@ -1377,16 +1390,9 @@ virPCIDeviceDetach(virPCIDevicePtr dev, return -1; } - if (virPCIDeviceBindToStub(dev, dev->stubDriver) < 0) - return -1; - - /* Add *a copy of* the dev into list inactiveDevs, if - * it's not already there. - */ - if (inactiveDevs && !virPCIDeviceListFind(inactiveDevs, dev) && - virPCIDeviceListAddCopy(inactiveDevs, dev) < 0) { + if (virPCIDeviceBindToStub(dev, dev->stubDriver, + activeDevs, inactiveDevs) < 0) return -1; - } return 0; } @@ -1402,13 +1408,9 @@ virPCIDeviceReattach(virPCIDevicePtr dev, return -1; } - if (virPCIDeviceUnbindFromStub(dev) < 0) + if (virPCIDeviceUnbindFromStub(dev, activeDevs, inactiveDevs) < 0) return -1; - /* Steal the dev from list inactiveDevs */ - if (inactiveDevs) - virPCIDeviceListDel(inactiveDevs, dev); - return 0; }

The host will crash if a device is bound to host driver when the device belonging to same iommu group is in use by any of the guests. So, do the host driver probe only after all the devices in the iommu group have unbound from the vfio. The patch also adds the test cases. The negative test case is removed. We need to add a new one with a new driver. The patch fixes https://bugzilla.redhat.com/show_bug.cgi?id=1272300 Signed-off-by: Shivaprasad G Bhat <sbhat@linux.vnet.ibm.com> --- src/util/virpci.c | 141 ++++++++++++++++++++++++++++++++++++++---- tests/virpcimock.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++----- tests/virpcitest.c | 152 ++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 410 insertions(+), 56 deletions(-) diff --git a/src/util/virpci.c b/src/util/virpci.c index 0c20053..caa2458 100644 --- a/src/util/virpci.c +++ b/src/util/virpci.c @@ -1129,6 +1129,24 @@ virPCIDeviceReprobeHostDriver(virPCIDevicePtr dev, } static int +virPCIDeviceBoundToVFIODriver(virPCIDeviceAddressPtr devAddr, void *opaque ATTRIBUTE_UNUSED) +{ + int ret = 0; + virPCIDevicePtr pci = NULL; + + if (!(pci = virPCIDeviceNew(devAddr->domain, devAddr->bus, + devAddr->slot, devAddr->function))) + goto cleanup; + + if (STREQ_NULLABLE(pci->stubDriver, "vfio-pci")) + ret = -1; + + cleanup: + virPCIDeviceFree(pci); + return ret; +} + +static int virPCIDeviceUnbindFromStub(virPCIDevicePtr dev, virPCIDeviceListPtr activeDevs ATTRIBUTE_UNUSED, virPCIDeviceListPtr inactiveDevs) @@ -1145,7 +1163,12 @@ virPCIDeviceUnbindFromStub(virPCIDevicePtr dev, goto cleanup; if (!driver) { - /* The device is not bound to any driver and we are almost done. */ + /* This device was probably unbound before and libvirt restared. + * Add to the inactive list if not already there so that we + * reprobe it if needed. + */ + if (inactiveDevs && !virPCIDeviceListFind(inactiveDevs, dev) && + virPCIDeviceListAddCopy(inactiveDevs, dev) < 0) goto reprobe; } @@ -1156,10 +1179,69 @@ virPCIDeviceUnbindFromStub(virPCIDevicePtr dev, if (!virPCIIsKnownStub(driver)) goto remove_slot; - if (virPCIDeviceUnbind(dev, dev->reprobe) < 0) - goto cleanup; - dev->unbind_from_stub = false; + if (STREQ_NULLABLE(dev->stubDriver, "vfio-pci")) { + size_t i = 0; + bool inInactiveList = false; + virPCIDevicePtr temp; + while (activeDevs && (i < virPCIDeviceListCount(activeDevs))) { + virPCIDevicePtr pcidev = virPCIDeviceListGet(activeDevs, i); + if (dev->iommuGroup == pcidev->iommuGroup) { + /* If Part of activelist, deal with current device later */ + if (inactiveDevs && !virPCIDeviceListFind(inactiveDevs, dev) && + virPCIDeviceListAddCopy(inactiveDevs, dev) < 0) + goto cleanup; + result = 0; + goto revisit; + } + i++; + } + /* No guests are using any of the devices in the iommu. + * This is the first device after guest shutdown/unplug + * of all devices. So, rest of the devices are in inactiveDevs. + * OR This is the first device after libvirt start and nothing is in + * inactiveDevs. + * If the device is in inactiveList(User can issue reattach multiple times + * for the same device), mark it as already unbound. + */ + if (virPCIDeviceUnbind(dev, dev->reprobe) < 0) + goto cleanup; + dev->unbind_from_stub = false; + if (inactiveDevs && (temp = virPCIDeviceListFind(inactiveDevs, dev))) { + temp->unbind_from_stub = false; + inInactiveList = true; + } + /* Unbind rest of the devices in the same iommu group. + * After the fresh libvirt start, nothing is detached in the step + * as inactiveDevs is empty.*/ + i = 0; + while (inactiveDevs && (i < virPCIDeviceListCount(inactiveDevs))) { + virPCIDevicePtr pcidev = virPCIDeviceListGet(inactiveDevs, i); + if (dev->iommuGroup == pcidev->iommuGroup) { + if (pcidev->unbind_from_stub && + virPCIDeviceUnbind(pcidev, pcidev->reprobe) < 0) { + goto cleanup; + } + pcidev->unbind_from_stub = false; + } + i++; + } + /* Libvirt just restarted and inactive list is empty or yet to get into + * the list. But no devices used actively from same iommu group. + * Basically this could be the first one to unbind from vfio but + * could be the last function to be bound to vfio if this is after + * libvirt restart. + */ + if (!inInactiveList) { + if (inactiveDevs && virPCIDeviceListAddCopy(inactiveDevs, dev) < 0) + goto cleanup; + } + } else { + if (virPCIDeviceUnbind(dev, dev->reprobe) < 0) + goto cleanup; + dev->unbind_from_stub = false; + } + /* The remove_slot means nothing for VFIO. Dont manage it in active/inactiveList */ remove_slot: if (!dev->remove_slot) goto reprobe; @@ -1177,25 +1259,58 @@ virPCIDeviceUnbindFromStub(virPCIDevicePtr dev, dev->remove_slot = false; reprobe: - if (!dev->reprobe) { - result = 0; - goto cleanup; - } - if (virPCIDeviceReprobeHostDriver(dev, driver, drvdir) < 0) - goto cleanup; + /* For VFIO devices if there is a pending reprobe, the new reattach + * request would come with device->stubDriver set to null as the + * device is actually unbound. Dont bind to host driver if the + * iommu group is in use. */ + if (!dev->stubDriver || STREQ_NULLABLE(dev->stubDriver, "vfio-pci")) { + size_t i = 0; + virPCIDeviceAddress devAddr = { dev->domain, dev->bus, + dev->slot, dev->function }; + if (virPCIDeviceAddressIOMMUGroupIterate(&devAddr, virPCIDeviceBoundToVFIODriver, NULL) < 0) { + result = 0; + goto cleanup; + } + /* This device is the last to unbind from vfio. As we explicitly + * add a missing device in the list to inactiveList, we will just + * go through the list. */ + while (inactiveDevs && (i < virPCIDeviceListCount(inactiveDevs))) { + virPCIDevicePtr pcidev = virPCIDeviceListGet(inactiveDevs, i); + if (dev->iommuGroup == pcidev->iommuGroup) { + if (pcidev->reprobe && + virPCIDeviceReprobeHostDriver(pcidev, driver, drvdir) < 0) + goto cleanup; + /* Steal the dev from list inactiveDevs */ + virPCIDeviceListDel(inactiveDevs, pcidev); + continue; + } + i++; + } + /* If the list was null, we failed to add to the list before. + * Reprobe the current device explicitly. */ + if (!inactiveDevs) { + if (dev->reprobe && + virPCIDeviceReprobeHostDriver(dev, driver, drvdir) < 0) + goto cleanup; + } + } else { + if (virPCIDeviceReprobeHostDriver(dev, driver, drvdir) < 0) + goto cleanup; + + if (inactiveDevs) + virPCIDeviceListDel(inactiveDevs, dev); + } result = 0; cleanup: - if ((result == 0) && inactiveDevs) - virPCIDeviceListDel(inactiveDevs, dev); - /* do not do it again */ dev->unbind_from_stub = false; dev->remove_slot = false; dev->reprobe = false; + revisit: VIR_FREE(drvdir); VIR_FREE(path); VIR_FREE(driver); diff --git a/tests/virpcimock.c b/tests/virpcimock.c index 0b49290..926a548 100644 --- a/tests/virpcimock.c +++ b/tests/virpcimock.c @@ -127,9 +127,15 @@ struct pciDevice { int vendor; int device; int class; + int iommu; struct pciDriver *driver; /* Driver attached. NULL if attached to no driver */ }; +struct pciIommuGroup { + int iommu; + size_t nDevices; /* @len is used for both @vendor and @device */ +}; + struct fdCallback { int fd; char *path; @@ -141,6 +147,9 @@ size_t nPCIDevices = 0; struct pciDriver **pciDrivers = NULL; size_t nPCIDrivers = 0; +struct pciIommuGroup **pciIommuGroups = NULL; +size_t npciIommuGroups = 0; + struct fdCallback *callbacks = NULL; size_t nCallbacks = 0; @@ -325,6 +334,7 @@ pci_device_new_from_stub(const struct pciDevice *data) char *configSrc; char tmp[32]; struct stat sb; + char *iommugrouppath, *deviommupath, *iommugroupdevs = NULL; if (VIR_STRDUP_QUIET(id, data->id) < 0) ABORT_OOM(); @@ -387,6 +397,25 @@ pci_device_new_from_stub(const struct pciDevice *data) ABORT("@tmp overflow"); make_file(devpath, "class", tmp, -1); + if (virAsprintfQuiet(&deviommupath, "%s/iommu_group", devpath) < 0 || + virAsprintfQuiet(&iommugrouppath, "%s/iommu_groups/%d", + fakesysfsdir, dev->iommu) < 0) + ABORT("@deviommupath overflow"); + + if (symlink(iommugrouppath, deviommupath) < 0) + ABORT("Unable to link device to iommu group"); + + VIR_FREE(deviommupath); + if (virAsprintfQuiet(&iommugroupdevs, "%s/devices/%s", + iommugrouppath, dev->id) < 0) + ABORT("@iommugroupdevs overflow"); + + if (symlink(devpath, iommugroupdevs) < 0) + ABORT("Unable to link iommu group devices to current device"); + + VIR_FREE(iommugrouppath); + VIR_FREE(iommugroupdevs); + if (pci_device_autobind(dev) < 0) ABORT("Unable to bind: %s", data->id); @@ -435,7 +464,89 @@ pci_device_autobind(struct pciDevice *dev) return pci_driver_bind(driver, dev); } +static void +pci_iommu_new(int num) +{ + char *iommupath; + struct pciIommuGroup *iommuG; + + if (VIR_ALLOC_QUIET(iommuG) < 0) + ABORT_OOM(); + iommuG->iommu = num; + + if (virAsprintfQuiet(&iommupath, "%s/iommu_groups/%d/devices", fakesysfsdir, num) < 0) + ABORT_OOM(); + + if (virFileMakePath(iommupath) < 0) + ABORT("Unable to create: %s", iommupath); + + if (VIR_APPEND_ELEMENT_QUIET(pciIommuGroups, npciIommuGroups, iommuG) < 0) + ABORT_OOM(); +} + +static int +pci_iommu_release(struct pciDevice *device) +{ + char *vfiopath = NULL; + int ret = -1; + size_t i = 0; + + for (i = 0; i < npciIommuGroups; i++) { + if (pciIommuGroups[i]->iommu == device->iommu) + break; + } + + if (i != npciIommuGroups) { + pciIommuGroups[i]->nDevices--; + if (!pciIommuGroups[i]->nDevices) { + if (virAsprintfQuiet(&vfiopath, "%s/dev/vfio/%d", + fakesysfsdir, device->iommu) < 0) { + errno = ENOMEM; + goto cleanup; + } + if (unlink(vfiopath) < 0) + goto cleanup; + } + } + ret = 0; + cleanup: + VIR_FREE(vfiopath); + return ret; +} + +static int +pci_iommu_lock(struct pciDevice *device) +{ + char *vfiopath = NULL; + int ret = -1; + size_t i = 0; + int fd = -1; + + for (i = 0; i < npciIommuGroups; i++) { + if (pciIommuGroups[i]->iommu == device->iommu) + break; + } + + if (i != npciIommuGroups) { + if (!pciIommuGroups[i]->nDevices) { + if (virAsprintfQuiet(&vfiopath, "%s/dev/vfio/%d", + fakesysfsdir, device->iommu) < 0) { + errno = ENOMEM; + goto cleanup; + } + if ((fd = realopen(vfiopath, O_CREAT)) < 0) + goto cleanup; + } + pciIommuGroups[i]->nDevices++; + } + + ret = 0; + cleanup: + realclose(fd); + VIR_FREE(vfiopath); + return ret; +} /* * PCI Driver functions */ @@ -556,6 +667,10 @@ pci_driver_bind(struct pciDriver *driver, if (symlink(devpath, driverpath) < 0) goto cleanup; + if (STREQ(driver->name, "vfio-pci")) + if (pci_iommu_lock(dev) < 0) + goto cleanup; + dev->driver = driver; ret = 0; cleanup: @@ -590,6 +705,10 @@ pci_driver_unbind(struct pciDriver *driver, unlink(driverpath) < 0) goto cleanup; + if (STREQ(driver->name, "vfio-pci")) + if (pci_iommu_release(dev) < 0) + goto cleanup; + dev->driver = NULL; ret = 0; cleanup: @@ -801,6 +920,8 @@ init_syms(void) static void init_env(void) { + char *devvfio; + if (fakesysfsdir) return; @@ -809,13 +930,33 @@ init_env(void) make_file(fakesysfsdir, "drivers_probe", NULL, -1); + if (virAsprintfQuiet(&devvfio, "%s/dev/vfio", fakesysfsdir) < 0) + ABORT_OOM(); + + if (virFileMakePath(devvfio) < 0) + ABORT("Unable to create: %s", devvfio); + VIR_FREE(devvfio); + + pci_iommu_new(1); + pci_iommu_new(2); + pci_iommu_new(3); + pci_iommu_new(4); + pci_iommu_new(5); + pci_iommu_new(6); + pci_iommu_new(7); + pci_iommu_new(8); + pci_iommu_new(9); + pci_iommu_new(10); + pci_iommu_new(11); + + # define MAKE_PCI_DRIVER(name, ...) \ pci_driver_new(name, 0, __VA_ARGS__, -1, -1) MAKE_PCI_DRIVER("iwlwifi", 0x8086, 0x0044); - MAKE_PCI_DRIVER("i915", 0x8086, 0x0046, 0x8086, 0x0047); + MAKE_PCI_DRIVER("i915", 0x8086, 0x0046, 0x8086, 0x0047, 0x8086, 0x0048, 0x1033, 0x0035, 0x1033, 0x00e0); MAKE_PCI_DRIVER("pci-stub", -1, -1); - pci_driver_new("vfio-pci", PCI_ACTION_BIND, -1, -1); + MAKE_PCI_DRIVER("vfio-pci", -1, -1); # define MAKE_PCI_DEVICE(Id, Vendor, Device, ...) \ do { \ @@ -824,20 +965,20 @@ init_env(void) pci_device_new_from_stub(&dev); \ } while (0) - MAKE_PCI_DEVICE("0000:00:00.0", 0x8086, 0x0044); - MAKE_PCI_DEVICE("0000:00:01.0", 0x8086, 0x0044); - MAKE_PCI_DEVICE("0000:00:02.0", 0x8086, 0x0046); - MAKE_PCI_DEVICE("0000:00:03.0", 0x8086, 0x0048); - MAKE_PCI_DEVICE("0001:00:00.0", 0x1014, 0x03b9, .class = 0x060400); - MAKE_PCI_DEVICE("0001:01:00.0", 0x8086, 0x105e); - MAKE_PCI_DEVICE("0001:01:00.1", 0x8086, 0x105e); - MAKE_PCI_DEVICE("0005:80:00.0", 0x10b5, 0x8112, .class = 0x060400); - MAKE_PCI_DEVICE("0005:90:01.0", 0x1033, 0x0035); - MAKE_PCI_DEVICE("0005:90:01.1", 0x1033, 0x0035); - MAKE_PCI_DEVICE("0005:90:01.2", 0x1033, 0x00e0); - MAKE_PCI_DEVICE("0000:0a:01.0", 0x8086, 0x0047); - MAKE_PCI_DEVICE("0000:0a:02.0", 0x8286, 0x0048); - MAKE_PCI_DEVICE("0000:0a:03.0", 0x8386, 0x0048); + MAKE_PCI_DEVICE("0000:00:00.0", 0x8086, 0x0044, .iommu = 1); + MAKE_PCI_DEVICE("0000:00:01.0", 0x8086, 0x0044, .iommu = 2); + MAKE_PCI_DEVICE("0000:00:02.0", 0x8086, 0x0046, .iommu = 3); + MAKE_PCI_DEVICE("0000:00:03.0", 0x8086, 0x0048, .iommu = 4); + MAKE_PCI_DEVICE("0001:00:00.0", 0x1014, 0x03b9, .iommu = 5, .class = 0x060400); + MAKE_PCI_DEVICE("0001:01:00.0", 0x8086, 0x105e, .iommu = 6); + MAKE_PCI_DEVICE("0001:01:00.1", 0x8086, 0x105e, .iommu = 6); + MAKE_PCI_DEVICE("0005:80:00.0", 0x10b5, 0x8112, .iommu = 7, .class = 0x060400); + MAKE_PCI_DEVICE("0005:90:01.0", 0x1033, 0x0035, .iommu = 8); + MAKE_PCI_DEVICE("0005:90:01.1", 0x1033, 0x0035, .iommu = 8); + MAKE_PCI_DEVICE("0005:90:01.2", 0x1033, 0x00e0, .iommu = 8); + MAKE_PCI_DEVICE("0000:0a:01.0", 0x8086, 0x0047, .iommu = 9); + MAKE_PCI_DEVICE("0000:0a:02.0", 0x8286, 0x0048, .iommu = 10); + MAKE_PCI_DEVICE("0000:0a:03.0", 0x8386, 0x0048, .iommu = 11); } diff --git a/tests/virpcitest.c b/tests/virpcitest.c index d4d3253..15cf7d2 100644 --- a/tests/virpcitest.c +++ b/tests/virpcitest.c @@ -111,6 +111,8 @@ testVirPCIDeviceDetach(const void *oaque ATTRIBUTE_UNUSED) virPCIDeviceSetStubDriver(dev[i], "pci-stub") < 0) goto cleanup; + virPCIDeviceSetIommuGroup(dev[i], 1); + if (virPCIDeviceDetach(dev[i], activeDevs, inactiveDevs) < 0) goto cleanup; @@ -191,6 +193,8 @@ testVirPCIDeviceReattach(const void *opaque ATTRIBUTE_UNUSED) if (virPCIDeviceSetStubDriver(dev[i], "pci-stub") < 0) goto cleanup; + + virPCIDeviceSetIommuGroup(dev[i], 1); } CHECK_LIST_COUNT(activeDevs, 0); @@ -211,11 +215,89 @@ testVirPCIDeviceReattach(const void *opaque ATTRIBUTE_UNUSED) return ret; } +static int +testVirPCIDeviceVFIOReattach(const void *opaque ATTRIBUTE_UNUSED) +{ + int ret = -1; + virPCIDevicePtr dev[] = {NULL, NULL, NULL}; + size_t i, nDev = ARRAY_CARDINALITY(dev); + virPCIDeviceListPtr activeDevs = NULL, inactiveDevs = NULL; + int count; + + if (!(activeDevs = virPCIDeviceListNew()) || + !(inactiveDevs = virPCIDeviceListNew())) + goto cleanup; + + for (i = 0; i < nDev; i++) { + if (!(dev[i] = virPCIDeviceNew(5, 0x90, 1, i))) + goto cleanup; + /* + if (virPCIDeviceListAdd(inactiveDevs, dev[i]) < 0) { + virPCIDeviceFree(dev[i]); + goto cleanup; + } + */ + if (virPCIDeviceSetStubDriver(dev[i], "vfio-pci") < 0) + goto cleanup; + + virPCIDeviceSetIommuGroup(dev[i], 8); + + if (virPCIDeviceDetach(dev[i], activeDevs, inactiveDevs) < 0) + goto cleanup; + } + + CHECK_LIST_COUNT(activeDevs, 0); + CHECK_LIST_COUNT(inactiveDevs, nDev); + + /* Check that the reattach fails when any domain is actively using any of + * the devices in the iommu. */ + + if (virPCIDeviceListAddCopy(activeDevs, dev[0]) < 0) { + virPCIDeviceFree(dev[0]); + goto cleanup; + } + + if (virPCIDeviceReattach(dev[1], activeDevs, inactiveDevs) < 0) + goto cleanup; + + if (virPCIDeviceReattach(dev[2], activeDevs, inactiveDevs) < 0) + goto cleanup; + + virPCIDeviceListDel(activeDevs, dev[0]); + /* The devices should get queued in inactiveDevs */ + CHECK_LIST_COUNT(activeDevs, 0); + CHECK_LIST_COUNT(inactiveDevs, 3); + + if (virPCIDeviceReattach(dev[0], activeDevs, inactiveDevs) < 0) + goto cleanup; + + /* The queued devices should all be Reattached */ + CHECK_LIST_COUNT(activeDevs, 0); + CHECK_LIST_COUNT(inactiveDevs, 0); + + /* Already reattached. No need to queue */ + for (i = 0; i < nDev; i++) { + if (virPCIDeviceReattach(dev[i], activeDevs, inactiveDevs) < 0) + goto cleanup; + } + + /* Already reattached. No need to queue */ + CHECK_LIST_COUNT(activeDevs, 0); + CHECK_LIST_COUNT(inactiveDevs, 0); + + ret = 0; + cleanup: + virObjectUnref(activeDevs); + virObjectUnref(inactiveDevs); + return ret; +} + struct testPCIDevData { unsigned int domain; unsigned int bus; unsigned int slot; unsigned int function; + int iommu; const char *driver; }; @@ -248,6 +330,8 @@ testVirPCIDeviceDetachSingle(const void *opaque) if (!dev) goto cleanup; + virPCIDeviceSetIommuGroup(dev, data->iommu); + if (virPCIDeviceSetStubDriver(dev, "pci-stub") < 0 || virPCIDeviceDetach(dev, NULL, NULL) < 0) goto cleanup; @@ -259,7 +343,7 @@ testVirPCIDeviceDetachSingle(const void *opaque) } static int -testVirPCIDeviceDetachFail(const void *opaque) +testVirPCIDeviceDetachSuccess(const void *opaque) { const struct testPCIDevData *data = opaque; int ret = -1; @@ -269,6 +353,8 @@ testVirPCIDeviceDetachFail(const void *opaque) if (!dev) goto cleanup; + virPCIDeviceSetIommuGroup(dev, data->iommu); + if (virPCIDeviceSetStubDriver(dev, "vfio-pci") < 0) goto cleanup; @@ -276,14 +362,17 @@ testVirPCIDeviceDetachFail(const void *opaque) if (virTestGetVerbose() || virTestGetDebug()) virDispatchError(NULL); virResetLastError(); - ret = 0; - } else { - virReportError(VIR_ERR_INTERNAL_ERROR, - "Attaching device %s to %s should have failed", - virPCIDeviceGetName(dev), - virPCIDeviceGetStubDriver(dev)); + ret = -1; + } + + if (virPCIDeviceReattach(dev, NULL, NULL) < 0) { + if (virTestGetVerbose() || virTestGetDebug()) + virDispatchError(NULL); + virResetLastError(); + ret = -1; } + ret = 0; cleanup: virPCIDeviceFree(dev); return ret; @@ -300,6 +389,8 @@ testVirPCIDeviceReattachSingle(const void *opaque) if (!dev) goto cleanup; + virPCIDeviceSetIommuGroup(dev, data->iommu); + virPCIDeviceReattachInit(dev); if (virPCIDeviceReattach(dev, NULL, NULL) < 0) goto cleanup; @@ -321,6 +412,8 @@ testVirPCIDeviceCheckDriverTest(const void *opaque) if (!dev) goto cleanup; + virPCIDeviceSetIommuGroup(dev, data->iommu); + if (testVirPCIDeviceCheckDriver(dev, data->driver) < 0) goto cleanup; @@ -341,6 +434,8 @@ testVirPCIDeviceUnbind(const void *opaque) if (!dev) goto cleanup; + virPCIDeviceSetIommuGroup(dev, data->iommu); + if (virPCIDeviceUnbind(dev, false) < 0) goto cleanup; @@ -376,10 +471,10 @@ mymain(void) ret = -1; \ } while (0) -# define DO_TEST_PCI(fnc, domain, bus, slot, function) \ +# define DO_TEST_PCI(fnc, domain, bus, slot, function, iommu) \ do { \ struct testPCIDevData data = { \ - domain, bus, slot, function, NULL \ + domain, bus, slot, function, iommu, NULL \ }; \ char *label = NULL; \ if (virAsprintf(&label, "%s(%04x:%02x:%02x.%x)", \ @@ -392,10 +487,10 @@ mymain(void) VIR_FREE(label); \ } while (0) -# define DO_TEST_PCI_DRIVER(domain, bus, slot, function, driver) \ +# define DO_TEST_PCI_DRIVER(domain, bus, slot, function, iommu, driver) \ do { \ struct testPCIDevData data = { \ - domain, bus, slot, function, driver \ + domain, bus, slot, function, iommu, driver \ }; \ char *label = NULL; \ if (virAsprintf(&label, "PCI driver %04x:%02x:%02x.%x is %s", \ @@ -418,31 +513,34 @@ mymain(void) DO_TEST(testVirPCIDeviceDetach); DO_TEST(testVirPCIDeviceReset); DO_TEST(testVirPCIDeviceReattach); - DO_TEST_PCI(testVirPCIDeviceIsAssignable, 5, 0x90, 1, 0); - DO_TEST_PCI(testVirPCIDeviceIsAssignable, 1, 1, 0, 0); + DO_TEST_PCI(testVirPCIDeviceIsAssignable, 5, 0x90, 1, 0, 8); + DO_TEST_PCI(testVirPCIDeviceIsAssignable, 1, 1, 0, 0, 6); - DO_TEST_PCI(testVirPCIDeviceDetachFail, 0, 0x0a, 1, 0); + DO_TEST_PCI(testVirPCIDeviceDetachSuccess, 0, 0x0a, 1, 0, 9); /* Reattach a device already bound to non-stub a driver */ - DO_TEST_PCI_DRIVER(0, 0x0a, 1, 0, "i915"); - DO_TEST_PCI(testVirPCIDeviceReattachSingle, 0, 0x0a, 1, 0); - DO_TEST_PCI_DRIVER(0, 0x0a, 1, 0, "i915"); + DO_TEST_PCI_DRIVER(0, 0x0a, 1, 0, 9, "i915"); + DO_TEST_PCI(testVirPCIDeviceReattachSingle, 0, 0x0a, 1, 0, 9); + DO_TEST_PCI_DRIVER(0, 0x0a, 1, 0, 9, "i915"); /* Reattach an unbound device */ - DO_TEST_PCI(testVirPCIDeviceUnbind, 0, 0x0a, 1, 0); - DO_TEST_PCI_DRIVER(0, 0x0a, 1, 0, NULL); - DO_TEST_PCI(testVirPCIDeviceReattachSingle, 0, 0x0a, 1, 0); - DO_TEST_PCI_DRIVER(0, 0x0a, 1, 0, "i915"); + DO_TEST_PCI(testVirPCIDeviceUnbind, 0, 0x0a, 1, 0, 9); + DO_TEST_PCI_DRIVER(0, 0x0a, 1, 0, 9, NULL); + DO_TEST_PCI(testVirPCIDeviceReattachSingle, 0, 0x0a, 1, 0, 9); + DO_TEST_PCI_DRIVER(0, 0x0a, 1, 0, 9, "i915"); /* Detach an unbound device */ - DO_TEST_PCI_DRIVER(0, 0x0a, 2, 0, NULL); - DO_TEST_PCI(testVirPCIDeviceDetachSingle, 0, 0x0a, 2, 0); - DO_TEST_PCI_DRIVER(0, 0x0a, 2, 0, "pci-stub"); + DO_TEST_PCI_DRIVER(0, 0x0a, 2, 0, 10, NULL); + DO_TEST_PCI(testVirPCIDeviceDetachSingle, 0, 0x0a, 2, 0, 10); + DO_TEST_PCI_DRIVER(0, 0x0a, 2, 0, 10, "pci-stub"); + + /* Reattach an unknown unbound device */ + DO_TEST_PCI_DRIVER(0, 0x0a, 3, 0, 11, NULL); + DO_TEST_PCI(testVirPCIDeviceReattachSingle, 0, 0x0a, 3, 0, 11); + DO_TEST_PCI_DRIVER(0, 0x0a, 3, 0, 11, NULL); /* Reattach an unknown unbound device */ - DO_TEST_PCI_DRIVER(0, 0x0a, 3, 0, NULL); - DO_TEST_PCI(testVirPCIDeviceReattachSingle, 0, 0x0a, 3, 0); - DO_TEST_PCI_DRIVER(0, 0x0a, 3, 0, NULL); + DO_TEST(testVirPCIDeviceVFIOReattach); if (getenv("LIBVIRT_SKIP_CLEANUP") == NULL) virFileDeleteTree(fakesysfsdir);

There could be a delay of 1 or 2 seconds before the vfio-pci driver is unbound and the device file /dev/vfio/<iommu> is actually removed. If the file exists, the host driver probing the device can lead to crash. So, wait and avoid the crash. Signed-off-by: Shivaprasad G Bhat <sbhat@linux.vnet.ibm.com> --- src/util/virpci.c | 41 +++++++++++++++++++++++++++++++++++++++++ tests/virpcimock.c | 14 ++++++++++++-- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/src/util/virpci.c b/src/util/virpci.c index caa2458..157327f 100644 --- a/src/util/virpci.c +++ b/src/util/virpci.c @@ -1098,6 +1098,43 @@ virPCIIsKnownStub(char *driver) return ret; } +#define VFIO_UNBIND_TIMEOUT 100 + +/* It is not safe to initiate host driver probe if the vfio driver has not + * completely unbound the device. Usual wait time is 1 to 2 seconds. + * So, return if the unbind didn't complete in 10 seconds. + */ +static int +virPCIWaitForVFIOUnbindCompletion(virPCIDevicePtr dev) +{ + int retry = 0; + int ret = -1; + char *path = NULL; + + if (!(path = virPCIDeviceGetIOMMUGroupDev(dev))) + goto cleanup; + + while (retry++ < VFIO_UNBIND_TIMEOUT) { + if (!virFileExists(path)) + break; + usleep(100000); /* Sleep for 1/10th of a second */ + } + + if (virFileExists(path)) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("VFIO unbind not completed even after %d seconds" + " for device %s"), retry, dev->name); + goto cleanup; + } + + ret = 0; + cleanup: + VIR_FREE(path); + return ret; + +} + + static int virPCIDeviceReprobeHostDriver(virPCIDevicePtr dev, const char *driver, @@ -1275,6 +1312,10 @@ virPCIDeviceUnbindFromStub(virPCIDevicePtr dev, /* This device is the last to unbind from vfio. As we explicitly * add a missing device in the list to inactiveList, we will just * go through the list. */ + + if (virPCIWaitForVFIOUnbindCompletion(dev) < 0) + goto cleanup; + while (inactiveDevs && (i < virPCIDeviceListCount(inactiveDevs))) { virPCIDevicePtr pcidev = virPCIDeviceListGet(inactiveDevs, i); if (dev->iommuGroup == pcidev->iommuGroup) { diff --git a/tests/virpcimock.c b/tests/virpcimock.c index 926a548..9ee46d9 100644 --- a/tests/virpcimock.c +++ b/tests/virpcimock.c @@ -52,6 +52,7 @@ static DIR * (*realopendir)(const char *name); char *fakesysfsdir; # define PCI_SYSFS_PREFIX "/sys/bus/pci/" +# define ROOT_DEV_PREFIX "/dev/" # define STDERR(...) \ fprintf(stderr, "%s %zu: ", __FUNCTION__, (size_t) __LINE__); \ @@ -248,6 +249,13 @@ getrealpath(char **newpath, errno = ENOMEM; return -1; } + } else if (STRPREFIX(path, ROOT_DEV_PREFIX)) { + if (virAsprintfQuiet(newpath, "%s/%s", + fakesysfsdir, + path) < 0) { + errno = ENOMEM; + return -1; + } } else { if (VIR_STRDUP_QUIET(*newpath, path) < 0) return -1; @@ -995,7 +1003,8 @@ access(const char *path, int mode) init_syms(); - if (STRPREFIX(path, PCI_SYSFS_PREFIX)) { + if (STRPREFIX(path, PCI_SYSFS_PREFIX) || + STRPREFIX(path, ROOT_DEV_PREFIX)) { char *newpath; if (getrealpath(&newpath, path) < 0) return -1; @@ -1014,7 +1023,8 @@ __lxstat(int ver, const char *path, struct stat *sb) init_syms(); - if (STRPREFIX(path, PCI_SYSFS_PREFIX)) { + if (STRPREFIX(path, PCI_SYSFS_PREFIX) || + STRPREFIX(path, ROOT_DEV_PREFIX)) { char *newpath; if (getrealpath(&newpath, path) < 0) return -1;
participants (1)
-
Shivaprasad G Bhat