Currently we bind a managed hostdev back to the host driver (or
"unbind" from the perspective of the stub driver) immediately upon
receiving a DEVICE_DELETED event from QEMU. In cases where we have
more one device from the group attached to a guest, this runs the risk
of putting the group in a "non-viable" state where both a guest and
host are using devices from a group simultaneously.
This patch addresses this by deferring the unbind step until all
hostdevs from a group have been detached from the guest. In the
meantime, they are left on the drvManager's inactiveList, in a similar
state as they would be if they were unmanaged devices that were bound
to VFIO via nodedev-detach but not yet plugged into a guest.
Signed-off-by: Michael Roth <mdroth(a)linux.vnet.ibm.com>
---
src/libvirt_private.syms | 3 ++
src/qemu/qemu_hostdev.c | 16 +++++++++
src/qemu/qemu_hostdev.h | 4 +++
src/qemu/qemu_hotplug.c | 16 ++++++++-
src/util/virhostdev.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++
src/util/virhostdev.h | 8 +++++
src/util/virpci.c | 27 +++++++++++++++
src/util/virpci.h | 1 +
8 files changed, 164 insertions(+), 1 deletion(-)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index 2bd3581..ba7fa39 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -1783,6 +1783,8 @@ virHostCPUStatsAssign;
virHostdevFindUSBDevice;
virHostdevIsSCSIDevice;
virHostdevManagerGetDefault;
+virHostdevPCIDeviceGroupUnbind;
+virHostdevPCIDeviceGroupUnbindable;
virHostdevPCINodeDeviceDetach;
virHostdevPCINodeDeviceReAttach;
virHostdevPCINodeDeviceReset;
@@ -2342,6 +2344,7 @@ virPCIDeviceWaitForCleanup;
virPCIEDeviceInfoFree;
virPCIGetDeviceAddressFromSysfsLink;
virPCIGetHeaderType;
+virPCIGetIOMMUGroupList;
virPCIGetNetName;
virPCIGetPhysicalFunction;
virPCIGetVirtualFunctionIndex;
diff --git a/src/qemu/qemu_hostdev.c b/src/qemu/qemu_hostdev.c
index 73d26f4..fdc52fe 100644
--- a/src/qemu/qemu_hostdev.c
+++ b/src/qemu/qemu_hostdev.c
@@ -384,6 +384,22 @@ qemuHostdevPrepareDomainDevices(virQEMUDriverPtr driver,
}
void
+qemuHostdevReleasePCIDevices(virQEMUDriverPtr driver,
+ const char *name,
+ virDomainHostdevDefPtr *hostdevs,
+ int nhostdevs)
+{
+ virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
+ const char *oldStateDir = cfg->stateDir;
+ virHostdevManagerPtr hostdev_mgr = driver->hostdevMgr;
+
+ virHostdevReleasePCIDevices(hostdev_mgr, QEMU_DRIVER_NAME, name,
+ hostdevs, nhostdevs, oldStateDir);
+
+ virObjectUnref(cfg);
+}
+
+void
qemuHostdevReAttachPCIDevices(virQEMUDriverPtr driver,
const char *name,
virDomainHostdevDefPtr *hostdevs,
diff --git a/src/qemu/qemu_hostdev.h b/src/qemu/qemu_hostdev.h
index 9a7c7f1..b010085 100644
--- a/src/qemu/qemu_hostdev.h
+++ b/src/qemu/qemu_hostdev.h
@@ -74,6 +74,10 @@ void qemuHostdevReAttachPCIDevices(virQEMUDriverPtr driver,
const char *name,
virDomainHostdevDefPtr *hostdevs,
int nhostdevs);
+void qemuHostdevReleasePCIDevices(virQEMUDriverPtr driver,
+ const char *name,
+ virDomainHostdevDefPtr *hostdevs,
+ int nhostdevs);
void qemuHostdevReAttachUSBDevices(virQEMUDriverPtr driver,
const char *name,
virDomainHostdevDefPtr *hostdevs,
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
index b557e82..af5ee6f 100644
--- a/src/qemu/qemu_hotplug.c
+++ b/src/qemu/qemu_hotplug.c
@@ -3896,7 +3896,10 @@ qemuDomainRemoveHostDevice(virQEMUDriverPtr driver,
switch ((virDomainHostdevSubsysType) hostdev->source.subsys.type) {
case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
- qemuHostdevReAttachPCIDevices(driver, vm->def->name, &hostdev, 1);
+ if (is_vfio)
+ qemuHostdevReleasePCIDevices(driver, vm->def->name, &hostdev, 1);
+ else
+ qemuHostdevReAttachPCIDevices(driver, vm->def->name, &hostdev, 1);
qemuDomainReleaseDeviceAddress(vm, hostdev->info, NULL);
/* QEMU might no longer need to lock as much memory, eg. we just
* detached the last VFIO device, so adjust the limit here */
@@ -3925,6 +3928,17 @@ qemuDomainRemoveHostDevice(virQEMUDriverPtr driver,
virDomainNetDefFree(net);
}
+ if (is_vfio) {
+ int iommu_group =
+
virPCIDeviceAddressGetIOMMUGroupNum(&hostdev->source.subsys.u.pci.addr);
+ if (virHostdevPCIDeviceGroupUnbindable(driver->hostdevMgr,
+ iommu_group)) {
+ virHostdevPCIDeviceGroupUnbind(driver->hostdevMgr,
+ iommu_group);
+ }
+ }
+
+
ret = 0;
cleanup:
diff --git a/src/util/virhostdev.c b/src/util/virhostdev.c
index 2cd3f34..a7f04fe 100644
--- a/src/util/virhostdev.c
+++ b/src/util/virhostdev.c
@@ -905,6 +905,96 @@ virHostdevPreparePCIDevices(virHostdevManagerPtr mgr,
return ret;
}
+static bool
+virHostdevPCIDeviceUnbindableInternal(virHostdevManagerPtr mgr,
+ int iommu_group)
+{
+ struct virHostdevIsPCINodeDeviceUsedData data = { mgr, NULL, true };
+
+ if (virPCIIOMMUGroupIterate(iommu_group,
+ virHostdevIsPCINodeDeviceUsed,
+ &data) < 0) {
+ VIR_DEBUG("IOMMU group %d is not unbindable", iommu_group);
+ return false;
+ }
+
+ VIR_DEBUG("IOMMU group %d is unbindable", iommu_group);
+ return true;
+}
+
+/*
+ * Check if devices within IOMMU group are in use by any domains
+ */
+bool
+virHostdevPCIDeviceGroupUnbindable(virHostdevManagerPtr mgr,
+ int iommu_group)
+{
+ bool result;
+
+ virObjectLock(mgr->activePCIHostdevs);
+ result = virHostdevPCIDeviceUnbindableInternal(mgr, iommu_group);
+ virObjectUnlock(mgr->activePCIHostdevs);
+
+ return result;
+}
+
+/*
+ * Confirm all devices in IOMMU group are in inactiveList
+ * before attempting to reattach to host driver. Devices in IOMMU
+ * group that aren't in either activeList or inactiveList are considered
+ * outside our control, so we treat them as inactive as well.
+ *
+ * Callers can check virHostdevPCIDeviceGroupUnbindable() beforehand
+ * for some indication that the group is ready for reattach to the
+ * host, but since it's possible for a hostdev from the group to get
+ * re-attached to a guest prior to subsequently calling this function
+ * there is no guarantee of this, which should be fine since it would
+ * only be immediately rebound to the stub driver anyway.
+ */
+void
+virHostdevPCIDeviceGroupUnbind(virHostdevManagerPtr mgr,
+ int iommu_group)
+{
+ virPCIDeviceListPtr pcidevs = NULL;
+ size_t i;
+
+ virObjectLock(mgr->activePCIHostdevs);
+ virObjectLock(mgr->inactivePCIHostdevs);
+
+ if (!virHostdevPCIDeviceUnbindableInternal(mgr, iommu_group)) {
+ VIR_DEBUG("IOMMU group %d still in use, deferring reattach "
+ "of PCI devices to host", iommu_group);
+ goto cleanup;
+ }
+
+ pcidevs = virPCIGetIOMMUGroupList(iommu_group);
+ for (i = 0; i < virPCIDeviceListCount(pcidevs); i++) {
+ virPCIDevicePtr actual, pci = virPCIDeviceListGet(pcidevs, i);
+ virPCIDeviceAddressPtr devAddr = virPCIDeviceGetAddress(pci);
+
+ actual = virPCIDeviceListFindByIDs(mgr->inactivePCIHostdevs,
+ devAddr->domain,
+ devAddr->bus,
+ devAddr->slot,
+ devAddr->function);
+ if (actual) {
+ VIR_DEBUG("Reattaching PCI device %s",
virPCIDeviceGetName(actual));
+ if (virPCIDeviceGetManaged(actual))
+ if (virPCIDeviceReattach(actual, mgr->activePCIHostdevs,
+ mgr->inactivePCIHostdevs) < 0) {
+ VIR_ERROR(_("Failed to re-attach PCI device: %s"),
+ virGetLastErrorMessage());
+ virResetLastError();
+ }
+ }
+ }
+
+ cleanup:
+ virObjectUnref(pcidevs);
+ virObjectUnlock(mgr->activePCIHostdevs);
+ virObjectUnlock(mgr->inactivePCIHostdevs);
+}
+
/*
* Pre-condition: inactivePCIHostdevs & activePCIHostdevs
* are locked
diff --git a/src/util/virhostdev.h b/src/util/virhostdev.h
index fbc7fbd..2ab8101 100644
--- a/src/util/virhostdev.h
+++ b/src/util/virhostdev.h
@@ -122,6 +122,14 @@ virHostdevReleasePCIDevices(virHostdevManagerPtr mgr,
const char *oldStateDir)
ATTRIBUTE_NONNULL(1);
void
+virHostdevPCIDeviceGroupUnbind(virHostdevManagerPtr mgr,
+ int iommu_group)
+ ATTRIBUTE_NONNULL(1);
+bool
+virHostdevPCIDeviceGroupUnbindable(virHostdevManagerPtr mgr,
+ int iommu_group)
+ ATTRIBUTE_NONNULL(1);
+void
virHostdevReAttachUSBDevices(virHostdevManagerPtr hostdev_mgr,
const char *drv_name,
const char *dom_name,
diff --git a/src/util/virpci.c b/src/util/virpci.c
index b842f44..a8e5190 100644
--- a/src/util/virpci.c
+++ b/src/util/virpci.c
@@ -2298,6 +2298,33 @@ virPCIDeviceGetIOMMUGroupList(virPCIDevicePtr dev)
}
+/*
+ * virPCIGetIOMMUGroupList - return a virPCIDeviceList containing
+ * all of the devices in @iommu_group.
+ *
+ * Return the new list, or NULL on failure
+ */
+virPCIDeviceListPtr
+virPCIGetIOMMUGroupList(int iommu_group)
+{
+ virPCIDeviceListPtr groupList = virPCIDeviceListNew();
+
+ if (!groupList)
+ goto error;
+
+ if (virPCIIOMMUGroupIterate(iommu_group,
+ virPCIDeviceGetIOMMUGroupAddOne,
+ groupList) < 0)
+ goto error;
+
+ return groupList;
+
+ error:
+ virObjectUnref(groupList);
+ return NULL;
+}
+
+
typedef struct {
virPCIDeviceAddressPtr **iommuGroupDevices;
size_t *nIommuGroupDevices;
diff --git a/src/util/virpci.h b/src/util/virpci.h
index 5ec1306..5bcacb2 100644
--- a/src/util/virpci.h
+++ b/src/util/virpci.h
@@ -180,6 +180,7 @@ int virPCIIOMMUGroupIterate(int iommu_group,
virPCIDeviceAddressActor actor,
void *opaque);
virPCIDeviceListPtr virPCIDeviceGetIOMMUGroupList(virPCIDevicePtr dev);
+virPCIDeviceListPtr virPCIGetIOMMUGroupList(int iommu_group);
int virPCIDeviceAddressGetIOMMUGroupAddresses(virPCIDeviceAddressPtr devAddr,
virPCIDeviceAddressPtr
**iommuGroupDevices,
size_t *nIommuGroupDevices);
--
2.7.4