QEMU emits DEVICE_DELETED events during a device's "unparent"
callback, but some additional cleanup occurs afterward via
"finalize". In most cases libvirt can ignore the latter, but in the
case of VFIO the closing of a device's group FD happens here, which
is something libvirt needs to wait for before attempting to bind a
hostdev back to a host driver. In the case of powernv, and possibly
other host archs as well, failing to do this can lead to the host
device driver crashing due to necessary setup (like restoring default
DMA windows for the IOMMU group) not being completed yet.
We attempt to avoid this here by polling the QEMU process for open
FDs referencing /dev/vfio/<group num> and waiting for a certain period
of time. In practice the delay between the DEVICE_DELETED
event and closing of the group FD seems to be around 6 seconds, so we
set the max wait time at 15 seconds. If we time out we leave the
device in the inactiveList and bound to VFIO. We only attempt the
wait if the last hostdev from an IOMMU group is being detached and
there's reasonable expectation that the group FD will be closed soon.
There are alternatives to this approach, like adding a specific
group delete event to QEMU and handling this cleanup via and
asynchronous event handler, nut since we do a similar poll-wait for
things like KVM device passthrough this simple approach is hopefully
a reasonable starting point at least.
Signed-off-by: Michael Roth <mdroth(a)linux.vnet.ibm.com>
---
src/libvirt_private.syms | 1 +
src/qemu/qemu_hotplug.c | 34 +++++++++++++++++++++++++++++--
src/util/virfile.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
src/util/virfile.h | 1 +
4 files changed, 86 insertions(+), 2 deletions(-)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index ba7fa39..787267c 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -1657,6 +1657,7 @@ virFileIsDir;
virFileIsExecutable;
virFileIsLink;
virFileIsMountPoint;
+virFileIsOpenByPid;
virFileIsSharedFS;
virFileIsSharedFSType;
virFileLength;
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
index af5ee6f..d200bab 100644
--- a/src/qemu/qemu_hotplug.c
+++ b/src/qemu/qemu_hotplug.c
@@ -68,6 +68,8 @@ VIR_LOG_INIT("qemu.qemu_hotplug");
/* Wait up to 5 seconds for device removal to finish. */
unsigned long long qemuDomainRemoveDeviceWaitTime = 1000ull * 5;
+/* Wait up to 15 seconds for iommu group close */
+unsigned long long qemuDomainRemoveDeviceGroupWaitTime = 1000ull * 15;
/**
* qemuDomainPrepareDisk:
@@ -3830,6 +3832,32 @@ qemuDomainRemoveSCSIVHostDevice(virQEMUDriverPtr driver,
}
static int
+qemuDomainWaitForDeviceGroupClose(virDomainObjPtr vm, int iommu_group)
+{
+ char *group_path;
+ unsigned long long remaining_ms = qemuDomainRemoveDeviceGroupWaitTime;
+ int rc = -1;
+
+ if (virAsprintf(&group_path, "/dev/vfio/%d", iommu_group) < 0)
+ return -1;
+
+ while ((rc = virFileIsOpenByPid(group_path, vm->pid)) == 1) {
+ if (remaining_ms <= 0)
+ break;
+ usleep(100*1000);
+ remaining_ms -= 100;
+ }
+
+ VIR_DEBUG("IOMMU group %d FD status: %d, wait time: %llu ms",
+ iommu_group, rc,
+ qemuDomainRemoveDeviceGroupWaitTime - remaining_ms);
+
+ VIR_FREE(group_path);
+ return rc;
+}
+
+
+static int
qemuDomainRemoveHostDevice(virQEMUDriverPtr driver,
virDomainObjPtr vm,
virDomainHostdevDefPtr hostdev)
@@ -3933,8 +3961,10 @@ qemuDomainRemoveHostDevice(virQEMUDriverPtr driver,
virPCIDeviceAddressGetIOMMUGroupNum(&hostdev->source.subsys.u.pci.addr);
if (virHostdevPCIDeviceGroupUnbindable(driver->hostdevMgr,
iommu_group)) {
- virHostdevPCIDeviceGroupUnbind(driver->hostdevMgr,
- iommu_group);
+ if (qemuDomainWaitForDeviceGroupClose(vm, iommu_group) == 0) {
+ virHostdevPCIDeviceGroupUnbind(driver->hostdevMgr,
+ iommu_group);
+ }
}
}
diff --git a/src/util/virfile.c b/src/util/virfile.c
index d444b32..29b762f 100644
--- a/src/util/virfile.c
+++ b/src/util/virfile.c
@@ -4162,3 +4162,55 @@ virFileReadValueString(char **value, const char *format, ...)
VIR_FREE(str);
return ret;
}
+
+int
+virFileIsOpenByPid(const char *path, pid_t pid)
+{
+ struct dirent *ent;
+ DIR *filelist_dir;
+ char *filelist_path;
+ bool found = false;
+ int rc = -1;
+
+ if (!path || !IS_ABSOLUTE_FILE_NAME(path)) {
+ virReportError(VIR_ERR_INTERNAL_ERROR,
+ _("invalid path: %s"), path ? path : "null");
+ goto error;
+ }
+
+ if (virAsprintf(&filelist_path, "/proc/%d/fd", pid) < 0)
+ goto error;
+
+ if (virDirOpen(&filelist_dir, filelist_path) < 0) {
+ virReportError(VIR_ERR_INTERNAL_ERROR,
+ _("unable to open directory: %s"), filelist_path);
+ goto error;
+ }
+
+ while (!found &&
+ (rc = virDirRead(filelist_dir, &ent, filelist_path)) == 1) {
+ char *resolved_path = NULL;
+ char *link_path;
+ if ((rc = virAsprintf(&link_path, "%s/%s", filelist_path,
ent->d_name)) < 0)
+ break;
+ if (virFileResolveLink(link_path, &resolved_path) == 0) {
+ if (resolved_path) {
+ VIR_DEBUG("checking absolute path for match (need: %s, got:
%s)",
+ path, resolved_path);
+ if (STREQ(resolved_path, path))
+ found = true;
+ VIR_FREE(resolved_path);
+ }
+ }
+ }
+
+ VIR_DIR_CLOSE(filelist_dir);
+ error:
+ VIR_FREE(filelist_path);
+
+ VIR_DEBUG("returning, rc: %d, found: %d", rc, found);
+ if (rc < 0)
+ return rc;
+
+ return found ? 1 : 0;
+}
diff --git a/src/util/virfile.h b/src/util/virfile.h
index 57ceb80..fb86786 100644
--- a/src/util/virfile.h
+++ b/src/util/virfile.h
@@ -347,6 +347,7 @@ int virFileReadValueScaledInt(unsigned long long *value, const char
*format, ...
int virFileReadValueString(char **value, const char *format, ...)
ATTRIBUTE_FMT_PRINTF(2, 3);
+int virFileIsOpenByPid(const char *path, pid_t pid);
int virFileInData(int fd,
int *inData,
--
2.7.4