On a Monday in 2025, Nathan Chen via Devel wrote:
Open iommufd FDs from libvirt backend without exposing these FDs to XML users, i.e. one per domain for /dev/iommu and one per iommufd hostdev for /dev/vfio/devices/vfioX, and pass the FD to qemu command line.
The part formatting the object and the part formatting the device should be split.
Signed-off-by: Nathan Chen <nathanc@nvidia.com> --- src/qemu/qemu_command.c | 43 +++++++- src/qemu/qemu_command.h | 3 +- src/qemu/qemu_domain.c | 8 ++ src/qemu/qemu_domain.h | 7 ++ src/qemu/qemu_hotplug.c | 2 +- src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 289 insertions(+), 6 deletions(-)
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 8fd7527645..740a6970f2 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -4730,7 +4730,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,
virJSONValue * qemuBuildPCIHostdevDevProps(const virDomainDef *def, - virDomainHostdevDef *dev) + virDomainHostdevDef *dev, + virDomainObj *vm)
Hmm, perhaps exposing the iommufd object in the XML would save us from having to pass this.
{ g_autoptr(virJSONValue) props = NULL; virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci; @@ -4741,6 +4742,13 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def, const char *iommufdId = NULL; /* 'ramfb' property must be omitted unless it's to be enabled */ bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON; + bool useIommufd = false; + qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL; + + if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO && + pcisrc->driver.iommufd) { + useIommufd = true; + }
/* caller has to assign proper passthrough driver name */ switch (pcisrc->driver.name) { @@ -4787,6 +4795,18 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def, NULL) < 0) return NULL;
+ if (useIommufd && priv) { + g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:%02x:%02x.%d", + pcisrc->addr.domain, pcisrc->addr.bus, + pcisrc->addr.slot, pcisrc->addr.function); +
There's no need to duplicate the list of hostdevs which use iommufd in a per-domain hash table. For storing per-device file descriptors, we have per-device private data.
+ int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName)); + if (virJSONValueObjectAdd(&props, + "S:fd", g_strdup_printf("%d", vfiofd), + NULL) < 0) + return NULL; + } + if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0) return NULL;
@@ -5197,12 +5217,14 @@ qemuBuildAcpiNodesetProps(virCommand *cmd, static int qemuBuildHostdevCommandLine(virCommand *cmd, const virDomainDef *def, - virQEMUCaps *qemuCaps) + virQEMUCaps *qemuCaps, + virDomainObj *vm) { size_t i; g_autoptr(virJSONValue) props = NULL; int iommufd = 0; const char * iommufdId = "iommufd0"; + qemuDomainObjPrivate *priv = vm->privateData;
for (i = 0; i < def->nhostdevs; i++) { virDomainHostdevDef *hostdev = def->hostdevs[i]; @@ -5233,8 +5255,10 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
if (subsys->u.pci.driver.iommufd && iommufd == 0) { iommufd = 1; + virCommandPassFD(cmd, priv->iommufd, VIR_COMMAND_PASS_FD_CLOSE_PARENT); if (qemuMonitorCreateObjectProps(&props, "iommufd", iommufdId, + "S:fd", g_strdup_printf("%d", priv->iommufd), NULL) < 0) return -1;
@@ -5245,7 +5269,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd, if (qemuCommandAddExtDevice(cmd, hostdev->info, def, qemuCaps) < 0) return -1;
- if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev))) + if (subsys->u.pci.driver.iommufd) { + virDomainHostdevSubsysPCI *pcisrc = &hostdev->source.subsys.u.pci; + g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:%02x:%02x.%d", + pcisrc->addr.domain, pcisrc->addr.bus, + pcisrc->addr.slot, pcisrc->addr.function); + + int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName)); + + virCommandPassFD(cmd, vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
This would become just: qemuDomainHostdevPrivate *priv = (qemuDomainHostdevPrivate *)vsock->privateData; virCommandPassFD(cmd, priv->vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
+ } + + if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev, vm))) return -1;
if (qemuBuildDeviceCommandlineFromJSON(cmd, devprops, def, qemuCaps) < 0) @@ -10893,7 +10928,7 @@ qemuBuildCommandLine(virDomainObj *vm, if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0) return NULL;
- if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0) + if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps, vm) < 0) return NULL;
if (migrateURI) diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h index ad068f1f16..380aac261f 100644 --- a/src/qemu/qemu_command.h +++ b/src/qemu/qemu_command.h @@ -180,7 +180,8 @@ qemuBuildThreadContextProps(virJSONValue **tcProps, /* Current, best practice */ virJSONValue * qemuBuildPCIHostdevDevProps(const virDomainDef *def, - virDomainHostdevDef *dev); + virDomainHostdevDef *dev, + virDomainObj *vm);
virJSONValue * qemuBuildRNGDevProps(const virDomainDef *def, diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index a42721efad..86640aa3e3 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -1953,6 +1953,11 @@ qemuDomainObjPrivateFree(void *data)
virChrdevFree(priv->devs);
+ if (priv->iommufd >= 0) { + virEventRemoveHandle(priv->iommufd);
There is no handle to remove (and none is needed). So no need for the condition either.
+ priv->iommufd = -1; + } + if (priv->pidMonitored >= 0) { virEventRemoveHandle(priv->pidMonitored); priv->pidMonitored = -1; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 45fc32a663..cecfed94a7 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -25,6 +25,7 @@ #include <unistd.h> #include <signal.h> #include <sys/stat.h> +#include <dirent.h>
We should not need this in qemu_process.
#if WITH_SYS_SYSCALL_H # include <sys/syscall.h> #endif @@ -8091,6 +8092,9 @@ qemuProcessLaunch(virConnectPtr conn, if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0) goto cleanup;
+ if (qemuProcessOpenVfioFds(vm) < 0) + goto cleanup; + if (!(cmd = qemuBuildCommandLine(vm, incoming ? "defer" : NULL, vmop, @@ -10267,3 +10271,231 @@ qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit, qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0, nbdkit); virObjectUnlock(vm); } + +/** + * qemuProcessOpenIommuFd: + * @vm: domain object + * @iommuFd: returned file descriptor + * + * Opens /dev/iommu file descriptor for the VM. + * + * Returns: 0 on success, -1 on failure + */ +static int +qemuProcessOpenIommuFd(virDomainObj *vm, int *iommuFd) +{ + int fd = -1; + + VIR_DEBUG("Opening IOMMU FD for domain %s", vm->def->name); + + if ((fd = open("/dev/iommu", O_RDWR | O_CLOEXEC)) < 0) { + if (errno == ENOENT) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("IOMMU FD support requires /dev/iommu device")); + } else { + virReportSystemError(errno, "%s", + _("cannot open /dev/iommu")); + } + return -1; + } + + *iommuFd = fd; + VIR_DEBUG("Opened IOMMU FD %d for domain %s", fd, vm->def->name); + return 0; +} + +/** + * qemuProcessGetVfioDevicePath: + * @hostdev: host device definition + * @vfioPath: returned VFIO device path + * + * Constructs the VFIO device path for a PCI hostdev. + * + * Returns: 0 on success, -1 on failure + */ +static int +qemuProcessGetVfioDevicePath(virDomainHostdevDef *hostdev,
No need to pass the whole hostdev here. Then this function can live in virpci.c
+ char **vfioPath) +{ + virPCIDeviceAddress *addr; + g_autofree char *sysfsPath = NULL; + DIR *dir = NULL; + struct dirent *entry = NULL; + int ret = -1; +
Jano