Open iommufd FDs from libvirt backend without
exposing these FDs to XML users, i.e. one per
domain for /dev/iommu and one per iommufd
hostdev for /dev/vfio/devices/vfioX, and pass
the FD to qemu command line.
Signed-off-by: Nathan Chen <nathanc(a)nvidia.com>
---
src/qemu/qemu_command.c | 44 +++++++-
src/qemu/qemu_command.h | 3 +-
src/qemu/qemu_domain.c | 8 ++
src/qemu/qemu_domain.h | 7 ++
src/qemu/qemu_hotplug.c | 2 +-
src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++
6 files changed, 290 insertions(+), 6 deletions(-)
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 6b3e2ffd0d..359dbb2621 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -4797,7 +4797,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,
virJSONValue *
qemuBuildPCIHostdevDevProps(const virDomainDef *def,
- virDomainHostdevDef *dev)
+ virDomainHostdevDef *dev,
+ virDomainObj *vm)
{
g_autoptr(virJSONValue) props = NULL;
virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci;
@@ -4807,6 +4808,13 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
const char *driver = NULL;
/* 'ramfb' property must be omitted unless it's to be enabled */
bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
+ bool useIommufd = false;
+ qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL;
+
+ if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
+ dev->iommufdId) {
+ useIommufd = true;
+ }
/* caller has to assign proper passthrough driver name */
switch (pcisrc->driver.name) {
@@ -4850,6 +4858,18 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
NULL) < 0)
return NULL;
+ if (useIommufd && priv) {
+ g_autofree char *vfioFdName =
g_strdup_printf("vfio-%04x:%02x:%02x.%d",
+ pcisrc->addr.domain,
pcisrc->addr.bus,
+ pcisrc->addr.slot,
pcisrc->addr.function);
+
+ int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds,
vfioFdName));
+ if (virJSONValueObjectAdd(&props,
+ "S:fd", g_strdup_printf("%d",
vfiofd),
+ NULL) < 0)
+ return NULL;
+ }
+
if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
return NULL;
@@ -5223,11 +5243,13 @@ qemuBuildHostdevSCSICommandLine(virCommand *cmd,
static int
qemuBuildHostdevCommandLine(virCommand *cmd,
const virDomainDef *def,
- virQEMUCaps *qemuCaps)
+ virQEMUCaps *qemuCaps,
+ virDomainObj *vm)
{
size_t i;
g_autoptr(virJSONValue) props = NULL;
int iommufd = 0;
+ qemuDomainObjPrivate *priv = vm->privateData;
for (i = 0; i < def->nhostdevs; i++) {
virDomainHostdevDef *hostdev = def->hostdevs[i];
@@ -5239,8 +5261,11 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
if (hostdev->iommufdId && iommufd == 0) {
iommufd = 1;
+ virCommandPassFD(cmd, priv->iommufd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
+
if (qemuMonitorCreateObjectProps(&props, "iommufd",
hostdev->iommufdId,
+ "S:fd",
g_strdup_printf("%d", priv->iommufd),
NULL) < 0)
return -1;
@@ -5270,7 +5295,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
if (qemuCommandAddExtDevice(cmd, hostdev->info, def, qemuCaps) < 0)
return -1;
- if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
+ if (hostdev->iommufdId) {
+ virDomainHostdevSubsysPCI *pcisrc =
&hostdev->source.subsys.u.pci;
+ g_autofree char *vfioFdName =
g_strdup_printf("vfio-%04x:%02x:%02x.%d",
+ pcisrc->addr.domain,
pcisrc->addr.bus,
+ pcisrc->addr.slot,
pcisrc->addr.function);
+
+ int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds,
vfioFdName));
+
+ virCommandPassFD(cmd, vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
+ }
+
+ if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev, vm)))
return -1;
if (qemuBuildDeviceCommandlineFromJSON(cmd, devprops, def, qemuCaps) < 0)
@@ -10960,7 +10996,7 @@ qemuBuildCommandLine(virDomainObj *vm,
if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0)
return NULL;
- if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0)
+ if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps, vm) < 0)
return NULL;
if (migrateURI)
diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h
index ad068f1f16..380aac261f 100644
--- a/src/qemu/qemu_command.h
+++ b/src/qemu/qemu_command.h
@@ -180,7 +180,8 @@ qemuBuildThreadContextProps(virJSONValue **tcProps,
/* Current, best practice */
virJSONValue *
qemuBuildPCIHostdevDevProps(const virDomainDef *def,
- virDomainHostdevDef *dev);
+ virDomainHostdevDef *dev,
+ virDomainObj *vm);
virJSONValue *
qemuBuildRNGDevProps(const virDomainDef *def,
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index a2c7c88a7e..2086dbb575 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -1954,6 +1954,11 @@ qemuDomainObjPrivateFree(void *data)
virChrdevFree(priv->devs);
+ if (priv->iommufd >= 0) {
+ virEventRemoveHandle(priv->iommufd);
+ priv->iommufd = -1;
+ }
+
if (priv->pidMonitored >= 0) {
virEventRemoveHandle(priv->pidMonitored);
priv->pidMonitored = -1;
@@ -1975,6 +1980,7 @@ qemuDomainObjPrivateFree(void *data)
g_clear_pointer(&priv->blockjobs, g_hash_table_unref);
g_clear_pointer(&priv->fds, g_hash_table_unref);
+ g_clear_pointer(&priv->vfioDeviceFds, g_hash_table_unref);
/* This should never be non-NULL if we get here, but just in case... */
if (priv->eventThread) {
@@ -2003,7 +2009,9 @@ qemuDomainObjPrivateAlloc(void *opaque)
priv->blockjobs = virHashNew(virObjectUnref);
priv->fds = virHashNew(g_object_unref);
+ priv->vfioDeviceFds = g_hash_table_new(g_str_hash, g_str_equal);
+ priv->iommufd = -1;
priv->pidMonitored = -1;
/* agent commands block by default, user can choose different behavior */
diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h
index 1afd932764..6460323554 100644
--- a/src/qemu/qemu_domain.h
+++ b/src/qemu/qemu_domain.h
@@ -266,6 +266,10 @@ struct _qemuDomainObjPrivate {
/* named file descriptor groups associated with the VM */
GHashTable *fds;
+ int iommufd;
+
+ GHashTable *vfioDeviceFds;
+
char *memoryBackingDir;
};
@@ -1172,3 +1176,6 @@ qemuDomainCheckCPU(virArch arch,
bool
qemuDomainMachineSupportsFloppy(const char *machine,
virQEMUCaps *qemuCaps);
+
+int qemuProcessOpenVfioFds(virDomainObj *vm);
+void qemuProcessCloseVfioFds(virDomainObj *vm);
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
index e9568af125..e0e693e251 100644
--- a/src/qemu/qemu_hotplug.c
+++ b/src/qemu/qemu_hotplug.c
@@ -1633,7 +1633,7 @@ qemuDomainAttachHostPCIDevice(virQEMUDriver *driver,
goto error;
}
- if (!(devprops = qemuBuildPCIHostdevDevProps(vm->def, hostdev)))
+ if (!(devprops = qemuBuildPCIHostdevDevProps(vm->def, hostdev, vm)))
goto error;
qemuDomainObjEnterMonitor(vm);
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index a81c02c9d5..1bc779c6aa 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -25,6 +25,7 @@
#include <unistd.h>
#include <signal.h>
#include <sys/stat.h>
+#include <dirent.h>
#if WITH_SYS_SYSCALL_H
# include <sys/syscall.h>
#endif
@@ -8025,6 +8026,9 @@ qemuProcessLaunch(virConnectPtr conn,
if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0)
goto cleanup;
+ if (qemuProcessOpenVfioFds(vm) < 0)
+ goto cleanup;
+
if (!(cmd = qemuBuildCommandLine(vm,
incoming ? "defer" : NULL,
vmop,
@@ -10206,3 +10210,231 @@ qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit,
qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0, nbdkit);
virObjectUnlock(vm);
}
+
+/**
+ * qemuProcessOpenIommuFd:
+ * @vm: domain object
+ * @iommuFd: returned file descriptor
+ *
+ * Opens /dev/iommu file descriptor for the VM.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessOpenIommuFd(virDomainObj *vm, int *iommuFd)
+{
+ int fd = -1;
+
+ VIR_DEBUG("Opening IOMMU FD for domain %s", vm->def->name);
+
+ if ((fd = open("/dev/iommu", O_RDWR | O_CLOEXEC)) < 0) {
+ if (errno == ENOENT) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("IOMMU FD support requires /dev/iommu device"));
+ } else {
+ virReportSystemError(errno, "%s",
+ _("cannot open /dev/iommu"));
+ }
+ return -1;
+ }
+
+ *iommuFd = fd;
+ VIR_DEBUG("Opened IOMMU FD %d for domain %s", fd, vm->def->name);
+ return 0;
+}
+
+/**
+ * qemuProcessGetVfioDevicePath:
+ * @hostdev: host device definition
+ * @vfioPath: returned VFIO device path
+ *
+ * Constructs the VFIO device path for a PCI hostdev.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessGetVfioDevicePath(virDomainHostdevDef *hostdev,
+ char **vfioPath)
+{
+ virPCIDeviceAddress *addr;
+ g_autofree char *sysfsPath = NULL;
+ DIR *dir = NULL;
+ struct dirent *entry = NULL;
+ int ret = -1;
+
+ if (hostdev->mode != VIR_DOMAIN_HOSTDEV_MODE_SUBSYS ||
+ hostdev->source.subsys.type != VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) {
+ virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+ _("VFIO FD only supported for PCI hostdevs"));
+ return -1;
+ }
+
+ addr = &hostdev->source.subsys.u.pci.addr;
+
+ /* Build sysfs path: /sys/bus/pci/devices/DDDD:BB:DD.F/vfio-dev/ */
+ sysfsPath = g_strdup_printf("/sys/bus/pci/devices/"
+ "%04x:%02x:%02x.%d/vfio-dev/",
+ addr->domain, addr->bus,
+ addr->slot, addr->function);
+
+ if (virDirOpen(&dir, sysfsPath) < 0) {
+ virReportSystemError(errno,
+ _("cannot open VFIO sysfs directory %1$s"),
+ sysfsPath);
+ return -1;
+ }
+
+ /* Find the vfio device name in the directory */
+ while (virDirRead(dir, &entry, sysfsPath) > 0) {
+ if (STRPREFIX(entry->d_name, "vfio")) {
+ *vfioPath = g_strdup_printf("/dev/vfio/devices/%s",
entry->d_name);
+ ret = 0;
+ break;
+ }
+ }
+
+ if (ret < 0) {
+ virReportError(VIR_ERR_INTERNAL_ERROR,
+ _("cannot find VFIO device for PCI device
%1$04x:%2$02x:%3$02x.%4$d"),
+ addr->domain, addr->bus, addr->slot, addr->function);
+ }
+
+ virDirClose(dir);
+ return ret;
+}
+
+/**
+ * qemuProcessOpenVfioDeviceFd:
+ * @hostdev: host device definition
+ * @vfioFd: returned file descriptor
+ *
+ * Opens the VFIO device file descriptor for a hostdev.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessOpenVfioDeviceFd(virDomainHostdevDef *hostdev,
+ int *vfioFd)
+{
+ g_autofree char *vfioPath = NULL;
+ int fd = -1;
+
+ if (qemuProcessGetVfioDevicePath(hostdev, &vfioPath) < 0)
+ return -1;
+
+ VIR_DEBUG("Opening VFIO device %s", vfioPath);
+
+ if ((fd = open(vfioPath, O_RDWR | O_CLOEXEC)) < 0) {
+ if (errno == ENOENT) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+ _("VFIO device %1$s not found - ensure device is bound to
vfio-pci driver"),
+ vfioPath);
+ } else {
+ virReportSystemError(errno,
+ _("cannot open VFIO device %1$s"), vfioPath);
+ }
+ return -1;
+ }
+
+ *vfioFd = fd;
+ VIR_DEBUG("Opened VFIO device FD %d for %s", *vfioFd, vfioPath);
+ return 0;
+}
+
+/**
+ * qemuProcessOpenVfioFds:
+ * @vm: domain object
+ *
+ * Opens all necessary VFIO file descriptors for the domain.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+int
+qemuProcessOpenVfioFds(virDomainObj *vm)
+{
+ qemuDomainObjPrivate *priv = vm->privateData;
+ bool needsIommuFd = false;
+ size_t i;
+
+ /* Check if we have any hostdevs that need VFIO FDs */
+ for (i = 0; i < vm->def->nhostdevs; i++) {
+ virDomainHostdevDef *hostdev = vm->def->hostdevs[i];
+ int vfioFd = -1;
+ g_autofree char *fdname = NULL;
+
+ if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS &&
+ hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) {
+
+ /* Check if this hostdev uses VFIO with IOMMU FD */
+ if (hostdev->source.subsys.u.pci.driver.name ==
VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
+ hostdev->iommufdId) {
+
+ needsIommuFd = true;
+
+ /* Open VFIO device FD */
+ if (qemuProcessOpenVfioDeviceFd(hostdev, &vfioFd) < 0)
+ goto error;
+
+ /* Store the FD */
+ fdname = g_strdup_printf("vfio-%04x:%02x:%02x.%d",
+ hostdev->source.subsys.u.pci.addr.domain,
+ hostdev->source.subsys.u.pci.addr.bus,
+ hostdev->source.subsys.u.pci.addr.slot,
+ hostdev->source.subsys.u.pci.addr.function);
+
+ g_hash_table_insert(priv->vfioDeviceFds, g_steal_pointer(&fdname),
GINT_TO_POINTER(vfioFd));
+
+ VIR_DEBUG("Stored VFIO FD for device %s", fdname);
+ }
+ }
+ }
+
+ /* Open IOMMU FD if needed */
+ if (needsIommuFd) {
+ int iommuFd = -1;
+
+ if (qemuProcessOpenIommuFd(vm, &iommuFd) < 0)
+ goto error;
+
+ priv->iommufd = iommuFd;
+
+ VIR_DEBUG("Stored IOMMU FD");
+ }
+
+ return 0;
+
+ error:
+ qemuProcessCloseVfioFds(vm);
+ return -1;
+}
+
+/**
+ * qemuProcessCloseVfioFds:
+ * @vm: domain object
+ *
+ * Closes all VFIO file descriptors for the domain.
+ */
+void
+qemuProcessCloseVfioFds(virDomainObj *vm)
+{
+ qemuDomainObjPrivate *priv = vm->privateData;
+ GHashTableIter iter;
+ gpointer key, value;
+
+ /* Close all VFIO device FDs */
+ if (priv->vfioDeviceFds) {
+ g_hash_table_iter_init(&iter, priv->vfioDeviceFds);
+ while (g_hash_table_iter_next(&iter, &key, &value)) {
+ int fd = GPOINTER_TO_INT(value);
+ VIR_DEBUG("Closing VFIO device FD %d for %s", fd, (char*)key);
+ VIR_FORCE_CLOSE(fd);
+ }
+ g_hash_table_remove_all(priv->vfioDeviceFds);
+ }
+
+ /* Close IOMMU FD */
+ if (priv->iommufd >= 0) {
+ VIR_DEBUG("Closing IOMMU FD %d", priv->iommufd);
+ VIR_FORCE_CLOSE(priv->iommufd);
+ }
+}
--
2.43.0