[PATCH v5 0/7] qemu: Implement support for iommufd
Hi, This is a follow up to the fourth patch series [0] for using iommufd to propagate DMA mappings to the kernel for VM-assigned host devices in a qemu VM. We add a new 'iommufd' attribute for hostdev devices to be associated with the iommufd object. For instance, specifying the iommufd object and associated hostdev in a VM definition: <devices> ... <hostdev mode='subsystem' type='pci' managed='no'> <driver iommufd='yes'/> <source> <address domain='0x0009' bus='0x01' slot='0x00' function='0x0'/> </source> <address type='pci' domain='0x0000' bus='0x15' slot='0x00' function='0x0'/> </hostdev> <hostdev mode='subsystem' type='pci' managed='no'> <driver iommufd='yes'/> <source> <address domain='0x0019' bus='0x01' slot='0x00' function='0x0'/> </source> <address type='pci' domain='0x0000' bus='0x16' slot='0x00' function='0x0'/> </hostdev> ... </devices> This would get translated to a qemu command line with the arguments below. Note that libvirt will open the /dev/iommu and VFIO cdev, passing the associated fd number to qemu: -object '{"qom-type":"iommufd","id":"iommufd0","fd":"24"}' \ -device '{"driver":"vfio-pci","host":"0009:01:00.0","id":"hostdev0","iommufd":"iommufd0","fd":"22","bus":"pci.21","addr":"0x0"}' \ -device '{"driver":"vfio-pci","host":"0019:01:00.0","id":"hostdev1","iommufd":"iommufd0","fd":"25","bus":"pci.22","addr":"0x0"}' \ Changes from v4: - Move qemuProcessOpenVfioFds() to qemu_process.h in patch 4/7 - Add `linux/iommufd.h` into headers list in the meson.build file - Correct viriommufd.c pre-processor definition indentations - Restore call to virIOMMUFDSetRLimitMode() in patch 5/7 - Fix indentations for various print function calls - Move the qemuProcessOpenVfioFds() call to qemuProcessPrepareHost() - Make qemuProcessOpenVfioFds() static - Remove unnecessary formatting and comment in virPCIDeviceGetVfioPath() - Clean up namespace and cgroup changes with conditional check for iommufd - Remove virIOMMUFDSupported() - Fix seclabel return logic - In Apparmor seclabel logic, use g_autoptr for virPCIDevice pointer Changes from v3: - Resolved issue from v2 where stale FD from previous VM boot was in use - Remove second approach for retrieving VFIO device path in virPCIDeviceGetVfioPath() - Resolve broken build of libvirt on non-Linux platforms - Conditionally define iommufd headers and use system headers where possible - Add non-fatal handling + warning print for EPERM for the IOMMU_OPTION_RLIMIT_MODE ioctl - Replace references to /dev/iommu with VIR_IOMMU_DEV_PATH - Implement virIOMMUFDSupported(void) to check for existence of /dev/iommu on host - Include tests for multiple hostdevs Changes from v2: - Set per-process memory accounting mode for iommufd - Separated out formatting of iommufd object from qemuBuildHostdevCommandLine - Placed hostdev private data implementation in a separate commit - Allocate hostdev private data unconditionally - Compare FDs against -1 - Integrated callback function in virQEMUDriverPrivateDataCallbacks for qemuDomainHostdevPrivateNew - Dropped qemuProcessCloseVfioFds - Addressed other feedback from v2 (formatting, includes, etc.) - Revised seclabel logic to be device-specific for AppArmor and to allow paths for SELinux/DAC This series is on Github: https://github.com/NathanChenNVIDIA/libvirt/commits/iommufd-v5-01-26 Thanks, Nathan [0] https://lists.libvirt.org/archives/list/devel@lists.libvirt.org/thread/4BEU6... Nathan Chen (7): qemu: Implement support for associating iommufd to hostdev qemu: Introduce privateData for hostdevs qemu: Support per-process memory accounting for iommufd qemu: open VFIO FDs from libvirt backend qemu: open iommufd FD from libvirt backend qemu: Update Cgroup, namespace, and seclabel for iommufd tests: qemuxmlconfdata: provide iommufd sample XML and CLI args docs/formatdomain.rst | 7 + meson.build | 1 + po/POTFILES | 1 + src/bhyve/bhyve_parse_command.c | 2 +- src/conf/device_conf.c | 11 ++ src/conf/device_conf.h | 1 + src/conf/domain_conf.c | 13 +- src/conf/domain_conf.h | 5 +- src/conf/schemas/basictypes.rng | 5 + src/libvirt_private.syms | 4 + src/libxl/xen_common.c | 2 +- src/libxl/xen_xl.c | 2 +- src/lxc/lxc_native.c | 2 +- src/qemu/qemu_cgroup.c | 3 + src/qemu/qemu_command.c | 76 +++++++++++ src/qemu/qemu_domain.c | 41 ++++++ src/qemu/qemu_domain.h | 17 +++ src/qemu/qemu_namespace.c | 3 + src/qemu/qemu_process.c | 121 ++++++++++++++++++ src/security/security_apparmor.c | 31 +++-- src/security/security_dac.c | 49 +++++-- src/security/security_selinux.c | 47 +++++-- src/security/virt-aa-helper.c | 33 ++++- src/util/meson.build | 1 + src/util/viriommufd.c | 111 ++++++++++++++++ src/util/viriommufd.h | 25 ++++ src/util/virpci.c | 39 ++++++ src/util/virpci.h | 2 + src/vbox/vbox_common.c | 2 +- .../iommufd-q35.x86_64-latest.args | 41 ++++++ .../iommufd-q35.x86_64-latest.xml | 60 +++++++++ tests/qemuxmlconfdata/iommufd-q35.xml | 38 ++++++ ...fd-virt-pci-bus-single.aarch64-latest.args | 32 +++++ ...ufd-virt-pci-bus-single.aarch64-latest.xml | 31 +++++ .../iommufd-virt-pci-bus-single.xml | 22 ++++ .../iommufd-virt.aarch64-latest.args | 36 ++++++ .../iommufd-virt.aarch64-latest.xml | 53 ++++++++ tests/qemuxmlconfdata/iommufd-virt.xml | 29 +++++ .../iommufd.x86_64-latest.args | 35 +++++ .../qemuxmlconfdata/iommufd.x86_64-latest.xml | 38 ++++++ tests/qemuxmlconfdata/iommufd.xml | 30 +++++ tests/qemuxmlconftest.c | 34 +++++ tests/virhostdevtest.c | 2 +- 43 files changed, 1093 insertions(+), 45 deletions(-) create mode 100644 src/util/viriommufd.c create mode 100644 src/util/viriommufd.h create mode 100644 tests/qemuxmlconfdata/iommufd-q35.x86_64-latest.args create mode 100644 tests/qemuxmlconfdata/iommufd-q35.x86_64-latest.xml create mode 100644 tests/qemuxmlconfdata/iommufd-q35.xml create mode 100644 tests/qemuxmlconfdata/iommufd-virt-pci-bus-single.aarch64-latest.args create mode 100644 tests/qemuxmlconfdata/iommufd-virt-pci-bus-single.aarch64-latest.xml create mode 100644 tests/qemuxmlconfdata/iommufd-virt-pci-bus-single.xml create mode 100644 tests/qemuxmlconfdata/iommufd-virt.aarch64-latest.args create mode 100644 tests/qemuxmlconfdata/iommufd-virt.aarch64-latest.xml create mode 100644 tests/qemuxmlconfdata/iommufd-virt.xml create mode 100644 tests/qemuxmlconfdata/iommufd.x86_64-latest.args create mode 100644 tests/qemuxmlconfdata/iommufd.x86_64-latest.xml create mode 100644 tests/qemuxmlconfdata/iommufd.xml -- 2.43.0
From: Nathan Chen <nathanc@nvidia.com> Implement a new iommufd attribute under hostdevs' PCI subsystem driver that can be used to specify associated iommufd object when launching a qemu VM. Signed-off-by: Ján Tomko <jtomko@redhat.com> Signed-off-by: Nathan Chen <nathanc@nvidia.com> --- docs/formatdomain.rst | 7 +++++ src/conf/device_conf.c | 11 ++++++++ src/conf/device_conf.h | 1 + src/conf/schemas/basictypes.rng | 5 ++++ src/qemu/qemu_command.c | 46 +++++++++++++++++++++++++++++++++ 5 files changed, 70 insertions(+) diff --git a/docs/formatdomain.rst b/docs/formatdomain.rst index 04ef319a73..08fc3f3944 100644 --- a/docs/formatdomain.rst +++ b/docs/formatdomain.rst @@ -4920,6 +4920,13 @@ or: found is "problematic" in some way, the generic vfio-pci driver similarly be forced. + The ``<driver>`` element's ``iommufd`` attribute is used to specify + using the iommufd interface to propagate DMA mappings to the kernel, + instead of VFIO alone. When the attribute is present, an iommufd + object will be created by the resulting qemu command. Libvirt will + open the /dev/iommu and VFIO device cdev, passing the associated + file descriptor numbers to the qemu command. + (Note: :since:`Since 1.0.5`, the ``name`` attribute has been described to be used to select the type of PCI device assignment ("vfio", "kvm", or "xen"), but those values have been mostly diff --git a/src/conf/device_conf.c b/src/conf/device_conf.c index c278b81652..d68232a4f4 100644 --- a/src/conf/device_conf.c +++ b/src/conf/device_conf.c @@ -67,6 +67,11 @@ virDeviceHostdevPCIDriverInfoParseXML(xmlNodePtr node, return -1; } + if (virXMLPropTristateBool(node, "iommufd", + VIR_XML_PROP_NONE, + &driver->iommufd) < 0) + return -1; + driver->model = virXMLPropString(node, "model"); return 0; } @@ -93,6 +98,12 @@ virDeviceHostdevPCIDriverInfoFormat(virBuffer *buf, virBufferEscapeString(&driverAttrBuf, " model='%s'", driver->model); + if (driver->iommufd == VIR_TRISTATE_BOOL_YES) { + virBufferAddLit(&driverAttrBuf, " iommufd='yes'"); + } else if (driver->iommufd == VIR_TRISTATE_BOOL_NO) { + virBufferAddLit(&driverAttrBuf, " iommufd='no'"); + } + virXMLFormatElement(buf, "driver", &driverAttrBuf, NULL); return 0; } diff --git a/src/conf/device_conf.h b/src/conf/device_conf.h index e570f51824..116b959143 100644 --- a/src/conf/device_conf.h +++ b/src/conf/device_conf.h @@ -47,6 +47,7 @@ VIR_ENUM_DECL(virDeviceHostdevPCIDriverName); struct _virDeviceHostdevPCIDriverInfo { virDeviceHostdevPCIDriverName name; char *model; + virTristateBool iommufd; }; typedef enum { diff --git a/src/conf/schemas/basictypes.rng b/src/conf/schemas/basictypes.rng index 5689170fad..381e0ac24f 100644 --- a/src/conf/schemas/basictypes.rng +++ b/src/conf/schemas/basictypes.rng @@ -673,6 +673,11 @@ <ref name="genericName"/> </attribute> </optional> + <optional> + <attribute name="iommufd"> + <ref name="virYesNo"/> + </attribute> + </optional> <empty/> </element> </define> diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 0de0a79b46..5274b33d17 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -4761,6 +4761,7 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def, g_autofree char *host = virPCIDeviceAddressAsString(&pcisrc->addr); const char *failover_pair_id = NULL; const char *driver = NULL; + const char *iommufdId = NULL; /* 'ramfb' property must be omitted unless it's to be enabled */ bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON; @@ -4794,6 +4795,9 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def, teaming->persistent) failover_pair_id = teaming->persistent; + if (pcisrc->driver.iommufd == VIR_TRISTATE_BOOL_YES) + iommufdId = "iommufd0"; + if (virJSONValueObjectAdd(&props, "s:driver", driver, "s:host", host, @@ -4802,6 +4806,7 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def, "S:failover_pair_id", failover_pair_id, "S:display", qemuOnOffAuto(pcisrc->display), "B:ramfb", ramfb, + "S:iommufd", iommufdId, NULL) < 0) return NULL; @@ -5321,6 +5326,44 @@ qemuBuildHostdevCommandLine(virCommand *cmd, } +static int +qemuBuildIOMMUFDCommandLine(virCommand *cmd, + const virDomainDef *def) +{ + size_t i; + + for (i = 0; i < def->nhostdevs; i++) { + virDomainHostdevDef *hostdev = def->hostdevs[i]; + virDomainHostdevSubsys *subsys = &hostdev->source.subsys; + g_autoptr(virJSONValue) props = NULL; + + if (hostdev->mode != VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) + continue; + + if (subsys->type != VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) + continue; + + if (hostdev->info->type == VIR_DOMAIN_DEVICE_ADDRESS_TYPE_UNASSIGNED) + continue; + + if (subsys->u.pci.driver.iommufd != VIR_TRISTATE_BOOL_YES) + continue; + + if (qemuMonitorCreateObjectProps(&props, "iommufd", + "iommufd0", + NULL) < 0) + return -1; + + if (qemuBuildObjectCommandlineFromJSON(cmd, props) < 0) + return -1; + + break; + } + + return 0; +} + + static int qemuBuildMonitorCommandLine(virCommand *cmd, qemuDomainObjPrivate *priv) @@ -10933,6 +10976,9 @@ qemuBuildCommandLine(virDomainObj *vm, if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0) return NULL; + if (qemuBuildIOMMUFDCommandLine(cmd, def) < 0) + return NULL; + if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0) return NULL; -- 2.43.0
On Fri, Jan 16, 2026 at 05:39:31PM -0800, Nathan Chen via Devel wrote:
From: Nathan Chen <nathanc@nvidia.com>
Implement a new iommufd attribute under hostdevs' PCI subsystem driver that can be used to specify associated iommufd object when launching a qemu VM.
Signed-off-by: Ján Tomko <jtomko@redhat.com> Signed-off-by: Nathan Chen <nathanc@nvidia.com> --- docs/formatdomain.rst | 7 +++++ src/conf/device_conf.c | 11 ++++++++ src/conf/device_conf.h | 1 + src/conf/schemas/basictypes.rng | 5 ++++ src/qemu/qemu_command.c | 46 +++++++++++++++++++++++++++++++++ 5 files changed, 70 insertions(+)
diff --git a/docs/formatdomain.rst b/docs/formatdomain.rst index 04ef319a73..08fc3f3944 100644 --- a/docs/formatdomain.rst +++ b/docs/formatdomain.rst @@ -4920,6 +4920,13 @@ or: found is "problematic" in some way, the generic vfio-pci driver similarly be forced.
We should add :since:`Since 12.1.0 (QEMU and KVM only)`
+ The ``<driver>`` element's ``iommufd`` attribute is used to specify + using the iommufd interface to propagate DMA mappings to the kernel, + instead of VFIO alone. When the attribute is present, an iommufd + object will be created by the resulting qemu command. Libvirt will + open the /dev/iommu and VFIO device cdev, passing the associated + file descriptor numbers to the qemu command.
I would rephrase this paragraph to something like this: :since:`Since 12.1.0 (QEMU and KVM only), the ``iommufd`` element can be used to enable IOMMUFD backend for VFIO device. This provides interface to propagate DMA mappings to kernel for assigned devices. Libvirt will open the /dev/iommu and VFIO device cdev and pass associated file descriptors to QEMU. Pavel
On 1/19/2026 5:17 AM, Pavel Hrdina wrote:
On Fri, Jan 16, 2026 at 05:39:31PM -0800, Nathan Chen via Devel wrote:
From: Nathan Chen<nathanc@nvidia.com>
Implement a new iommufd attribute under hostdevs' PCI subsystem driver that can be used to specify associated iommufd object when launching a qemu VM.
Signed-off-by: Ján Tomko<jtomko@redhat.com> Signed-off-by: Nathan Chen<nathanc@nvidia.com> --- docs/formatdomain.rst | 7 +++++ src/conf/device_conf.c | 11 ++++++++ src/conf/device_conf.h | 1 + src/conf/schemas/basictypes.rng | 5 ++++ src/qemu/qemu_command.c | 46 +++++++++++++++++++++++++++++++++ 5 files changed, 70 insertions(+)
diff --git a/docs/formatdomain.rst b/docs/formatdomain.rst index 04ef319a73..08fc3f3944 100644 --- a/docs/formatdomain.rst +++ b/docs/formatdomain.rst @@ -4920,6 +4920,13 @@ or: found is "problematic" in some way, the generic vfio-pci driver similarly be forced. We should add :since:`Since 12.1.0 (QEMU and KVM only)`
+ The ``<driver>`` element's ``iommufd`` attribute is used to specify + using the iommufd interface to propagate DMA mappings to the kernel, + instead of VFIO alone. When the attribute is present, an iommufd + object will be created by the resulting qemu command. Libvirt will + open the /dev/iommu and VFIO device cdev, passing the associated + file descriptor numbers to the qemu command. I would rephrase this paragraph to something like this:
:since:`Since 12.1.0 (QEMU and KVM only), the ``iommufd`` element can be used to enable IOMMUFD backend for VFIO device. This provides interface to propagate DMA mappings to kernel for assigned devices. Libvirt will open the /dev/iommu and VFIO device cdev and pass associated file descriptors to QEMU.
Got it, thanks for the feedback. Nathan
From: Nathan Chen <nathanc@nvidia.com> Introduce private data for hostdevs and allocate hostdev private data by default. Signed-off-by: Ján Tomko <jtomko@redhat.com> Signed-off-by: Nathan Chen <nathanc@nvidia.com> --- src/bhyve/bhyve_parse_command.c | 2 +- src/conf/domain_conf.c | 13 +++++++++-- src/conf/domain_conf.h | 5 ++++- src/libxl/xen_common.c | 2 +- src/libxl/xen_xl.c | 2 +- src/lxc/lxc_native.c | 2 +- src/qemu/qemu_domain.c | 40 +++++++++++++++++++++++++++++++++ src/qemu/qemu_domain.h | 15 +++++++++++++ src/vbox/vbox_common.c | 2 +- tests/virhostdevtest.c | 2 +- 10 files changed, 76 insertions(+), 9 deletions(-) diff --git a/src/bhyve/bhyve_parse_command.c b/src/bhyve/bhyve_parse_command.c index d62ea64beb..8b405206bd 100644 --- a/src/bhyve/bhyve_parse_command.c +++ b/src/bhyve/bhyve_parse_command.c @@ -687,7 +687,7 @@ bhyveParsePassthru(virDomainDef *def G_GNUC_UNUSED, return -1; } - hostdev = virDomainHostdevDefNew(); + hostdev = virDomainHostdevDefNew(NULL); hostdev->mode = VIR_DOMAIN_HOSTDEV_MODE_SUBSYS; hostdev->source.subsys.type = VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI; diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index d00a43e969..17cad7e8b0 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -2733,6 +2733,8 @@ virDomainHostdevDefClear(virDomainHostdevDef *def) case VIR_DOMAIN_HOSTDEV_MODE_LAST: break; } + + g_clear_pointer(&def->privateData, virObjectUnref); } @@ -3483,7 +3485,7 @@ void virDomainVideoDefFree(virDomainVideoDef *def) virDomainHostdevDef * -virDomainHostdevDefNew(void) +virDomainHostdevDefNew(virDomainXMLOption *xmlopt) { virDomainHostdevDef *def; @@ -3491,6 +3493,13 @@ virDomainHostdevDefNew(void) def->info = g_new0(virDomainDeviceInfo, 1); + if (xmlopt && xmlopt->privateData.hostdevNew && + !(def->privateData = xmlopt->privateData.hostdevNew())) { + VIR_FREE(def->info); + VIR_FREE(def); + return NULL; + } + return def; } @@ -13681,7 +13690,7 @@ virDomainHostdevDefParseXML(virDomainXMLOption *xmlopt, ctxt->node = node; - def = virDomainHostdevDefNew(); + def = virDomainHostdevDefNew(xmlopt); if (virXMLPropEnumDefault(node, "mode", virDomainHostdevModeTypeFromString, VIR_XML_PROP_NONE, diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 83d49969d3..dee1e85a35 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -364,6 +364,8 @@ struct _virDomainHostdevDef { */ virDomainNetDef *parentnet; + virObject *privateData; + virDomainHostdevMode mode; virDomainStartupPolicy startupPolicy; bool managed; @@ -3588,6 +3590,7 @@ struct _virDomainXMLPrivateDataCallbacks { virDomainXMLPrivateDataNewFunc vsockNew; virDomainXMLPrivateDataNewFunc cryptoNew; virDomainXMLPrivateDataNewFunc graphicsNew; + virDomainXMLPrivateDataNewFunc hostdevNew; virDomainXMLPrivateDataNewFunc networkNew; virDomainXMLPrivateDataNetParseFunc networkParse; virDomainXMLPrivateDataNetFormatFunc networkFormat; @@ -3797,7 +3800,7 @@ virDomainVideoDef *virDomainVideoDefNew(virDomainXMLOption *xmlopt); void virDomainVideoDefFree(virDomainVideoDef *def); G_DEFINE_AUTOPTR_CLEANUP_FUNC(virDomainVideoDef, virDomainVideoDefFree); void virDomainVideoDefClear(virDomainVideoDef *def); -virDomainHostdevDef *virDomainHostdevDefNew(void); +virDomainHostdevDef *virDomainHostdevDefNew(virDomainXMLOption *xmlopt); void virDomainHostdevDefFree(virDomainHostdevDef *def); void virDomainHubDefFree(virDomainHubDef *def); void virDomainRedirdevDefFree(virDomainRedirdevDef *def); diff --git a/src/libxl/xen_common.c b/src/libxl/xen_common.c index 890ef11723..e6a372e078 100644 --- a/src/libxl/xen_common.c +++ b/src/libxl/xen_common.c @@ -445,7 +445,7 @@ xenParsePCI(char *entry) } } - hostdev = virDomainHostdevDefNew(); + hostdev = virDomainHostdevDefNew(NULL); hostdev->managed = false; hostdev->writeFiltering = filtered; hostdev->source.subsys.type = VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI; diff --git a/src/libxl/xen_xl.c b/src/libxl/xen_xl.c index e72e7d7f44..42e1408cf3 100644 --- a/src/libxl/xen_xl.c +++ b/src/libxl/xen_xl.c @@ -930,7 +930,7 @@ xenParseXLUSB(virConf *conf, virDomainDef *def) key = nextkey; } - hostdev = virDomainHostdevDefNew(); + hostdev = virDomainHostdevDefNew(NULL); hostdev->managed = false; hostdev->source.subsys.type = VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB; hostdev->source.subsys.u.usb.bus = busNum; diff --git a/src/lxc/lxc_native.c b/src/lxc/lxc_native.c index 7700804429..a94427b027 100644 --- a/src/lxc/lxc_native.c +++ b/src/lxc/lxc_native.c @@ -376,7 +376,7 @@ lxcCreateNetDef(const char *type, static virDomainHostdevDef * lxcCreateHostdevDef(const char *data) { - virDomainHostdevDef *hostdev = virDomainHostdevDefNew(); + virDomainHostdevDef *hostdev = virDomainHostdevDefNew(NULL); hostdev->mode = VIR_DOMAIN_HOSTDEV_MODE_CAPABILITIES; hostdev->source.caps.type = VIR_DOMAIN_HOSTDEV_CAPS_TYPE_NET; hostdev->source.caps.u.net.ifname = g_strdup(data); diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index ac56fc7cb4..85eea1801f 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -1238,6 +1238,45 @@ qemuDomainNetworkPrivateFormat(const virDomainNetDef *net, } +static virClass *qemuDomainHostdevPrivateClass; + +static void +qemuDomainHostdevPrivateDispose(void *obj) +{ + qemuDomainHostdevPrivate *priv = obj; + + VIR_FORCE_CLOSE(priv->vfioDeviceFd); +} + + +static int +qemuDomainHostdevPrivateOnceInit(void) +{ + if (!VIR_CLASS_NEW(qemuDomainHostdevPrivate, virClassForObject())) + return -1; + + return 0; +} + +VIR_ONCE_GLOBAL_INIT(qemuDomainHostdevPrivate); + +virObject * +qemuDomainHostdevPrivateNew(void) +{ + qemuDomainHostdevPrivate *priv; + + if (qemuDomainHostdevPrivateInitialize() < 0) + return NULL; + + if (!(priv = virObjectNew(qemuDomainHostdevPrivateClass))) + return NULL; + + priv->vfioDeviceFd = -1; + + return (virObject *) priv; +} + + /* qemuDomainSecretInfoSetup: * @priv: pointer to domain private object * @alias: alias of the secret @@ -3563,6 +3602,7 @@ virDomainXMLPrivateDataCallbacks virQEMUDriverPrivateDataCallbacks = { .chrSourceNew = qemuDomainChrSourcePrivateNew, .vsockNew = qemuDomainVsockPrivateNew, .graphicsNew = qemuDomainGraphicsPrivateNew, + .hostdevNew = qemuDomainHostdevPrivateNew, .networkNew = qemuDomainNetworkPrivateNew, .networkParse = qemuDomainNetworkPrivateParse, .networkFormat = qemuDomainNetworkPrivateFormat, diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 3396f929fd..3aac743875 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -461,6 +461,18 @@ struct _qemuDomainTPMPrivate { }; +#define QEMU_DOMAIN_HOSTDEV_PRIVATE(hostdev) \ + ((qemuDomainHostdevPrivate *) (hostdev)->privateData) + +typedef struct _qemuDomainHostdevPrivate qemuDomainHostdevPrivate; +struct _qemuDomainHostdevPrivate { + virObject parent; + + /* VFIO device file descriptor for iommufd passthrough */ + int vfioDeviceFd; +}; + + void qemuDomainNetworkPrivateClearFDs(qemuDomainNetworkPrivate *priv); @@ -1174,3 +1186,6 @@ qemuDomainCheckCPU(virArch arch, bool qemuDomainMachineSupportsFloppy(const char *machine, virQEMUCaps *qemuCaps); + +virObject * +qemuDomainHostdevPrivateNew(void); diff --git a/src/vbox/vbox_common.c b/src/vbox/vbox_common.c index 26c5fdfef6..d2a8cf8da4 100644 --- a/src/vbox/vbox_common.c +++ b/src/vbox/vbox_common.c @@ -3090,7 +3090,7 @@ vboxHostDeviceGetXMLDesc(struct _vboxDriver *data, virDomainDef *def, IMachine * def->hostdevs = g_new0(virDomainHostdevDef *, def->nhostdevs); for (i = 0; i < def->nhostdevs; i++) - def->hostdevs[i] = virDomainHostdevDefNew(); + def->hostdevs[i] = virDomainHostdevDefNew(NULL); for (i = 0; i < deviceFilters.count; i++) { PRBool active = PR_FALSE; diff --git a/tests/virhostdevtest.c b/tests/virhostdevtest.c index aec474a148..a35c1d9402 100644 --- a/tests/virhostdevtest.c +++ b/tests/virhostdevtest.c @@ -124,7 +124,7 @@ myInit(void) for (i = 0; i < nhostdevs; i++) { virDomainHostdevSubsys *subsys; - hostdevs[i] = virDomainHostdevDefNew(); + hostdevs[i] = virDomainHostdevDefNew(NULL); if (!hostdevs[i]) goto cleanup; hostdevs[i]->mode = VIR_DOMAIN_HOSTDEV_MODE_SUBSYS; -- 2.43.0
From: Nathan Chen <nathanc@nvidia.com> Implement the IOMMU_OPTION_RLIMIT_MODE ioctl to set per-process memory accounting for iommufd. This prevents ENOMEM errors from the default per-user memory accounting when multiple VMs under the libvirt-qemu user have their pinned memory summed and checked against a per-process RLIMIT_MEMLOCK limit. Signed-off-by: Nathan Chen <nathanc@nvidia.com> --- meson.build | 1 + po/POTFILES | 1 + src/libvirt_private.syms | 3 ++ src/util/meson.build | 1 + src/util/viriommufd.c | 111 +++++++++++++++++++++++++++++++++++++++ src/util/viriommufd.h | 25 +++++++++ 6 files changed, 142 insertions(+) create mode 100644 src/util/viriommufd.c create mode 100644 src/util/viriommufd.h diff --git a/meson.build b/meson.build index 964d1fa4e1..a6db70f13e 100644 --- a/meson.build +++ b/meson.build @@ -732,6 +732,7 @@ headers = [ 'ifaddrs.h', 'libtasn1.h', 'linux/kvm.h', + 'linux/iommufd.h', 'mntent.h', 'net/ethernet.h', 'net/if.h', diff --git a/po/POTFILES b/po/POTFILES index f0aad35c8c..c78d2b8000 100644 --- a/po/POTFILES +++ b/po/POTFILES @@ -303,6 +303,7 @@ src/util/virhostuptime.c src/util/viridentity.c src/util/virinhibitor.c src/util/virinitctl.c +src/util/viriommufd.c src/util/viriscsi.c src/util/virjson.c src/util/virlease.c diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 6bffd2eb6d..7fa76a1ec3 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2646,6 +2646,9 @@ virInhibitorRelease; virInitctlFifos; virInitctlSetRunLevel; +# util/viriommufd.h +virIOMMUFDSetRLimitMode; + # util/viriscsi.h virISCSIConnectionLogin; virISCSIConnectionLogout; diff --git a/src/util/meson.build b/src/util/meson.build index 4950a795cc..9fb0aa0fe7 100644 --- a/src/util/meson.build +++ b/src/util/meson.build @@ -46,6 +46,7 @@ util_sources = [ 'viridentity.c', 'virinhibitor.c', 'virinitctl.c', + 'viriommufd.c', 'viriscsi.c', 'virjson.c', 'virkeycode.c', diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c new file mode 100644 index 0000000000..225c76f4b2 --- /dev/null +++ b/src/util/viriommufd.c @@ -0,0 +1,111 @@ +#include <config.h> + +#include "viriommufd.h" +#include "virlog.h" +#include "virerror.h" +#include "virfile.h" + +#ifdef __linux__ + +# include <sys/ioctl.h> +# include <linux/types.h> + +# ifdef HAVE_LINUX_IOMMUFD_H +# include <linux/iommufd.h> +# endif + +# define VIR_FROM_THIS VIR_FROM_NONE + +VIR_LOG_INIT("util.iommufd"); + +# ifndef IOMMU_OPTION + +enum iommufd_option { + IOMMU_OPTION_RLIMIT_MODE = 0, + IOMMU_OPTION_HUGE_PAGES = 1, +}; + +enum iommufd_option_ops { + IOMMU_OPTION_OP_SET = 0, + IOMMU_OPTION_OP_GET = 1, +}; + +struct iommu_option { + __u32 size; + __u32 option_id; + __u16 op; + __u16 __reserved; + __u32 object_id; + __aligned_u64 val64; +}; + +# define IOMMUFD_TYPE (';') +# define IOMMUFD_CMD_OPTION 0x87 +# define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION) + +# endif + +/** + * virIOMMUFDSetRLimitMode: + * @fd: iommufd file descriptor + * @processAccounting: true for per-process, false for per-user + * + * Set RLIMIT_MEMLOCK accounting mode for the iommufd. + * + * Returns: 0 on success, -1 on error + */ +int +virIOMMUFDSetRLimitMode(int fd, bool processAccounting) +{ + struct iommu_option option = { + .size = sizeof(struct iommu_option), + .option_id = IOMMU_OPTION_RLIMIT_MODE, + .op = IOMMU_OPTION_OP_SET, + .__reserved = 0, + .object_id = 0, + .val64 = processAccounting ? 1 : 0, + }; + + if (ioctl(fd, IOMMU_OPTION, &option) < 0) { + switch (errno) { + case ENOTTY: + VIR_WARN("IOMMU_OPTION ioctl not supported"); + return 0; + + case EOPNOTSUPP: + VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel"); + return 0; + + case EINVAL: + virReportSystemError(errno, "%s", + _("invalid iommufd option parameters")); + return -1; + + case EPERM: + VIR_WARN("Permission denied for IOMMU_OPTION ioctl. " + "Per-user-based memory accounting to be used by default."); + return 0; + + default: + virReportSystemError(errno, "%s", + _("failed to set iommufd option")); + return -1; + } + } + + VIR_DEBUG("Set iommufd rlimit mode to %s-based accounting", + processAccounting ? "process" : "user"); + return 0; +} + +#else + +int virIOMMUFDSetRLimitMode(int fd G_GNUC_UNUSED, + bool processAccounting G_GNUC_UNUSED) +{ + virReportError(VIR_ERR_NO_SUPPORT, "%s", + _("IOMMUFD is not supported on this platform")); + return -1; +} + +#endif diff --git a/src/util/viriommufd.h b/src/util/viriommufd.h new file mode 100644 index 0000000000..ebecfe3633 --- /dev/null +++ b/src/util/viriommufd.h @@ -0,0 +1,25 @@ +/* + * viriommufd.h: iommufd helpers + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "internal.h" + +#define VIR_IOMMU_DEV_PATH "/dev/iommu" + +int virIOMMUFDSetRLimitMode(int fd, bool processAccounting); -- 2.43.0
On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel wrote:
From: Nathan Chen <nathanc@nvidia.com>
Implement the IOMMU_OPTION_RLIMIT_MODE ioctl to set per-process memory accounting for iommufd. This prevents ENOMEM errors from the default per-user memory accounting when multiple VMs under the libvirt-qemu user have their pinned memory summed and checked against a per-process RLIMIT_MEMLOCK limit.
Signed-off-by: Nathan Chen <nathanc@nvidia.com> --- meson.build | 1 + po/POTFILES | 1 + src/libvirt_private.syms | 3 ++ src/util/meson.build | 1 + src/util/viriommufd.c | 111 +++++++++++++++++++++++++++++++++++++++ src/util/viriommufd.h | 25 +++++++++ 6 files changed, 142 insertions(+) create mode 100644 src/util/viriommufd.c create mode 100644 src/util/viriommufd.h
diff --git a/meson.build b/meson.build index 964d1fa4e1..a6db70f13e 100644 --- a/meson.build +++ b/meson.build @@ -732,6 +732,7 @@ headers = [ 'ifaddrs.h', 'libtasn1.h', 'linux/kvm.h', + 'linux/iommufd.h', 'mntent.h', 'net/ethernet.h', 'net/if.h', diff --git a/po/POTFILES b/po/POTFILES index f0aad35c8c..c78d2b8000 100644 --- a/po/POTFILES +++ b/po/POTFILES @@ -303,6 +303,7 @@ src/util/virhostuptime.c src/util/viridentity.c src/util/virinhibitor.c src/util/virinitctl.c +src/util/viriommufd.c src/util/viriscsi.c src/util/virjson.c src/util/virlease.c diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 6bffd2eb6d..7fa76a1ec3 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2646,6 +2646,9 @@ virInhibitorRelease; virInitctlFifos; virInitctlSetRunLevel;
+# util/viriommufd.h +virIOMMUFDSetRLimitMode; + # util/viriscsi.h virISCSIConnectionLogin; virISCSIConnectionLogout; diff --git a/src/util/meson.build b/src/util/meson.build index 4950a795cc..9fb0aa0fe7 100644 --- a/src/util/meson.build +++ b/src/util/meson.build @@ -46,6 +46,7 @@ util_sources = [ 'viridentity.c', 'virinhibitor.c', 'virinitctl.c', + 'viriommufd.c', 'viriscsi.c', 'virjson.c', 'virkeycode.c', diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c new file mode 100644 index 0000000000..225c76f4b2 --- /dev/null +++ b/src/util/viriommufd.c @@ -0,0 +1,111 @@ +#include <config.h> + +#include "viriommufd.h" +#include "virlog.h" +#include "virerror.h" +#include "virfile.h" + +#ifdef __linux__ + +# include <sys/ioctl.h> +# include <linux/types.h> + +# ifdef HAVE_LINUX_IOMMUFD_H +# include <linux/iommufd.h> +# endif + +# define VIR_FROM_THIS VIR_FROM_NONE + +VIR_LOG_INIT("util.iommufd");
Move these two before #ifdef __linux__ as they don't depend on linux and not having VIR_FROM_THIS defines breaks compilation on non-linux systems because the else branch calls virReportError(). Pavel
On 1/19/2026 5:41 AM, Pavel Hrdina wrote:
On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel wrote:
From: Nathan Chen<nathanc@nvidia.com>
Implement the IOMMU_OPTION_RLIMIT_MODE ioctl to set per-process memory accounting for iommufd. This prevents ENOMEM errors from the default per-user memory accounting when multiple VMs under the libvirt-qemu user have their pinned memory summed and checked against a per-process RLIMIT_MEMLOCK limit.
Signed-off-by: Nathan Chen<nathanc@nvidia.com> --- meson.build | 1 + po/POTFILES | 1 + src/libvirt_private.syms | 3 ++ src/util/meson.build | 1 + src/util/viriommufd.c | 111 +++++++++++++++++++++++++++++++++++++++ src/util/viriommufd.h | 25 +++++++++ 6 files changed, 142 insertions(+) create mode 100644 src/util/viriommufd.c create mode 100644 src/util/viriommufd.h
diff --git a/meson.build b/meson.build index 964d1fa4e1..a6db70f13e 100644 --- a/meson.build +++ b/meson.build @@ -732,6 +732,7 @@ headers = [ 'ifaddrs.h', 'libtasn1.h', 'linux/kvm.h', + 'linux/iommufd.h', 'mntent.h', 'net/ethernet.h', 'net/if.h', diff --git a/po/POTFILES b/po/POTFILES index f0aad35c8c..c78d2b8000 100644 --- a/po/POTFILES +++ b/po/POTFILES @@ -303,6 +303,7 @@ src/util/virhostuptime.c src/util/viridentity.c src/util/virinhibitor.c src/util/virinitctl.c +src/util/viriommufd.c src/util/viriscsi.c src/util/virjson.c src/util/virlease.c diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 6bffd2eb6d..7fa76a1ec3 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2646,6 +2646,9 @@ virInhibitorRelease; virInitctlFifos; virInitctlSetRunLevel;
+# util/viriommufd.h +virIOMMUFDSetRLimitMode; + # util/viriscsi.h virISCSIConnectionLogin; virISCSIConnectionLogout; diff --git a/src/util/meson.build b/src/util/meson.build index 4950a795cc..9fb0aa0fe7 100644 --- a/src/util/meson.build +++ b/src/util/meson.build @@ -46,6 +46,7 @@ util_sources = [ 'viridentity.c', 'virinhibitor.c', 'virinitctl.c', + 'viriommufd.c', 'viriscsi.c', 'virjson.c', 'virkeycode.c', diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c new file mode 100644 index 0000000000..225c76f4b2 --- /dev/null +++ b/src/util/viriommufd.c @@ -0,0 +1,111 @@ +#include <config.h> + +#include "viriommufd.h" +#include "virlog.h" +#include "virerror.h" +#include "virfile.h" + +#ifdef __linux__ + +# include <sys/ioctl.h> +# include <linux/types.h> + +# ifdef HAVE_LINUX_IOMMUFD_H +# include <linux/iommufd.h> +# endif + +# define VIR_FROM_THIS VIR_FROM_NONE + +VIR_LOG_INIT("util.iommufd"); Move these two before #ifdef __linux__ as they don't depend on linux and not having VIR_FROM_THIS defines breaks compilation on non-linux systems because the else branch calls virReportError().
Ok that makes sense, I will move these out in the next revision. Nathan
On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel wrote:
From: Nathan Chen <nathanc@nvidia.com>
Implement the IOMMU_OPTION_RLIMIT_MODE ioctl to set per-process memory accounting for iommufd. This prevents ENOMEM errors from the default per-user memory accounting when multiple VMs under the libvirt-qemu user have their pinned memory summed and checked against a per-process RLIMIT_MEMLOCK limit.
Signed-off-by: Nathan Chen <nathanc@nvidia.com> --- meson.build | 1 + po/POTFILES | 1 + src/libvirt_private.syms | 3 ++ src/util/meson.build | 1 + src/util/viriommufd.c | 111 +++++++++++++++++++++++++++++++++++++++ src/util/viriommufd.h | 25 +++++++++ 6 files changed, 142 insertions(+) create mode 100644 src/util/viriommufd.c create mode 100644 src/util/viriommufd.h
diff --git a/meson.build b/meson.build index 964d1fa4e1..a6db70f13e 100644 --- a/meson.build +++ b/meson.build @@ -732,6 +732,7 @@ headers = [ 'ifaddrs.h', 'libtasn1.h', 'linux/kvm.h', + 'linux/iommufd.h', 'mntent.h', 'net/ethernet.h', 'net/if.h', diff --git a/po/POTFILES b/po/POTFILES index f0aad35c8c..c78d2b8000 100644 --- a/po/POTFILES +++ b/po/POTFILES @@ -303,6 +303,7 @@ src/util/virhostuptime.c src/util/viridentity.c src/util/virinhibitor.c src/util/virinitctl.c +src/util/viriommufd.c src/util/viriscsi.c src/util/virjson.c src/util/virlease.c diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 6bffd2eb6d..7fa76a1ec3 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2646,6 +2646,9 @@ virInhibitorRelease; virInitctlFifos; virInitctlSetRunLevel;
+# util/viriommufd.h +virIOMMUFDSetRLimitMode; + # util/viriscsi.h virISCSIConnectionLogin; virISCSIConnectionLogout; diff --git a/src/util/meson.build b/src/util/meson.build index 4950a795cc..9fb0aa0fe7 100644 --- a/src/util/meson.build +++ b/src/util/meson.build @@ -46,6 +46,7 @@ util_sources = [ 'viridentity.c', 'virinhibitor.c', 'virinitctl.c', + 'viriommufd.c', 'viriscsi.c', 'virjson.c', 'virkeycode.c', diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c new file mode 100644 index 0000000000..225c76f4b2 --- /dev/null +++ b/src/util/viriommufd.c @@ -0,0 +1,111 @@ +#include <config.h> + +#include "viriommufd.h" +#include "virlog.h" +#include "virerror.h" +#include "virfile.h" + +#ifdef __linux__ + +# include <sys/ioctl.h> +# include <linux/types.h> + +# ifdef HAVE_LINUX_IOMMUFD_H +# include <linux/iommufd.h> +# endif + +# define VIR_FROM_THIS VIR_FROM_NONE + +VIR_LOG_INIT("util.iommufd"); + +# ifndef IOMMU_OPTION + +enum iommufd_option { + IOMMU_OPTION_RLIMIT_MODE = 0, + IOMMU_OPTION_HUGE_PAGES = 1, +}; + +enum iommufd_option_ops { + IOMMU_OPTION_OP_SET = 0, + IOMMU_OPTION_OP_GET = 1, +}; + +struct iommu_option { + __u32 size; + __u32 option_id; + __u16 op; + __u16 __reserved; + __u32 object_id; + __aligned_u64 val64; +}; + +# define IOMMUFD_TYPE (';') +# define IOMMUFD_CMD_OPTION 0x87 +# define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION) + +# endif + +/** + * virIOMMUFDSetRLimitMode: + * @fd: iommufd file descriptor + * @processAccounting: true for per-process, false for per-user + * + * Set RLIMIT_MEMLOCK accounting mode for the iommufd. + * + * Returns: 0 on success, -1 on error + */ +int +virIOMMUFDSetRLimitMode(int fd, bool processAccounting) +{ + struct iommu_option option = { + .size = sizeof(struct iommu_option), + .option_id = IOMMU_OPTION_RLIMIT_MODE, + .op = IOMMU_OPTION_OP_SET, + .__reserved = 0, + .object_id = 0, + .val64 = processAccounting ? 1 : 0, + }; + + if (ioctl(fd, IOMMU_OPTION, &option) < 0) { + switch (errno) { + case ENOTTY: + VIR_WARN("IOMMU_OPTION ioctl not supported"); + return 0; + + case EOPNOTSUPP: + VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel"); + return 0; + + case EINVAL: + virReportSystemError(errno, "%s", + _("invalid iommufd option parameters")); + return -1; + + case EPERM: + VIR_WARN("Permission denied for IOMMU_OPTION ioctl. " + "Per-user-based memory accounting to be used by default."); + return 0; + + default: + virReportSystemError(errno, "%s", + _("failed to set iommufd option")); + return -1; + } + }
In my previous testing this part of code was not used so no rlimit was configured for the grace hopper GPU that was assigned to a VM. The VM OS was able to see the GPU and I was able to run cuda-samples with most of them passing. This setup didn't use vCMDQ or EGM. When I tried patches that add support for vCMDQ I was no longer able to use the GPU inside the VM until this code was called or setting "setcap cap_ipc_lock=ep" on the qemu binary but it was still detected inside the VM and the VM was started successfully. So is this required for all devices that want to use iommufd in order for them to work correctly inside the VM? Or is it necessary only when specific features are used? I wonder if we should allow to start a VM if we know the device will not actually work correctly. Basically if IOMMU_OPTION ioctl, IOMMU_OPTION_RLIMIT_MODE are not supported or we get permission denied we return 0 and we let the VM start. Pavel
+ + VIR_DEBUG("Set iommufd rlimit mode to %s-based accounting", + processAccounting ? "process" : "user"); + return 0; +} + +#else + +int virIOMMUFDSetRLimitMode(int fd G_GNUC_UNUSED, + bool processAccounting G_GNUC_UNUSED) +{ + virReportError(VIR_ERR_NO_SUPPORT, "%s", + _("IOMMUFD is not supported on this platform")); + return -1; +} + +#endif diff --git a/src/util/viriommufd.h b/src/util/viriommufd.h new file mode 100644 index 0000000000..ebecfe3633 --- /dev/null +++ b/src/util/viriommufd.h @@ -0,0 +1,25 @@ +/* + * viriommufd.h: iommufd helpers + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "internal.h" + +#define VIR_IOMMU_DEV_PATH "/dev/iommu" + +int virIOMMUFDSetRLimitMode(int fd, bool processAccounting); -- 2.43.0
On 1/20/2026 10:24 AM, Pavel Hrdina wrote:
On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel wrote:
From: Nathan Chen<nathanc@nvidia.com>
Implement the IOMMU_OPTION_RLIMIT_MODE ioctl to set per-process memory accounting for iommufd. This prevents ENOMEM errors from the default per-user memory accounting when multiple VMs under the libvirt-qemu user have their pinned memory summed and checked against a per-process RLIMIT_MEMLOCK limit.
Signed-off-by: Nathan Chen<nathanc@nvidia.com> --- meson.build | 1 + po/POTFILES | 1 + src/libvirt_private.syms | 3 ++ src/util/meson.build | 1 + src/util/viriommufd.c | 111 +++++++++++++++++++++++++++++++++++++++ src/util/viriommufd.h | 25 +++++++++ 6 files changed, 142 insertions(+) create mode 100644 src/util/viriommufd.c create mode 100644 src/util/viriommufd.h
diff --git a/meson.build b/meson.build index 964d1fa4e1..a6db70f13e 100644 --- a/meson.build +++ b/meson.build @@ -732,6 +732,7 @@ headers = [ 'ifaddrs.h', 'libtasn1.h', 'linux/kvm.h', + 'linux/iommufd.h', 'mntent.h', 'net/ethernet.h', 'net/if.h', diff --git a/po/POTFILES b/po/POTFILES index f0aad35c8c..c78d2b8000 100644 --- a/po/POTFILES +++ b/po/POTFILES @@ -303,6 +303,7 @@ src/util/virhostuptime.c src/util/viridentity.c src/util/virinhibitor.c src/util/virinitctl.c +src/util/viriommufd.c src/util/viriscsi.c src/util/virjson.c src/util/virlease.c diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 6bffd2eb6d..7fa76a1ec3 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2646,6 +2646,9 @@ virInhibitorRelease; virInitctlFifos; virInitctlSetRunLevel;
+# util/viriommufd.h +virIOMMUFDSetRLimitMode; + # util/viriscsi.h virISCSIConnectionLogin; virISCSIConnectionLogout; diff --git a/src/util/meson.build b/src/util/meson.build index 4950a795cc..9fb0aa0fe7 100644 --- a/src/util/meson.build +++ b/src/util/meson.build @@ -46,6 +46,7 @@ util_sources = [ 'viridentity.c', 'virinhibitor.c', 'virinitctl.c', + 'viriommufd.c', 'viriscsi.c', 'virjson.c', 'virkeycode.c', diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c new file mode 100644 index 0000000000..225c76f4b2 --- /dev/null +++ b/src/util/viriommufd.c @@ -0,0 +1,111 @@ +#include <config.h> + +#include "viriommufd.h" +#include "virlog.h" +#include "virerror.h" +#include "virfile.h" + +#ifdef __linux__ + +# include <sys/ioctl.h> +# include <linux/types.h> + +# ifdef HAVE_LINUX_IOMMUFD_H +# include <linux/iommufd.h> +# endif + +# define VIR_FROM_THIS VIR_FROM_NONE + +VIR_LOG_INIT("util.iommufd"); + +# ifndef IOMMU_OPTION + +enum iommufd_option { + IOMMU_OPTION_RLIMIT_MODE = 0, + IOMMU_OPTION_HUGE_PAGES = 1, +}; + +enum iommufd_option_ops { + IOMMU_OPTION_OP_SET = 0, + IOMMU_OPTION_OP_GET = 1, +}; + +struct iommu_option { + __u32 size; + __u32 option_id; + __u16 op; + __u16 __reserved; + __u32 object_id; + __aligned_u64 val64; +}; + +# define IOMMUFD_TYPE (';') +# define IOMMUFD_CMD_OPTION 0x87 +# define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION) + +# endif + +/** + * virIOMMUFDSetRLimitMode: + * @fd: iommufd file descriptor + * @processAccounting: true for per-process, false for per-user + * + * Set RLIMIT_MEMLOCK accounting mode for the iommufd. + * + * Returns: 0 on success, -1 on error + */ +int +virIOMMUFDSetRLimitMode(int fd, bool processAccounting) +{ + struct iommu_option option = { + .size = sizeof(struct iommu_option), + .option_id = IOMMU_OPTION_RLIMIT_MODE, + .op = IOMMU_OPTION_OP_SET, + .__reserved = 0, + .object_id = 0, + .val64 = processAccounting ? 1 : 0, + }; + + if (ioctl(fd, IOMMU_OPTION, &option) < 0) { + switch (errno) { + case ENOTTY: + VIR_WARN("IOMMU_OPTION ioctl not supported"); + return 0; + + case EOPNOTSUPP: + VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel"); + return 0; + + case EINVAL: + virReportSystemError(errno, "%s", + _("invalid iommufd option parameters")); + return -1; + + case EPERM: + VIR_WARN("Permission denied for IOMMU_OPTION ioctl. " + "Per-user-based memory accounting to be used by default."); + return 0; + + default: + virReportSystemError(errno, "%s", + _("failed to set iommufd option")); + return -1; + } + } In my previous testing this part of code was not used so no rlimit was configured for the grace hopper GPU that was assigned to a VM.
The VM OS was able to see the GPU and I was able to run cuda-samples with most of them passing. This setup didn't use vCMDQ or EGM. When I tried patches that add support for vCMDQ I was no longer able to use the GPU inside the VM until this code was called or setting "setcap cap_ipc_lock=ep" on the qemu binary but it was still detected inside the VM and the VM was started successfully.
So is this required for all devices that want to use iommufd in order for them to work correctly inside the VM? Or is it necessary only when specific features are used?
I don’t think the ioctl is required for all devices, but vCMDQ can increase accounted pinned memory over the per‑user memory locking limit. vCMDQ introduces additional guest‑RAM backed queues that could be the extra pinned/accounted memory pushing over the memory locking limit. Additionally, attempting to launch a second iommufd VM could increase accounted memory over the per-user memory locking limit. For the case you observed, if it were truly a single isolated QEMU process with no other memlocked usage under the same uid, per‑process vs per‑user should be identical. The fact that switching to per‑process memory accounting fixes the issue suggests there is additional memlocked usage being charged to the libvirt‑qemu uid (e.g. other processes, helper daemons, or device‑related accounting). vCMDQ just pushes the summed memory over the limit.
I wonder if we should allow to start a VM if we know the device will not actually work correctly.
Basically if IOMMU_OPTION ioctl, IOMMU_OPTION_RLIMIT_MODE are not supported or we get permission denied we return 0 and we let the VM start.
I’m open to returning -1 for these cases if you feel it's safer to fail early. What are your thoughts? Nathan
On Fri, Jan 23, 2026 at 12:30:28PM -0800, Nathan Chen wrote:
On 1/20/2026 10:24 AM, Pavel Hrdina wrote:
On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel wrote:
From: Nathan Chen<nathanc@nvidia.com>
Implement the IOMMU_OPTION_RLIMIT_MODE ioctl to set per-process memory accounting for iommufd. This prevents ENOMEM errors from the default per-user memory accounting when multiple VMs under the libvirt-qemu user have their pinned memory summed and checked against a per-process RLIMIT_MEMLOCK limit.
Signed-off-by: Nathan Chen<nathanc@nvidia.com> --- meson.build | 1 + po/POTFILES | 1 + src/libvirt_private.syms | 3 ++ src/util/meson.build | 1 + src/util/viriommufd.c | 111 +++++++++++++++++++++++++++++++++++++++ src/util/viriommufd.h | 25 +++++++++ 6 files changed, 142 insertions(+) create mode 100644 src/util/viriommufd.c create mode 100644 src/util/viriommufd.h
diff --git a/meson.build b/meson.build index 964d1fa4e1..a6db70f13e 100644 --- a/meson.build +++ b/meson.build @@ -732,6 +732,7 @@ headers = [ 'ifaddrs.h', 'libtasn1.h', 'linux/kvm.h', + 'linux/iommufd.h', 'mntent.h', 'net/ethernet.h', 'net/if.h', diff --git a/po/POTFILES b/po/POTFILES index f0aad35c8c..c78d2b8000 100644 --- a/po/POTFILES +++ b/po/POTFILES @@ -303,6 +303,7 @@ src/util/virhostuptime.c src/util/viridentity.c src/util/virinhibitor.c src/util/virinitctl.c +src/util/viriommufd.c src/util/viriscsi.c src/util/virjson.c src/util/virlease.c diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 6bffd2eb6d..7fa76a1ec3 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2646,6 +2646,9 @@ virInhibitorRelease; virInitctlFifos; virInitctlSetRunLevel; +# util/viriommufd.h +virIOMMUFDSetRLimitMode; + # util/viriscsi.h virISCSIConnectionLogin; virISCSIConnectionLogout; diff --git a/src/util/meson.build b/src/util/meson.build index 4950a795cc..9fb0aa0fe7 100644 --- a/src/util/meson.build +++ b/src/util/meson.build @@ -46,6 +46,7 @@ util_sources = [ 'viridentity.c', 'virinhibitor.c', 'virinitctl.c', + 'viriommufd.c', 'viriscsi.c', 'virjson.c', 'virkeycode.c', diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c new file mode 100644 index 0000000000..225c76f4b2 --- /dev/null +++ b/src/util/viriommufd.c @@ -0,0 +1,111 @@ +#include <config.h> + +#include "viriommufd.h" +#include "virlog.h" +#include "virerror.h" +#include "virfile.h" + +#ifdef __linux__ + +# include <sys/ioctl.h> +# include <linux/types.h> + +# ifdef HAVE_LINUX_IOMMUFD_H +# include <linux/iommufd.h> +# endif + +# define VIR_FROM_THIS VIR_FROM_NONE + +VIR_LOG_INIT("util.iommufd"); + +# ifndef IOMMU_OPTION + +enum iommufd_option { + IOMMU_OPTION_RLIMIT_MODE = 0, + IOMMU_OPTION_HUGE_PAGES = 1, +}; + +enum iommufd_option_ops { + IOMMU_OPTION_OP_SET = 0, + IOMMU_OPTION_OP_GET = 1, +}; + +struct iommu_option { + __u32 size; + __u32 option_id; + __u16 op; + __u16 __reserved; + __u32 object_id; + __aligned_u64 val64; +}; + +# define IOMMUFD_TYPE (';') +# define IOMMUFD_CMD_OPTION 0x87 +# define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION) + +# endif + +/** + * virIOMMUFDSetRLimitMode: + * @fd: iommufd file descriptor + * @processAccounting: true for per-process, false for per-user + * + * Set RLIMIT_MEMLOCK accounting mode for the iommufd. + * + * Returns: 0 on success, -1 on error + */ +int +virIOMMUFDSetRLimitMode(int fd, bool processAccounting) +{ + struct iommu_option option = { + .size = sizeof(struct iommu_option), + .option_id = IOMMU_OPTION_RLIMIT_MODE, + .op = IOMMU_OPTION_OP_SET, + .__reserved = 0, + .object_id = 0, + .val64 = processAccounting ? 1 : 0, + }; + + if (ioctl(fd, IOMMU_OPTION, &option) < 0) { + switch (errno) { + case ENOTTY: + VIR_WARN("IOMMU_OPTION ioctl not supported"); + return 0; + + case EOPNOTSUPP: + VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel"); + return 0; + + case EINVAL: + virReportSystemError(errno, "%s", + _("invalid iommufd option parameters")); + return -1; + + case EPERM: + VIR_WARN("Permission denied for IOMMU_OPTION ioctl. " + "Per-user-based memory accounting to be used by default."); + return 0; + + default: + virReportSystemError(errno, "%s", + _("failed to set iommufd option")); + return -1; + } + } In my previous testing this part of code was not used so no rlimit was configured for the grace hopper GPU that was assigned to a VM.
The VM OS was able to see the GPU and I was able to run cuda-samples with most of them passing. This setup didn't use vCMDQ or EGM. When I tried patches that add support for vCMDQ I was no longer able to use the GPU inside the VM until this code was called or setting "setcap cap_ipc_lock=ep" on the qemu binary but it was still detected inside the VM and the VM was started successfully.
So is this required for all devices that want to use iommufd in order for them to work correctly inside the VM? Or is it necessary only when specific features are used?
I don’t think the ioctl is required for all devices, but vCMDQ can increase accounted pinned memory over the per‑user memory locking limit. vCMDQ introduces additional guest‑RAM backed queues that could be the extra pinned/accounted memory pushing over the memory locking limit. Additionally, attempting to launch a second iommufd VM could increase accounted memory over the per-user memory locking limit.
If that ioctl call is not required for all devices we should not call it unconditionally for all VMs that will try to use iommufd with any device. Libvirt tries to guess correct memory limit for specific cases, see function qemuDomainGetMemLockLimitBytes() . If I manually set 64G hard_limit for VM with 32G ram everything works even without calling tha ioctl: <memtune> <hard_limit unit='GiB'>64</hard_limit> </memtune> So if we can figure out some reasonable overhead when vCMDQ is used that would be better solution.
For the case you observed, if it were truly a single isolated QEMU process with no other memlocked usage under the same uid, per‑process vs per‑user should be identical. The fact that switching to per‑process memory accounting fixes the issue suggests there is additional memlocked usage being charged to the libvirt‑qemu uid (e.g. other processes, helper daemons, or device‑related accounting). vCMDQ just pushes the summed memory over the limit.
When the limit was not high enough I got the following errors in host dmesg: [30507.848263] acpi NVDA200C:03: tegra241_cmdqv: unexpected error reported. vintf_map: 0000000000000002, vcmdq_map 00000000:00000000:00000000:0000000c I think this needs additional work in QEMU, starting VM should error out if it hits the memory limit instead of silently starting broken VM configuration. Pavel
I wonder if we should allow to start a VM if we know the device will not actually work correctly.
Basically if IOMMU_OPTION ioctl, IOMMU_OPTION_RLIMIT_MODE are not supported or we get permission denied we return 0 and we let the VM start.
I’m open to returning -1 for these cases if you feel it's safer to fail early. What are your thoughts?
Nathan
On 1/26/2026 1:07 PM, Pavel Hrdina wrote:
On Fri, Jan 23, 2026 at 12:30:28PM -0800, Nathan Chen wrote:
On 1/20/2026 10:24 AM, Pavel Hrdina wrote:
On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel wrote:
From: Nathan Chen<nathanc@nvidia.com>
Implement the IOMMU_OPTION_RLIMIT_MODE ioctl to set per-process memory accounting for iommufd. This prevents ENOMEM errors from the default per-user memory accounting when multiple VMs under the libvirt-qemu user have their pinned memory summed and checked against a per-process RLIMIT_MEMLOCK limit.
Signed-off-by: Nathan Chen<nathanc@nvidia.com> --- meson.build | 1 + po/POTFILES | 1 + src/libvirt_private.syms | 3 ++ src/util/meson.build | 1 + src/util/viriommufd.c | 111 +++++++++++++++++++++++++++++++++++++++ src/util/viriommufd.h | 25 +++++++++ 6 files changed, 142 insertions(+) create mode 100644 src/util/viriommufd.c create mode 100644 src/util/viriommufd.h
diff --git a/meson.build b/meson.build index 964d1fa4e1..a6db70f13e 100644 --- a/meson.build +++ b/meson.build @@ -732,6 +732,7 @@ headers = [ 'ifaddrs.h', 'libtasn1.h', 'linux/kvm.h', + 'linux/iommufd.h', 'mntent.h', 'net/ethernet.h', 'net/if.h', diff --git a/po/POTFILES b/po/POTFILES index f0aad35c8c..c78d2b8000 100644 --- a/po/POTFILES +++ b/po/POTFILES @@ -303,6 +303,7 @@ src/util/virhostuptime.c src/util/viridentity.c src/util/virinhibitor.c src/util/virinitctl.c +src/util/viriommufd.c src/util/viriscsi.c src/util/virjson.c src/util/virlease.c diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 6bffd2eb6d..7fa76a1ec3 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2646,6 +2646,9 @@ virInhibitorRelease; virInitctlFifos; virInitctlSetRunLevel; +# util/viriommufd.h +virIOMMUFDSetRLimitMode; + # util/viriscsi.h virISCSIConnectionLogin; virISCSIConnectionLogout; diff --git a/src/util/meson.build b/src/util/meson.build index 4950a795cc..9fb0aa0fe7 100644 --- a/src/util/meson.build +++ b/src/util/meson.build @@ -46,6 +46,7 @@ util_sources = [ 'viridentity.c', 'virinhibitor.c', 'virinitctl.c', + 'viriommufd.c', 'viriscsi.c', 'virjson.c', 'virkeycode.c', diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c new file mode 100644 index 0000000000..225c76f4b2 --- /dev/null +++ b/src/util/viriommufd.c @@ -0,0 +1,111 @@ +#include <config.h> + +#include "viriommufd.h" +#include "virlog.h" +#include "virerror.h" +#include "virfile.h" + +#ifdef __linux__ + +# include <sys/ioctl.h> +# include <linux/types.h> + +# ifdef HAVE_LINUX_IOMMUFD_H +# include <linux/iommufd.h> +# endif + +# define VIR_FROM_THIS VIR_FROM_NONE + +VIR_LOG_INIT("util.iommufd"); + +# ifndef IOMMU_OPTION + +enum iommufd_option { + IOMMU_OPTION_RLIMIT_MODE = 0, + IOMMU_OPTION_HUGE_PAGES = 1, +}; + +enum iommufd_option_ops { + IOMMU_OPTION_OP_SET = 0, + IOMMU_OPTION_OP_GET = 1, +}; + +struct iommu_option { + __u32 size; + __u32 option_id; + __u16 op; + __u16 __reserved; + __u32 object_id; + __aligned_u64 val64; +}; + +# define IOMMUFD_TYPE (';') +# define IOMMUFD_CMD_OPTION 0x87 +# define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION) + +# endif + +/** + * virIOMMUFDSetRLimitMode: + * @fd: iommufd file descriptor + * @processAccounting: true for per-process, false for per-user + * + * Set RLIMIT_MEMLOCK accounting mode for the iommufd. + * + * Returns: 0 on success, -1 on error + */ +int +virIOMMUFDSetRLimitMode(int fd, bool processAccounting) +{ + struct iommu_option option = { + .size = sizeof(struct iommu_option), + .option_id = IOMMU_OPTION_RLIMIT_MODE, + .op = IOMMU_OPTION_OP_SET, + .__reserved = 0, + .object_id = 0, + .val64 = processAccounting ? 1 : 0, + }; + + if (ioctl(fd, IOMMU_OPTION, &option) < 0) { + switch (errno) { + case ENOTTY: + VIR_WARN("IOMMU_OPTION ioctl not supported"); + return 0; + + case EOPNOTSUPP: + VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel"); + return 0; + + case EINVAL: + virReportSystemError(errno, "%s", + _("invalid iommufd option parameters")); + return -1; + + case EPERM: + VIR_WARN("Permission denied for IOMMU_OPTION ioctl. " + "Per-user-based memory accounting to be used by default."); + return 0; + + default: + virReportSystemError(errno, "%s", + _("failed to set iommufd option")); + return -1; + } + } In my previous testing this part of code was not used so no rlimit was configured for the grace hopper GPU that was assigned to a VM.
The VM OS was able to see the GPU and I was able to run cuda-samples with most of them passing. This setup didn't use vCMDQ or EGM. When I tried patches that add support for vCMDQ I was no longer able to use the GPU inside the VM until this code was called or setting "setcap cap_ipc_lock=ep" on the qemu binary but it was still detected inside the VM and the VM was started successfully.
So is this required for all devices that want to use iommufd in order for them to work correctly inside the VM? Or is it necessary only when specific features are used?
I don’t think the ioctl is required for all devices, but vCMDQ can increase accounted pinned memory over the per‑user memory locking limit. vCMDQ introduces additional guest‑RAM backed queues that could be the extra pinned/accounted memory pushing over the memory locking limit. Additionally, attempting to launch a second iommufd VM could increase accounted memory over the per-user memory locking limit.
If that ioctl call is not required for all devices we should not call it unconditionally for all VMs that will try to use iommufd with any device.
Libvirt tries to guess correct memory limit for specific cases, see function qemuDomainGetMemLockLimitBytes() .
If I manually set 64G hard_limit for VM with 32G ram everything works even without calling tha ioctl:
<memtune> <hard_limit unit='GiB'>64</hard_limit> </memtune>
So if we can figure out some reasonable overhead when vCMDQ is used that would be better solution.
It makes sense that the ioctl should not be used blindly for every iommufd VM. Would you be open to gating the per-process accounting behind a config setting (e.g. iommufd_rlimit_mode=process in libvirtd.conf)? That keeps the default behavior unchanged while accounting for the multi-VM failure case. Separately, I'd be happy to add memlock limit adjustments in the vCMDQ Libvirt patch series under qemuDomainGetMemLockLimitBytes() when vCMDQ is enabled.
For the case you observed, if it were truly a single isolated QEMU process with no other memlocked usage under the same uid, per‑process vs per‑user should be identical. The fact that switching to per‑process memory accounting fixes the issue suggests there is additional memlocked usage being charged to the libvirt‑qemu uid (e.g. other processes, helper daemons, or device‑related accounting). vCMDQ just pushes the summed memory over the limit. When the limit was not high enough I got the following errors in host dmesg:
[30507.848263] acpi NVDA200C:03: tegra241_cmdqv: unexpected error reported. vintf_map: 0000000000000002, vcmdq_map 00000000:00000000:00000000:0000000c
I think this needs additional work in QEMU, starting VM should error out if it hits the memory limit instead of silently starting broken VM configuration.
Ok, I will discuss with Shameer about erroring out if it hits the memory limit. Thank you for testing and providing this detailed feedback. Nathan
From: Nathan Chen <nathanc@nvidia.com> Open VFIO FDs from libvirt backend without exposing these FDs to XML users, i.e. one per iommufd hostdev for /dev/vfio/devices/vfioX, and pass the FD to qemu command line. Suggested-by: Ján Tomko <jtomko@redhat.com> Signed-off-by: Nathan Chen <nathanc@nvidia.com> --- src/libvirt_private.syms | 1 + src/qemu/qemu_command.c | 21 +++++++++++ src/qemu/qemu_process.c | 78 ++++++++++++++++++++++++++++++++++++++++ src/util/virpci.c | 39 ++++++++++++++++++++ src/util/virpci.h | 2 ++ 5 files changed, 141 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 7fa76a1ec3..d81b30f0b6 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -3156,6 +3156,7 @@ virPCIDeviceGetStubDriverName; virPCIDeviceGetStubDriverType; virPCIDeviceGetUnbindFromStub; virPCIDeviceGetUsedBy; +virPCIDeviceGetVfioPath; virPCIDeviceGetVPD; virPCIDeviceHasPCIExpressLink; virPCIDeviceIsAssignable; diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 5274b33d17..406f14831c 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -4810,6 +4810,18 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def, NULL) < 0) return NULL; + if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO && + pcisrc->driver.iommufd == VIR_TRISTATE_BOOL_YES) { + qemuDomainHostdevPrivate *hostdevPriv = QEMU_DOMAIN_HOSTDEV_PRIVATE(dev); + + if (hostdevPriv->vfioDeviceFd != -1) { + g_autofree char *fdstr = g_strdup_printf("%d", hostdevPriv->vfioDeviceFd); + if (virJSONValueObjectAdd(&props, "S:fd", fdstr, NULL) < 0) + return NULL; + hostdevPriv->vfioDeviceFd = -1; + } + } + if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0) return NULL; @@ -5254,6 +5266,15 @@ qemuBuildHostdevCommandLine(virCommand *cmd, if (qemuCommandAddExtDevice(cmd, hostdev->info, def, qemuCaps) < 0) return -1; + if (subsys->u.pci.driver.iommufd == VIR_TRISTATE_BOOL_YES) { + qemuDomainHostdevPrivate *hostdevPriv = QEMU_DOMAIN_HOSTDEV_PRIVATE(hostdev); + + if (hostdevPriv->vfioDeviceFd != -1) { + virCommandPassFD(cmd, hostdevPriv->vfioDeviceFd, + VIR_COMMAND_PASS_FD_CLOSE_PARENT); + } + } + if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev))) return -1; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index a53bb40783..2841856454 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -103,6 +103,7 @@ #include "storage_source.h" #include "backup_conf.h" #include "storage_file_probe.h" +#include "virpci.h" #include "logging/log_manager.h" #include "logging/log_protocol.h" @@ -7670,6 +7671,81 @@ qemuProcessPrepareHostBackendChardevHotplug(virDomainObj *vm, return 0; } +/** + * qemuProcessOpenVfioDeviceFd: + * @hostdev: host device definition + * @vfioFd: returned file descriptor + * + * Opens the VFIO device file descriptor for a hostdev. + * + * Returns: FD on success, -1 on failure + */ +static int +qemuProcessOpenVfioDeviceFd(virDomainHostdevDef *hostdev) +{ + g_autofree char *vfioPath = NULL; + int fd = -1; + + if (hostdev->mode != VIR_DOMAIN_HOSTDEV_MODE_SUBSYS || + hostdev->source.subsys.type != VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("VFIO FD only supported for PCI hostdevs")); + return -1; + } + + if (virPCIDeviceGetVfioPath(&hostdev->source.subsys.u.pci.addr, &vfioPath) < 0) + return -1; + + VIR_DEBUG("Opening VFIO device %s", vfioPath); + + if ((fd = open(vfioPath, O_RDWR | O_CLOEXEC)) < 0) { + if (errno == ENOENT) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("VFIO device %1$s not found - ensure device is bound to vfio-pci driver"), + vfioPath); + } else { + virReportSystemError(errno, + _("cannot open VFIO device %1$s"), vfioPath); + } + return -1; + } + + VIR_DEBUG("Opened VFIO device FD %d for %s", fd, vfioPath); + return fd; +} + +/** + * qemuProcessOpenVfioFds: + * @vm: domain object + * + * Opens all necessary VFIO file descriptors for the domain. + * + * Returns: 0 on success, -1 on failure + */ +static int +qemuProcessOpenVfioFds(virDomainObj *vm) +{ + size_t i; + + /* Check if we have any hostdevs that need VFIO FDs */ + for (i = 0; i < vm->def->nhostdevs; i++) { + virDomainHostdevDef *hostdev = vm->def->hostdevs[i]; + qemuDomainHostdevPrivate *hostdevPriv = QEMU_DOMAIN_HOSTDEV_PRIVATE(hostdev); + + if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS && + hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI && + hostdev->source.subsys.u.pci.driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO && + hostdev->source.subsys.u.pci.driver.iommufd == VIR_TRISTATE_BOOL_YES) { + /* Open VFIO device FD */ + hostdevPriv->vfioDeviceFd = qemuProcessOpenVfioDeviceFd(hostdev); + if (hostdevPriv->vfioDeviceFd == -1) + return -1; + } + } + + return 0; +} + /** * qemuProcessPrepareHost: * @driver: qemu driver @@ -7725,6 +7801,8 @@ qemuProcessPrepareHost(virQEMUDriver *driver, hostdev_flags |= VIR_HOSTDEV_COLD_BOOT; if (qemuHostdevPrepareDomainDevices(driver, vm->def, hostdev_flags) < 0) return -1; + if (qemuProcessOpenVfioFds(vm) < 0) + return -1; VIR_DEBUG("Preparing chr device backends"); if (qemuProcessPrepareHostBackendChardev(vm) < 0) diff --git a/src/util/virpci.c b/src/util/virpci.c index 90617e69c6..2348a98003 100644 --- a/src/util/virpci.c +++ b/src/util/virpci.c @@ -3320,3 +3320,42 @@ virPCIDeviceAddressFree(virPCIDeviceAddress *address) { g_free(address); } + +/** + * virPCIDeviceGetVfioPath: + * @addr: host device PCI address + * @vfioPath: returned VFIO device path + * + * Constructs the VFIO device path for a PCI hostdev. + * + * Returns: 0 on success, -1 on failure + */ +int +virPCIDeviceGetVfioPath(virPCIDeviceAddress *addr, + char **vfioPath) +{ + g_autofree char *addrStr = NULL; + g_autofree char *sysfsPath = NULL; + g_autoptr(DIR) dir = NULL; + struct dirent *entry = NULL; + + *vfioPath = NULL; + addrStr = virPCIDeviceAddressAsString(addr); + + /* Look in device's vfio-dev subdirectory */ + sysfsPath = g_strdup_printf("/sys/bus/pci/devices/%s/vfio-dev/", addrStr); + + if (virDirOpen(&dir, sysfsPath) == 1) { + while (virDirRead(dir, &entry, sysfsPath) > 0) { + if (STRPREFIX(entry->d_name, "vfio")) { + *vfioPath = g_strdup_printf("/dev/vfio/devices/%s", entry->d_name); + return 0; + } + } + } + + virReportError(VIR_ERR_INTERNAL_ERROR, + _("cannot find VFIO device for PCI device %1$s"), + addrStr); + return -1; +} diff --git a/src/util/virpci.h b/src/util/virpci.h index fc538566e1..24ede10755 100644 --- a/src/util/virpci.h +++ b/src/util/virpci.h @@ -296,6 +296,8 @@ void virPCIEDeviceInfoFree(virPCIEDeviceInfo *dev); void virPCIDeviceAddressFree(virPCIDeviceAddress *address); +int virPCIDeviceGetVfioPath(virPCIDeviceAddress *addr, char **vfioPath); + G_DEFINE_AUTOPTR_CLEANUP_FUNC(virPCIDevice, virPCIDeviceFree); G_DEFINE_AUTOPTR_CLEANUP_FUNC(virPCIDeviceAddress, virPCIDeviceAddressFree); G_DEFINE_AUTOPTR_CLEANUP_FUNC(virPCIEDeviceInfo, virPCIEDeviceInfoFree); -- 2.43.0
From: Nathan Chen <nathanc@nvidia.com> Open iommufd FD from libvirt backend without exposing these FDs to XML users, i.e. one per domain for /dev/iommu, and pass the FD to qemu command line. Set per-process memory accounting for iommufd instead of the default per-user memory accounting. Suggested-by: Ján Tomko <jtomko@redhat.com> Signed-off-by: Nathan Chen <nathanc@nvidia.com> --- src/qemu/qemu_command.c | 13 +++++++++++-- src/qemu/qemu_domain.c | 1 + src/qemu/qemu_domain.h | 2 ++ src/qemu/qemu_process.c | 43 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 2 deletions(-) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 406f14831c..85c368f0f9 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -5349,9 +5349,13 @@ qemuBuildHostdevCommandLine(virCommand *cmd, static int qemuBuildIOMMUFDCommandLine(virCommand *cmd, - const virDomainDef *def) + const virDomainDef *def, + virDomainObj *vm) { size_t i; + qemuDomainObjPrivate *priv = vm->privateData; + g_autofree char *fdstr = g_strdup_printf("%d", priv->iommufd); + for (i = 0; i < def->nhostdevs; i++) { virDomainHostdevDef *hostdev = def->hostdevs[i]; @@ -5370,8 +5374,13 @@ qemuBuildIOMMUFDCommandLine(virCommand *cmd, if (subsys->u.pci.driver.iommufd != VIR_TRISTATE_BOOL_YES) continue; + virCommandPassFD(cmd, priv->iommufd, VIR_COMMAND_PASS_FD_CLOSE_PARENT); + + priv->iommufd = -1; + if (qemuMonitorCreateObjectProps(&props, "iommufd", "iommufd0", + "S:fd", fdstr, NULL) < 0) return -1; @@ -10997,7 +11006,7 @@ qemuBuildCommandLine(virDomainObj *vm, if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0) return NULL; - if (qemuBuildIOMMUFDCommandLine(cmd, def) < 0) + if (qemuBuildIOMMUFDCommandLine(cmd, def, vm) < 0) return NULL; if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 85eea1801f..c5e1cd5279 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -2042,6 +2042,7 @@ qemuDomainObjPrivateAlloc(void *opaque) priv->blockjobs = virHashNew(virObjectUnref); priv->fds = virHashNew(g_object_unref); + priv->iommufd = -1; priv->pidMonitored = -1; /* agent commands block by default, user can choose different behavior */ diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 3aac743875..fabfe265bf 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -264,6 +264,8 @@ struct _qemuDomainObjPrivate { /* named file descriptor groups associated with the VM */ GHashTable *fds; + int iommufd; + char *memoryBackingDir; }; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 2841856454..c5b2a5fda8 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -104,6 +104,7 @@ #include "backup_conf.h" #include "storage_file_probe.h" #include "virpci.h" +#include "viriommufd.h" #include "logging/log_manager.h" #include "logging/log_protocol.h" @@ -7671,6 +7672,42 @@ qemuProcessPrepareHostBackendChardevHotplug(virDomainObj *vm, return 0; } +/** + * qemuProcessOpenIommuFd: + * @vm: domain object + * @iommuFd: returned file descriptor + * + * Opens /dev/iommu file descriptor for the VM. + * + * Returns: FD on success, -1 on failure + */ +static int +qemuProcessOpenIommuFd(virDomainObj *vm) +{ + int fd = -1; + + VIR_DEBUG("Opening IOMMU FD for domain %s", vm->def->name); + + if ((fd = open(VIR_IOMMU_DEV_PATH, O_RDWR | O_CLOEXEC)) < 0) { + if (errno == ENOENT) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("IOMMU FD support requires /dev/iommu device")); + } else { + virReportSystemError(errno, "%s", + _("cannot open /dev/iommu")); + } + return -1; + } + + if (virIOMMUFDSetRLimitMode(fd, true) < 0) { + VIR_FORCE_CLOSE(fd); + return -1; + } + + VIR_DEBUG("Opened IOMMU FD %d for domain %s", fd, vm->def->name); + return fd; +} + /** * qemuProcessOpenVfioDeviceFd: * @hostdev: host device definition @@ -7725,6 +7762,7 @@ qemuProcessOpenVfioDeviceFd(virDomainHostdevDef *hostdev) static int qemuProcessOpenVfioFds(virDomainObj *vm) { + qemuDomainObjPrivate *priv = vm->privateData; size_t i; /* Check if we have any hostdevs that need VFIO FDs */ @@ -7740,6 +7778,11 @@ qemuProcessOpenVfioFds(virDomainObj *vm) hostdevPriv->vfioDeviceFd = qemuProcessOpenVfioDeviceFd(hostdev); if (hostdevPriv->vfioDeviceFd == -1) return -1; + + /* Open IOMMU FD */ + priv->iommufd = qemuProcessOpenIommuFd(vm); + if (priv->iommufd == -1) + return -1; } } -- 2.43.0
From: Nathan Chen <nathanc@nvidia.com> When launching a qemu VM with the iommufd feature enabled for VFIO hostdevs: - Do not allow cgroup, namespace, and seclabel access to VFIO paths (/dev/vfio/vfio and /dev/vfio/<iommugroup>) - Allow access to iommufd paths (/dev/iommu and /dev/vfio/devices/vfio*) for AppArmor, SELinux, and DAC Signed-off-by: Nathan Chen <nathanc@nvidia.com> --- src/qemu/qemu_cgroup.c | 3 ++ src/qemu/qemu_namespace.c | 3 ++ src/security/security_apparmor.c | 31 ++++++++++++++------ src/security/security_dac.c | 49 +++++++++++++++++++++++++------- src/security/security_selinux.c | 47 +++++++++++++++++++++++------- src/security/virt-aa-helper.c | 33 ++++++++++++++++----- 6 files changed, 130 insertions(+), 36 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 7dadef0739..6148990f19 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -479,6 +479,9 @@ qemuSetupHostdevCgroup(virDomainObj *vm, g_autofree char *path = NULL; int perms; + if (dev->source.subsys.u.pci.driver.iommufd == VIR_TRISTATE_BOOL_YES) + return 0; + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; diff --git a/src/qemu/qemu_namespace.c b/src/qemu/qemu_namespace.c index c689cc3e40..fb0734193d 100644 --- a/src/qemu/qemu_namespace.c +++ b/src/qemu/qemu_namespace.c @@ -345,6 +345,9 @@ qemuDomainSetupHostdev(virDomainObj *vm, { g_autofree char *path = NULL; + if (hostdev->source.subsys.u.pci.driver.iommufd == VIR_TRISTATE_BOOL_YES) + return 0; + if (qemuDomainGetHostdevPath(hostdev, &path, NULL) < 0) return -1; diff --git a/src/security/security_apparmor.c b/src/security/security_apparmor.c index 68ac39611f..e7987b54b4 100644 --- a/src/security/security_apparmor.c +++ b/src/security/security_apparmor.c @@ -45,6 +45,7 @@ #include "virstring.h" #include "virscsi.h" #include "virmdev.h" +#include "viriommufd.h" #define VIR_FROM_THIS VIR_FROM_SECURITY @@ -841,25 +842,37 @@ AppArmorSetSecurityHostdevLabel(virSecurityManager *mgr, } case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI: { - virPCIDevice *pci = + g_autoptr(virPCIDevice) pci = virPCIDeviceNew(&pcisrc->addr); if (!pci) goto done; if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO) { - char *vfioGroupDev = virPCIDeviceGetIOMMUGroupDev(pci); - - if (!vfioGroupDev) { - virPCIDeviceFree(pci); - goto done; + if (dev->source.subsys.u.pci.driver.iommufd != VIR_TRISTATE_BOOL_YES) { + char *vfioGroupDev = virPCIDeviceGetIOMMUGroupDev(pci); + + if (!vfioGroupDev) { + virPCIDeviceFree(pci); + goto done; + } + ret = AppArmorSetSecurityPCILabel(pci, vfioGroupDev, ptr); + VIR_FREE(vfioGroupDev); + } else { + g_autofree char *vfiofdDev = NULL; + + if (virPCIDeviceGetVfioPath(&dev->source.subsys.u.pci.addr, &vfiofdDev) < 0) + goto done; + + ret = AppArmorSetSecurityPCILabel(pci, vfiofdDev, ptr); + if (ret < 0) + goto done; + + ret = AppArmorSetSecurityPCILabel(pci, VIR_IOMMU_DEV_PATH, ptr); } - ret = AppArmorSetSecurityPCILabel(pci, vfioGroupDev, ptr); - VIR_FREE(vfioGroupDev); } else { ret = virPCIDeviceFileIterate(pci, AppArmorSetSecurityPCILabel, ptr); } - virPCIDeviceFree(pci); break; } diff --git a/src/security/security_dac.c b/src/security/security_dac.c index 2f788b872a..d0ed22db2d 100644 --- a/src/security/security_dac.c +++ b/src/security/security_dac.c @@ -41,6 +41,7 @@ #include "virscsivhost.h" #include "virstring.h" #include "virutil.h" +#include "viriommufd.h" #define VIR_FROM_THIS VIR_FROM_SECURITY @@ -1282,14 +1283,27 @@ virSecurityDACSetHostdevLabel(virSecurityManager *mgr, return -1; if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO) { - g_autofree char *vfioGroupDev = virPCIDeviceGetIOMMUGroupDev(pci); + if (dev->source.subsys.u.pci.driver.iommufd != VIR_TRISTATE_BOOL_YES) { + g_autofree char *vfioGroupDev = virPCIDeviceGetIOMMUGroupDev(pci); - if (!vfioGroupDev) - return -1; + if (!vfioGroupDev) + return -1; + + ret = virSecurityDACSetHostdevLabelHelper(vfioGroupDev, + false, + &cbdata); + } else { + g_autofree char *vfiofdDev = NULL; + + if (virPCIDeviceGetVfioPath(&dev->source.subsys.u.pci.addr, &vfiofdDev) < 0) + return -1; - ret = virSecurityDACSetHostdevLabelHelper(vfioGroupDev, - false, - &cbdata); + ret = virSecurityDACSetHostdevLabelHelper(vfiofdDev, false, &cbdata); + if (ret < 0) + break; + + ret = virSecurityDACSetHostdevLabelHelper(VIR_IOMMU_DEV_PATH, false, &cbdata); + } } else { ret = virPCIDeviceFileIterate(pci, virSecurityDACSetPCILabel, @@ -1443,13 +1457,28 @@ virSecurityDACRestoreHostdevLabel(virSecurityManager *mgr, return -1; if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO) { - g_autofree char *vfioGroupDev = virPCIDeviceGetIOMMUGroupDev(pci); + if (dev->source.subsys.u.pci.driver.iommufd != VIR_TRISTATE_BOOL_YES) { + g_autofree char *vfioGroupDev = virPCIDeviceGetIOMMUGroupDev(pci); - if (!vfioGroupDev) - return -1; + if (!vfioGroupDev) + return -1; - ret = virSecurityDACRestoreFileLabelInternal(mgr, NULL, + ret = virSecurityDACRestoreFileLabelInternal(mgr, NULL, vfioGroupDev, false); + } else { + g_autofree char *vfiofdDev = NULL; + + if (virPCIDeviceGetVfioPath(&dev->source.subsys.u.pci.addr, &vfiofdDev) < 0) + return -1; + + ret = virSecurityDACRestoreFileLabelInternal(mgr, NULL, + vfiofdDev, false); + if (ret < 0) + break; + + ret = virSecurityDACRestoreFileLabelInternal(mgr, NULL, + VIR_IOMMU_DEV_PATH, false); + } } else { ret = virPCIDeviceFileIterate(pci, virSecurityDACRestorePCILabel, mgr); } diff --git a/src/security/security_selinux.c b/src/security/security_selinux.c index 2f3cc274a5..834383a7de 100644 --- a/src/security/security_selinux.c +++ b/src/security/security_selinux.c @@ -41,6 +41,7 @@ #include "virconf.h" #include "virtpm.h" #include "virstring.h" +#include "viriommufd.h" #define VIR_FROM_THIS VIR_FROM_SECURITY @@ -2256,14 +2257,27 @@ virSecuritySELinuxSetHostdevSubsysLabel(virSecurityManager *mgr, return -1; if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO) { - g_autofree char *vfioGroupDev = virPCIDeviceGetIOMMUGroupDev(pci); + if (dev->source.subsys.u.pci.driver.iommufd != VIR_TRISTATE_BOOL_YES) { + g_autofree char *vfioGroupDev = virPCIDeviceGetIOMMUGroupDev(pci); - if (!vfioGroupDev) - return -1; + if (!vfioGroupDev) + return -1; + + ret = virSecuritySELinuxSetHostdevLabelHelper(vfioGroupDev, + false, + &data); + } else { + g_autofree char *vfiofdDev = NULL; + + if (virPCIDeviceGetVfioPath(&dev->source.subsys.u.pci.addr, &vfiofdDev) < 0) + return -1; - ret = virSecuritySELinuxSetHostdevLabelHelper(vfioGroupDev, - false, - &data); + ret = virSecuritySELinuxSetHostdevLabelHelper(vfiofdDev, false, &data); + if (ret) + break; + + ret = virSecuritySELinuxSetHostdevLabelHelper(VIR_IOMMU_DEV_PATH, false, &data); + } } else { ret = virPCIDeviceFileIterate(pci, virSecuritySELinuxSetPCILabel, &data); } @@ -2491,12 +2505,25 @@ virSecuritySELinuxRestoreHostdevSubsysLabel(virSecurityManager *mgr, return -1; if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO) { - g_autofree char *vfioGroupDev = virPCIDeviceGetIOMMUGroupDev(pci); + if (dev->source.subsys.u.pci.driver.iommufd != VIR_TRISTATE_BOOL_YES) { + g_autofree char *vfioGroupDev = virPCIDeviceGetIOMMUGroupDev(pci); - if (!vfioGroupDev) - return -1; + if (!vfioGroupDev) + return -1; + + ret = virSecuritySELinuxRestoreFileLabel(mgr, vfioGroupDev, false, false); + } else { + g_autofree char *vfiofdDev = NULL; + + if (virPCIDeviceGetVfioPath(&dev->source.subsys.u.pci.addr, &vfiofdDev) < 0) + return -1; - ret = virSecuritySELinuxRestoreFileLabel(mgr, vfioGroupDev, false, false); + ret = virSecuritySELinuxRestoreFileLabel(mgr, vfiofdDev, false, false); + if (ret < 0) + break; + + ret = virSecuritySELinuxRestoreFileLabel(mgr, VIR_IOMMU_DEV_PATH, false, false); + } } else { ret = virPCIDeviceFileIterate(pci, virSecuritySELinuxRestorePCILabel, mgr); } diff --git a/src/security/virt-aa-helper.c b/src/security/virt-aa-helper.c index e365d02af4..82dfb3d3af 100644 --- a/src/security/virt-aa-helper.c +++ b/src/security/virt-aa-helper.c @@ -50,6 +50,7 @@ #include "virstring.h" #include "virgettext.h" #include "virhostdev.h" +#include "viriommufd.h" #define VIR_FROM_THIS VIR_FROM_SECURITY @@ -1114,8 +1115,9 @@ get_files(vahControl * ctl) virDeviceHostdevPCIDriverName driverName = dev->source.subsys.u.pci.driver.name; - if (driverName == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO || - driverName == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_DEFAULT) { + if ((driverName == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO || + driverName == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_DEFAULT) && + dev->source.subsys.u.pci.driver.iommufd != VIR_TRISTATE_BOOL_YES) { needsVfio = true; } @@ -1348,6 +1350,7 @@ get_files(vahControl * ctl) virBufferAddLit(&buf, " \"/dev/vfio/vfio\" rw,\n"); virBufferAddLit(&buf, " \"/dev/vfio/[0-9]*\" rw,\n"); } + if (needsgl) { /* if using gl all sorts of further dri related paths will be needed */ virBufferAddLit(&buf, " # DRI/Mesa/(e)GL config and driver paths\n"); @@ -1385,9 +1388,18 @@ get_files(vahControl * ctl) } } - if (ctl->newfile && - vah_add_file(&buf, ctl->newfile, "rwk") != 0) { - return -1; + if (ctl->newfile) { + const char *perms = "rwk"; + + /* VFIO and iommufd devices need mmap permission */ + if (STRPREFIX(ctl->newfile, "/dev/vfio/devices/vfio") || + STREQ(ctl->newfile, VIR_IOMMU_DEV_PATH)) { + perms = "rwm"; + } + + if (vah_add_file(&buf, ctl->newfile, perms) != 0) { + return -1; + } } ctl->files = virBufferContentAndReset(&buf); @@ -1561,8 +1573,15 @@ main(int argc, char **argv) } } if (ctl->append && ctl->newfile) { - if (vah_add_file(&buf, ctl->newfile, "rwk") != 0) - goto cleanup; + const char *perms = "rwk"; + + if (STRPREFIX(ctl->newfile, "/dev/vfio/devices/vfio") || + STREQ(ctl->newfile, VIR_IOMMU_DEV_PATH)) { + perms = "rwm"; + } + + if (vah_add_file(&buf, ctl->newfile, perms) != 0) + return -1; } else { if (ctl->def->virtType == VIR_DOMAIN_VIRT_QEMU || ctl->def->virtType == VIR_DOMAIN_VIRT_KQEMU || -- 2.43.0
On Fri, Jan 16, 2026 at 05:39:36PM -0800, Nathan Chen via Devel wrote:
From: Nathan Chen <nathanc@nvidia.com>
When launching a qemu VM with the iommufd feature enabled for VFIO hostdevs: - Do not allow cgroup, namespace, and seclabel access to VFIO paths (/dev/vfio/vfio and /dev/vfio/<iommugroup>) - Allow access to iommufd paths (/dev/iommu and /dev/vfio/devices/vfio*) for AppArmor, SELinux, and DAC
Signed-off-by: Nathan Chen <nathanc@nvidia.com> --- src/qemu/qemu_cgroup.c | 3 ++ src/qemu/qemu_namespace.c | 3 ++ src/security/security_apparmor.c | 31 ++++++++++++++------ src/security/security_dac.c | 49 +++++++++++++++++++++++++------- src/security/security_selinux.c | 47 +++++++++++++++++++++++------- src/security/virt-aa-helper.c | 33 ++++++++++++++++----- 6 files changed, 130 insertions(+), 36 deletions(-)
[...]
diff --git a/src/security/security_apparmor.c b/src/security/security_apparmor.c index 68ac39611f..e7987b54b4 100644 --- a/src/security/security_apparmor.c +++ b/src/security/security_apparmor.c @@ -45,6 +45,7 @@ #include "virstring.h" #include "virscsi.h" #include "virmdev.h" +#include "viriommufd.h"
#define VIR_FROM_THIS VIR_FROM_SECURITY
@@ -841,25 +842,37 @@ AppArmorSetSecurityHostdevLabel(virSecurityManager *mgr, }
case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI: { - virPCIDevice *pci = + g_autoptr(virPCIDevice) pci = virPCIDeviceNew(&pcisrc->addr);
if (!pci) goto done;
if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO) { - char *vfioGroupDev = virPCIDeviceGetIOMMUGroupDev(pci); - - if (!vfioGroupDev) { - virPCIDeviceFree(pci); - goto done; + if (dev->source.subsys.u.pci.driver.iommufd != VIR_TRISTATE_BOOL_YES) { + char *vfioGroupDev = virPCIDeviceGetIOMMUGroupDev(pci); + + if (!vfioGroupDev) { + virPCIDeviceFree(pci);
This virPCIDeviceFree should be removed as the pci was converted to g_autoptr().
+ goto done; + } + ret = AppArmorSetSecurityPCILabel(pci, vfioGroupDev, ptr); + VIR_FREE(vfioGroupDev); + } else { + g_autofree char *vfiofdDev = NULL; + + if (virPCIDeviceGetVfioPath(&dev->source.subsys.u.pci.addr, &vfiofdDev) < 0) + goto done; + + ret = AppArmorSetSecurityPCILabel(pci, vfiofdDev, ptr); + if (ret < 0) + goto done; + + ret = AppArmorSetSecurityPCILabel(pci, VIR_IOMMU_DEV_PATH, ptr); } - ret = AppArmorSetSecurityPCILabel(pci, vfioGroupDev, ptr); - VIR_FREE(vfioGroupDev); } else { ret = virPCIDeviceFileIterate(pci, AppArmorSetSecurityPCILabel, ptr); } - virPCIDeviceFree(pci); break; }
Pavel
On 1/19/2026 5:52 AM, Pavel Hrdina wrote:
On Fri, Jan 16, 2026 at 05:39:36PM -0800, Nathan Chen via Devel wrote:
From: Nathan Chen<nathanc@nvidia.com>
When launching a qemu VM with the iommufd feature enabled for VFIO hostdevs: - Do not allow cgroup, namespace, and seclabel access to VFIO paths (/dev/vfio/vfio and /dev/vfio/<iommugroup>) - Allow access to iommufd paths (/dev/iommu and /dev/vfio/devices/vfio*) for AppArmor, SELinux, and DAC
Signed-off-by: Nathan Chen<nathanc@nvidia.com> --- src/qemu/qemu_cgroup.c | 3 ++ src/qemu/qemu_namespace.c | 3 ++ src/security/security_apparmor.c | 31 ++++++++++++++------ src/security/security_dac.c | 49 +++++++++++++++++++++++++------- src/security/security_selinux.c | 47 +++++++++++++++++++++++------- src/security/virt-aa-helper.c | 33 ++++++++++++++++----- 6 files changed, 130 insertions(+), 36 deletions(-) [...]
diff --git a/src/security/security_apparmor.c b/src/security/security_apparmor.c index 68ac39611f..e7987b54b4 100644 --- a/src/security/security_apparmor.c +++ b/src/security/security_apparmor.c @@ -45,6 +45,7 @@ #include "virstring.h" #include "virscsi.h" #include "virmdev.h" +#include "viriommufd.h"
#define VIR_FROM_THIS VIR_FROM_SECURITY
@@ -841,25 +842,37 @@ AppArmorSetSecurityHostdevLabel(virSecurityManager *mgr, }
case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI: { - virPCIDevice *pci = + g_autoptr(virPCIDevice) pci = virPCIDeviceNew(&pcisrc->addr);
if (!pci) goto done;
if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO) { - char *vfioGroupDev = virPCIDeviceGetIOMMUGroupDev(pci); - - if (!vfioGroupDev) { - virPCIDeviceFree(pci); - goto done; + if (dev->source.subsys.u.pci.driver.iommufd != VIR_TRISTATE_BOOL_YES) { + char *vfioGroupDev = virPCIDeviceGetIOMMUGroupDev(pci); + + if (!vfioGroupDev) { + virPCIDeviceFree(pci); This virPCIDeviceFree should be removed as the pci was converted to g_autoptr().
I will remove this in the next revision, thanks for catching this. Nathan
From: Nathan Chen <nathanc@nvidia.com> Provide sample XML and CLI args for the iommufd XML schema for pc, q35, and virt machine types. Signed-off-by: Nathan Chen <nathanc@nvidia.com> --- .../iommufd-q35.x86_64-latest.args | 41 +++++++++++++ .../iommufd-q35.x86_64-latest.xml | 60 +++++++++++++++++++ tests/qemuxmlconfdata/iommufd-q35.xml | 38 ++++++++++++ ...fd-virt-pci-bus-single.aarch64-latest.args | 32 ++++++++++ ...ufd-virt-pci-bus-single.aarch64-latest.xml | 31 ++++++++++ .../iommufd-virt-pci-bus-single.xml | 22 +++++++ .../iommufd-virt.aarch64-latest.args | 36 +++++++++++ .../iommufd-virt.aarch64-latest.xml | 53 ++++++++++++++++ tests/qemuxmlconfdata/iommufd-virt.xml | 29 +++++++++ .../iommufd.x86_64-latest.args | 35 +++++++++++ .../qemuxmlconfdata/iommufd.x86_64-latest.xml | 38 ++++++++++++ tests/qemuxmlconfdata/iommufd.xml | 30 ++++++++++ tests/qemuxmlconftest.c | 34 +++++++++++ 13 files changed, 479 insertions(+) create mode 100644 tests/qemuxmlconfdata/iommufd-q35.x86_64-latest.args create mode 100644 tests/qemuxmlconfdata/iommufd-q35.x86_64-latest.xml create mode 100644 tests/qemuxmlconfdata/iommufd-q35.xml create mode 100644 tests/qemuxmlconfdata/iommufd-virt-pci-bus-single.aarch64-latest.args create mode 100644 tests/qemuxmlconfdata/iommufd-virt-pci-bus-single.aarch64-latest.xml create mode 100644 tests/qemuxmlconfdata/iommufd-virt-pci-bus-single.xml create mode 100644 tests/qemuxmlconfdata/iommufd-virt.aarch64-latest.args create mode 100644 tests/qemuxmlconfdata/iommufd-virt.aarch64-latest.xml create mode 100644 tests/qemuxmlconfdata/iommufd-virt.xml create mode 100644 tests/qemuxmlconfdata/iommufd.x86_64-latest.args create mode 100644 tests/qemuxmlconfdata/iommufd.x86_64-latest.xml create mode 100644 tests/qemuxmlconfdata/iommufd.xml diff --git a/tests/qemuxmlconfdata/iommufd-q35.x86_64-latest.args b/tests/qemuxmlconfdata/iommufd-q35.x86_64-latest.args new file mode 100644 index 0000000000..7d819e141b --- /dev/null +++ b/tests/qemuxmlconfdata/iommufd-q35.x86_64-latest.args @@ -0,0 +1,41 @@ +LC_ALL=C \ +PATH=/bin \ +HOME=/var/lib/libvirt/qemu/domain--1-q35-test \ +USER=test \ +LOGNAME=test \ +XDG_DATA_HOME=/var/lib/libvirt/qemu/domain--1-q35-test/.local/share \ +XDG_CACHE_HOME=/var/lib/libvirt/qemu/domain--1-q35-test/.cache \ +XDG_CONFIG_HOME=/var/lib/libvirt/qemu/domain--1-q35-test/.config \ +/usr/bin/qemu-system-x86_64 \ +-name guest=q35-test,debug-threads=on \ +-S \ +-object '{"qom-type":"secret","id":"masterKey0","format":"raw","file":"/var/lib/libvirt/qemu/domain--1-q35-test/master-key.aes"}' \ +-machine q35,usb=off,dump-guest-core=off,memory-backend=pc.ram,acpi=off \ +-accel tcg \ +-cpu qemu64 \ +-m size=2097152k \ +-object '{"qom-type":"memory-backend-ram","id":"pc.ram","size":2147483648}' \ +-overcommit mem-lock=off \ +-smp 2,sockets=2,cores=1,threads=1 \ +-uuid 11dbdcdd-4c3b-482b-8903-9bdb8c0a2774 \ +-display none \ +-no-user-config \ +-nodefaults \ +-chardev socket,id=charmonitor,fd=1729,server=on,wait=off \ +-mon chardev=charmonitor,id=monitor,mode=control \ +-rtc base=utc \ +-no-shutdown \ +-boot strict=on \ +-device '{"driver":"pcie-root-port","port":16,"chassis":1,"id":"pci.1","bus":"pcie.0","multifunction":true,"addr":"0x2"}' \ +-device '{"driver":"pcie-root-port","port":17,"chassis":2,"id":"pci.2","bus":"pcie.0","addr":"0x2.0x1"}' \ +-device '{"driver":"qemu-xhci","id":"usb","bus":"pci.1","addr":"0x0"}' \ +-blockdev '{"driver":"host_device","filename":"/dev/HostVG/QEMUGuest1","node-name":"libvirt-1-storage","read-only":false}' \ +-device '{"driver":"ide-hd","bus":"ide.0","drive":"libvirt-1-storage","id":"sata0-0-0","bootindex":1}' \ +-audiodev '{"id":"audio1","driver":"none"}' \ +-device '{"driver":"qxl-vga","id":"video0","max_outputs":1,"ram_size":67108864,"vram_size":33554432,"vram64_size_mb":0,"vgamem_mb":8,"bus":"pcie.0","addr":"0x1"}' \ +-global ICH9-LPC.noreboot=off \ +-watchdog-action reset \ +-object '{"qom-type":"iommufd","id":"iommufd0","fd":"-1"}' \ +-device '{"driver":"vfio-pci","host":"0000:06:12.5","id":"hostdev0","iommufd":"iommufd0","fd":"0","bus":"pcie.0","addr":"0x3"}' \ +-sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny \ +-msg timestamp=on diff --git a/tests/qemuxmlconfdata/iommufd-q35.x86_64-latest.xml b/tests/qemuxmlconfdata/iommufd-q35.x86_64-latest.xml new file mode 100644 index 0000000000..bb76252b61 --- /dev/null +++ b/tests/qemuxmlconfdata/iommufd-q35.x86_64-latest.xml @@ -0,0 +1,60 @@ +<domain type='qemu'> + <name>q35-test</name> + <uuid>11dbdcdd-4c3b-482b-8903-9bdb8c0a2774</uuid> + <memory unit='KiB'>2097152</memory> + <currentMemory unit='KiB'>2097152</currentMemory> + <vcpu placement='static' cpuset='0-1'>2</vcpu> + <os> + <type arch='x86_64' machine='q35'>hvm</type> + <boot dev='hd'/> + </os> + <cpu mode='custom' match='exact' check='none'> + <model fallback='forbid'>qemu64</model> + </cpu> + <clock offset='utc'/> + <on_poweroff>destroy</on_poweroff> + <on_reboot>restart</on_reboot> + <on_crash>destroy</on_crash> + <devices> + <emulator>/usr/bin/qemu-system-x86_64</emulator> + <disk type='block' device='disk'> + <driver name='qemu' type='raw'/> + <source dev='/dev/HostVG/QEMUGuest1'/> + <target dev='sda' bus='sata'/> + <address type='drive' controller='0' bus='0' target='0' unit='0'/> + </disk> + <controller type='pci' index='0' model='pcie-root'/> + <controller type='pci' index='1' model='pcie-root-port'> + <model name='pcie-root-port'/> + <target chassis='1' port='0x10'/> + <address type='pci' domain='0x0000' bus='0x00' slot='0x02' function='0x0' multifunction='on'/> + </controller> + <controller type='pci' index='2' model='pcie-root-port'> + <model name='pcie-root-port'/> + <target chassis='2' port='0x11'/> + <address type='pci' domain='0x0000' bus='0x00' slot='0x02' function='0x1'/> + </controller> + <controller type='sata' index='0'> + <address type='pci' domain='0x0000' bus='0x00' slot='0x1f' function='0x2'/> + </controller> + <controller type='usb' index='0' model='qemu-xhci'> + <address type='pci' domain='0x0000' bus='0x01' slot='0x00' function='0x0'/> + </controller> + <input type='mouse' bus='ps2'/> + <input type='keyboard' bus='ps2'/> + <audio id='1' type='none'/> + <video> + <model type='qxl' ram='65536' vram='32768' vgamem='8192' heads='1' primary='yes'/> + <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x0'/> + </video> + <hostdev mode='subsystem' type='pci' managed='yes'> + <driver iommufd='yes'/> + <source> + <address domain='0x0000' bus='0x06' slot='0x12' function='0x5'/> + </source> + <address type='pci' domain='0x0000' bus='0x00' slot='0x03' function='0x0'/> + </hostdev> + <watchdog model='itco' action='reset'/> + <memballoon model='none'/> + </devices> +</domain> diff --git a/tests/qemuxmlconfdata/iommufd-q35.xml b/tests/qemuxmlconfdata/iommufd-q35.xml new file mode 100644 index 0000000000..f3c2269fb1 --- /dev/null +++ b/tests/qemuxmlconfdata/iommufd-q35.xml @@ -0,0 +1,38 @@ +<domain type='qemu'> + <name>q35-test</name> + <uuid>11dbdcdd-4c3b-482b-8903-9bdb8c0a2774</uuid> + <memory unit='KiB'>2097152</memory> + <currentMemory unit='KiB'>2097152</currentMemory> + <vcpu placement='static' cpuset='0-1'>2</vcpu> + <os> + <type arch='x86_64' machine='q35'>hvm</type> + <boot dev='hd'/> + </os> + <clock offset='utc'/> + <on_poweroff>destroy</on_poweroff> + <on_reboot>restart</on_reboot> + <on_crash>destroy</on_crash> + <devices> + <emulator>/usr/bin/qemu-system-x86_64</emulator> + <disk type='block' device='disk'> + <source dev='/dev/HostVG/QEMUGuest1'/> + <target dev='sda' bus='sata'/> + <address type='drive' controller='0' bus='0' target='0' unit='0'/> + </disk> + <controller type='pci' index='0' model='pcie-root'/> + <hostdev mode='subsystem' type='pci' managed='yes'> + <driver iommufd='yes'/> + <source> + <address domain='0x0000' bus='0x06' slot='0x12' function='0x5'/> + </source> + <address type='pci' domain='0x0000' bus='0x00' slot='0x03' function='0x0'/> + </hostdev> + <controller type='sata' index='0'/> + <input type='mouse' bus='ps2'/> + <input type='keyboard' bus='ps2'/> + <video> + <model type='qxl' ram='65536' vram='32768' vgamem='8192' heads='1'/> + </video> + <memballoon model='none'/> + </devices> +</domain> diff --git a/tests/qemuxmlconfdata/iommufd-virt-pci-bus-single.aarch64-latest.args b/tests/qemuxmlconfdata/iommufd-virt-pci-bus-single.aarch64-latest.args new file mode 100644 index 0000000000..9a04e97351 --- /dev/null +++ b/tests/qemuxmlconfdata/iommufd-virt-pci-bus-single.aarch64-latest.args @@ -0,0 +1,32 @@ +LC_ALL=C \ +PATH=/bin \ +HOME=/var/lib/libvirt/qemu/domain--1-foo \ +USER=test \ +LOGNAME=test \ +XDG_DATA_HOME=/var/lib/libvirt/qemu/domain--1-foo/.local/share \ +XDG_CACHE_HOME=/var/lib/libvirt/qemu/domain--1-foo/.cache \ +XDG_CONFIG_HOME=/var/lib/libvirt/qemu/domain--1-foo/.config \ +/usr/bin/qemu-system-aarch64 \ +-name guest=foo,debug-threads=on \ +-S \ +-object '{"qom-type":"secret","id":"masterKey0","format":"raw","file":"/var/lib/libvirt/qemu/domain--1-foo/master-key.aes"}' \ +-machine virt,usb=off,gic-version=2,dump-guest-core=off,memory-backend=mach-virt.ram,acpi=off \ +-accel tcg \ +-m size=1048576k \ +-object '{"qom-type":"memory-backend-ram","id":"mach-virt.ram","size":1073741824}' \ +-overcommit mem-lock=off \ +-smp 1,sockets=1,cores=1,threads=1 \ +-uuid 6ba7b810-9dad-11d1-80b4-00c04fd430c8 \ +-display none \ +-no-user-config \ +-nodefaults \ +-chardev socket,id=charmonitor,fd=1729,server=on,wait=off \ +-mon chardev=charmonitor,id=monitor,mode=control \ +-rtc base=utc \ +-no-shutdown \ +-boot strict=on \ +-audiodev '{"id":"audio1","driver":"none"}' \ +-object '{"qom-type":"iommufd","id":"iommufd0","fd":"-1"}' \ +-device '{"driver":"vfio-pci","host":"0000:06:12.5","id":"hostdev0","iommufd":"iommufd0","fd":"0","bus":"pcie.0","addr":"0x1"}' \ +-sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny \ +-msg timestamp=on diff --git a/tests/qemuxmlconfdata/iommufd-virt-pci-bus-single.aarch64-latest.xml b/tests/qemuxmlconfdata/iommufd-virt-pci-bus-single.aarch64-latest.xml new file mode 100644 index 0000000000..9541591c28 --- /dev/null +++ b/tests/qemuxmlconfdata/iommufd-virt-pci-bus-single.aarch64-latest.xml @@ -0,0 +1,31 @@ +<domain type='qemu'> + <name>foo</name> + <uuid>6ba7b810-9dad-11d1-80b4-00c04fd430c8</uuid> + <memory unit='KiB'>1048576</memory> + <currentMemory unit='KiB'>1048576</currentMemory> + <vcpu placement='static'>1</vcpu> + <os> + <type arch='aarch64' machine='virt'>hvm</type> + <boot dev='hd'/> + </os> + <features> + <gic version='2'/> + </features> + <clock offset='utc'/> + <on_poweroff>destroy</on_poweroff> + <on_reboot>restart</on_reboot> + <on_crash>destroy</on_crash> + <devices> + <emulator>/usr/bin/qemu-system-aarch64</emulator> + <controller type='pci' index='0' model='pcie-root'/> + <audio id='1' type='none'/> + <hostdev mode='subsystem' type='pci' managed='yes'> + <driver iommufd='yes'/> + <source> + <address domain='0x0000' bus='0x06' slot='0x12' function='0x5'/> + </source> + <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x0'/> + </hostdev> + <memballoon model='none'/> + </devices> +</domain> diff --git a/tests/qemuxmlconfdata/iommufd-virt-pci-bus-single.xml b/tests/qemuxmlconfdata/iommufd-virt-pci-bus-single.xml new file mode 100644 index 0000000000..c0b9d643b4 --- /dev/null +++ b/tests/qemuxmlconfdata/iommufd-virt-pci-bus-single.xml @@ -0,0 +1,22 @@ +<domain type='qemu'> + <name>foo</name> + <uuid>6ba7b810-9dad-11d1-80b4-00c04fd430c8</uuid> + <memory unit='KiB'>1048576</memory> + <currentMemory unit='KiB'>1048576</currentMemory> + <vcpu placement='static'>1</vcpu> + <os> + <type arch='aarch64' machine='virt'>hvm</type> + </os> + <devices> + <emulator>/usr/bin/qemu-system-aarch64</emulator> + <controller type='pci' index='0' model='pcie-root'/> + <hostdev mode='subsystem' type='pci' managed='yes'> + <driver iommufd='yes'/> + <source> + <address domain='0x0000' bus='0x06' slot='0x12' function='0x5'/> + </source> + <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x0'/> + </hostdev> + <memballoon model='none'/> + </devices> +</domain> diff --git a/tests/qemuxmlconfdata/iommufd-virt.aarch64-latest.args b/tests/qemuxmlconfdata/iommufd-virt.aarch64-latest.args new file mode 100644 index 0000000000..039ae4477c --- /dev/null +++ b/tests/qemuxmlconfdata/iommufd-virt.aarch64-latest.args @@ -0,0 +1,36 @@ +LC_ALL=C \ +PATH=/bin \ +HOME=/var/lib/libvirt/qemu/domain--1-foo \ +USER=test \ +LOGNAME=test \ +XDG_DATA_HOME=/var/lib/libvirt/qemu/domain--1-foo/.local/share \ +XDG_CACHE_HOME=/var/lib/libvirt/qemu/domain--1-foo/.cache \ +XDG_CONFIG_HOME=/var/lib/libvirt/qemu/domain--1-foo/.config \ +/usr/bin/qemu-system-aarch64 \ +-name guest=foo,debug-threads=on \ +-S \ +-object '{"qom-type":"secret","id":"masterKey0","format":"raw","file":"/var/lib/libvirt/qemu/domain--1-foo/master-key.aes"}' \ +-machine virt,usb=off,gic-version=2,dump-guest-core=off,memory-backend=mach-virt.ram,acpi=off \ +-accel tcg \ +-m size=1048576k \ +-object '{"qom-type":"memory-backend-ram","id":"mach-virt.ram","size":1073741824}' \ +-overcommit mem-lock=off \ +-smp 1,sockets=1,cores=1,threads=1 \ +-uuid 6ba7b810-9dad-11d1-80b4-00c04fd430c8 \ +-display none \ +-no-user-config \ +-nodefaults \ +-chardev socket,id=charmonitor,fd=1729,server=on,wait=off \ +-mon chardev=charmonitor,id=monitor,mode=control \ +-rtc base=utc \ +-no-shutdown \ +-boot strict=on \ +-device '{"driver":"pcie-root-port","port":8,"chassis":1,"id":"pci.1","bus":"pcie.0","multifunction":true,"addr":"0x1"}' \ +-device '{"driver":"pcie-root-port","port":9,"chassis":2,"id":"pci.2","bus":"pcie.0","addr":"0x1.0x1"}' \ +-device '{"driver":"pcie-root-port","port":10,"chassis":3,"id":"pci.3","bus":"pcie.0","addr":"0x1.0x2"}' \ +-audiodev '{"id":"audio1","driver":"none"}' \ +-object '{"qom-type":"iommufd","id":"iommufd0","fd":"-1"}' \ +-device '{"driver":"vfio-pci","host":"0000:06:12.5","id":"hostdev0","iommufd":"iommufd0","fd":"0","bus":"pci.1","addr":"0x0"}' \ +-device '{"driver":"vfio-pci","host":"0000:07:12.5","id":"hostdev1","iommufd":"iommufd0","fd":"0","bus":"pci.2","addr":"0x0"}' \ +-sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny \ +-msg timestamp=on diff --git a/tests/qemuxmlconfdata/iommufd-virt.aarch64-latest.xml b/tests/qemuxmlconfdata/iommufd-virt.aarch64-latest.xml new file mode 100644 index 0000000000..83d14a489a --- /dev/null +++ b/tests/qemuxmlconfdata/iommufd-virt.aarch64-latest.xml @@ -0,0 +1,53 @@ +<domain type='qemu'> + <name>foo</name> + <uuid>6ba7b810-9dad-11d1-80b4-00c04fd430c8</uuid> + <memory unit='KiB'>1048576</memory> + <currentMemory unit='KiB'>1048576</currentMemory> + <vcpu placement='static'>1</vcpu> + <os> + <type arch='aarch64' machine='virt'>hvm</type> + <boot dev='hd'/> + </os> + <features> + <gic version='2'/> + </features> + <clock offset='utc'/> + <on_poweroff>destroy</on_poweroff> + <on_reboot>restart</on_reboot> + <on_crash>destroy</on_crash> + <devices> + <emulator>/usr/bin/qemu-system-aarch64</emulator> + <controller type='pci' index='0' model='pcie-root'/> + <controller type='pci' index='1' model='pcie-root-port'> + <model name='pcie-root-port'/> + <target chassis='1' port='0x8'/> + <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x0' multifunction='on'/> + </controller> + <controller type='pci' index='2' model='pcie-root-port'> + <model name='pcie-root-port'/> + <target chassis='2' port='0x9'/> + <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x1'/> + </controller> + <controller type='pci' index='3' model='pcie-root-port'> + <model name='pcie-root-port'/> + <target chassis='3' port='0xa'/> + <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x2'/> + </controller> + <audio id='1' type='none'/> + <hostdev mode='subsystem' type='pci' managed='yes'> + <driver iommufd='yes'/> + <source> + <address domain='0x0000' bus='0x06' slot='0x12' function='0x5'/> + </source> + <address type='pci' domain='0x0000' bus='0x01' slot='0x00' function='0x0'/> + </hostdev> + <hostdev mode='subsystem' type='pci' managed='yes'> + <driver iommufd='yes'/> + <source> + <address domain='0x0000' bus='0x07' slot='0x12' function='0x5'/> + </source> + <address type='pci' domain='0x0000' bus='0x02' slot='0x00' function='0x0'/> + </hostdev> + <memballoon model='none'/> + </devices> +</domain> diff --git a/tests/qemuxmlconfdata/iommufd-virt.xml b/tests/qemuxmlconfdata/iommufd-virt.xml new file mode 100644 index 0000000000..a75c1cc13b --- /dev/null +++ b/tests/qemuxmlconfdata/iommufd-virt.xml @@ -0,0 +1,29 @@ +<domain type='qemu'> + <name>foo</name> + <uuid>6ba7b810-9dad-11d1-80b4-00c04fd430c8</uuid> + <memory unit='KiB'>1048576</memory> + <currentMemory unit='KiB'>1048576</currentMemory> + <vcpu placement='static'>1</vcpu> + <os> + <type arch='aarch64' machine='virt'>hvm</type> + </os> + <devices> + <emulator>/usr/bin/qemu-system-aarch64</emulator> + <controller type='pci' index='0' model='pcie-root'/> + <hostdev mode='subsystem' type='pci' managed='yes'> + <driver iommufd='yes'/> + <source> + <address domain='0x0000' bus='0x06' slot='0x12' function='0x5'/> + </source> + <address type='pci' domain='0x0000' bus='0x01' slot='0x00' function='0x0'/> + </hostdev> + <hostdev mode='subsystem' type='pci' managed='yes'> + <driver iommufd='yes'/> + <source> + <address domain='0x0000' bus='0x07' slot='0x12' function='0x5'/> + </source> + <address type='pci' domain='0x0000' bus='0x02' slot='0x00' function='0x0'/> + </hostdev> + <memballoon model='none'/> + </devices> +</domain> diff --git a/tests/qemuxmlconfdata/iommufd.x86_64-latest.args b/tests/qemuxmlconfdata/iommufd.x86_64-latest.args new file mode 100644 index 0000000000..3130ba2e3a --- /dev/null +++ b/tests/qemuxmlconfdata/iommufd.x86_64-latest.args @@ -0,0 +1,35 @@ +LC_ALL=C \ +PATH=/bin \ +HOME=/var/lib/libvirt/qemu/domain--1-foo \ +USER=test \ +LOGNAME=test \ +XDG_DATA_HOME=/var/lib/libvirt/qemu/domain--1-foo/.local/share \ +XDG_CACHE_HOME=/var/lib/libvirt/qemu/domain--1-foo/.cache \ +XDG_CONFIG_HOME=/var/lib/libvirt/qemu/domain--1-foo/.config \ +/usr/bin/qemu-system-x86_64 \ +-name guest=foo,debug-threads=on \ +-S \ +-object '{"qom-type":"secret","id":"masterKey0","format":"raw","file":"/var/lib/libvirt/qemu/domain--1-foo/master-key.aes"}' \ +-machine pc,usb=off,dump-guest-core=off,memory-backend=pc.ram,acpi=off \ +-accel tcg \ +-cpu qemu64 \ +-m size=2097152k \ +-object '{"qom-type":"memory-backend-ram","id":"pc.ram","size":2147483648}' \ +-overcommit mem-lock=off \ +-smp 2,sockets=2,cores=1,threads=1 \ +-uuid 3c7c30b5-7866-4b05-8a29-efebccba52a0 \ +-display none \ +-no-user-config \ +-nodefaults \ +-chardev socket,id=charmonitor,fd=1729,server=on,wait=off \ +-mon chardev=charmonitor,id=monitor,mode=control \ +-rtc base=utc \ +-no-shutdown \ +-boot strict=on \ +-device '{"driver":"piix3-usb-uhci","id":"usb","bus":"pci.0","addr":"0x1.0x2"}' \ +-audiodev '{"id":"audio1","driver":"none"}' \ +-object '{"qom-type":"iommufd","id":"iommufd0","fd":"-1"}' \ +-device '{"driver":"vfio-pci","host":"0000:06:12.5","id":"hostdev0","iommufd":"iommufd0","fd":"0","bus":"pci.0","addr":"0x3"}' \ +-device '{"driver":"virtio-balloon-pci","id":"balloon0","bus":"pci.0","addr":"0x2"}' \ +-sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny \ +-msg timestamp=on diff --git a/tests/qemuxmlconfdata/iommufd.x86_64-latest.xml b/tests/qemuxmlconfdata/iommufd.x86_64-latest.xml new file mode 100644 index 0000000000..2e8951aaf6 --- /dev/null +++ b/tests/qemuxmlconfdata/iommufd.x86_64-latest.xml @@ -0,0 +1,38 @@ +<domain type='qemu'> + <name>foo</name> + <uuid>3c7c30b5-7866-4b05-8a29-efebccba52a0</uuid> + <memory unit='KiB'>2097152</memory> + <currentMemory unit='KiB'>2097152</currentMemory> + <vcpu placement='static' cpuset='0-1'>2</vcpu> + <os> + <type arch='x86_64' machine='pc'>hvm</type> + <boot dev='hd'/> + </os> + <cpu mode='custom' match='exact' check='none'> + <model fallback='forbid'>qemu64</model> + </cpu> + <clock offset='utc'/> + <on_poweroff>destroy</on_poweroff> + <on_reboot>restart</on_reboot> + <on_crash>destroy</on_crash> + <devices> + <emulator>/usr/bin/qemu-system-x86_64</emulator> + <controller type='pci' index='0' model='pci-root'/> + <controller type='usb' index='0' model='piix3-uhci'> + <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x2'/> + </controller> + <input type='mouse' bus='ps2'/> + <input type='keyboard' bus='ps2'/> + <audio id='1' type='none'/> + <hostdev mode='subsystem' type='pci' managed='yes'> + <driver iommufd='yes'/> + <source> + <address domain='0x0000' bus='0x06' slot='0x12' function='0x5'/> + </source> + <address type='pci' domain='0x0000' bus='0x00' slot='0x03' function='0x0'/> + </hostdev> + <memballoon model='virtio'> + <address type='pci' domain='0x0000' bus='0x00' slot='0x02' function='0x0'/> + </memballoon> + </devices> +</domain> diff --git a/tests/qemuxmlconfdata/iommufd.xml b/tests/qemuxmlconfdata/iommufd.xml new file mode 100644 index 0000000000..eb278414d2 --- /dev/null +++ b/tests/qemuxmlconfdata/iommufd.xml @@ -0,0 +1,30 @@ +<domain type='qemu'> + <name>foo</name> + <uuid>3c7c30b5-7866-4b05-8a29-efebccba52a0</uuid> + <memory unit='KiB'>2097152</memory> + <currentMemory unit='KiB'>2097152</currentMemory> + <vcpu placement='static' cpuset='0-1'>2</vcpu> + <os> + <type arch='x86_64' machine='pc'>hvm</type> + <boot dev='hd'/> + </os> + <clock offset='utc'/> + <on_poweroff>destroy</on_poweroff> + <on_reboot>restart</on_reboot> + <on_crash>destroy</on_crash> + <devices> + <emulator>/usr/bin/qemu-system-x86_64</emulator> + <controller type='pci' index='0' model='pci-root'/> + <hostdev mode='subsystem' type='pci' managed='yes'> + <driver iommufd='yes'/> + <source> + <address domain='0x0000' bus='0x06' slot='0x12' function='0x5'/> + </source> + <address type='pci' domain='0x0000' bus='0x00' slot='0x03' function='0x0'/> + </hostdev> + <controller type='usb' index='0'/> + <input type='mouse' bus='ps2'/> + <input type='keyboard' bus='ps2'/> + <memballoon model='virtio'/> + </devices> +</domain> diff --git a/tests/qemuxmlconftest.c b/tests/qemuxmlconftest.c index 89b8ad1a35..1e8752a645 100644 --- a/tests/qemuxmlconftest.c +++ b/tests/qemuxmlconftest.c @@ -351,6 +351,33 @@ fakeNetworkPortGetXMLDesc(virNetworkPortPtr port, } +static void +testSetupHostdevPrivateData(virDomainDef *def) +{ + size_t i; + + for (i = 0; i < def->nhostdevs; i++) { + virDomainHostdevDef *hostdev = def->hostdevs[i]; + + if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS && + hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI && + hostdev->source.subsys.u.pci.driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO && + hostdev->source.subsys.u.pci.driver.iommufd == VIR_TRISTATE_BOOL_YES) { + + qemuDomainHostdevPrivate *priv; + + if (!hostdev->privateData) { + hostdev->privateData = qemuDomainHostdevPrivateNew(); + } + + priv = QEMU_DOMAIN_HOSTDEV_PRIVATE(hostdev); + /* Use a placeholder FD value for tests */ + priv->vfioDeviceFd = 0; + } + } +} + + static virNetworkDriver fakeNetworkDriver = { .networkLookupByName = fakeNetworkLookupByName, .networkGetXMLDesc = fakeNetworkGetXMLDesc, @@ -404,6 +431,8 @@ testCompareXMLToArgvCreateArgs(virQEMUDriver *drv, if (testQemuPrepareHostBackendChardevOne(NULL, priv->monConfig, vm) < 0) return NULL; + testSetupHostdevPrivateData(vm->def); + for (i = 0; i < vm->def->ndisks; i++) { virDomainDiskDef *disk = vm->def->disks[i]; virStorageSource *src; @@ -3069,6 +3098,11 @@ mymain(void) DO_TEST_CAPS_LATEST_PARSE_ERROR("virtio-iommu-dma-translation"); DO_TEST_CAPS_LATEST("acpi-generic-initiator"); + DO_TEST_CAPS_LATEST("iommufd"); + DO_TEST_CAPS_LATEST("iommufd-q35"); + DO_TEST_CAPS_ARCH_LATEST("iommufd-virt", "aarch64"); + DO_TEST_CAPS_ARCH_LATEST("iommufd-virt-pci-bus-single", "aarch64"); + DO_TEST_CAPS_LATEST("cpu-hotplug-startup"); DO_TEST_CAPS_ARCH_LATEST_PARSE_ERROR("cpu-hotplug-granularity", "ppc64"); -- 2.43.0
participants (2)
-
Nathan Chen -
Pavel Hrdina