Before a PCI device can be assigned to a guest with VFIO, that device
must be bound to the vfio-pci driver rather than to the device's
normal host driver. The vfio-pci driver provides APIs that permit QEMU
to perform all the necessary operations to make the device accessible
to the guest.
In the past vfio-pci was the only driver that supplied these APIs, but
there are now vendor/device-specific "VFIO variant" drivers that
provide the basic vfio-pci driver functionality/API while adding
support for device-specific operations (for example these
device-specific drivers may support live migration of certain
devices). All that is needed to make this functionality available is
to bind the vendor-specific "VFIO variant" driver to the device
(rather than the generic vfio-pci driver, which will continue to work,
just without the extra functionality).
But until now libvirt has required that all PCI devices being assigned
to a guest with VFIO specifically have the "vfio-pci" driver bound to
the device. So even if the user manually binds a shiny new
vendor-specific VFIO variant driver to the device (and puts
"managed='no'" in the config to prevent libvirt from changing the
binding), libvirt will just fail during startup of the guest (or
during hotplug) because the driver bound to the device isn't exactly
"vfio-pci".
Beginning with kernel 6.1, it's possible to determine from the sysfs
directory for a device whether the currently-bound driver is the
vfio-pci driver or a VFIO variant - the device directory will have a
subdirectory called "vfio-dev". We can use that to appropriately widen
the list of drivers that libvirt will allow for VFIO device
assignment.
This patch doesn't remove the explicit check for the exact "vfio-pci"
driver (since that would cause systems with pre-6.1 kernels to behave
incorrectly), but adds an additional check for the vfio-dev directory,
so that any VFIO variant driver is acceptable for libvirt to continue
setting up for VFIO device assignment.
Signed-off-by: Laine Stump <laine(a)redhat.com>
---
src/hypervisor/virhostdev.c | 28 +++++--------
src/libvirt_private.syms | 1 +
src/util/virpci.c | 78 ++++++++++++++++++++++++++++++++++---
src/util/virpci.h | 3 ++
4 files changed, 87 insertions(+), 23 deletions(-)
diff --git a/src/hypervisor/virhostdev.c b/src/hypervisor/virhostdev.c
index 244f057c6c..b95d6bf3d6 100644
--- a/src/hypervisor/virhostdev.c
+++ b/src/hypervisor/virhostdev.c
@@ -743,9 +743,8 @@ virHostdevPreparePCIDevicesImpl(virHostdevManager *mgr,
mgr->inactivePCIHostdevs) < 0)
goto reattachdevs;
} else {
- g_autofree char *driverPath = NULL;
- g_autofree char *driverName = NULL;
- int stub;
+ g_autofree char *drvName = NULL;
+ virPCIStubDriver drvType;
/* Unmanaged devices should already have been marked as
* inactive: if that's the case, we can simply move on */
@@ -765,19 +764,17 @@ virHostdevPreparePCIDevicesImpl(virHostdevManager *mgr,
* information about active / inactive device across
* daemon restarts has been implemented */
- if (virPCIDeviceGetCurrentDriverPathAndName(pci, &driverPath,
- &driverName) < 0) {
+ if (virPCIDeviceGetCurrentDriverNameAndType(pci, &drvName,
+ &drvType) < 0) {
goto reattachdevs;
}
- stub = virPCIStubDriverTypeFromString(driverName);
-
- if (stub > VIR_PCI_STUB_DRIVER_NONE &&
- stub < VIR_PCI_STUB_DRIVER_LAST) {
+ if (drvType > VIR_PCI_STUB_DRIVER_NONE) {
/* The device is bound to a known stub driver: store this
* information and add a copy to the inactive list */
- virPCIDeviceSetStubDriverType(pci, stub);
+ virPCIDeviceSetStubDriverType(pci, drvType);
+ virPCIDeviceSetStubDriverName(pci, drvName);
VIR_DEBUG("Adding PCI device %s to inactive list",
virPCIDeviceGetName(pci));
@@ -2291,18 +2288,13 @@ virHostdevPrepareOneNVMeDevice(virHostdevManager *hostdev_mgr,
/* Let's check if all PCI devices are NVMe disks. */
for (i = 0; i < virPCIDeviceListCount(pciDevices); i++) {
virPCIDevice *pci = virPCIDeviceListGet(pciDevices, i);
- g_autofree char *drvPath = NULL;
g_autofree char *drvName = NULL;
- int stub = VIR_PCI_STUB_DRIVER_NONE;
+ virPCIStubDriver drvType;
- if (virPCIDeviceGetCurrentDriverPathAndName(pci, &drvPath, &drvName) <
0)
+ if (virPCIDeviceGetCurrentDriverNameAndType(pci, &drvName, &drvType) <
0)
goto cleanup;
- if (drvName)
- stub = virPCIStubDriverTypeFromString(drvName);
-
- if (stub == VIR_PCI_STUB_DRIVER_VFIO ||
- STREQ_NULLABLE(drvName, "nvme"))
+ if (drvType == VIR_PCI_STUB_DRIVER_VFIO || STREQ_NULLABLE(drvName,
"nvme"))
continue;
VIR_WARN("Suspicious NVMe disk assignment. PCI device "
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index 2b577c4e2d..413985d34c 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -3064,6 +3064,7 @@ virPCIDeviceFileIterate;
virPCIDeviceFree;
virPCIDeviceGetAddress;
virPCIDeviceGetConfigPath;
+virPCIDeviceGetCurrentDriverNameAndType;
virPCIDeviceGetCurrentDriverPathAndName;
virPCIDeviceGetIOMMUGroupDev;
virPCIDeviceGetIOMMUGroupList;
diff --git a/src/util/virpci.c b/src/util/virpci.c
index 2ec0dc2053..e165725cd9 100644
--- a/src/util/virpci.c
+++ b/src/util/virpci.c
@@ -280,6 +280,73 @@ virPCIDeviceGetCurrentDriverPathAndName(virPCIDevice *dev,
}
+/**
+ * virPCIDeviceGetCurrentDriverNameAndType:
+ * @dev: virPCIDevice object to examine
+ * @drvName: returns name of driver bound to this device (if any)
+ * @drvType: returns type of driver if it is a known stub driver type
+ *
+ * Find the name of the driver bound to @dev (if any) and the type of
+ * the driver if it is a known/recognized "stub" driver (based on the
+ * driver name).
+ *
+ * There are vfio "variant" drivers that provide all the basic
+ * functionality of the standard vfio-pci driver as well as additional
+ * stuff. As of kernel 6.1, the vfio-pci driver and all vfio variant
+ * drivers can be identified (once the driver has been bound to a
+ * device) by looking for the subdirectory "vfio-dev" in the device's
+ * sysfs directory; for example, if the directory
+ * /sys/bus/pci/devices/0000:04:11.4/vfio-dev exists, then the driver
+ * that is currently bound to PCI device 0000:04:11.4 is either
+ * vfio-pci, or a vfio-pci variant driver.
+ *
+ * Return 0 on success, -1 on failure. If -1 is returned, then an error
+ * message has been logged.
+ */
+int
+virPCIDeviceGetCurrentDriverNameAndType(virPCIDevice *dev,
+ char **drvName,
+ virPCIStubDriver *drvType)
+{
+ g_autofree char *drvPath = NULL;
+ g_autofree char *vfioDevDir = NULL;
+ int tmpType;
+
+ if (virPCIDeviceGetCurrentDriverPathAndName(dev, &drvPath, drvName) < 0)
+ return -1;
+
+ if (!*drvName) {
+ *drvType = VIR_PCI_STUB_DRIVER_NONE;
+ return 0;
+ }
+
+ tmpType = virPCIStubDriverTypeFromString(*drvName);
+
+ if (tmpType > VIR_PCI_STUB_DRIVER_NONE) {
+ *drvType = tmpType;
+ return 0; /* exact match of a known driver name (or no name) */
+ }
+
+ /* If the sysfs directory of this device contains a directory
+ * named "vfio-dev" then the currently-bound driver is a vfio
+ * variant driver.
+ */
+
+ vfioDevDir = virPCIFile(dev->name, "vfio-dev");
+
+ if (virFileIsDir(vfioDevDir)) {
+ VIR_DEBUG("Driver %s is a vfio_pci driver", *drvName);
+ *drvType = VIR_PCI_STUB_DRIVER_VFIO;
+ } else {
+ VIR_DEBUG("Driver %s is NOT a vfio_pci driver, or kernel is too old",
+ *drvName);
+ *drvType = VIR_PCI_STUB_DRIVER_NONE;
+ }
+
+ return 0;
+}
+
+
static int
virPCIDeviceConfigOpenInternal(virPCIDevice *dev, bool readonly, bool fatal)
{
@@ -1007,8 +1074,8 @@ virPCIDeviceReset(virPCIDevice *dev,
virPCIDeviceList *activeDevs,
virPCIDeviceList *inactiveDevs)
{
- g_autofree char *drvPath = NULL;
g_autofree char *drvName = NULL;
+ virPCIStubDriver drvType;
int ret = -1;
int fd = -1;
int hdrType = -1;
@@ -1034,15 +1101,16 @@ virPCIDeviceReset(virPCIDevice *dev,
* reset it whenever appropriate, so doing it ourselves would just
* be redundant.
*/
- if (virPCIDeviceGetCurrentDriverPathAndName(dev, &drvPath, &drvName) < 0)
+ if (virPCIDeviceGetCurrentDriverNameAndType(dev, &drvName, &drvType) < 0)
goto cleanup;
- if (virPCIStubDriverTypeFromString(drvName) == VIR_PCI_STUB_DRIVER_VFIO) {
- VIR_DEBUG("Device %s is bound to vfio-pci - skip reset",
- dev->name);
+ if (drvType == VIR_PCI_STUB_DRIVER_VFIO) {
+
+ VIR_DEBUG("Device %s is bound to %s - skip reset", dev->name,
drvName);
ret = 0;
goto cleanup;
}
+
VIR_DEBUG("Resetting device %s", dev->name);
if ((fd = virPCIDeviceConfigOpenWrite(dev)) < 0)
diff --git a/src/util/virpci.h b/src/util/virpci.h
index 19c910202a..faca6cf6f9 100644
--- a/src/util/virpci.h
+++ b/src/util/virpci.h
@@ -283,6 +283,9 @@ int virPCIDeviceRebind(virPCIDevice *dev);
int virPCIDeviceGetCurrentDriverPathAndName(virPCIDevice *dev,
char **path,
char **name);
+int virPCIDeviceGetCurrentDriverNameAndType(virPCIDevice *dev,
+ char **drvName,
+ virPCIStubDriver *drvType);
int virPCIDeviceIsPCIExpress(virPCIDevice *dev);
int virPCIDeviceHasPCIExpressLink(virPCIDevice *dev);
--
2.41.0