[libvirt] [PATCHv3 0/3] Change preference of default PCI passthrough type to VFIO

Version 3 incorporates fixes for issues I found while testing this series a bit better than the previous one. The main objective of this series is to add checks if VFIO is enabled on a host and if it's the case use it as a default PCI passthrough type instead of the legacy KVM passthrough type. Peter Krempa (3): qemu: hostdev: Refactor PCI passhrough handling qemu: hostdev: Add checks if PCI passthrough is availabe in the host qemu: Prefer VFIO for PCI device passthrough docs/formatdomain.html.in | 9 ++- src/conf/domain_conf.h | 2 +- src/qemu/qemu_command.c | 28 ++++++--- src/qemu/qemu_hostdev.c | 143 ++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_hostdev.h | 5 ++ src/qemu/qemu_hotplug.c | 29 ++++++---- src/qemu/qemu_process.c | 18 ++++-- tests/qemuxml2argvtest.c | 11 ++++ 8 files changed, 214 insertions(+), 31 deletions(-) -- 1.8.3.2

To simplify future patches dealing with this code, simplify and refactor some conditions to switch statements. --- src/qemu/qemu_command.c | 27 ++++++++++++++++++--------- src/qemu/qemu_hotplug.c | 27 ++++++++++++++++----------- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index e976466..ecf26cc 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -5485,14 +5485,25 @@ qemuBuildPCIHostdevDevStr(virDomainDefPtr def, { virBuffer buf = VIR_BUFFER_INITIALIZER; - if (dev->source.subsys.u.pci.backend - == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { - virBufferAddLit(&buf, "vfio-pci"); - } else { + switch ((virDomainHostdevSubsysPciBackendType) + dev->source.subsys.u.pci.backend) { + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM: virBufferAddLit(&buf, "pci-assign"); if (configfd && *configfd) virBufferAsprintf(&buf, ",configfd=%s", configfd); + break; + + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO: + virBufferAddLit(&buf, "vfio-pci"); + break; + + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_TYPE_LAST: + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("PCI passhthrough type needs to be specified")); + break; } + virBufferAsprintf(&buf, ",host=%.2x:%.2x.%.1x", dev->source.subsys.u.pci.addr.bus, dev->source.subsys.u.pci.addr.slot, @@ -9232,7 +9243,6 @@ qemuBuildCommandLine(virConnectPtr conn, VIR_FREE(devstr); } - /* Add host passthrough hardware */ for (i = 0; i < def->nhostdevs; i++) { virDomainHostdevDefPtr hostdev = def->hostdevs[i]; @@ -9305,9 +9315,9 @@ qemuBuildCommandLine(virConnectPtr conn, /* PCI */ if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS && hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) { + int backend = hostdev->source.subsys.u.pci.backend; - if (hostdev->source.subsys.u.pci.backend - == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { + if (backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { if (!virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE_VFIO_PCI)) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("VFIO PCI device assignment is not " @@ -9321,8 +9331,7 @@ qemuBuildCommandLine(virConnectPtr conn, if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE)) { char *configfd_name = NULL; - if ((hostdev->source.subsys.u.pci.backend - != VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) && + if ((backend != VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) && virQEMUCapsGet(qemuCaps, QEMU_CAPS_PCI_CONFIGFD)) { int configfd = qemuOpenPCIConfig(hostdev); diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index 818c726..ae2cbc0 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -1134,6 +1134,7 @@ int qemuDomainAttachHostPciDevice(virQEMUDriverPtr driver, int configfd = -1; char *configfd_name = NULL; bool releaseaddr = false; + int *backend = &hostdev->source.subsys.u.pci.backend; if (VIR_REALLOC_N(vm->def->hostdevs, vm->def->nhostdevs + 1) < 0) return -1; @@ -1142,10 +1143,8 @@ int qemuDomainAttachHostPciDevice(virQEMUDriverPtr driver, &hostdev, 1) < 0) return -1; - if (hostdev->source.subsys.u.pci.backend - == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { - unsigned long long memKB; - + switch ((virDomainHostdevSubsysPciBackendType) *backend) { + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO: if (!virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE_VFIO_PCI)) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("VFIO PCI device assignment is not " @@ -1157,11 +1156,18 @@ int qemuDomainAttachHostPciDevice(virQEMUDriverPtr driver, * In this case, the guest's memory may already be locked, but it * doesn't hurt to "change" the limit to the same value. */ - vm->def->hostdevs[vm->def->nhostdevs++] = hostdev; - memKB = vm->def->mem.hard_limit ? - vm->def->mem.hard_limit : vm->def->mem.max_balloon + 1024 * 1024; - virProcessSetMaxMemLock(vm->pid, memKB); - vm->def->hostdevs[vm->def->nhostdevs--] = NULL; + if (vm->def->mem.hard_limit) + virProcessSetMaxMemLock(vm->pid, vm->def->mem.hard_limit); + else + virProcessSetMaxMemLock(vm->pid, + vm->def->mem.max_balloon + (1024 * 1024)); + + break; + + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM: + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_TYPE_LAST: + break; } if (virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { @@ -1170,8 +1176,7 @@ int qemuDomainAttachHostPciDevice(virQEMUDriverPtr driver, if (qemuDomainPCIAddressEnsureAddr(priv->pciaddrs, hostdev->info) < 0) goto error; releaseaddr = true; - if ((hostdev->source.subsys.u.pci.backend - != VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) && + if ((*backend != VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) && virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_PCI_CONFIGFD)) { configfd = qemuOpenPCIConfig(hostdev); if (configfd >= 0) { -- 1.8.3.2

On 10/04/2013 08:55 AM, Peter Krempa wrote:
To simplify future patches dealing with this code, simplify and refactor some conditions to switch statements. --- src/qemu/qemu_command.c | 27 ++++++++++++++++++--------- src/qemu/qemu_hotplug.c | 27 ++++++++++++++++----------- 2 files changed, 34 insertions(+), 20 deletions(-)
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index e976466..ecf26cc 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -5485,14 +5485,25 @@ qemuBuildPCIHostdevDevStr(virDomainDefPtr def, { virBuffer buf = VIR_BUFFER_INITIALIZER;
- if (dev->source.subsys.u.pci.backend - == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { - virBufferAddLit(&buf, "vfio-pci"); - } else { + switch ((virDomainHostdevSubsysPciBackendType) + dev->source.subsys.u.pci.backend) {
I'm assuming you're doing the typecast so that you're forced to have a case for every enum value. Do we have an official policy on that? I see similar typecasts for switch statements in a few places, but I don't know that I'm so fond of it...
+ case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM: virBufferAddLit(&buf, "pci-assign"); if (configfd && *configfd) virBufferAsprintf(&buf, ",configfd=%s", configfd); + break; + + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO: + virBufferAddLit(&buf, "vfio-pci"); + break; + + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_TYPE_LAST: + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("PCI passhthrough type needs to be specified"));
And this is one reason why - this value will *never* happen. If nothing is specified, backend will be set to "...DEFAULT", not "...TYPE_LAST". As a matter of fact, by the time you get as low as this function, backend will always be default, kvm, or vfio - the higher level functions see to that. There is a point where double checking values down through all the levels of a call chain becomes pointless, and I think we've reached it here. (Personally, I think if we're going to do enforce explicit listing of all cases in switch statements for an attribute that has an enum associated with it, it would be better to define the field with the actual enum type rather than int - these are internal data structures that are never passed from one machine to another, so it's not like there would ever be any compatibility problem.)
+ break; } + virBufferAsprintf(&buf, ",host=%.2x:%.2x.%.1x", dev->source.subsys.u.pci.addr.bus, dev->source.subsys.u.pci.addr.slot, @@ -9232,7 +9243,6 @@ qemuBuildCommandLine(virConnectPtr conn, VIR_FREE(devstr); }
- /* Add host passthrough hardware */ for (i = 0; i < def->nhostdevs; i++) { virDomainHostdevDefPtr hostdev = def->hostdevs[i]; @@ -9305,9 +9315,9 @@ qemuBuildCommandLine(virConnectPtr conn, /* PCI */ if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS && hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) { + int backend = hostdev->source.subsys.u.pci.backend;
- if (hostdev->source.subsys.u.pci.backend - == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { + if (backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { if (!virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE_VFIO_PCI)) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("VFIO PCI device assignment is not " @@ -9321,8 +9331,7 @@ qemuBuildCommandLine(virConnectPtr conn,
if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE)) { char *configfd_name = NULL; - if ((hostdev->source.subsys.u.pci.backend - != VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) && + if ((backend != VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) && virQEMUCapsGet(qemuCaps, QEMU_CAPS_PCI_CONFIGFD)) { int configfd = qemuOpenPCIConfig(hostdev);
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index 818c726..ae2cbc0 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -1134,6 +1134,7 @@ int qemuDomainAttachHostPciDevice(virQEMUDriverPtr driver, int configfd = -1; char *configfd_name = NULL; bool releaseaddr = false; + int *backend = &hostdev->source.subsys.u.pci.backend;
if (VIR_REALLOC_N(vm->def->hostdevs, vm->def->nhostdevs + 1) < 0) return -1; @@ -1142,10 +1143,8 @@ int qemuDomainAttachHostPciDevice(virQEMUDriverPtr driver, &hostdev, 1) < 0) return -1;
- if (hostdev->source.subsys.u.pci.backend - == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { - unsigned long long memKB; - + switch ((virDomainHostdevSubsysPciBackendType) *backend) { + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO: if (!virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE_VFIO_PCI)) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("VFIO PCI device assignment is not " @@ -1157,11 +1156,18 @@ int qemuDomainAttachHostPciDevice(virQEMUDriverPtr driver, * In this case, the guest's memory may already be locked, but it * doesn't hurt to "change" the limit to the same value. */ - vm->def->hostdevs[vm->def->nhostdevs++] = hostdev; - memKB = vm->def->mem.hard_limit ? - vm->def->mem.hard_limit : vm->def->mem.max_balloon + 1024 * 1024; - virProcessSetMaxMemLock(vm->pid, memKB); - vm->def->hostdevs[vm->def->nhostdevs--] = NULL; + if (vm->def->mem.hard_limit) + virProcessSetMaxMemLock(vm->pid, vm->def->mem.hard_limit); + else + virProcessSetMaxMemLock(vm->pid, + vm->def->mem.max_balloon + (1024 * 1024)); + + break; + + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM: + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_TYPE_LAST:
Again - I dislike having these sentinel values showing up in switch statements (searching the source I see it a lot), as it could lead someone to believe that those are actually valid values. I may be outvoted on this though :-)
+ break; }
if (virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { @@ -1170,8 +1176,7 @@ int qemuDomainAttachHostPciDevice(virQEMUDriverPtr driver, if (qemuDomainPCIAddressEnsureAddr(priv->pciaddrs, hostdev->info) < 0) goto error; releaseaddr = true; - if ((hostdev->source.subsys.u.pci.backend - != VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) && + if ((*backend != VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) && virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_PCI_CONFIGFD)) { configfd = qemuOpenPCIConfig(hostdev); if (configfd >= 0) {
Other than my dislike of typecasts and unreachable switch cases (which may also be fine, but I want to see someone else say so too), this looks fine, so ACK.

On 10/07/13 15:57, Laine Stump wrote:
On 10/04/2013 08:55 AM, Peter Krempa wrote:
To simplify future patches dealing with this code, simplify and refactor some conditions to switch statements. --- src/qemu/qemu_command.c | 27 ++++++++++++++++++--------- src/qemu/qemu_hotplug.c | 27 ++++++++++++++++----------- 2 files changed, 34 insertions(+), 20 deletions(-)
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index e976466..ecf26cc 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -5485,14 +5485,25 @@ qemuBuildPCIHostdevDevStr(virDomainDefPtr def, { virBuffer buf = VIR_BUFFER_INITIALIZER;
- if (dev->source.subsys.u.pci.backend - == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { - virBufferAddLit(&buf, "vfio-pci"); - } else { + switch ((virDomainHostdevSubsysPciBackendType) + dev->source.subsys.u.pci.backend) {
I'm assuming you're doing the typecast so that you're forced to have a case for every enum value. Do we have an official policy on that? I see similar typecasts for switch statements in a few places, but I don't know that I'm so fond of it...
I started adding this on multiple places earlier. I don't know if there's a official strategy, but I tend to prefer it as I was bitten a few times by adding a new enum value and missed a few switches where I actually should handle those. That's why I'm adding the typecasts where possible.
+ case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM: virBufferAddLit(&buf, "pci-assign"); if (configfd && *configfd) virBufferAsprintf(&buf, ",configfd=%s", configfd); + break; + + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO: + virBufferAddLit(&buf, "vfio-pci"); + break; + + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_TYPE_LAST: + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("PCI passhthrough type needs to be specified"));
And this is one reason why - this value will *never* happen. If nothing is specified, backend will be set to "...DEFAULT", not "...TYPE_LAST". As a matter of fact, by the time you get as low as this function, backend will always be default, kvm, or vfio - the higher level functions see to that. There is a point where double checking values down through all the levels of a call chain becomes pointless, and I think we've reached it here.
Yes, the _TYPE_LAST will never happen here. I usually leave out any action on the _TYPE_LAST variants, but in this particular case, patch 3/3 changes the default and code in this function shouldn't ever be reached with _DEFAULT any more. danpb suggested adding this error.
(Personally, I think if we're going to do enforce explicit listing of all cases in switch statements for an attribute that has an enum associated with it, it would be better to define the field with the actual enum type rather than int - these are internal data structures that are never passed from one machine to another, so it's not like there would ever be any compatibility problem.)
That would be a ideal global solution. Unfortunately as the SOMETypeFromString macroed functions return -1 on failure almost all variables holding the value got by this functions are declared as int rather than the appropriate enum value. To avoid the need for a separate intermediate variable we tend to declare the holding vars as int.
+ break; } + virBufferAsprintf(&buf, ",host=%.2x:%.2x.%.1x", dev->source.subsys.u.pci.addr.bus, dev->source.subsys.u.pci.addr.slot, @@ -9232,7 +9243,6 @@ qemuBuildCommandLine(virConnectPtr conn, VIR_FREE(devstr); }
- /* Add host passthrough hardware */ for (i = 0; i < def->nhostdevs; i++) { virDomainHostdevDefPtr hostdev = def->hostdevs[i]; @@ -9305,9 +9315,9 @@ qemuBuildCommandLine(virConnectPtr conn, /* PCI */ if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS && hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) { + int backend = hostdev->source.subsys.u.pci.backend;
- if (hostdev->source.subsys.u.pci.backend - == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { + if (backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { if (!virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE_VFIO_PCI)) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("VFIO PCI device assignment is not " @@ -9321,8 +9331,7 @@ qemuBuildCommandLine(virConnectPtr conn,
if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE)) { char *configfd_name = NULL; - if ((hostdev->source.subsys.u.pci.backend - != VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) && + if ((backend != VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) && virQEMUCapsGet(qemuCaps, QEMU_CAPS_PCI_CONFIGFD)) { int configfd = qemuOpenPCIConfig(hostdev);
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index 818c726..ae2cbc0 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -1134,6 +1134,7 @@ int qemuDomainAttachHostPciDevice(virQEMUDriverPtr driver, int configfd = -1; char *configfd_name = NULL; bool releaseaddr = false; + int *backend = &hostdev->source.subsys.u.pci.backend;
if (VIR_REALLOC_N(vm->def->hostdevs, vm->def->nhostdevs + 1) < 0) return -1; @@ -1142,10 +1143,8 @@ int qemuDomainAttachHostPciDevice(virQEMUDriverPtr driver, &hostdev, 1) < 0) return -1;
- if (hostdev->source.subsys.u.pci.backend - == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { - unsigned long long memKB; - + switch ((virDomainHostdevSubsysPciBackendType) *backend) { + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO: if (!virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE_VFIO_PCI)) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("VFIO PCI device assignment is not " @@ -1157,11 +1156,18 @@ int qemuDomainAttachHostPciDevice(virQEMUDriverPtr driver, * In this case, the guest's memory may already be locked, but it * doesn't hurt to "change" the limit to the same value. */ - vm->def->hostdevs[vm->def->nhostdevs++] = hostdev; - memKB = vm->def->mem.hard_limit ? - vm->def->mem.hard_limit : vm->def->mem.max_balloon + 1024 * 1024; - virProcessSetMaxMemLock(vm->pid, memKB); - vm->def->hostdevs[vm->def->nhostdevs--] = NULL; + if (vm->def->mem.hard_limit) + virProcessSetMaxMemLock(vm->pid, vm->def->mem.hard_limit); + else + virProcessSetMaxMemLock(vm->pid, + vm->def->mem.max_balloon + (1024 * 1024)); + + break; + + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM: + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_TYPE_LAST:
Again - I dislike having these sentinel values showing up in switch statements (searching the source I see it a lot), as it could lead someone to believe that those are actually valid values. I may be outvoted on this though :-)
I think that the benefits of the compiler warning in case you add a new variable on places you might have forgotten outweigh a few occurences of unused type in the code. ... Peter

On 10/07/2013 08:16 AM, Peter Krempa wrote:
+ switch ((virDomainHostdevSubsysPciBackendType) + dev->source.subsys.u.pci.backend) {
I'm assuming you're doing the typecast so that you're forced to have a case for every enum value. Do we have an official policy on that? I see similar typecasts for switch statements in a few places, but I don't know that I'm so fond of it...
I started adding this on multiple places earlier. I don't know if there's a official strategy, but I tend to prefer it as I was bitten a few times by adding a new enum value and missed a few switches where I actually should handle those. That's why I'm adding the typecasts where possible.
I actually like it as well - any time you can make the compiler catch your mistakes at compile time, rather than waiting until an obscure error at runtime, is a win, even if the syntax is a bit awkward (casting to force the correct type checking, and listing dead case labels to appease the compiler).
(Personally, I think if we're going to do enforce explicit listing of all cases in switch statements for an attribute that has an enum associated with it, it would be better to define the field with the actual enum type rather than int - these are internal data structures that are never passed from one machine to another, so it's not like there would ever be any compatibility problem.)
That would be a ideal global solution. Unfortunately as the SOMETypeFromString macroed functions return -1 on failure almost all variables holding the value got by this functions are declared as int rather than the appropriate enum value. To avoid the need for a separate intermediate variable we tend to declare the holding vars as int.
The cast inside the switch is a reasonable compromise, since it is often easier to pass int around (and in public APIs, we _have_ to pass int around - not all enums are private-use only).
+ break; + + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM: + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_TYPE_LAST:
Again - I dislike having these sentinel values showing up in switch statements (searching the source I see it a lot), as it could lead someone to believe that those are actually valid values. I may be outvoted on this though :-)
I think that the benefits of the compiler warning in case you add a new variable on places you might have forgotten outweigh a few occurences of unused type in the code.
Agreed. Maybe we should patch HACKING to capture the outcome of this discussion. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 10/07/13 16:59, Eric Blake wrote:
On 10/07/2013 08:16 AM, Peter Krempa wrote:
...
Again - I dislike having these sentinel values showing up in switch statements (searching the source I see it a lot), as it could lead someone to believe that those are actually valid values. I may be outvoted on this though :-)
I think that the benefits of the compiler warning in case you add a new variable on places you might have forgotten outweigh a few occurences of unused type in the code.
Agreed. Maybe we should patch HACKING to capture the outcome of this discussion.
I pushed this one with two trivial fixes according to ACK from Laine. Fixes done were: Removed double space in one of the conditions and changed the second addition of backend variable to non-pointer. Peter

Add code to check availability of PCI passhthrough using VFIO and the legacy KVM passthrough and use it when starting VMs and hotplugging devices to live machine. --- src/qemu/qemu_hostdev.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_hostdev.h | 4 ++ src/qemu/qemu_hotplug.c | 4 ++ src/qemu/qemu_process.c | 5 ++ 4 files changed, 139 insertions(+) diff --git a/src/qemu/qemu_hostdev.c b/src/qemu/qemu_hostdev.c index 21fe47f..dbbc2b4 100644 --- a/src/qemu/qemu_hostdev.c +++ b/src/qemu/qemu_hostdev.c @@ -23,6 +23,11 @@ #include <config.h> +#include <dirent.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <errno.h> + #include "qemu_hostdev.h" #include "virlog.h" #include "virerror.h" @@ -31,6 +36,7 @@ #include "virusb.h" #include "virscsi.h" #include "virnetdev.h" +#include "virfile.h" #define VIR_FROM_THIS VIR_FROM_QEMU @@ -1287,3 +1293,123 @@ void qemuDomainReAttachHostDevices(virQEMUDriverPtr driver, qemuDomainReAttachHostScsiDevices(driver, def->name, def->hostdevs, def->nhostdevs); } + + +static bool +qemuHostdevHostSupportsPassthroughVFIO(void) +{ + DIR *iommuDir = NULL; + struct dirent *iommuGroup = NULL; + bool ret = false; + + /* condition 1 - /sys/kernel/iommu_groups/ contains entries */ + if (!(iommuDir = opendir("/sys/kernel/iommu_groups/"))) + goto cleanup; + + while ((iommuGroup = readdir(iommuDir))) { + /* skip ./ ../ */ + if (STRPREFIX(iommuGroup->d_name, ".")) + continue; + + /* assume we found a group */ + break; + } + + if (!iommuGroup) + goto cleanup; + /* okay, iommu is on and recognizes groups */ + + /* condition 2 - /dev/vfio/vfio exists */ + if (!virFileExists("/dev/vfio/vfio")) + goto cleanup; + + ret = true; + +cleanup: + if (iommuDir) + closedir(iommuDir); + + return ret; +} + + +#if HAVE_LINUX_KVM_H +# include <linux/kvm.h> +static bool +qemuHostdevHostSupportsPassthroughLegacy(void) +{ + int kvmfd = -1; + bool ret = false; + + if ((kvmfd = open("/dev/kvm", O_RDONLY)) < 0) + goto cleanup; + +# ifdef KVM_CAP_IOMMU + if ((ioctl(kvmfd, KVM_CHECK_EXTENSION, KVM_CAP_IOMMU)) <= 0) + goto cleanup; + + ret = true; +# endif + +cleanup: + VIR_FORCE_CLOSE(kvmfd); + + return ret; +} +#else +static bool +qemuHostdevHostSupportsPassthroughLegacy(void) +{ + return false; +} +#endif + +bool +qemuHostdevHostVerifySupport(virDomainHostdevDefPtr *hostdevs, + size_t nhostdevs) +{ + int supportsPassthroughKVM = -1; + int supportsPassthroughVFIO = -1; + size_t i; + + /* assign defaults for hostdev passthrough */ + for (i = 0; i < nhostdevs; i++) { + virDomainHostdevDefPtr hostdev = hostdevs[i]; + + if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS && + hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) { + int *backend = &hostdev->source.subsys.u.pci.backend; + + /* cache host state of passthrough support */ + if (supportsPassthroughKVM == -1 || supportsPassthroughVFIO == -1) { + supportsPassthroughKVM = qemuHostdevHostSupportsPassthroughLegacy(); + supportsPassthroughVFIO = qemuHostdevHostSupportsPassthroughVFIO(); + } + + switch ((virDomainHostdevSubsysPciBackendType) *backend) { + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO: + if (!supportsPassthroughVFIO) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("host doesn't support VFIO PCI passthrough")); + return false; + } + break; + + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM: + if (!supportsPassthroughKVM) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("host doesn't support legacy PCI passthrough")); + return false; + } + + break; + + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_TYPE_LAST: + break; + } + } + } + + return true; +} diff --git a/src/qemu/qemu_hostdev.h b/src/qemu/qemu_hostdev.h index 327d4d5..6d88830 100644 --- a/src/qemu/qemu_hostdev.h +++ b/src/qemu/qemu_hostdev.h @@ -69,4 +69,8 @@ int qemuDomainHostdevNetConfigReplace(virDomainHostdevDefPtr hostdev, int qemuDomainHostdevNetConfigRestore(virDomainHostdevDefPtr hostdev, char *stateDir); +bool qemuHostdevHostVerifySupport(virDomainHostdevDefPtr *hostdevs, + size_t nhostdevs); + + #endif /* __QEMU_HOSTDEV_H__ */ diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index ae2cbc0..c72fdc3 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -1143,6 +1143,10 @@ int qemuDomainAttachHostPciDevice(virQEMUDriverPtr driver, &hostdev, 1) < 0) return -1; + /* verify the availability of passthrough support */ + if (!qemuHostdevHostVerifySupport(&hostdev, 1)) + goto error; + switch ((virDomainHostdevSubsysPciBackendType) *backend) { case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO: if (!virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE_VFIO_PCI)) { diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 7a30a5e..4aa9582 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3567,6 +3567,11 @@ int qemuProcessStart(virConnectPtr conn, goto cleanup; } + /* check and assign device assignment settings */ + if (!qemuHostdevHostVerifySupport(vm->def->hostdevs, + vm->def->nhostdevs)) + goto cleanup; + /* network devices must be "prepared" before hostdevs, because * setting up a network device might create a new hostdev that * will need to be setup. -- 1.8.3.2

On 10/04/2013 08:55 AM, Peter Krempa wrote:
Add code to check availability of PCI passhthrough using VFIO and the legacy KVM passthrough and use it when starting VMs and hotplugging devices to live machine. --- src/qemu/qemu_hostdev.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_hostdev.h | 4 ++ src/qemu/qemu_hotplug.c | 4 ++ src/qemu/qemu_process.c | 5 ++ 4 files changed, 139 insertions(+)
diff --git a/src/qemu/qemu_hostdev.c b/src/qemu/qemu_hostdev.c index 21fe47f..dbbc2b4 100644 --- a/src/qemu/qemu_hostdev.c +++ b/src/qemu/qemu_hostdev.c @@ -23,6 +23,11 @@
#include <config.h>
+#include <dirent.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <errno.h> + #include "qemu_hostdev.h" #include "virlog.h" #include "virerror.h" @@ -31,6 +36,7 @@ #include "virusb.h" #include "virscsi.h" #include "virnetdev.h" +#include "virfile.h"
#define VIR_FROM_THIS VIR_FROM_QEMU
@@ -1287,3 +1293,123 @@ void qemuDomainReAttachHostDevices(virQEMUDriverPtr driver, qemuDomainReAttachHostScsiDevices(driver, def->name, def->hostdevs, def->nhostdevs); } + + +static bool +qemuHostdevHostSupportsPassthroughVFIO(void)
If you're concerned about name length, you could probably remove the "Passthrough" and it would still make sense...
+{ + DIR *iommuDir = NULL; + struct dirent *iommuGroup = NULL; + bool ret = false; + + /* condition 1 - /sys/kernel/iommu_groups/ contains entries */ + if (!(iommuDir = opendir("/sys/kernel/iommu_groups/"))) + goto cleanup; + + while ((iommuGroup = readdir(iommuDir))) { + /* skip ./ ../ */ + if (STRPREFIX(iommuGroup->d_name, ".")) + continue; + + /* assume we found a group */ + break; + } + + if (!iommuGroup) + goto cleanup; + /* okay, iommu is on and recognizes groups */ + + /* condition 2 - /dev/vfio/vfio exists */ + if (!virFileExists("/dev/vfio/vfio")) + goto cleanup;
Was this all that was suggested? At any rate it's a good start even if there are other checks that can be added later.
+ + ret = true; + +cleanup: + if (iommuDir) + closedir(iommuDir); + + return ret; +} + + +#if HAVE_LINUX_KVM_H +# include <linux/kvm.h> +static bool +qemuHostdevHostSupportsPassthroughLegacy(void) +{ + int kvmfd = -1; + bool ret = false; + + if ((kvmfd = open("/dev/kvm", O_RDONLY)) < 0) + goto cleanup; + +# ifdef KVM_CAP_IOMMU + if ((ioctl(kvmfd, KVM_CHECK_EXTENSION, KVM_CAP_IOMMU)) <= 0) + goto cleanup; + + ret = true; +# endif + +cleanup: + VIR_FORCE_CLOSE(kvmfd); + + return ret; +} +#else +static bool +qemuHostdevHostSupportsPassthroughLegacy(void) +{ + return false; +} +#endif + +bool +qemuHostdevHostVerifySupport(virDomainHostdevDefPtr *hostdevs, + size_t nhostdevs) +{ + int supportsPassthroughKVM = -1; + int supportsPassthroughVFIO = -1; + size_t i; + + /* assign defaults for hostdev passthrough */ + for (i = 0; i < nhostdevs; i++) { + virDomainHostdevDefPtr hostdev = hostdevs[i]; + + if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS && + hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) { + int *backend = &hostdev->source.subsys.u.pci.backend; + + /* cache host state of passthrough support */ + if (supportsPassthroughKVM == -1 || supportsPassthroughVFIO == -1) { + supportsPassthroughKVM = qemuHostdevHostSupportsPassthroughLegacy(); + supportsPassthroughVFIO = qemuHostdevHostSupportsPassthroughVFIO();
These should be checked once at the top of the function and their return values cached.
+ } + + switch ((virDomainHostdevSubsysPciBackendType) *backend) {
You know my opinion on these typecasts :-) (but don't let that stop you).
+ case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO: + if (!supportsPassthroughVFIO) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("host doesn't support VFIO PCI passthrough")); + return false; + } + break; + + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM: + if (!supportsPassthroughKVM) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("host doesn't support legacy PCI passthrough")); + return false; + } + + break;
In the case of DEFAULT, we don't want to require support for legacy PCI passthrough, we instead want to make sure that one or the other of the methods is supported on the current host. So DEFAULT should have a separate case that says "if (!(supportsPassthroughKVM || supportsPassthroughVFIO) { error }"
+ + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_TYPE_LAST: + break; + } + } + } + + return true; +} diff --git a/src/qemu/qemu_hostdev.h b/src/qemu/qemu_hostdev.h index 327d4d5..6d88830 100644 --- a/src/qemu/qemu_hostdev.h +++ b/src/qemu/qemu_hostdev.h @@ -69,4 +69,8 @@ int qemuDomainHostdevNetConfigReplace(virDomainHostdevDefPtr hostdev, int qemuDomainHostdevNetConfigRestore(virDomainHostdevDefPtr hostdev, char *stateDir);
+bool qemuHostdevHostVerifySupport(virDomainHostdevDefPtr *hostdevs, + size_t nhostdevs); + + #endif /* __QEMU_HOSTDEV_H__ */ diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index ae2cbc0..c72fdc3 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -1143,6 +1143,10 @@ int qemuDomainAttachHostPciDevice(virQEMUDriverPtr driver, &hostdev, 1) < 0) return -1;
+ /* verify the availability of passthrough support */ + if (!qemuHostdevHostVerifySupport(&hostdev, 1)) + goto error; + switch ((virDomainHostdevSubsysPciBackendType) *backend) { case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO: if (!virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE_VFIO_PCI)) { diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 7a30a5e..4aa9582 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3567,6 +3567,11 @@ int qemuProcessStart(virConnectPtr conn, goto cleanup; }
+ /* check and assign device assignment settings */ + if (!qemuHostdevHostVerifySupport(vm->def->hostdevs, + vm->def->nhostdevs)) + goto cleanup; +
In one case you're calling this very near to a call to qemuPrepareHostdevPCIDevices(), in the other very near to a call to qemuPrepareHostDevices, which itself calls qemuPrepareHostdevPCIDevices(); I think this check should be made inside qemuPrepareHostdevPCIDevices() so that the higher level code isn't polluted with the detail.
/* network devices must be "prepared" before hostdevs, because * setting up a network device might create a new hostdev that * will need to be setup.

On 10/07/13 16:10, Laine Stump wrote:
On 10/04/2013 08:55 AM, Peter Krempa wrote:
Add code to check availability of PCI passhthrough using VFIO and the legacy KVM passthrough and use it when starting VMs and hotplugging devices to live machine. --- src/qemu/qemu_hostdev.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_hostdev.h | 4 ++ src/qemu/qemu_hotplug.c | 4 ++ src/qemu/qemu_process.c | 5 ++ 4 files changed, 139 insertions(+)
diff --git a/src/qemu/qemu_hostdev.c b/src/qemu/qemu_hostdev.c index 21fe47f..dbbc2b4 100644 --- a/src/qemu/qemu_hostdev.c +++ b/src/qemu/qemu_hostdev.c
...
@@ -1287,3 +1293,123 @@ void qemuDomainReAttachHostDevices(virQEMUDriverPtr driver, qemuDomainReAttachHostScsiDevices(driver, def->name, def->hostdevs, def->nhostdevs); } + + +static bool +qemuHostdevHostSupportsPassthroughVFIO(void) +{ + DIR *iommuDir = NULL; + struct dirent *iommuGroup = NULL; + bool ret = false; + + /* condition 1 - /sys/kernel/iommu_groups/ contains entries */ + if (!(iommuDir = opendir("/sys/kernel/iommu_groups/"))) + goto cleanup; + + while ((iommuGroup = readdir(iommuDir))) { + /* skip ./ ../ */ + if (STRPREFIX(iommuGroup->d_name, ".")) + continue; + + /* assume we found a group */ + break; + } + + if (!iommuGroup) + goto cleanup; + /* okay, iommu is on and recognizes groups */ + + /* condition 2 - /dev/vfio/vfio exists */ + if (!virFileExists("/dev/vfio/vfio")) + goto cleanup;
Was this all that was suggested? At any rate it's a good start even if there are other checks that can be added later.
Yes that was all. While I was testing this I found out, that this doesn't necesarily mean everything will go well. For example on crappy hardware you might need to enable unsafe interrupt routing (which may kill your host in most cases). These are then caught by the improved error reporting code in the qemu driver so you get a more useful error message. (saying that operation was not permitted and few entries in the kernel log).
+
...
+bool +qemuHostdevHostVerifySupport(virDomainHostdevDefPtr *hostdevs, + size_t nhostdevs) +{ + int supportsPassthroughKVM = -1; + int supportsPassthroughVFIO = -1; + size_t i; + + /* assign defaults for hostdev passthrough */ + for (i = 0; i < nhostdevs; i++) { + virDomainHostdevDefPtr hostdev = hostdevs[i]; + + if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS && + hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) { + int *backend = &hostdev->source.subsys.u.pci.backend; + + /* cache host state of passthrough support */ + if (supportsPassthroughKVM == -1 || supportsPassthroughVFIO == -1) { + supportsPassthroughKVM = qemuHostdevHostSupportsPassthroughLegacy(); + supportsPassthroughVFIO = qemuHostdevHostSupportsPassthroughVFIO();
These should be checked once at the top of the function and their return values cached.
Hmm; I originally didn't want to call the code in case no PCI passthrough device was present in the guest. The main reason was that the first version was erroring out in the functions directly. I think we can safely call those every time we see non-zero count of hostdevs. I will adapt the code in the next version.
+ } + + switch ((virDomainHostdevSubsysPciBackendType) *backend) { + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO: + if (!supportsPassthroughVFIO) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("host doesn't support VFIO PCI passthrough")); + return false; + } + break; +
...
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 7a30a5e..4aa9582 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3567,6 +3567,11 @@ int qemuProcessStart(virConnectPtr conn, goto cleanup; }
+ /* check and assign device assignment settings */ + if (!qemuHostdevHostVerifySupport(vm->def->hostdevs, + vm->def->nhostdevs)) + goto cleanup; +
In one case you're calling this very near to a call to qemuPrepareHostdevPCIDevices(), in the other very near to a call to qemuPrepareHostDevices, which itself calls qemuPrepareHostdevPCIDevices(); I think this check should be made inside qemuPrepareHostdevPCIDevices() so that the higher level code isn't polluted with the detail.
Seems like that will be an okay place too. At first I was looking for a place that won't be called when running the tests, but qemuPrepareHostdevPCIDevices() seems like the right place to do it. I will adapt the code and re-send. Peter

Prefer using VFIO (if available) to the legacy KVM device passthrough. With this patch a PCI passthrough device without the driver configured will be started with VFIO if it's available on the host. If not legacy KVM passthrough is checked and error is reported if it's not available. --- docs/formatdomain.html.in | 9 ++++----- src/conf/domain_conf.h | 2 +- src/qemu/qemu_command.c | 3 ++- src/qemu/qemu_hostdev.c | 21 +++++++++++++++++++-- src/qemu/qemu_hostdev.h | 3 ++- src/qemu/qemu_hotplug.c | 2 +- src/qemu/qemu_process.c | 15 ++++++++------- tests/qemuxml2argvtest.c | 11 +++++++++++ 8 files changed, 48 insertions(+), 18 deletions(-) diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 3689399..6f3f7cf 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -2755,11 +2755,10 @@ backend, which is compatible with UEFI SecureBoot) or "kvm" (for the legacy device assignment handled directly by the KVM kernel module)<span class="since">Since 1.0.5 (QEMU and KVM - only, requires kernel 3.6 or newer)</span>. Currently, "kvm" - is the default used by libvirt when not explicitly provided, - but since the two are functionally equivalent, this default - could be changed in the future with no impact to domains that - don't specify anything. + only, requires kernel 3.6 or newer)</span>. The default, when + the driver name is not explicitly specified, is to check wether + VFIO is available and use it if it's the case. If VFIO is not + available, the legacy "kvm" assignment is attempted. </dd> <dt><code>readonly</code></dt> <dd>Indicates that the device is readonly, only supported by SCSI host diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index f20a916..6b825d8 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -399,7 +399,7 @@ enum virDomainHostdevSubsysType { /* the backend driver used for PCI hostdev devices */ typedef enum { - VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT, /* currently kvm, could change */ + VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT, /* detect automaticaly, prefer VFIO */ VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM, /* force legacy kvm style */ VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO, /* force vfio */ diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index ecf26cc..a4742fa 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -5487,7 +5487,6 @@ qemuBuildPCIHostdevDevStr(virDomainDefPtr def, switch ((virDomainHostdevSubsysPciBackendType) dev->source.subsys.u.pci.backend) { - case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM: virBufferAddLit(&buf, "pci-assign"); if (configfd && *configfd) @@ -5498,6 +5497,8 @@ qemuBuildPCIHostdevDevStr(virDomainDefPtr def, virBufferAddLit(&buf, "vfio-pci"); break; + + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_TYPE_LAST: virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("PCI passhthrough type needs to be specified")); diff --git a/src/qemu/qemu_hostdev.c b/src/qemu/qemu_hostdev.c index dbbc2b4..ad408d8 100644 --- a/src/qemu/qemu_hostdev.c +++ b/src/qemu/qemu_hostdev.c @@ -1366,7 +1366,8 @@ qemuHostdevHostSupportsPassthroughLegacy(void) bool qemuHostdevHostVerifySupport(virDomainHostdevDefPtr *hostdevs, - size_t nhostdevs) + size_t nhostdevs, + virQEMUCapsPtr qemuCaps) { int supportsPassthroughKVM = -1; int supportsPassthroughVFIO = -1; @@ -1387,6 +1388,23 @@ qemuHostdevHostVerifySupport(virDomainHostdevDefPtr *hostdevs, } switch ((virDomainHostdevSubsysPciBackendType) *backend) { + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: + if (supportsPassthroughVFIO && + virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE_VFIO_PCI)) { + *backend = VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO; + } else if (supportsPassthroughKVM && + (virQEMUCapsGet(qemuCaps, QEMU_CAPS_PCIDEVICE) || + virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE))) { + *backend = VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM; + } else { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("host doesn't support passthrough of " + "host PCI devices")); + return false; + } + + break; + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO: if (!supportsPassthroughVFIO) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", @@ -1395,7 +1413,6 @@ qemuHostdevHostVerifySupport(virDomainHostdevDefPtr *hostdevs, } break; - case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM: if (!supportsPassthroughKVM) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", diff --git a/src/qemu/qemu_hostdev.h b/src/qemu/qemu_hostdev.h index 6d88830..afe67a5 100644 --- a/src/qemu/qemu_hostdev.h +++ b/src/qemu/qemu_hostdev.h @@ -70,7 +70,8 @@ int qemuDomainHostdevNetConfigRestore(virDomainHostdevDefPtr hostdev, char *stateDir); bool qemuHostdevHostVerifySupport(virDomainHostdevDefPtr *hostdevs, - size_t nhostdevs); + size_t nhostdevs, + virQEMUCapsPtr qemuCaps); #endif /* __QEMU_HOSTDEV_H__ */ diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index c72fdc3..0bbbf6e 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -1144,7 +1144,7 @@ int qemuDomainAttachHostPciDevice(virQEMUDriverPtr driver, return -1; /* verify the availability of passthrough support */ - if (!qemuHostdevHostVerifySupport(&hostdev, 1)) + if (!qemuHostdevHostVerifySupport(&hostdev, 1, priv->qemuCaps)) goto error; switch ((virDomainHostdevSubsysPciBackendType) *backend) { diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 4aa9582..64d9326 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3567,9 +3567,16 @@ int qemuProcessStart(virConnectPtr conn, goto cleanup; } + VIR_DEBUG("Determining emulator version"); + virObjectUnref(priv->qemuCaps); + if (!(priv->qemuCaps = virQEMUCapsCacheLookupCopy(driver->qemuCapsCache, + vm->def->emulator))) + goto cleanup; + /* check and assign device assignment settings */ if (!qemuHostdevHostVerifySupport(vm->def->hostdevs, - vm->def->nhostdevs)) + vm->def->nhostdevs, + priv->qemuCaps)) goto cleanup; /* network devices must be "prepared" before hostdevs, because @@ -3664,12 +3671,6 @@ int qemuProcessStart(virConnectPtr conn, } } - VIR_DEBUG("Determining emulator version"); - virObjectUnref(priv->qemuCaps); - if (!(priv->qemuCaps = virQEMUCapsCacheLookupCopy(driver->qemuCapsCache, - vm->def->emulator))) - goto cleanup; - if (!qemuValidateCpuMax(vm->def, priv->qemuCaps)) goto cleanup; diff --git a/tests/qemuxml2argvtest.c b/tests/qemuxml2argvtest.c index 38319e5..6fe8c6a 100644 --- a/tests/qemuxml2argvtest.c +++ b/tests/qemuxml2argvtest.c @@ -98,6 +98,7 @@ static int testCompareXMLToArgvFiles(const char *xml, virConnectPtr conn; char *log = NULL; virCommandPtr cmd = NULL; + size_t i; if (!(conn = virGetConnect())) goto out; @@ -154,6 +155,16 @@ static int testCompareXMLToArgvFiles(const char *xml, if (qemuAssignDeviceAliases(vmdef, extraFlags) < 0) goto out; + for (i = 0; i < vmdef->nhostdevs; i++) { + virDomainHostdevDefPtr hostdev = vmdef->hostdevs[i]; + + if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS && + hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI && + hostdev->source.subsys.u.pci.backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT) { + hostdev->source.subsys.u.pci.backend = VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM; + } + } + if (!(cmd = qemuBuildCommandLine(conn, &driver, vmdef, &monitor_chr, (flags & FLAG_JSON), extraFlags, migrateFrom, migrateFd, NULL, -- 1.8.3.2

On 10/04/2013 08:55 AM, Peter Krempa wrote:
Prefer using VFIO (if available) to the legacy KVM device passthrough.
With this patch a PCI passthrough device without the driver configured will be started with VFIO if it's available on the host. If not legacy KVM passthrough is checked and error is reported if it's not available. --- docs/formatdomain.html.in | 9 ++++----- src/conf/domain_conf.h | 2 +- src/qemu/qemu_command.c | 3 ++- src/qemu/qemu_hostdev.c | 21 +++++++++++++++++++-- src/qemu/qemu_hostdev.h | 3 ++- src/qemu/qemu_hotplug.c | 2 +- src/qemu/qemu_process.c | 15 ++++++++------- tests/qemuxml2argvtest.c | 11 +++++++++++ 8 files changed, 48 insertions(+), 18 deletions(-)
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 3689399..6f3f7cf 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -2755,11 +2755,10 @@ backend, which is compatible with UEFI SecureBoot) or "kvm" (for the legacy device assignment handled directly by the KVM kernel module)<span class="since">Since 1.0.5 (QEMU and KVM - only, requires kernel 3.6 or newer)</span>. Currently, "kvm" - is the default used by libvirt when not explicitly provided, - but since the two are functionally equivalent, this default - could be changed in the future with no impact to domains that - don't specify anything. + only, requires kernel 3.6 or newer)</span>. The default, when + the driver name is not explicitly specified, is to check wether + VFIO is available and use it if it's the case. If VFIO is not + available, the legacy "kvm" assignment is attempted. </dd> <dt><code>readonly</code></dt> <dd>Indicates that the device is readonly, only supported by SCSI host diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index f20a916..6b825d8 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -399,7 +399,7 @@ enum virDomainHostdevSubsysType {
/* the backend driver used for PCI hostdev devices */ typedef enum { - VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT, /* currently kvm, could change */ + VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT, /* detect automaticaly, prefer VFIO */ VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM, /* force legacy kvm style */ VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO, /* force vfio */
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index ecf26cc..a4742fa 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -5487,7 +5487,6 @@ qemuBuildPCIHostdevDevStr(virDomainDefPtr def,
switch ((virDomainHostdevSubsysPciBackendType) dev->source.subsys.u.pci.backend) { - case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM: virBufferAddLit(&buf, "pci-assign"); if (configfd && *configfd) @@ -5498,6 +5497,8 @@ qemuBuildPCIHostdevDevStr(virDomainDefPtr def, virBufferAddLit(&buf, "vfio-pci"); break;
+
extra blank line.
+ case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_TYPE_LAST: virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("PCI passhthrough type needs to be specified")); diff --git a/src/qemu/qemu_hostdev.c b/src/qemu/qemu_hostdev.c index dbbc2b4..ad408d8 100644 --- a/src/qemu/qemu_hostdev.c +++ b/src/qemu/qemu_hostdev.c @@ -1366,7 +1366,8 @@ qemuHostdevHostSupportsPassthroughLegacy(void)
bool qemuHostdevHostVerifySupport(virDomainHostdevDefPtr *hostdevs, - size_t nhostdevs) + size_t nhostdevs, + virQEMUCapsPtr qemuCaps)
Aha. I guess if you follow my recommendation in 2/3, you'll need to pass qemuCaps down through the qemuPrepareHostDevices() call chain (and I don't think that's a bad thing).
{ int supportsPassthroughKVM = -1; int supportsPassthroughVFIO = -1; @@ -1387,6 +1388,23 @@ qemuHostdevHostVerifySupport(virDomainHostdevDefPtr *hostdevs, }
switch ((virDomainHostdevSubsysPciBackendType) *backend) { + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: + if (supportsPassthroughVFIO && + virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE_VFIO_PCI)) { + *backend = VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO; + } else if (supportsPassthroughKVM && + (virQEMUCapsGet(qemuCaps, QEMU_CAPS_PCIDEVICE) || + virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE))) { + *backend = VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM; + } else { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("host doesn't support passthrough of " + "host PCI devices")); + return false; + } + + break; +
Ah! And there is the separate case I was asking for in 2/3! But you're changing the "backend" in the data itself - that will lead to incorrect display when someone does a dumpxml (will it cause problems if someone loads the vfio driver? I guess not, since this should only be the "live" data, not the persistent data)
case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO: if (!supportsPassthroughVFIO) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", @@ -1395,7 +1413,6 @@ qemuHostdevHostVerifySupport(virDomainHostdevDefPtr *hostdevs, } break;
- case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM: if (!supportsPassthroughKVM) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", diff --git a/src/qemu/qemu_hostdev.h b/src/qemu/qemu_hostdev.h index 6d88830..afe67a5 100644 --- a/src/qemu/qemu_hostdev.h +++ b/src/qemu/qemu_hostdev.h @@ -70,7 +70,8 @@ int qemuDomainHostdevNetConfigRestore(virDomainHostdevDefPtr hostdev, char *stateDir);
bool qemuHostdevHostVerifySupport(virDomainHostdevDefPtr *hostdevs, - size_t nhostdevs); + size_t nhostdevs, + virQEMUCapsPtr qemuCaps);
#endif /* __QEMU_HOSTDEV_H__ */ diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index c72fdc3..0bbbf6e 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -1144,7 +1144,7 @@ int qemuDomainAttachHostPciDevice(virQEMUDriverPtr driver, return -1;
/* verify the availability of passthrough support */ - if (!qemuHostdevHostVerifySupport(&hostdev, 1)) + if (!qemuHostdevHostVerifySupport(&hostdev, 1, priv->qemuCaps)) goto error;
switch ((virDomainHostdevSubsysPciBackendType) *backend) { diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 4aa9582..64d9326 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3567,9 +3567,16 @@ int qemuProcessStart(virConnectPtr conn, goto cleanup; }
+ VIR_DEBUG("Determining emulator version"); + virObjectUnref(priv->qemuCaps); + if (!(priv->qemuCaps = virQEMUCapsCacheLookupCopy(driver->qemuCapsCache, + vm->def->emulator))) + goto cleanup; + /* check and assign device assignment settings */ if (!qemuHostdevHostVerifySupport(vm->def->hostdevs, - vm->def->nhostdevs)) + vm->def->nhostdevs, + priv->qemuCaps)) goto cleanup;
/* network devices must be "prepared" before hostdevs, because @@ -3664,12 +3671,6 @@ int qemuProcessStart(virConnectPtr conn, } }
- VIR_DEBUG("Determining emulator version"); - virObjectUnref(priv->qemuCaps); - if (!(priv->qemuCaps = virQEMUCapsCacheLookupCopy(driver->qemuCapsCache, - vm->def->emulator))) - goto cleanup; - if (!qemuValidateCpuMax(vm->def, priv->qemuCaps)) goto cleanup;
diff --git a/tests/qemuxml2argvtest.c b/tests/qemuxml2argvtest.c index 38319e5..6fe8c6a 100644 --- a/tests/qemuxml2argvtest.c +++ b/tests/qemuxml2argvtest.c @@ -98,6 +98,7 @@ static int testCompareXMLToArgvFiles(const char *xml, virConnectPtr conn; char *log = NULL; virCommandPtr cmd = NULL; + size_t i;
if (!(conn = virGetConnect())) goto out; @@ -154,6 +155,16 @@ static int testCompareXMLToArgvFiles(const char *xml, if (qemuAssignDeviceAliases(vmdef, extraFlags) < 0) goto out;
+ for (i = 0; i < vmdef->nhostdevs; i++) { + virDomainHostdevDefPtr hostdev = vmdef->hostdevs[i]; + + if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS && + hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI && + hostdev->source.subsys.u.pci.backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT) { + hostdev->source.subsys.u.pci.backend = VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM; + } + } +
I'm in too much of a rush to read the code and understand why you're changing this unconditionally for the test. Can you just comment on what you're doing to save me the time?
if (!(cmd = qemuBuildCommandLine(conn, &driver, vmdef, &monitor_chr, (flags & FLAG_JSON), extraFlags, migrateFrom, migrateFd, NULL,

On 10/07/13 16:19, Laine Stump wrote:
On 10/04/2013 08:55 AM, Peter Krempa wrote:
Prefer using VFIO (if available) to the legacy KVM device passthrough.
With this patch a PCI passthrough device without the driver configured will be started with VFIO if it's available on the host. If not legacy KVM passthrough is checked and error is reported if it's not available. --- docs/formatdomain.html.in | 9 ++++----- src/conf/domain_conf.h | 2 +- src/qemu/qemu_command.c | 3 ++- src/qemu/qemu_hostdev.c | 21 +++++++++++++++++++-- src/qemu/qemu_hostdev.h | 3 ++- src/qemu/qemu_hotplug.c | 2 +- src/qemu/qemu_process.c | 15 ++++++++------- tests/qemuxml2argvtest.c | 11 +++++++++++ 8 files changed, 48 insertions(+), 18 deletions(-)
...
diff --git a/tests/qemuxml2argvtest.c b/tests/qemuxml2argvtest.c index 38319e5..6fe8c6a 100644 --- a/tests/qemuxml2argvtest.c +++ b/tests/qemuxml2argvtest.c @@ -98,6 +98,7 @@ static int testCompareXMLToArgvFiles(const char *xml, virConnectPtr conn; char *log = NULL; virCommandPtr cmd = NULL; + size_t i;
if (!(conn = virGetConnect())) goto out; @@ -154,6 +155,16 @@ static int testCompareXMLToArgvFiles(const char *xml, if (qemuAssignDeviceAliases(vmdef, extraFlags) < 0) goto out;
+ for (i = 0; i < vmdef->nhostdevs; i++) { + virDomainHostdevDefPtr hostdev = vmdef->hostdevs[i]; + + if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS && + hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI && + hostdev->source.subsys.u.pci.backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT) { + hostdev->source.subsys.u.pci.backend = VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM; + } + } +
I'm in too much of a rush to read the code and understand why you're changing this unconditionally for the test. Can you just comment on what you're doing to save me the time?
We can't call the detection code in the test suite as we could get different results on different environments. This just "simulates" that legacy passthrough was selected instead of leaving the default so that: a) the command line generator doesn't error out as it now won't work with _DEFAULT b) the test results won't have to be changed to cope with a change to VFIO.
if (!(cmd = qemuBuildCommandLine(conn, &driver, vmdef, &monitor_chr, (flags & FLAG_JSON), extraFlags, migrateFrom, migrateFd, NULL,
Peter

On 10/07/13 16:19, Laine Stump wrote:
On 10/04/2013 08:55 AM, Peter Krempa wrote:
Prefer using VFIO (if available) to the legacy KVM device passthrough.
With this patch a PCI passthrough device without the driver configured will be started with VFIO if it's available on the host. If not legacy KVM passthrough is checked and error is reported if it's not available. --- docs/formatdomain.html.in | 9 ++++----- src/conf/domain_conf.h | 2 +- src/qemu/qemu_command.c | 3 ++- src/qemu/qemu_hostdev.c | 21 +++++++++++++++++++-- src/qemu/qemu_hostdev.h | 3 ++- src/qemu/qemu_hotplug.c | 2 +- src/qemu/qemu_process.c | 15 ++++++++------- tests/qemuxml2argvtest.c | 11 +++++++++++ 8 files changed, 48 insertions(+), 18 deletions(-)
...
diff --git a/src/qemu/qemu_hostdev.c b/src/qemu/qemu_hostdev.c index dbbc2b4..ad408d8 100644 --- a/src/qemu/qemu_hostdev.c +++ b/src/qemu/qemu_hostdev.c @@ -1366,7 +1366,8 @@ qemuHostdevHostSupportsPassthroughLegacy(void)
bool qemuHostdevHostVerifySupport(virDomainHostdevDefPtr *hostdevs, - size_t nhostdevs) + size_t nhostdevs, + virQEMUCapsPtr qemuCaps)
Aha. I guess if you follow my recommendation in 2/3, you'll need to pass qemuCaps down through the qemuPrepareHostDevices() call chain (and I don't think that's a bad thing).
Seems doable. I will post another version incorporating that.
{ int supportsPassthroughKVM = -1; int supportsPassthroughVFIO = -1; @@ -1387,6 +1388,23 @@ qemuHostdevHostVerifySupport(virDomainHostdevDefPtr *hostdevs, }
switch ((virDomainHostdevSubsysPciBackendType) *backend) { + case VIR_DOMAIN_HOSTDEV_PCI_BACKEND_DEFAULT: + if (supportsPassthroughVFIO && + virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE_VFIO_PCI)) { + *backend = VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO; + } else if (supportsPassthroughKVM && + (virQEMUCapsGet(qemuCaps, QEMU_CAPS_PCIDEVICE) || + virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE))) { + *backend = VIR_DOMAIN_HOSTDEV_PCI_BACKEND_KVM; + } else { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("host doesn't support passthrough of " + "host PCI devices")); + return false; + } + + break; +
Ah! And there is the separate case I was asking for in 2/3!
But you're changing the "backend" in the data itself - that will lead to incorrect display when someone does a dumpxml (will it cause problems if someone loads the vfio driver? I guess not, since this should only be the "live" data, not the persistent data)
Hmm, Is this an issue? I think it might be useful to update the info in the live XML so that the users are notified. If we don't want to do this I will change it in the next version. Peter
participants (3)
-
Eric Blake
-
Laine Stump
-
Peter Krempa