[libvirt] [PATCH v4 0/4] Host device isolation for pSeries guests

Changes from [v3]: * correctly handle interfaces connected to hostdev-backed networks; * drop patches implementing support for multiple PHBs, as they have been merged already; * some minor cleanups. Changes from [v2]: * support hot(un)plug properly; * add documentation. Changes from [v1]: * address review comments; * implement a much better isolation algorithm that doesn't require parsing and formatting the isolation group and can handle more dynamic scenarios, such as empty PHBs changing their isolation groups to accomodate hotplugged hostdevs; * add more test cases. [v3] https://www.redhat.com/archives/libvir-list/2017-June/msg01018.html [v2] https://www.redhat.com/archives/libvir-list/2017-June/msg00695.html [v1] https://www.redhat.com/archives/libvir-list/2017-June/msg00110.html Andrea Bolognani (4): conf: Introduce isolation groups conf: Implement isolation rules qemu: Isolate hostdevs on pSeries guests news: Update for hostdev isolation docs/news.xml | 10 + src/bhyve/bhyve_device.c | 4 +- src/conf/device_conf.h | 10 + src/conf/domain_addr.c | 86 ++++++- src/conf/domain_addr.h | 12 +- src/conf/domain_conf.c | 2 + src/qemu/qemu_domain_address.c | 276 +++++++++++++++++++-- src/qemu/qemu_domain_address.h | 4 + src/qemu/qemu_hotplug.c | 7 + tests/qemumemlocktest.c | 2 +- .../qemuxml2argv-pseries-hostdevs-1.args | 8 +- .../qemuxml2argv-pseries-hostdevs-2.args | 3 +- .../qemuxml2argv-pseries-hostdevs-3.args | 2 +- .../qemuxml2xmlout-pseries-hostdevs-1.xml | 14 +- .../qemuxml2xmlout-pseries-hostdevs-2.xml | 6 +- .../qemuxml2xmlout-pseries-hostdevs-3.xml | 2 +- 16 files changed, 410 insertions(+), 38 deletions(-) -- 2.7.5

Isolation groups will eventually allow us to make sure certain devices, eg. PCI hostdevs, are assigned to guest PCI buses in a way that guarantees improved isolation, error detection and recovery for machine types and hypervisors that support it, eg. pSeries guest on QEMU. This patch merely defines storage for the new information we're going to need later on and makes sure it is passed from the hypervisor driver (QEMU / bhyve) down to the generic PCI address allocation code. Signed-off-by: Andrea Bolognani <abologna@redhat.com> Reviewed-by: Laine Stump <laine@laine.org> --- src/bhyve/bhyve_device.c | 4 ++-- src/conf/device_conf.h | 10 ++++++++++ src/conf/domain_addr.c | 17 ++++++++++++----- src/conf/domain_addr.h | 9 ++++++++- src/conf/domain_conf.c | 2 ++ src/qemu/qemu_domain_address.c | 35 ++++++++++++++++++----------------- 6 files changed, 52 insertions(+), 25 deletions(-) diff --git a/src/bhyve/bhyve_device.c b/src/bhyve/bhyve_device.c index fdfd512..03aa6c9 100644 --- a/src/bhyve/bhyve_device.c +++ b/src/bhyve/bhyve_device.c @@ -57,7 +57,7 @@ bhyveCollectPCIAddress(virDomainDefPtr def ATTRIBUTE_UNUSED, } if (virDomainPCIAddressReserveAddr(addrs, addr, - VIR_PCI_CONNECT_TYPE_PCI_DEVICE) < 0) { + VIR_PCI_CONNECT_TYPE_PCI_DEVICE, 0) < 0) { goto cleanup; } @@ -100,7 +100,7 @@ bhyveAssignDevicePCISlots(virDomainDefPtr def, lpc_addr.slot = 0x1; if (virDomainPCIAddressReserveAddr(addrs, &lpc_addr, - VIR_PCI_CONNECT_TYPE_PCI_DEVICE) < 0) { + VIR_PCI_CONNECT_TYPE_PCI_DEVICE, 0) < 0) { goto error; } diff --git a/src/conf/device_conf.h b/src/conf/device_conf.h index 14be2e3..68615f6 100644 --- a/src/conf/device_conf.h +++ b/src/conf/device_conf.h @@ -164,6 +164,16 @@ struct _virDomainDeviceInfo { */ int pciConnectFlags; /* enum virDomainPCIConnectFlags */ char *loadparm; + + /* PCI devices will only be automatically placed on a PCI bus + * that shares the same isolation group */ + unsigned int isolationGroup; + + /* Usually, PCI buses will take on the same isolation group + * as the first device that is plugged into them, but in some + * cases we might want to prevent that from happening by + * locking the isolation group */ + bool isolationGroupLocked; }; diff --git a/src/conf/domain_addr.c b/src/conf/domain_addr.c index 362e996..bb095a3 100644 --- a/src/conf/domain_addr.c +++ b/src/conf/domain_addr.c @@ -548,6 +548,7 @@ static int ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) virDomainPCIAddressReserveAddrInternal(virDomainPCIAddressSetPtr addrs, virPCIDeviceAddressPtr addr, virDomainPCIConnectFlags flags, + unsigned int isolationGroup ATTRIBUTE_UNUSED, bool fromConfig) { int ret = -1; @@ -600,9 +601,11 @@ virDomainPCIAddressReserveAddrInternal(virDomainPCIAddressSetPtr addrs, int virDomainPCIAddressReserveAddr(virDomainPCIAddressSetPtr addrs, virPCIDeviceAddressPtr addr, - virDomainPCIConnectFlags flags) + virDomainPCIConnectFlags flags, + unsigned int isolationGroup) { - return virDomainPCIAddressReserveAddrInternal(addrs, addr, flags, true); + return virDomainPCIAddressReserveAddrInternal(addrs, addr, flags, + isolationGroup, true); } int @@ -638,7 +641,8 @@ virDomainPCIAddressEnsureAddr(virDomainPCIAddressSetPtr addrs, goto cleanup; ret = virDomainPCIAddressReserveAddrInternal(addrs, &dev->addr.pci, - flags, true); + flags, dev->isolationGroup, + true); } else { ret = virDomainPCIAddressReserveNextAddr(addrs, dev, flags, -1); } @@ -759,6 +763,7 @@ static int ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) virDomainPCIAddressGetNextAddr(virDomainPCIAddressSetPtr addrs, virPCIDeviceAddressPtr next_addr, virDomainPCIConnectFlags flags, + unsigned int isolationGroup ATTRIBUTE_UNUSED, int function) { virPCIDeviceAddress a = { 0 }; @@ -839,10 +844,12 @@ virDomainPCIAddressReserveNextAddr(virDomainPCIAddressSetPtr addrs, { virPCIDeviceAddress addr; - if (virDomainPCIAddressGetNextAddr(addrs, &addr, flags, function) < 0) + if (virDomainPCIAddressGetNextAddr(addrs, &addr, flags, + dev->isolationGroup, function) < 0) return -1; - if (virDomainPCIAddressReserveAddrInternal(addrs, &addr, flags, false) < 0) + if (virDomainPCIAddressReserveAddrInternal(addrs, &addr, flags, + dev->isolationGroup, false) < 0) return -1; if (!addrs->dryRun) { diff --git a/src/conf/domain_addr.h b/src/conf/domain_addr.h index c53dceb..ac6d64f 100644 --- a/src/conf/domain_addr.h +++ b/src/conf/domain_addr.h @@ -100,6 +100,12 @@ typedef struct { * bit is set, that function is in use by a device. */ virDomainPCIAddressSlot slot[VIR_PCI_ADDRESS_SLOT_LAST + 1]; + + /* See virDomainDeviceInfo::isolationGroup */ + unsigned int isolationGroup; + + /* See virDomainDeviceInfo::isolationGroupLocked */ + bool isolationGroupLocked; } virDomainPCIAddressBus; typedef virDomainPCIAddressBus *virDomainPCIAddressBusPtr; @@ -142,7 +148,8 @@ bool virDomainPCIAddressSlotInUse(virDomainPCIAddressSetPtr addrs, int virDomainPCIAddressReserveAddr(virDomainPCIAddressSetPtr addrs, virPCIDeviceAddressPtr addr, - virDomainPCIConnectFlags flags) + virDomainPCIConnectFlags flags, + unsigned int isolationGroup) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); int virDomainPCIAddressReserveNextAddr(virDomainPCIAddressSetPtr addrs, diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 3feeccb..9320794 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -3670,6 +3670,8 @@ void virDomainDeviceInfoClear(virDomainDeviceInfoPtr info) info->type = VIR_DOMAIN_DEVICE_ADDRESS_TYPE_NONE; VIR_FREE(info->romfile); VIR_FREE(info->loadparm); + info->isolationGroup = 0; + info->isolationGroupLocked = false; } diff --git a/src/qemu/qemu_domain_address.c b/src/qemu/qemu_domain_address.c index 44d0a5b..b247c85 100644 --- a/src/qemu/qemu_domain_address.c +++ b/src/qemu/qemu_domain_address.c @@ -1037,7 +1037,8 @@ qemuDomainCollectPCIAddress(virDomainDefPtr def ATTRIBUTE_UNUSED, } if (virDomainPCIAddressReserveAddr(addrs, addr, - info->pciConnectFlags) < 0) { + info->pciConnectFlags, + info->isolationGroup) < 0) { goto cleanup; } @@ -1082,6 +1083,10 @@ qemuDomainPCIAddressSetCreate(virDomainDefPtr def, if (virDomainPCIAddressBusSetModel(&addrs->buses[idx], cont->model) < 0) goto error; + /* Forward the information about isolation groups */ + addrs->buses[idx].isolationGroup = cont->info.isolationGroup; + addrs->buses[idx].isolationGroupLocked = cont->info.isolationGroupLocked; + if (cont->model == VIR_DOMAIN_CONTROLLER_MODEL_PCIE_ROOT) hasPCIeRoot = true; } @@ -1198,7 +1203,7 @@ qemuDomainValidateDevicePCISlotsPIIX3(virDomainDefPtr def, if (addrs->nbuses) { memset(&tmp_addr, 0, sizeof(tmp_addr)); tmp_addr.slot = 1; - if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags) < 0) + if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags, 0) < 0) goto cleanup; } @@ -1233,7 +1238,7 @@ qemuDomainValidateDevicePCISlotsPIIX3(virDomainDefPtr def, goto cleanup; } } else { - if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags) < 0) + if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags, 0) < 0) goto cleanup; primaryVideo->info.addr.pci = tmp_addr; primaryVideo->info.type = VIR_DOMAIN_DEVICE_ADDRESS_TYPE_PCI; @@ -1258,7 +1263,7 @@ qemuDomainValidateDevicePCISlotsPIIX3(virDomainDefPtr def, VIR_DEBUG("PCI address 0:0:2.0 in use, future addition of a video" " device will not be possible without manual" " intervention"); - } else if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags) < 0) { + } else if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags, 0) < 0) { goto cleanup; } } @@ -1334,10 +1339,8 @@ qemuDomainValidateDevicePCISlotsQ35(virDomainDefPtr def, assign = true; } if (assign) { - if (virDomainPCIAddressReserveAddr(addrs, - &tmp_addr, flags) < 0) { + if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags, 0) < 0) goto cleanup; - } cont->info.type = VIR_DOMAIN_DEVICE_ADDRESS_TYPE_PCI; cont->info.addr.pci.domain = 0; @@ -1359,10 +1362,8 @@ qemuDomainValidateDevicePCISlotsQ35(virDomainDefPtr def, memset(&tmp_addr, 0, sizeof(tmp_addr)); tmp_addr.slot = 0x1E; if (!virDomainPCIAddressSlotInUse(addrs, &tmp_addr)) { - if (virDomainPCIAddressReserveAddr(addrs, - &tmp_addr, flags) < 0) { + if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags, 0) < 0) goto cleanup; - } cont->info.type = VIR_DOMAIN_DEVICE_ADDRESS_TYPE_PCI; cont->info.addr.pci.domain = 0; @@ -1385,12 +1386,12 @@ qemuDomainValidateDevicePCISlotsQ35(virDomainDefPtr def, tmp_addr.slot = 0x1F; tmp_addr.function = 0; tmp_addr.multi = VIR_TRISTATE_SWITCH_ON; - if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags) < 0) + if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags, 0) < 0) goto cleanup; tmp_addr.function = 3; tmp_addr.multi = VIR_TRISTATE_SWITCH_ABSENT; - if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags) < 0) + if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags, 0) < 0) goto cleanup; } @@ -1424,7 +1425,7 @@ qemuDomainValidateDevicePCISlotsQ35(virDomainDefPtr def, goto cleanup; } } else { - if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags) < 0) + if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags, 0) < 0) goto cleanup; primaryVideo->info.type = VIR_DOMAIN_DEVICE_ADDRESS_TYPE_PCI; primaryVideo->info.addr.pci = tmp_addr; @@ -1450,8 +1451,7 @@ qemuDomainValidateDevicePCISlotsQ35(virDomainDefPtr def, " device will not be possible without manual" " intervention"); virResetLastError(); - } else if (virDomainPCIAddressReserveAddr(addrs, - &tmp_addr, flags) < 0) { + } else if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags, 0) < 0) { goto cleanup; } } @@ -1472,7 +1472,7 @@ qemuDomainValidateDevicePCISlotsQ35(virDomainDefPtr def, !virDeviceInfoPCIAddressWanted(&sound->info)) { continue; } - if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags) < 0) + if (virDomainPCIAddressReserveAddr(addrs, &tmp_addr, flags, 0) < 0) goto cleanup; sound->info.type = VIR_DOMAIN_DEVICE_ADDRESS_TYPE_PCI; @@ -1676,7 +1676,8 @@ qemuDomainAssignDevicePCISlots(virDomainDefPtr def, if (foundAddr) { /* Reserve this function on the slot we found */ if (virDomainPCIAddressReserveAddr(addrs, &addr, - cont->info.pciConnectFlags) < 0) { + cont->info.pciConnectFlags, + cont->info.isolationGroup) < 0) { goto error; } -- 2.7.5

These rules will make it possible for libvirt to automatically assign PCI addresses in a way that respects any isolation constraints devices might have. Signed-off-by: Andrea Bolognani <abologna@redhat.com> Reviewed-by: Laine Stump <laine@laine.org> --- src/conf/domain_addr.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++--- src/conf/domain_addr.h | 3 +++ 2 files changed, 72 insertions(+), 4 deletions(-) diff --git a/src/conf/domain_addr.c b/src/conf/domain_addr.c index bb095a3..531fc68 100644 --- a/src/conf/domain_addr.c +++ b/src/conf/domain_addr.c @@ -369,6 +369,20 @@ virDomainPCIAddressBusIsFullyReserved(virDomainPCIAddressBusPtr bus) } +bool +virDomainPCIAddressBusIsEmpty(virDomainPCIAddressBusPtr bus) +{ + size_t i; + + for (i = bus->minSlot; i <= bus->maxSlot; i++) { + if (bus->slot[i].functions) + return false; + } + + return true; +} + + /* Ensure addr fits in the address set, by expanding it if needed * * Return value: @@ -548,7 +562,7 @@ static int ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) virDomainPCIAddressReserveAddrInternal(virDomainPCIAddressSetPtr addrs, virPCIDeviceAddressPtr addr, virDomainPCIConnectFlags flags, - unsigned int isolationGroup ATTRIBUTE_UNUSED, + unsigned int isolationGroup, bool fromConfig) { int ret = -1; @@ -586,6 +600,26 @@ virDomainPCIAddressReserveAddrInternal(virDomainPCIAddressSetPtr addrs, bus->slot[addr->slot].aggregate = true; } + if (virDomainPCIAddressBusIsEmpty(bus) && !bus->isolationGroupLocked) { + /* The first device decides the isolation group for the + * entire bus */ + bus->isolationGroup = isolationGroup; + VIR_DEBUG("PCI bus %.4x:%.2x assigned isolation group %u because of " + "first device %s", + addr->domain, addr->bus, isolationGroup, addrStr); + } else if (bus->isolationGroup != isolationGroup && fromConfig) { + /* If this is not the first function and its isolation group + * doesn't match the bus', then it should not be using this + * address. However, if the address comes from the user then + * we comply with the request and change the isolation group + * back to the default (because at that point isolation can't + * be guaranteed anymore) */ + bus->isolationGroup = 0; + VIR_DEBUG("PCI bus %.4x:%.2x assigned isolation group %u because of " + "user assigned address %s", + addr->domain, addr->bus, isolationGroup, addrStr); + } + /* mark the requested function as reserved */ bus->slot[addr->slot].functions |= (1 << addr->function); VIR_DEBUG("Reserving PCI address %s (aggregate='%s')", addrStr, @@ -763,7 +797,7 @@ static int ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) virDomainPCIAddressGetNextAddr(virDomainPCIAddressSetPtr addrs, virPCIDeviceAddressPtr next_addr, virDomainPCIConnectFlags flags, - unsigned int isolationGroup ATTRIBUTE_UNUSED, + unsigned int isolationGroup, int function) { virPCIDeviceAddress a = { 0 }; @@ -779,12 +813,41 @@ virDomainPCIAddressGetNextAddr(virDomainPCIAddressSetPtr addrs, else a.function = function; - /* "Begin at the beginning," the King said, very gravely, "and go on - * till you come to the end: then stop." */ + /* When looking for a suitable bus for the device, start by being + * very strict and ignoring all those where the isolation groups + * don't match. This ensures all devices sharing the same isolation + * group will end up on the same bus */ for (a.bus = 0; a.bus < addrs->nbuses; a.bus++) { virDomainPCIAddressBusPtr bus = &addrs->buses[a.bus]; bool found = false; + if (bus->isolationGroup != isolationGroup) + continue; + + a.slot = bus->minSlot; + + if (virDomainPCIAddressFindUnusedFunctionOnBus(bus, &a, function, + flags, &found) < 0) { + goto error; + } + + if (found) + goto success; + } + + /* We haven't been able to find a perfectly matching bus, but we + * might still be able to make this work by altering the isolation + * group for a bus that's currently empty. So let's try that */ + for (a.bus = 0; a.bus < addrs->nbuses; a.bus++) { + virDomainPCIAddressBusPtr bus = &addrs->buses[a.bus]; + bool found = false; + + /* We can only change the isolation group for a bus when + * plugging in the first device; moreover, some buses are + * prevented from ever changing it */ + if (!virDomainPCIAddressBusIsEmpty(bus) || bus->isolationGroupLocked) + continue; + a.slot = bus->minSlot; if (virDomainPCIAddressFindUnusedFunctionOnBus(bus, &a, function, @@ -792,6 +855,8 @@ virDomainPCIAddressGetNextAddr(virDomainPCIAddressSetPtr addrs, goto error; } + /* The isolation group for the bus will actually be changed + * later, in virDomainPCIAddressReserveAddrInternal() */ if (found) goto success; } diff --git a/src/conf/domain_addr.h b/src/conf/domain_addr.h index ac6d64f..205e7cf 100644 --- a/src/conf/domain_addr.h +++ b/src/conf/domain_addr.h @@ -142,6 +142,9 @@ int virDomainPCIAddressBusSetModel(virDomainPCIAddressBusPtr bus, bool virDomainPCIAddressBusIsFullyReserved(virDomainPCIAddressBusPtr bus) ATTRIBUTE_NONNULL(1); +bool virDomainPCIAddressBusIsEmpty(virDomainPCIAddressBusPtr bus) + ATTRIBUTE_NONNULL(1); + bool virDomainPCIAddressSlotInUse(virDomainPCIAddressSetPtr addrs, virPCIDeviceAddressPtr addr) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); -- 2.7.5

On 07/15/2017 11:30 AM, Andrea Bolognani wrote:
These rules will make it possible for libvirt to automatically assign PCI addresses in a way that respects any isolation constraints devices might have.
Signed-off-by: Andrea Bolognani <abologna@redhat.com> Reviewed-by: Laine Stump <laine@laine.org> --- src/conf/domain_addr.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++--- src/conf/domain_addr.h | 3 +++ 2 files changed, 72 insertions(+), 4 deletions(-)
[...]
@@ -763,7 +797,7 @@ static int ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) virDomainPCIAddressGetNextAddr(virDomainPCIAddressSetPtr addrs, virPCIDeviceAddressPtr next_addr, virDomainPCIConnectFlags flags, - unsigned int isolationGroup ATTRIBUTE_UNUSED, + unsigned int isolationGroup, int function) { virPCIDeviceAddress a = { 0 }; @@ -779,12 +813,41 @@ virDomainPCIAddressGetNextAddr(virDomainPCIAddressSetPtr addrs, else a.function = function;
- /* "Begin at the beginning," the King said, very gravely, "and go on - * till you come to the end: then stop." */ + /* When looking for a suitable bus for the device, start by being + * very strict and ignoring all those where the isolation groups + * don't match. This ensures all devices sharing the same isolation + * group will end up on the same bus */ for (a.bus = 0; a.bus < addrs->nbuses; a.bus++) { virDomainPCIAddressBusPtr bus = &addrs->buses[a.bus]; bool found = false;
+ if (bus->isolationGroup != isolationGroup) + continue; + + a.slot = bus->minSlot;
Something for a followup patch - it occurred to me while looking at this that due to your earlier patch changing the "find next" algorithm to always start over at bus='0' slot='0', virDomainPCIAddressFindUnusedFunctionOnBus() is now always called with a.slot == bus->minSlot. We should change that function internally to start at bus->minSlot rather than starting at a.slot - then we can remove the "pre-load" of a.slot = bus->minSlot in this function. Behavior will be identical, but the code will be easier for an outsider to follow (and look less like it has been hacked up multiple times from its original form :-P) My Reviewed-By still stands BTW.
+ + if (virDomainPCIAddressFindUnusedFunctionOnBus(bus, &a, function, + flags, &found) < 0) { + goto error; + } + + if (found)

All the pieces are now in place, so we can finally start using isolation groups to achieve our initial goal, which is separating hostdevs from emulated PCI devices while keeping hostdevs that belong to the same host IOMMU group together. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1280542 Signed-off-by: Andrea Bolognani <abologna@redhat.com> --- src/qemu/qemu_domain_address.c | 241 +++++++++++++++++++++ src/qemu/qemu_domain_address.h | 4 + src/qemu/qemu_hotplug.c | 7 + tests/qemumemlocktest.c | 2 +- .../qemuxml2argv-pseries-hostdevs-1.args | 8 +- .../qemuxml2argv-pseries-hostdevs-2.args | 3 +- .../qemuxml2argv-pseries-hostdevs-3.args | 2 +- .../qemuxml2xmlout-pseries-hostdevs-1.xml | 14 +- .../qemuxml2xmlout-pseries-hostdevs-2.xml | 6 +- .../qemuxml2xmlout-pseries-hostdevs-3.xml | 2 +- 10 files changed, 278 insertions(+), 11 deletions(-) diff --git a/src/qemu/qemu_domain_address.c b/src/qemu/qemu_domain_address.c index b247c85..2594712 100644 --- a/src/qemu/qemu_domain_address.c +++ b/src/qemu/qemu_domain_address.c @@ -25,6 +25,7 @@ #include "qemu_domain_address.h" #include "qemu_domain.h" +#include "network/bridge_driver.h" #include "viralloc.h" #include "virerror.h" #include "virlog.h" @@ -906,6 +907,243 @@ qemuDomainFillAllPCIConnectFlags(virDomainDefPtr def, /** + * qemuDomainFindUnusedIsolationGroupIter: + * @def: domain definition + * @dev: device definition + * @info: device information + * @opaque: user data + * + * Used to implement qemuDomainFindUnusedIsolationGroup(). You probably + * don't want to call this directly. + * + * Return: 0 if the isolation group is not used by the device, <1 otherwise. + */ +static int +qemuDomainFindUnusedIsolationGroupIter(virDomainDefPtr def ATTRIBUTE_UNUSED, + virDomainDeviceDefPtr dev ATTRIBUTE_UNUSED, + virDomainDeviceInfoPtr info, + void *opaque) +{ + unsigned int *isolationGroup = opaque; + + if (info->isolationGroup == *isolationGroup) + return -1; + + return 0; +} + + +/** + * qemuDomainFindUnusedIsolationGroup: + * @def: domain definition + * + * Find an isolation group that is not used by any device in @def yet. + * + * Normally, we'd look up the device's IOMMU group and base its isolation + * group on that; however, when a network interface uses a network backed + * by SR-IOV Virtual Functions, we can't know at PCI address assignment + * time which host device will be used so we can't look up its IOMMU group. + * + * We still want such a device to be isolated: this function can be used + * to obtain a synthetic isolation group usable for the purpose. + * + * Return: unused isolation group + */ +static unsigned int +qemuDomainFindUnusedIsolationGroup(virDomainDefPtr def) +{ + unsigned int isolationGroup = UINT_MAX; + + /* We start from the highest possible isolation group and work our + * way backwards so that we're working in a completely different range + * from IOMMU groups, thus avoiding clashes. We're realistically going + * to call this function just a few times per guest anyway */ + while (isolationGroup > 0 && + virDomainDeviceInfoIterate(def, + qemuDomainFindUnusedIsolationGroupIter, + &isolationGroup) < 0) { + isolationGroup--; + } + + return isolationGroup; +} + + +/** + * qemuDomainFillDeviceIsolationGroup: + * @def: domain definition + * @dev: device definition + * + * Fill isolation group information for a single device. + * + * Return: 0 on success, <0 on failure + * */ +int +qemuDomainFillDeviceIsolationGroup(virDomainDefPtr def, + virDomainDeviceDefPtr dev) +{ + int ret = -1; + + /* Only host devices need their isolation group to be different from + * the default. Interfaces of type hostdev are just host devices in + * disguise, but we don't need to handle them separately because for + * each such interface a corresponding hostdev is also added to the + * guest configuration */ + if (dev->type == VIR_DOMAIN_DEVICE_HOSTDEV) { + virDomainHostdevDefPtr hostdev = dev->data.hostdev; + virDomainDeviceInfoPtr info = hostdev->info; + virPCIDeviceAddressPtr hostAddr; + int tmp; + + /* Only PCI host devices are subject to isolation */ + if (hostdev->mode != VIR_DOMAIN_HOSTDEV_MODE_SUBSYS || + hostdev->source.subsys.type != VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) { + goto skip; + } + + hostAddr = &hostdev->source.subsys.u.pci.addr; + + /* If a non-default isolation has already been assigned to the + * device, we can avoid looking up the information again */ + if (info->isolationGroup > 0) + goto skip; + + /* The isolation group depends on the IOMMU group assigned by the host */ + tmp = virPCIDeviceAddressGetIOMMUGroupNum(hostAddr); + + if (tmp < 0) { + VIR_WARN("Can't look up isolation group for host device " + "%04x:%02x:%02x.%x", + hostAddr->domain, hostAddr->bus, + hostAddr->slot, hostAddr->function); + goto cleanup; + } + + /* The isolation group for a host device is its IOMMU group, + * increased by one: this is because zero is a valid IOMMU group but + * that's also the default isolation group, which we want to save + * for emulated devices. Shifting isolation groups for host devices + * by one ensures there is no overlap */ + info->isolationGroup = tmp + 1; + + VIR_DEBUG("Isolation group for host device %04x:%02x:%02x.%x is %u", + hostAddr->domain, hostAddr->bus, + hostAddr->slot, hostAddr->function, + info->isolationGroup); + + } else if (dev->type == VIR_DOMAIN_DEVICE_NET) { + virDomainNetDefPtr iface = dev->data.net; + virDomainDeviceInfoPtr info = &iface->info; + unsigned int tmp; + + /* Network interfaces can ultimately result in the guest being + * assigned a host device if the libvirt network they're connected + * to is of type hostdev. Any other kind of network doesn't require + * us to isolate the guest device, so we can skip them */ + if (iface->type == VIR_DOMAIN_NET_TYPE_NETWORK && + networkGetActualType(iface) != VIR_DOMAIN_NET_TYPE_HOSTDEV) { + goto skip; + } + + /* If a non-default isolation has already been assigned to the + * device, we can avoid looking up the information again */ + if (info->isolationGroup > 0) + goto skip; + + /* Obtain a synthetic isolation group for the device, since at this + * point in time we don't have access to the IOMMU group of the host + * device that will eventually be used by the guest */ + tmp = qemuDomainFindUnusedIsolationGroup(def); + + if (tmp == 0) { + VIR_WARN("Can't obtain usable isolation group for interface " + "configured to use hostdev-backed network '%s'", + iface->data.network.name); + goto cleanup; + } + + info->isolationGroup = tmp; + + VIR_DEBUG("Isolation group for interface configured to use " + "hostdev-backed network '%s' is %u", + iface->data.network.name, info->isolationGroup); + } + + skip: + ret = 0; + + cleanup: + return ret; +} + + +/** + * qemuDomainFillDeviceIsolationGroupIter: + * @def: domain definition + * @dev: device definition + * @info: device information + * @opaque: user data + * + * A version of qemuDomainFillDeviceIsolationGroup() to be used + * with virDomainDeviceInfoIterate() + * + * Return: 0 on success, <0 on failure + */ +static int +qemuDomainFillDeviceIsolationGroupIter(virDomainDefPtr def, + virDomainDeviceDefPtr dev, + virDomainDeviceInfoPtr info ATTRIBUTE_UNUSED, + void *opaque ATTRIBUTE_UNUSED) +{ + return qemuDomainFillDeviceIsolationGroup(def, dev); +} + + +/** + * qemuDomainSetupIsolationGroups: + * @def: domain definition + * + * High-level function to set up isolation groups for all devices + * and controllers in @def. Isolation groups will only be set up if + * the guest architecture and machine type require it, so this + * function can and should be called unconditionally before attempting + * to assign any PCI address. + * + * Return: 0 on success, <0 on failure + */ +static int +qemuDomainSetupIsolationGroups(virDomainDefPtr def) +{ + int idx; + int ret = -1; + + /* Only pSeries guests care about isolation groups at the moment */ + if (!qemuDomainIsPSeries(def)) + return 0; + + idx = virDomainControllerFind(def, VIR_DOMAIN_CONTROLLER_TYPE_PCI, 0); + if (idx < 0) + goto cleanup; + + /* We want to prevent hostdevs from being plugged into the default PHB: + * we can make sure that doesn't happen by locking its isolation group */ + def->controllers[idx]->info.isolationGroupLocked = true; + + /* Fill in isolation groups for all other devices */ + if (virDomainDeviceInfoIterate(def, + qemuDomainFillDeviceIsolationGroupIter, + NULL) < 0) { + goto cleanup; + } + + ret = 0; + + cleanup: + return ret; +} + + +/** * qemuDomainFillDevicePCIConnectFlags: * * @def: the entire DomainDef @@ -2054,6 +2292,9 @@ qemuDomainAssignPCIAddresses(virDomainDefPtr def, if (qemuDomainFillAllPCIConnectFlags(def, qemuCaps, driver) < 0) goto cleanup; + if (qemuDomainSetupIsolationGroups(def) < 0) + goto cleanup; + if (nbuses > 0) { /* 1st pass to figure out how many PCI bridges we need */ if (!(addrs = qemuDomainPCIAddressSetCreate(def, nbuses, true))) diff --git a/src/qemu/qemu_domain_address.h b/src/qemu/qemu_domain_address.h index 067f4e7..b5644fa 100644 --- a/src/qemu/qemu_domain_address.h +++ b/src/qemu/qemu_domain_address.h @@ -44,6 +44,10 @@ int qemuDomainEnsurePCIAddress(virDomainObjPtr obj, virQEMUDriverPtr driver) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_NONNULL(3); +int qemuDomainFillDeviceIsolationGroup(virDomainDefPtr def, + virDomainDeviceDefPtr dev) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); + void qemuDomainReleaseDeviceAddress(virDomainObjPtr vm, virDomainDeviceInfoPtr info, const char *devstr); diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index 4bc4972..da5aafa 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -1476,6 +1476,13 @@ qemuDomainAttachHostPCIDevice(virQEMUDriverPtr driver, if (qemuAssignDeviceHostdevAlias(vm->def, &info->alias, -1) < 0) goto error; + + if (qemuDomainIsPSeries(vm->def)) { + /* Isolation groups are only relevant for pSeries guests */ + if (qemuDomainFillDeviceIsolationGroup(vm->def, &dev) < 0) + goto error; + } + if (qemuDomainEnsurePCIAddress(vm, &dev, driver) < 0) goto error; releaseaddr = true; diff --git a/tests/qemumemlocktest.c b/tests/qemumemlocktest.c index c0f1dc3..268563d 100644 --- a/tests/qemumemlocktest.c +++ b/tests/qemumemlocktest.c @@ -131,7 +131,7 @@ mymain(void) DO_TEST("pseries-hardlimit", 2147483648); DO_TEST("pseries-locked", VIR_DOMAIN_MEMORY_PARAM_UNLIMITED); - DO_TEST("pseries-hostdev", 2168455168); + DO_TEST("pseries-hostdev", 4320133120); DO_TEST("pseries-hardlimit+locked", 2147483648); DO_TEST("pseries-hardlimit+hostdev", 2147483648); diff --git a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-1.args b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-1.args index 051ffde..8a4a4c5 100644 --- a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-1.args +++ b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-1.args @@ -18,6 +18,8 @@ QEMU_AUDIO_DRV=none \ server,nowait \ -mon chardev=charmonitor,id=monitor,mode=readline \ -boot c \ --device vfio-pci,host=0005:90:01.0,id=hostdev0,bus=pci.0,addr=0x1 \ --device vfio-pci,host=0001:01:00.0,id=hostdev1,bus=pci.0,addr=0x2 \ --device vfio-pci,host=0001:01:00.1,id=hostdev2,bus=pci.0,addr=0x3 +-device spapr-pci-host-bridge,index=1,id=pci.1 \ +-device spapr-pci-host-bridge,index=2,id=pci.2 \ +-device vfio-pci,host=0005:90:01.0,id=hostdev0,bus=pci.1.0,addr=0x1 \ +-device vfio-pci,host=0001:01:00.0,id=hostdev1,bus=pci.2.0,addr=0x1 \ +-device vfio-pci,host=0001:01:00.1,id=hostdev2,bus=pci.2.0,addr=0x2 diff --git a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-2.args b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-2.args index 83d4306..cd5b664 100644 --- a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-2.args +++ b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-2.args @@ -19,6 +19,7 @@ server,nowait \ -mon chardev=charmonitor,id=monitor,mode=readline \ -boot c \ -device spapr-pci-host-bridge,index=1,id=pci.1 \ +-device spapr-pci-host-bridge,index=2,id=pci.2 \ -device virtio-scsi-pci,id=scsi0,bus=pci.1.0,addr=0x1 \ -device vfio-pci,host=0001:01:00.0,id=hostdev0,bus=pci.1.0,addr=0x2 \ --device vfio-pci,host=0005:90:01.0,id=hostdev1,bus=pci.0,addr=0x1 +-device vfio-pci,host=0005:90:01.0,id=hostdev1,bus=pci.2.0,addr=0x1 diff --git a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-3.args b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-3.args index eda6cc7..66a31ba 100644 --- a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-3.args +++ b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-3.args @@ -21,4 +21,4 @@ server,nowait \ -device spapr-pci-host-bridge,index=1,id=pci.1 \ -device spapr-pci-host-bridge,index=2,id=pci.2 \ -device vfio-pci,host=0001:01:00.0,id=hostdev0,bus=pci.2.0,addr=0x1 \ --device vfio-pci,host=0001:01:00.1,id=hostdev1,bus=pci.0,addr=0x1 +-device vfio-pci,host=0001:01:00.1,id=hostdev1,bus=pci.2.0,addr=0x2 diff --git a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-1.xml b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-1.xml index fa9e4da..e77a060 100644 --- a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-1.xml +++ b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-1.xml @@ -19,27 +19,35 @@ <model name='spapr-pci-host-bridge'/> <target index='0'/> </controller> + <controller type='pci' index='1' model='pci-root'> + <model name='spapr-pci-host-bridge'/> + <target index='1'/> + </controller> + <controller type='pci' index='2' model='pci-root'> + <model name='spapr-pci-host-bridge'/> + <target index='2'/> + </controller> <interface type='hostdev' managed='yes'> <mac address='52:54:00:6d:90:02'/> <driver name='vfio'/> <source> <address type='pci' domain='0x0005' bus='0x90' slot='0x01' function='0x0'/> </source> - <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x0'/> + <address type='pci' domain='0x0000' bus='0x01' slot='0x01' function='0x0'/> </interface> <hostdev mode='subsystem' type='pci' managed='yes'> <driver name='vfio'/> <source> <address domain='0x0001' bus='0x01' slot='0x00' function='0x0'/> </source> - <address type='pci' domain='0x0000' bus='0x00' slot='0x02' function='0x0'/> + <address type='pci' domain='0x0000' bus='0x02' slot='0x01' function='0x0'/> </hostdev> <hostdev mode='subsystem' type='pci' managed='yes'> <driver name='vfio'/> <source> <address domain='0x0001' bus='0x01' slot='0x00' function='0x1'/> </source> - <address type='pci' domain='0x0000' bus='0x00' slot='0x03' function='0x0'/> + <address type='pci' domain='0x0000' bus='0x02' slot='0x02' function='0x0'/> </hostdev> <memballoon model='none'/> <panic model='pseries'/> diff --git a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-2.xml b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-2.xml index 17ff4c8..cfa395b 100644 --- a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-2.xml +++ b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-2.xml @@ -26,6 +26,10 @@ <model name='spapr-pci-host-bridge'/> <target index='1'/> </controller> + <controller type='pci' index='2' model='pci-root'> + <model name='spapr-pci-host-bridge'/> + <target index='2'/> + </controller> <hostdev mode='subsystem' type='pci' managed='yes'> <driver name='vfio'/> <source> @@ -38,7 +42,7 @@ <source> <address domain='0x0005' bus='0x90' slot='0x01' function='0x0'/> </source> - <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x0'/> + <address type='pci' domain='0x0000' bus='0x02' slot='0x01' function='0x0'/> </hostdev> <memballoon model='none'/> <panic model='pseries'/> diff --git a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-3.xml b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-3.xml index 58023ec..f91959b 100644 --- a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-3.xml +++ b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-3.xml @@ -39,7 +39,7 @@ <source> <address domain='0x0001' bus='0x01' slot='0x00' function='0x1'/> </source> - <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x0'/> + <address type='pci' domain='0x0000' bus='0x02' slot='0x02' function='0x0'/> </hostdev> <memballoon model='none'/> <panic model='pseries'/> -- 2.7.5

On 07/15/2017 11:30 AM, Andrea Bolognani wrote:
All the pieces are now in place, so we can finally start using isolation groups to achieve our initial goal, which is separating hostdevs from emulated PCI devices while keeping hostdevs that belong to the same host IOMMU group together.
Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1280542
Signed-off-by: Andrea Bolognani <abologna@redhat.com>
Reviewed-by: Laine Stump <laine@laine.org> (I wonder if maybe it would be better to make the "default iommu group" that's used for emulated devices something other than 0, so that the debug log messages won't confuse anyone due to the displayed group being 1 off from that of the actual device on the host (see my comments a bit further down), but I'm willing to ACK it either way.
--- src/qemu/qemu_domain_address.c | 241 +++++++++++++++++++++ src/qemu/qemu_domain_address.h | 4 + src/qemu/qemu_hotplug.c | 7 + tests/qemumemlocktest.c | 2 +- .../qemuxml2argv-pseries-hostdevs-1.args | 8 +- .../qemuxml2argv-pseries-hostdevs-2.args | 3 +- .../qemuxml2argv-pseries-hostdevs-3.args | 2 +- .../qemuxml2xmlout-pseries-hostdevs-1.xml | 14 +- .../qemuxml2xmlout-pseries-hostdevs-2.xml | 6 +- .../qemuxml2xmlout-pseries-hostdevs-3.xml | 2 +- 10 files changed, 278 insertions(+), 11 deletions(-)
diff --git a/src/qemu/qemu_domain_address.c b/src/qemu/qemu_domain_address.c index b247c85..2594712 100644 --- a/src/qemu/qemu_domain_address.c +++ b/src/qemu/qemu_domain_address.c @@ -25,6 +25,7 @@
#include "qemu_domain_address.h" #include "qemu_domain.h" +#include "network/bridge_driver.h" #include "viralloc.h" #include "virerror.h" #include "virlog.h" @@ -906,6 +907,243 @@ qemuDomainFillAllPCIConnectFlags(virDomainDefPtr def,
/** + * qemuDomainFindUnusedIsolationGroupIter: + * @def: domain definition + * @dev: device definition + * @info: device information + * @opaque: user data + * + * Used to implement qemuDomainFindUnusedIsolationGroup(). You probably + * don't want to call this directly. + * + * Return: 0 if the isolation group is not used by the device, <1 otherwise. + */ +static int +qemuDomainFindUnusedIsolationGroupIter(virDomainDefPtr def ATTRIBUTE_UNUSED, + virDomainDeviceDefPtr dev ATTRIBUTE_UNUSED, + virDomainDeviceInfoPtr info, + void *opaque) +{ + unsigned int *isolationGroup = opaque; + + if (info->isolationGroup == *isolationGroup) + return -1; + + return 0; +} + + +/** + * qemuDomainFindUnusedIsolationGroup: + * @def: domain definition + * + * Find an isolation group that is not used by any device in @def yet. + * + * Normally, we'd look up the device's IOMMU group and base its isolation + * group on that; however, when a network interface uses a network backed + * by SR-IOV Virtual Functions, we can't know at PCI address assignment + * time which host device will be used so we can't look up its IOMMU group. + * + * We still want such a device to be isolated: this function can be used + * to obtain a synthetic isolation group usable for the purpose. + * + * Return: unused isolation group + */ +static unsigned int +qemuDomainFindUnusedIsolationGroup(virDomainDefPtr def) +{ + unsigned int isolationGroup = UINT_MAX; + + /* We start from the highest possible isolation group and work our + * way backwards so that we're working in a completely different range + * from IOMMU groups, thus avoiding clashes. We're realistically going + * to call this function just a few times per guest anyway */
Oh sure, you say that *now*. But what's going to happen when someone decides that iommu groups need to be represented by random numbers in the kernel rather than assigning them sequentially starting at 0! (No, I'm not serious - even if that was even a possibility, the chances of a clash would be infinitesimal.)
+ while (isolationGroup > 0 && + virDomainDeviceInfoIterate(def, + qemuDomainFindUnusedIsolationGroupIter, + &isolationGroup) < 0) { + isolationGroup--; + } + + return isolationGroup; +} + + +/** + * qemuDomainFillDeviceIsolationGroup: + * @def: domain definition + * @dev: device definition + * + * Fill isolation group information for a single device. + * + * Return: 0 on success, <0 on failure + * */ +int +qemuDomainFillDeviceIsolationGroup(virDomainDefPtr def, + virDomainDeviceDefPtr dev) +{ + int ret = -1; + + /* Only host devices need their isolation group to be different from + * the default. Interfaces of type hostdev are just host devices in + * disguise, but we don't need to handle them separately because for + * each such interface a corresponding hostdev is also added to the + * guest configuration */
I had totally forgotten about that. My God, what was I thinking????!?
+ if (dev->type == VIR_DOMAIN_DEVICE_HOSTDEV) { + virDomainHostdevDefPtr hostdev = dev->data.hostdev; + virDomainDeviceInfoPtr info = hostdev->info; + virPCIDeviceAddressPtr hostAddr; + int tmp; + + /* Only PCI host devices are subject to isolation */ + if (hostdev->mode != VIR_DOMAIN_HOSTDEV_MODE_SUBSYS || + hostdev->source.subsys.type != VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) { + goto skip; + } + + hostAddr = &hostdev->source.subsys.u.pci.addr; + + /* If a non-default isolation has already been assigned to the + * device, we can avoid looking up the information again */ + if (info->isolationGroup > 0) + goto skip; + + /* The isolation group depends on the IOMMU group assigned by the host */ + tmp = virPCIDeviceAddressGetIOMMUGroupNum(hostAddr); + + if (tmp < 0) { + VIR_WARN("Can't look up isolation group for host device " + "%04x:%02x:%02x.%x", + hostAddr->domain, hostAddr->bus, + hostAddr->slot, hostAddr->function); + goto cleanup; + } + + /* The isolation group for a host device is its IOMMU group, + * increased by one: this is because zero is a valid IOMMU group but + * that's also the default isolation group, which we want to save + * for emulated devices. Shifting isolation groups for host devices + * by one ensures there is no overlap */
Which is fine because this is only used internally - we never actually try to relate this internal number to the external number. Of course, the whole reason you're doing this is so that the default value can be 0, making it easier to initialize objects. Otherwise, we could make the default group -1 (or UINT_MAX) and then there would be no confusion (e.g. if someone was looking at the debug log message a few lines down from here). So if the simplified initialization of objects is a good tradeoff in exchange for confusion when looking at debug output which probably nobody will ever look at, then I guess this works.
+ info->isolationGroup = tmp + 1; + + VIR_DEBUG("Isolation group for host device %04x:%02x:%02x.%x is %u", + hostAddr->domain, hostAddr->bus, + hostAddr->slot, hostAddr->function, + info->isolationGroup); + + } else if (dev->type == VIR_DOMAIN_DEVICE_NET) { + virDomainNetDefPtr iface = dev->data.net; + virDomainDeviceInfoPtr info = &iface->info; + unsigned int tmp; + + /* Network interfaces can ultimately result in the guest being + * assigned a host device if the libvirt network they're connected + * to is of type hostdev. Any other kind of network doesn't require + * us to isolate the guest device, so we can skip them */ + if (iface->type == VIR_DOMAIN_NET_TYPE_NETWORK && + networkGetActualType(iface) != VIR_DOMAIN_NET_TYPE_HOSTDEV) { + goto skip; + } + + /* If a non-default isolation has already been assigned to the + * device, we can avoid looking up the information again */ + if (info->isolationGroup > 0) + goto skip; + + /* Obtain a synthetic isolation group for the device, since at this + * point in time we don't have access to the IOMMU group of the host + * device that will eventually be used by the guest */ + tmp = qemuDomainFindUnusedIsolationGroup(def); + + if (tmp == 0) { + VIR_WARN("Can't obtain usable isolation group for interface " + "configured to use hostdev-backed network '%s'", + iface->data.network.name); + goto cleanup; + } + + info->isolationGroup = tmp; + + VIR_DEBUG("Isolation group for interface configured to use " + "hostdev-backed network '%s' is %u", + iface->data.network.name, info->isolationGroup); + } + + skip: + ret = 0; + + cleanup: + return ret; +} + + +/** + * qemuDomainFillDeviceIsolationGroupIter: + * @def: domain definition + * @dev: device definition + * @info: device information + * @opaque: user data + * + * A version of qemuDomainFillDeviceIsolationGroup() to be used + * with virDomainDeviceInfoIterate() + * + * Return: 0 on success, <0 on failure + */ +static int +qemuDomainFillDeviceIsolationGroupIter(virDomainDefPtr def, + virDomainDeviceDefPtr dev, + virDomainDeviceInfoPtr info ATTRIBUTE_UNUSED, + void *opaque ATTRIBUTE_UNUSED) +{ + return qemuDomainFillDeviceIsolationGroup(def, dev); +} + + +/** + * qemuDomainSetupIsolationGroups: + * @def: domain definition + * + * High-level function to set up isolation groups for all devices + * and controllers in @def. Isolation groups will only be set up if + * the guest architecture and machine type require it, so this + * function can and should be called unconditionally before attempting + * to assign any PCI address. + * + * Return: 0 on success, <0 on failure + */ +static int +qemuDomainSetupIsolationGroups(virDomainDefPtr def) +{ + int idx; + int ret = -1; + + /* Only pSeries guests care about isolation groups at the moment */ + if (!qemuDomainIsPSeries(def)) + return 0; + + idx = virDomainControllerFind(def, VIR_DOMAIN_CONTROLLER_TYPE_PCI, 0); + if (idx < 0) + goto cleanup; + + /* We want to prevent hostdevs from being plugged into the default PHB: + * we can make sure that doesn't happen by locking its isolation group */ + def->controllers[idx]->info.isolationGroupLocked = true; + + /* Fill in isolation groups for all other devices */ + if (virDomainDeviceInfoIterate(def, + qemuDomainFillDeviceIsolationGroupIter, + NULL) < 0) { + goto cleanup; + } + + ret = 0; + + cleanup: + return ret; +} + + +/** * qemuDomainFillDevicePCIConnectFlags: * * @def: the entire DomainDef @@ -2054,6 +2292,9 @@ qemuDomainAssignPCIAddresses(virDomainDefPtr def, if (qemuDomainFillAllPCIConnectFlags(def, qemuCaps, driver) < 0) goto cleanup;
+ if (qemuDomainSetupIsolationGroups(def) < 0) + goto cleanup; + if (nbuses > 0) { /* 1st pass to figure out how many PCI bridges we need */ if (!(addrs = qemuDomainPCIAddressSetCreate(def, nbuses, true))) diff --git a/src/qemu/qemu_domain_address.h b/src/qemu/qemu_domain_address.h index 067f4e7..b5644fa 100644 --- a/src/qemu/qemu_domain_address.h +++ b/src/qemu/qemu_domain_address.h @@ -44,6 +44,10 @@ int qemuDomainEnsurePCIAddress(virDomainObjPtr obj, virQEMUDriverPtr driver) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_NONNULL(3);
+int qemuDomainFillDeviceIsolationGroup(virDomainDefPtr def, + virDomainDeviceDefPtr dev) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); + void qemuDomainReleaseDeviceAddress(virDomainObjPtr vm, virDomainDeviceInfoPtr info, const char *devstr); diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index 4bc4972..da5aafa 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -1476,6 +1476,13 @@ qemuDomainAttachHostPCIDevice(virQEMUDriverPtr driver,
if (qemuAssignDeviceHostdevAlias(vm->def, &info->alias, -1) < 0) goto error; + + if (qemuDomainIsPSeries(vm->def)) { + /* Isolation groups are only relevant for pSeries guests */ + if (qemuDomainFillDeviceIsolationGroup(vm->def, &dev) < 0) + goto error; + } +
Hotplug of course will work only if a PHB already exists that has the right iommu group.
if (qemuDomainEnsurePCIAddress(vm, &dev, driver) < 0) goto error; releaseaddr = true; diff --git a/tests/qemumemlocktest.c b/tests/qemumemlocktest.c index c0f1dc3..268563d 100644 --- a/tests/qemumemlocktest.c +++ b/tests/qemumemlocktest.c @@ -131,7 +131,7 @@ mymain(void)
DO_TEST("pseries-hardlimit", 2147483648); DO_TEST("pseries-locked", VIR_DOMAIN_MEMORY_PARAM_UNLIMITED); - DO_TEST("pseries-hostdev", 2168455168); + DO_TEST("pseries-hostdev", 4320133120);
DO_TEST("pseries-hardlimit+locked", 2147483648); DO_TEST("pseries-hardlimit+hostdev", 2147483648); diff --git a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-1.args b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-1.args index 051ffde..8a4a4c5 100644 --- a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-1.args +++ b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-1.args @@ -18,6 +18,8 @@ QEMU_AUDIO_DRV=none \ server,nowait \ -mon chardev=charmonitor,id=monitor,mode=readline \ -boot c \ --device vfio-pci,host=0005:90:01.0,id=hostdev0,bus=pci.0,addr=0x1 \ --device vfio-pci,host=0001:01:00.0,id=hostdev1,bus=pci.0,addr=0x2 \ --device vfio-pci,host=0001:01:00.1,id=hostdev2,bus=pci.0,addr=0x3 +-device spapr-pci-host-bridge,index=1,id=pci.1 \ +-device spapr-pci-host-bridge,index=2,id=pci.2 \ +-device vfio-pci,host=0005:90:01.0,id=hostdev0,bus=pci.1.0,addr=0x1 \ +-device vfio-pci,host=0001:01:00.0,id=hostdev1,bus=pci.2.0,addr=0x1 \ +-device vfio-pci,host=0001:01:00.1,id=hostdev2,bus=pci.2.0,addr=0x2 diff --git a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-2.args b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-2.args index 83d4306..cd5b664 100644 --- a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-2.args +++ b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-2.args @@ -19,6 +19,7 @@ server,nowait \ -mon chardev=charmonitor,id=monitor,mode=readline \ -boot c \ -device spapr-pci-host-bridge,index=1,id=pci.1 \ +-device spapr-pci-host-bridge,index=2,id=pci.2 \ -device virtio-scsi-pci,id=scsi0,bus=pci.1.0,addr=0x1 \ -device vfio-pci,host=0001:01:00.0,id=hostdev0,bus=pci.1.0,addr=0x2 \ --device vfio-pci,host=0005:90:01.0,id=hostdev1,bus=pci.0,addr=0x1 +-device vfio-pci,host=0005:90:01.0,id=hostdev1,bus=pci.2.0,addr=0x1 diff --git a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-3.args b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-3.args index eda6cc7..66a31ba 100644 --- a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-3.args +++ b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-3.args @@ -21,4 +21,4 @@ server,nowait \ -device spapr-pci-host-bridge,index=1,id=pci.1 \ -device spapr-pci-host-bridge,index=2,id=pci.2 \ -device vfio-pci,host=0001:01:00.0,id=hostdev0,bus=pci.2.0,addr=0x1 \ --device vfio-pci,host=0001:01:00.1,id=hostdev1,bus=pci.0,addr=0x1 +-device vfio-pci,host=0001:01:00.1,id=hostdev1,bus=pci.2.0,addr=0x2 diff --git a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-1.xml b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-1.xml index fa9e4da..e77a060 100644 --- a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-1.xml +++ b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-1.xml @@ -19,27 +19,35 @@ <model name='spapr-pci-host-bridge'/> <target index='0'/> </controller> + <controller type='pci' index='1' model='pci-root'> + <model name='spapr-pci-host-bridge'/> + <target index='1'/> + </controller> + <controller type='pci' index='2' model='pci-root'> + <model name='spapr-pci-host-bridge'/> + <target index='2'/> + </controller> <interface type='hostdev' managed='yes'> <mac address='52:54:00:6d:90:02'/> <driver name='vfio'/> <source> <address type='pci' domain='0x0005' bus='0x90' slot='0x01' function='0x0'/> </source> - <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x0'/> + <address type='pci' domain='0x0000' bus='0x01' slot='0x01' function='0x0'/> </interface> <hostdev mode='subsystem' type='pci' managed='yes'> <driver name='vfio'/> <source> <address domain='0x0001' bus='0x01' slot='0x00' function='0x0'/> </source> - <address type='pci' domain='0x0000' bus='0x00' slot='0x02' function='0x0'/> + <address type='pci' domain='0x0000' bus='0x02' slot='0x01' function='0x0'/> </hostdev> <hostdev mode='subsystem' type='pci' managed='yes'> <driver name='vfio'/> <source> <address domain='0x0001' bus='0x01' slot='0x00' function='0x1'/> </source> - <address type='pci' domain='0x0000' bus='0x00' slot='0x03' function='0x0'/> + <address type='pci' domain='0x0000' bus='0x02' slot='0x02' function='0x0'/> </hostdev> <memballoon model='none'/> <panic model='pseries'/> diff --git a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-2.xml b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-2.xml index 17ff4c8..cfa395b 100644 --- a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-2.xml +++ b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-2.xml @@ -26,6 +26,10 @@ <model name='spapr-pci-host-bridge'/> <target index='1'/> </controller> + <controller type='pci' index='2' model='pci-root'> + <model name='spapr-pci-host-bridge'/> + <target index='2'/> + </controller> <hostdev mode='subsystem' type='pci' managed='yes'> <driver name='vfio'/> <source> @@ -38,7 +42,7 @@ <source> <address domain='0x0005' bus='0x90' slot='0x01' function='0x0'/> </source> - <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x0'/> + <address type='pci' domain='0x0000' bus='0x02' slot='0x01' function='0x0'/> </hostdev> <memballoon model='none'/> <panic model='pseries'/> diff --git a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-3.xml b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-3.xml index 58023ec..f91959b 100644 --- a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-3.xml +++ b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-3.xml @@ -39,7 +39,7 @@ <source> <address domain='0x0001' bus='0x01' slot='0x00' function='0x1'/> </source> - <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x0'/> + <address type='pci' domain='0x0000' bus='0x02' slot='0x02' function='0x0'/> </hostdev> <memballoon model='none'/> <panic model='pseries'/>

On Sat, 2017-07-15 at 22:28 -0400, Laine Stump wrote:
+ /* The isolation group for a host device is its IOMMU group, + * increased by one: this is because zero is a valid IOMMU group but + * that's also the default isolation group, which we want to save + * for emulated devices. Shifting isolation groups for host devices + * by one ensures there is no overlap */ Which is fine because this is only used internally - we never actually try to relate this internal number to the external number. Of course, the whole reason you're doing this is so that the default value can be 0, making it easier to initialize objects. Otherwise, we could make the default group -1 (or UINT_MAX) and then there would be no confusion (e.g. if someone was looking at the debug log message a few lines down from here). So if the simplified initialization of objects is a good tradeoff in exchange for confusion when looking at debug output which probably nobody will ever look at, then I guess this works.
I tried pushing for having explicit allocation/initialization functions for structs rather than using VIR_ALLOC(), which for all intents and purposes means mandating the default value to be zero, but there was pushback. Since this alternative approach ultimately achieves the same result, I thought it would be best to just drop it. Another factor to keep in mind is that I took great care in making sure "isolation groups" is the only name used for the concept outside of this function, which means unless you're looking at this specific bit of code you should not even realize there is a correlation between isolation groups and IOMMU groups.
+++ b/src/qemu/qemu_hotplug.c @@ -1476,6 +1476,13 @@ qemuDomainAttachHostPCIDevice(virQEMUDriverPtr driver, if (qemuAssignDeviceHostdevAlias(vm->def, &info->alias, -1) < 0) goto error; + + if (qemuDomainIsPSeries(vm->def)) { + /* Isolation groups are only relevant for pSeries guests */ + if (qemuDomainFillDeviceIsolationGroup(vm->def, &dev) < 0) + goto error; + } Hotplug of course will work only if a PHB already exists that has the right iommu group.
It only needs to exist and not have any device with a different isolation group plugged in: an existing, empty PHB will simply change its isolation group to accept the hotplugged device. -- Andrea Bolognani / Red Hat / Virtualization

On Sat, 2017-07-15 at 17:30 +0200, Andrea Bolognani wrote:
All the pieces are now in place, so we can finally start using isolation groups to achieve our initial goal, which is separating hostdevs from emulated PCI devices while keeping hostdevs that belong to the same host IOMMU group together. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1280542 Signed-off-by: Andrea Bolognani <abologna@redhat.com> --- src/qemu/qemu_domain_address.c | 241 +++++++++++++++++++++ src/qemu/qemu_domain_address.h | 4 + src/qemu/qemu_hotplug.c | 7 + tests/qemumemlocktest.c | 2 +- .../qemuxml2argv-pseries-hostdevs-1.args | 8 +- .../qemuxml2argv-pseries-hostdevs-2.args | 3 +- .../qemuxml2argv-pseries-hostdevs-3.args | 2 +- .../qemuxml2xmlout-pseries-hostdevs-1.xml | 14 +- .../qemuxml2xmlout-pseries-hostdevs-2.xml | 6 +- .../qemuxml2xmlout-pseries-hostdevs-3.xml | 2 +- 10 files changed, 278 insertions(+), 11 deletions(-)
I'll squash this in before pushing: diff --git a/src/qemu/qemu_domain_address.c b/src/qemu/qemu_domain_address.c index 2594712..d943c8b 100644 --- a/src/qemu/qemu_domain_address.c +++ b/src/qemu/qemu_domain_address.c @@ -1038,9 +1038,9 @@ qemuDomainFillDeviceIsolationGroup(virDomainDefPtr def, /* Network interfaces can ultimately result in the guest being * assigned a host device if the libvirt network they're connected - * to is of type hostdev. Any other kind of network doesn't require - * us to isolate the guest device, so we can skip them */ - if (iface->type == VIR_DOMAIN_NET_TYPE_NETWORK && + * to is of type hostdev. All other kinds of network interfaces don't + * require us to isolate the guest device, so we can skip them */ + if (iface->type != VIR_DOMAIN_NET_TYPE_NETWORK || networkGetActualType(iface) != VIR_DOMAIN_NET_TYPE_HOSTDEV) { goto skip; } You know, to prevent libvirtd from crashing :) -- Andrea Bolognani / Red Hat / Virtualization

Signed-off-by: Andrea Bolognani <abologna@redhat.com> Reviewed-by: Laine Stump <laine@laine.org> --- docs/news.xml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/news.xml b/docs/news.xml index d519b72..ea21cbc 100644 --- a/docs/news.xml +++ b/docs/news.xml @@ -51,6 +51,16 @@ to the guest configuration. </description> </change> + <change> + <summary> + qemu: Isolate hostdevs on pSeries guests + </summary> + <description> + To enable better error reporting and recovery, unrelated hostdevs + will now be automatically isolated on pSeries guests by placing them + on separate PHBs (PCI Host Bridges). + </description> + </change> </section> <section title="Improvements"> <change> -- 2.7.5
participants (2)
-
Andrea Bolognani
-
Laine Stump