[libvirt] [PATCH v1 1/1] qemu_process.c: add CAP_IPC_LOCK when using libcap-ng

QEMU virtual machines with PCI passthrough of certain devices, such as the NVIDIA Tesla V100 GPU, requires allocation of an amount of memory pages that can break KVM limits. When that happens, the KVM module checks if the process is IPC_LOCK capable and, in case it doesn't, it refuses to allocate the mem pages, causing the guest to misbehave. When Libvirt is compiled to use libcap-ng support, the resulting QEMU guest Libvirt creates does not have CAP_IPC_LOCK, hurting guests that are doing PCI passthrough and that requires extra memory to work. This patch addresses this issue by checking whether we're running with libcap-ng support (WITH_CAPNG) and if the guest is using PCI passthrough with the VFIO driver. If those conditions are met, allow CAP_IPC_LOCK capability to the QEMU process. Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com> --- src/libvirt_private.syms | 1 + src/qemu/qemu_process.c | 34 ++++++++++++++++++++++++++++++++++ src/util/virhostdev.c | 2 +- src/util/virhostdev.h | 4 ++++ 4 files changed, 40 insertions(+), 1 deletion(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 6b401aa5e0..b7384a67a8 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1992,6 +1992,7 @@ virHostCPUStatsAssign; # util/virhostdev.h virHostdevFindUSBDevice; +virHostdevGetPCIHostDeviceList; virHostdevIsMdevDevice; virHostdevIsSCSIDevice; virHostdevManagerGetDefault; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 0583eb03f2..24aef5904f 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -4997,6 +4997,38 @@ qemuProcessSetupRawIO(virQEMUDriverPtr driver, } +static void +qemuProcessSetupVFIOCaps(virDomainObjPtr vm ATTRIBUTE_UNUSED, + virCommandPtr cmd ATTRIBUTE_UNUSED) +{ +#if WITH_CAPNG + virPCIDeviceListPtr pcidevs = NULL; + bool has_vfio = false; + size_t i; + + pcidevs = virHostdevGetPCIHostDeviceList(vm->def->hostdevs, + vm->def->nhostdevs); + if (!pcidevs) + return; + + for (i = 0; i < virPCIDeviceListCount(pcidevs); i++) { + virPCIDevicePtr pci = virPCIDeviceListGet(pcidevs, i); + if (virPCIDeviceGetStubDriver(pci) == VIR_PCI_STUB_DRIVER_VFIO) { + has_vfio = true; + break; + } + } + + if (has_vfio) { + VIR_DEBUG("Adding CAP_IPC_LOCK to QEMU process"); + virCommandAllowCap(cmd, CAP_IPC_LOCK); + } + + virObjectUnref(pcidevs); +#endif +} + + static int qemuProcessSetupBalloon(virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -6569,6 +6601,8 @@ qemuProcessLaunch(virConnectPtr conn, if (qemuProcessSetupRawIO(driver, vm, cmd) < 0) goto cleanup; + qemuProcessSetupVFIOCaps(vm, cmd); + virCommandSetPreExecHook(cmd, qemuProcessHook, &hookData); virCommandSetMaxProcesses(cmd, cfg->maxProcesses); virCommandSetMaxFiles(cmd, cfg->maxFiles); diff --git a/src/util/virhostdev.c b/src/util/virhostdev.c index 6be395cdda..ee389a1729 100644 --- a/src/util/virhostdev.c +++ b/src/util/virhostdev.c @@ -218,7 +218,7 @@ virHostdevManagerGetDefault(void) return virObjectRef(manager); } -static virPCIDeviceListPtr +virPCIDeviceListPtr virHostdevGetPCIHostDeviceList(virDomainHostdevDefPtr *hostdevs, int nhostdevs) { virPCIDeviceListPtr pcidevs; diff --git a/src/util/virhostdev.h b/src/util/virhostdev.h index 7263f320a2..9235581242 100644 --- a/src/util/virhostdev.h +++ b/src/util/virhostdev.h @@ -57,6 +57,10 @@ struct _virHostdevManager { }; virHostdevManagerPtr virHostdevManagerGetDefault(void); +virPCIDeviceListPtr +virHostdevGetPCIHostDeviceList(virDomainHostdevDefPtr *hostdevs, + int nhostdevs) + ATTRIBUTE_NONNULL(1); int virHostdevPreparePCIDevices(virHostdevManagerPtr hostdev_mgr, const char *drv_name, -- 2.20.1

On Wed, Feb 06, 2019 at 01:44:53PM -0200, Daniel Henrique Barboza wrote:
QEMU virtual machines with PCI passthrough of certain devices, such as the NVIDIA Tesla V100 GPU, requires allocation of an amount of memory pages that can break KVM limits. When that
Wow, ^this means only 1 thing, this Tesla card requires more than 1GB extra memory to be locked, is that right? Erik

On 2/6/19 1:51 PM, Erik Skultety wrote:
On Wed, Feb 06, 2019 at 01:44:53PM -0200, Daniel Henrique Barboza wrote:
QEMU virtual machines with PCI passthrough of certain devices, such as the NVIDIA Tesla V100 GPU, requires allocation of an amount of memory pages that can break KVM limits. When that Wow, ^this means only 1 thing, this Tesla card requires more than 1GB extra memory to be locked, is that right?
Yes. Just checked in the dmesg of the guest out of curiosity. The very fist big DMA call (the one that causes the IPC_LOCK issue) is a bit larger than that already. There are a few nuances to be considered. As I mentioned earlier in Alex's reply, this is a custom QEMU that has NVLink2 passthrough support. The V100 card has 16GB of RAM that is allocated in its own NUMA node, and the pseries machine emulates this behavior. So I am not sure if these DMAs are locking regular host RAM or the GPU memory to populate the NUMA node the guest uses. If the latter, then a case can be made about allowing QEMU running with IPC_LOCK - even if we limit it to the pseries machine assigning a Tesla V100 device (although Yuval reported a similar issue with a Mellanox mlx5 card replied in the previous thread ...). Thanks, DHB
Erik

[cc +Alexey] On Wed, 6 Feb 2019 13:44:53 -0200 Daniel Henrique Barboza <danielhb413@gmail.com> wrote:
QEMU virtual machines with PCI passthrough of certain devices, such as the NVIDIA Tesla V100 GPU, requires allocation of an amount of memory pages that can break KVM limits. When that happens, the KVM module checks if the process is IPC_LOCK capable and, in case it doesn't, it refuses to allocate the mem pages, causing the guest to misbehave.
When Libvirt is compiled to use libcap-ng support, the resulting QEMU guest Libvirt creates does not have CAP_IPC_LOCK, hurting guests that are doing PCI passthrough and that requires extra memory to work.
This patch addresses this issue by checking whether we're running with libcap-ng support (WITH_CAPNG) and if the guest is using PCI passthrough with the VFIO driver. If those conditions are met, allow CAP_IPC_LOCK capability to the QEMU process.
That effectively allows the QEMU process to lock all the memory it wants and thereby generate a denial of service on the host, which is the thing we're originally trying to prevent by accounting pinned pages and enforcing the user's locked memory limits. The commit log doesn't mention that this is only seen on POWER, but I infer that from the previous thread. Does this happen for any assigned device on POWER or is this something to do with the new NVLink support that Alexey has been adding? Thanks, Alex
Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com> --- src/libvirt_private.syms | 1 + src/qemu/qemu_process.c | 34 ++++++++++++++++++++++++++++++++++ src/util/virhostdev.c | 2 +- src/util/virhostdev.h | 4 ++++ 4 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 6b401aa5e0..b7384a67a8 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1992,6 +1992,7 @@ virHostCPUStatsAssign;
# util/virhostdev.h virHostdevFindUSBDevice; +virHostdevGetPCIHostDeviceList; virHostdevIsMdevDevice; virHostdevIsSCSIDevice; virHostdevManagerGetDefault; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 0583eb03f2..24aef5904f 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -4997,6 +4997,38 @@ qemuProcessSetupRawIO(virQEMUDriverPtr driver, }
+static void +qemuProcessSetupVFIOCaps(virDomainObjPtr vm ATTRIBUTE_UNUSED, + virCommandPtr cmd ATTRIBUTE_UNUSED) +{ +#if WITH_CAPNG + virPCIDeviceListPtr pcidevs = NULL; + bool has_vfio = false; + size_t i; + + pcidevs = virHostdevGetPCIHostDeviceList(vm->def->hostdevs, + vm->def->nhostdevs); + if (!pcidevs) + return; + + for (i = 0; i < virPCIDeviceListCount(pcidevs); i++) { + virPCIDevicePtr pci = virPCIDeviceListGet(pcidevs, i); + if (virPCIDeviceGetStubDriver(pci) == VIR_PCI_STUB_DRIVER_VFIO) { + has_vfio = true; + break; + } + } + + if (has_vfio) { + VIR_DEBUG("Adding CAP_IPC_LOCK to QEMU process"); + virCommandAllowCap(cmd, CAP_IPC_LOCK); + } + + virObjectUnref(pcidevs); +#endif +} + + static int qemuProcessSetupBalloon(virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -6569,6 +6601,8 @@ qemuProcessLaunch(virConnectPtr conn, if (qemuProcessSetupRawIO(driver, vm, cmd) < 0) goto cleanup;
+ qemuProcessSetupVFIOCaps(vm, cmd); + virCommandSetPreExecHook(cmd, qemuProcessHook, &hookData); virCommandSetMaxProcesses(cmd, cfg->maxProcesses); virCommandSetMaxFiles(cmd, cfg->maxFiles); diff --git a/src/util/virhostdev.c b/src/util/virhostdev.c index 6be395cdda..ee389a1729 100644 --- a/src/util/virhostdev.c +++ b/src/util/virhostdev.c @@ -218,7 +218,7 @@ virHostdevManagerGetDefault(void) return virObjectRef(manager); }
-static virPCIDeviceListPtr +virPCIDeviceListPtr virHostdevGetPCIHostDeviceList(virDomainHostdevDefPtr *hostdevs, int nhostdevs) { virPCIDeviceListPtr pcidevs; diff --git a/src/util/virhostdev.h b/src/util/virhostdev.h index 7263f320a2..9235581242 100644 --- a/src/util/virhostdev.h +++ b/src/util/virhostdev.h @@ -57,6 +57,10 @@ struct _virHostdevManager { };
virHostdevManagerPtr virHostdevManagerGetDefault(void); +virPCIDeviceListPtr +virHostdevGetPCIHostDeviceList(virDomainHostdevDefPtr *hostdevs, + int nhostdevs) + ATTRIBUTE_NONNULL(1); int virHostdevPreparePCIDevices(virHostdevManagerPtr hostdev_mgr, const char *drv_name,

On 2/6/19 2:34 PM, Alex Williamson wrote:
[cc +Alexey]
On Wed, 6 Feb 2019 13:44:53 -0200 Daniel Henrique Barboza <danielhb413@gmail.com> wrote:
QEMU virtual machines with PCI passthrough of certain devices, such as the NVIDIA Tesla V100 GPU, requires allocation of an amount of memory pages that can break KVM limits. When that happens, the KVM module checks if the process is IPC_LOCK capable and, in case it doesn't, it refuses to allocate the mem pages, causing the guest to misbehave.
When Libvirt is compiled to use libcap-ng support, the resulting QEMU guest Libvirt creates does not have CAP_IPC_LOCK, hurting guests that are doing PCI passthrough and that requires extra memory to work.
This patch addresses this issue by checking whether we're running with libcap-ng support (WITH_CAPNG) and if the guest is using PCI passthrough with the VFIO driver. If those conditions are met, allow CAP_IPC_LOCK capability to the QEMU process. That effectively allows the QEMU process to lock all the memory it wants and thereby generate a denial of service on the host, which is the thing we're originally trying to prevent by accounting pinned pages and enforcing the user's locked memory limits. The commit log doesn't mention that this is only seen on POWER, but I infer that from the previous thread. Does this happen for any assigned device on POWER or is this something to do with the new NVLink support that Alexey has been adding? Thanks,
You're right in both accounts. The behavior I am addressing here is seen in a Power 9 server using a QEMU build with NVLink2 passthrough support from Alexey. Not all assigned devices in Power wants to lock more memory than KVM allows, at least as far as I can tell. About your argument of allowing QEMU to lock all memory, I agree that is a problem. Aside from trusting that QEMU will not get greedy and lock down all the host memory (which is what I'm doing here, in a sense), I can only speculate alternatives. I'd rather wait for Alexey's input. Thanks, DHB
Alex
Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com> --- src/libvirt_private.syms | 1 + src/qemu/qemu_process.c | 34 ++++++++++++++++++++++++++++++++++ src/util/virhostdev.c | 2 +- src/util/virhostdev.h | 4 ++++ 4 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 6b401aa5e0..b7384a67a8 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1992,6 +1992,7 @@ virHostCPUStatsAssign;
# util/virhostdev.h virHostdevFindUSBDevice; +virHostdevGetPCIHostDeviceList; virHostdevIsMdevDevice; virHostdevIsSCSIDevice; virHostdevManagerGetDefault; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 0583eb03f2..24aef5904f 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -4997,6 +4997,38 @@ qemuProcessSetupRawIO(virQEMUDriverPtr driver, }
+static void +qemuProcessSetupVFIOCaps(virDomainObjPtr vm ATTRIBUTE_UNUSED, + virCommandPtr cmd ATTRIBUTE_UNUSED) +{ +#if WITH_CAPNG + virPCIDeviceListPtr pcidevs = NULL; + bool has_vfio = false; + size_t i; + + pcidevs = virHostdevGetPCIHostDeviceList(vm->def->hostdevs, + vm->def->nhostdevs); + if (!pcidevs) + return; + + for (i = 0; i < virPCIDeviceListCount(pcidevs); i++) { + virPCIDevicePtr pci = virPCIDeviceListGet(pcidevs, i); + if (virPCIDeviceGetStubDriver(pci) == VIR_PCI_STUB_DRIVER_VFIO) { + has_vfio = true; + break; + } + } + + if (has_vfio) { + VIR_DEBUG("Adding CAP_IPC_LOCK to QEMU process"); + virCommandAllowCap(cmd, CAP_IPC_LOCK); + } + + virObjectUnref(pcidevs); +#endif +} + + static int qemuProcessSetupBalloon(virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -6569,6 +6601,8 @@ qemuProcessLaunch(virConnectPtr conn, if (qemuProcessSetupRawIO(driver, vm, cmd) < 0) goto cleanup;
+ qemuProcessSetupVFIOCaps(vm, cmd); + virCommandSetPreExecHook(cmd, qemuProcessHook, &hookData); virCommandSetMaxProcesses(cmd, cfg->maxProcesses); virCommandSetMaxFiles(cmd, cfg->maxFiles); diff --git a/src/util/virhostdev.c b/src/util/virhostdev.c index 6be395cdda..ee389a1729 100644 --- a/src/util/virhostdev.c +++ b/src/util/virhostdev.c @@ -218,7 +218,7 @@ virHostdevManagerGetDefault(void) return virObjectRef(manager); }
-static virPCIDeviceListPtr +virPCIDeviceListPtr virHostdevGetPCIHostDeviceList(virDomainHostdevDefPtr *hostdevs, int nhostdevs) { virPCIDeviceListPtr pcidevs; diff --git a/src/util/virhostdev.h b/src/util/virhostdev.h index 7263f320a2..9235581242 100644 --- a/src/util/virhostdev.h +++ b/src/util/virhostdev.h @@ -57,6 +57,10 @@ struct _virHostdevManager { };
virHostdevManagerPtr virHostdevManagerGetDefault(void); +virPCIDeviceListPtr +virHostdevGetPCIHostDeviceList(virDomainHostdevDefPtr *hostdevs, + int nhostdevs) + ATTRIBUTE_NONNULL(1); int virHostdevPreparePCIDevices(virHostdevManagerPtr hostdev_mgr, const char *drv_name,

Alexey replied in the QEMU mailing list in his patch series. Here's the link: https://patchwork.kernel.org/cover/10767473/ Based on Alexey's explanation, I was able to create a quick PoC that changes the calculation of the passthrough limit memory for the PPC64 machine in qemu_domain.c, qemuDomainGetMemLockLimitBytes, to make the guest boot up without IPC_LOCK privileges and with NVLink2 passthrough. Given that this is based on how the NVLink2 support is implemented in the pseries machine in QEMU, this code will need to verify the GPU model being passed-through as well. This deprecates the solution I was aiming for in this patch. There's no need to grant IPC_LOCK for guests that uses VFIO because the guest will not allocate RAM beyond the limit. This new approach is also confined in the PPC64 code, thus the impact is minimal to x86 and other archs*. Thanks, DHB * we can always generalize this code if other archs ends up doing DMA resize with passthrough devices the same way Power is doing. On 2/6/19 3:05 PM, Daniel Henrique Barboza wrote:
On 2/6/19 2:34 PM, Alex Williamson wrote:
[cc +Alexey]
On Wed, 6 Feb 2019 13:44:53 -0200 Daniel Henrique Barboza <danielhb413@gmail.com> wrote:
QEMU virtual machines with PCI passthrough of certain devices, such as the NVIDIA Tesla V100 GPU, requires allocation of an amount of memory pages that can break KVM limits. When that happens, the KVM module checks if the process is IPC_LOCK capable and, in case it doesn't, it refuses to allocate the mem pages, causing the guest to misbehave.
When Libvirt is compiled to use libcap-ng support, the resulting QEMU guest Libvirt creates does not have CAP_IPC_LOCK, hurting guests that are doing PCI passthrough and that requires extra memory to work.
This patch addresses this issue by checking whether we're running with libcap-ng support (WITH_CAPNG) and if the guest is using PCI passthrough with the VFIO driver. If those conditions are met, allow CAP_IPC_LOCK capability to the QEMU process. That effectively allows the QEMU process to lock all the memory it wants and thereby generate a denial of service on the host, which is the thing we're originally trying to prevent by accounting pinned pages and enforcing the user's locked memory limits. The commit log doesn't mention that this is only seen on POWER, but I infer that from the previous thread. Does this happen for any assigned device on POWER or is this something to do with the new NVLink support that Alexey has been adding? Thanks,
You're right in both accounts. The behavior I am addressing here is seen in a Power 9 server using a QEMU build with NVLink2 passthrough support from Alexey.
Not all assigned devices in Power wants to lock more memory than KVM allows, at least as far as I can tell.
About your argument of allowing QEMU to lock all memory, I agree that is a problem. Aside from trusting that QEMU will not get greedy and lock down all the host memory (which is what I'm doing here, in a sense), I can only speculate alternatives. I'd rather wait for Alexey's input.
Thanks,
DHB
Alex
Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com> --- src/libvirt_private.syms | 1 + src/qemu/qemu_process.c | 34 ++++++++++++++++++++++++++++++++++ src/util/virhostdev.c | 2 +- src/util/virhostdev.h | 4 ++++ 4 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 6b401aa5e0..b7384a67a8 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1992,6 +1992,7 @@ virHostCPUStatsAssign; # util/virhostdev.h virHostdevFindUSBDevice; +virHostdevGetPCIHostDeviceList; virHostdevIsMdevDevice; virHostdevIsSCSIDevice; virHostdevManagerGetDefault; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 0583eb03f2..24aef5904f 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -4997,6 +4997,38 @@ qemuProcessSetupRawIO(virQEMUDriverPtr driver, } +static void +qemuProcessSetupVFIOCaps(virDomainObjPtr vm ATTRIBUTE_UNUSED, + virCommandPtr cmd ATTRIBUTE_UNUSED) +{ +#if WITH_CAPNG + virPCIDeviceListPtr pcidevs = NULL; + bool has_vfio = false; + size_t i; + + pcidevs = virHostdevGetPCIHostDeviceList(vm->def->hostdevs, + vm->def->nhostdevs); + if (!pcidevs) + return; + + for (i = 0; i < virPCIDeviceListCount(pcidevs); i++) { + virPCIDevicePtr pci = virPCIDeviceListGet(pcidevs, i); + if (virPCIDeviceGetStubDriver(pci) == VIR_PCI_STUB_DRIVER_VFIO) { + has_vfio = true; + break; + } + } + + if (has_vfio) { + VIR_DEBUG("Adding CAP_IPC_LOCK to QEMU process"); + virCommandAllowCap(cmd, CAP_IPC_LOCK); + } + + virObjectUnref(pcidevs); +#endif +} + + static int qemuProcessSetupBalloon(virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -6569,6 +6601,8 @@ qemuProcessLaunch(virConnectPtr conn, if (qemuProcessSetupRawIO(driver, vm, cmd) < 0) goto cleanup; + qemuProcessSetupVFIOCaps(vm, cmd); + virCommandSetPreExecHook(cmd, qemuProcessHook, &hookData); virCommandSetMaxProcesses(cmd, cfg->maxProcesses); virCommandSetMaxFiles(cmd, cfg->maxFiles); diff --git a/src/util/virhostdev.c b/src/util/virhostdev.c index 6be395cdda..ee389a1729 100644 --- a/src/util/virhostdev.c +++ b/src/util/virhostdev.c @@ -218,7 +218,7 @@ virHostdevManagerGetDefault(void) return virObjectRef(manager); } -static virPCIDeviceListPtr +virPCIDeviceListPtr virHostdevGetPCIHostDeviceList(virDomainHostdevDefPtr *hostdevs, int nhostdevs) { virPCIDeviceListPtr pcidevs; diff --git a/src/util/virhostdev.h b/src/util/virhostdev.h index 7263f320a2..9235581242 100644 --- a/src/util/virhostdev.h +++ b/src/util/virhostdev.h @@ -57,6 +57,10 @@ struct _virHostdevManager { }; virHostdevManagerPtr virHostdevManagerGetDefault(void); +virPCIDeviceListPtr +virHostdevGetPCIHostDeviceList(virDomainHostdevDefPtr *hostdevs, + int nhostdevs) + ATTRIBUTE_NONNULL(1); int virHostdevPreparePCIDevices(virHostdevManagerPtr hostdev_mgr, const char *drv_name,
participants (3)
-
Alex Williamson
-
Daniel Henrique Barboza
-
Erik Skultety