On 2/6/19 2:34 PM, Alex Williamson wrote:
[cc +Alexey]
On Wed, 6 Feb 2019 13:44:53 -0200
Daniel Henrique Barboza <danielhb413(a)gmail.com> wrote:
> QEMU virtual machines with PCI passthrough of certain devices,
> such as the NVIDIA Tesla V100 GPU, requires allocation of an
> amount of memory pages that can break KVM limits. When that
> happens, the KVM module checks if the process is IPC_LOCK capable
> and, in case it doesn't, it refuses to allocate the mem pages,
> causing the guest to misbehave.
>
> When Libvirt is compiled to use libcap-ng support, the resulting
> QEMU guest Libvirt creates does not have CAP_IPC_LOCK, hurting
> guests that are doing PCI passthrough and that requires extra
> memory to work.
>
> This patch addresses this issue by checking whether we're
> running with libcap-ng support (WITH_CAPNG) and if the
> guest is using PCI passthrough with the VFIO driver. If those
> conditions are met, allow CAP_IPC_LOCK capability to the
> QEMU process.
That effectively allows the QEMU process to lock all the memory it
wants and thereby generate a denial of service on the host, which is
the thing we're originally trying to prevent by accounting pinned pages
and enforcing the user's locked memory limits. The commit log doesn't
mention that this is only seen on POWER, but I infer that from the
previous thread. Does this happen for any assigned device on POWER or
is this something to do with the new NVLink support that Alexey has been
adding? Thanks,
You're right in both accounts. The behavior I am addressing here is seen
in a
Power 9 server using a QEMU build with NVLink2 passthrough support
from Alexey.
Not all assigned devices in Power wants to lock more memory than KVM allows,
at least as far as I can tell.
About your argument of allowing QEMU to lock all memory, I agree that
is a problem. Aside from trusting that QEMU will not get greedy and lock
down all the host memory (which is what I'm doing here, in a sense), I
can only speculate alternatives. I'd rather wait for Alexey's input.
Thanks,
DHB
Alex
> Signed-off-by: Daniel Henrique Barboza <danielhb413(a)gmail.com>
> ---
> src/libvirt_private.syms | 1 +
> src/qemu/qemu_process.c | 34 ++++++++++++++++++++++++++++++++++
> src/util/virhostdev.c | 2 +-
> src/util/virhostdev.h | 4 ++++
> 4 files changed, 40 insertions(+), 1 deletion(-)
>
> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
> index 6b401aa5e0..b7384a67a8 100644
> --- a/src/libvirt_private.syms
> +++ b/src/libvirt_private.syms
> @@ -1992,6 +1992,7 @@ virHostCPUStatsAssign;
>
> # util/virhostdev.h
> virHostdevFindUSBDevice;
> +virHostdevGetPCIHostDeviceList;
> virHostdevIsMdevDevice;
> virHostdevIsSCSIDevice;
> virHostdevManagerGetDefault;
> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
> index 0583eb03f2..24aef5904f 100644
> --- a/src/qemu/qemu_process.c
> +++ b/src/qemu/qemu_process.c
> @@ -4997,6 +4997,38 @@ qemuProcessSetupRawIO(virQEMUDriverPtr driver,
> }
>
>
> +static void
> +qemuProcessSetupVFIOCaps(virDomainObjPtr vm ATTRIBUTE_UNUSED,
> + virCommandPtr cmd ATTRIBUTE_UNUSED)
> +{
> +#if WITH_CAPNG
> + virPCIDeviceListPtr pcidevs = NULL;
> + bool has_vfio = false;
> + size_t i;
> +
> + pcidevs = virHostdevGetPCIHostDeviceList(vm->def->hostdevs,
> + vm->def->nhostdevs);
> + if (!pcidevs)
> + return;
> +
> + for (i = 0; i < virPCIDeviceListCount(pcidevs); i++) {
> + virPCIDevicePtr pci = virPCIDeviceListGet(pcidevs, i);
> + if (virPCIDeviceGetStubDriver(pci) == VIR_PCI_STUB_DRIVER_VFIO) {
> + has_vfio = true;
> + break;
> + }
> + }
> +
> + if (has_vfio) {
> + VIR_DEBUG("Adding CAP_IPC_LOCK to QEMU process");
> + virCommandAllowCap(cmd, CAP_IPC_LOCK);
> + }
> +
> + virObjectUnref(pcidevs);
> +#endif
> +}
> +
> +
> static int
> qemuProcessSetupBalloon(virQEMUDriverPtr driver,
> virDomainObjPtr vm,
> @@ -6569,6 +6601,8 @@ qemuProcessLaunch(virConnectPtr conn,
> if (qemuProcessSetupRawIO(driver, vm, cmd) < 0)
> goto cleanup;
>
> + qemuProcessSetupVFIOCaps(vm, cmd);
> +
> virCommandSetPreExecHook(cmd, qemuProcessHook, &hookData);
> virCommandSetMaxProcesses(cmd, cfg->maxProcesses);
> virCommandSetMaxFiles(cmd, cfg->maxFiles);
> diff --git a/src/util/virhostdev.c b/src/util/virhostdev.c
> index 6be395cdda..ee389a1729 100644
> --- a/src/util/virhostdev.c
> +++ b/src/util/virhostdev.c
> @@ -218,7 +218,7 @@ virHostdevManagerGetDefault(void)
> return virObjectRef(manager);
> }
>
> -static virPCIDeviceListPtr
> +virPCIDeviceListPtr
> virHostdevGetPCIHostDeviceList(virDomainHostdevDefPtr *hostdevs, int nhostdevs)
> {
> virPCIDeviceListPtr pcidevs;
> diff --git a/src/util/virhostdev.h b/src/util/virhostdev.h
> index 7263f320a2..9235581242 100644
> --- a/src/util/virhostdev.h
> +++ b/src/util/virhostdev.h
> @@ -57,6 +57,10 @@ struct _virHostdevManager {
> };
>
> virHostdevManagerPtr virHostdevManagerGetDefault(void);
> +virPCIDeviceListPtr
> +virHostdevGetPCIHostDeviceList(virDomainHostdevDefPtr *hostdevs,
> + int nhostdevs)
> + ATTRIBUTE_NONNULL(1);
> int
> virHostdevPreparePCIDevices(virHostdevManagerPtr hostdev_mgr,
> const char *drv_name,