[libvirt] [PATCH (v4, kind of) 0/3] Use correct cpu set when doing automatic NUMA placement

With automatic cpu placement mode, the actual numa node set was ignored when setting CPUs. This patchset introduces helpers and actual code to set the cpu binding policy according to the NUMA node set that is requested. Osier Yang (1): qemu: Set cpuset.cpus for domain process Peter Krempa (2): caps: Add helpers to convert NUMA nodes to corresponding CPUs qemu: Cleanup coding style nits in qemu_cgroup.c po/POTFILES.in | 1 + src/conf/capabilities.c | 64 ++++++++++++++++++++++++++ src/conf/capabilities.h | 2 + src/libvirt_private.syms | 1 + src/qemu/qemu_cgroup.c | 114 ++++++++++++++++++++++++++++++++--------------- 5 files changed, 146 insertions(+), 36 deletions(-) -- 1.8.3.2

These helpers use the remembered host capabilities to retrieve the cpu map rather than query the host again. The intended usage for this helpers is to fix automatic NUMA placement with strict memory alloc. The code doing the prepare needs to pin the emulator process only to cpus belonging to a subset of NUMA nodes of the host. --- po/POTFILES.in | 1 + src/conf/capabilities.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++ src/conf/capabilities.h | 2 ++ src/libvirt_private.syms | 1 + 4 files changed, 68 insertions(+) diff --git a/po/POTFILES.in b/po/POTFILES.in index 0b65765..0770b8f 100644 --- a/po/POTFILES.in +++ b/po/POTFILES.in @@ -8,6 +8,7 @@ gnulib/lib/gai_strerror.c gnulib/lib/regcomp.c src/access/viraccessdriverpolkit.c src/access/viraccessmanager.c +src/conf/capabilities.c src/conf/cpu_conf.c src/conf/device_conf.c src/conf/domain_conf.c diff --git a/src/conf/capabilities.c b/src/conf/capabilities.c index bb5fe4e..1acc936 100644 --- a/src/conf/capabilities.c +++ b/src/conf/capabilities.c @@ -916,3 +916,67 @@ virCapabilitiesFormatXML(virCapsPtr caps) return virBufferContentAndReset(&xml); } + +/* get the maximum ID of cpus in the host */ +static unsigned int +virCapabilitiesGetHostMaxcpu(virCapsPtr caps) +{ + unsigned int maxcpu = 0; + size_t node; + size_t cpu; + + for (node = 0; node < caps->host.nnumaCell; node++) { + virCapsHostNUMACellPtr cell = caps->host.numaCell[node]; + + for (cpu = 0; cpu < cell->ncpus; cpu++) { + if (cell->cpus[cpu].id > maxcpu) + maxcpu = cell->cpus[cpu].id; + } + } + + return maxcpu; +} + +/* set cpus of a numa node in the bitmask */ +static int +virCapabilitiesGetCpusForNode(virCapsPtr caps, + size_t node, + virBitmapPtr cpumask) +{ + virCapsHostNUMACellPtr cell = caps->host.numaCell[node]; + size_t cpu; + + for (cpu = 0; cpu < cell->ncpus; cpu++) { + if (virBitmapSetBit(cpumask, cell->cpus[cpu].id) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Cpu '%u' in node '%zu' is out of range " + "of the provided bitmap"), + cell->cpus[cpu].id, node); + return -1; + } + } + + return 0; +} + +virBitmapPtr +virCapabilitiesGetCpusForNodemask(virCapsPtr caps, + virBitmapPtr nodemask) +{ + virBitmapPtr ret = NULL; + unsigned int maxcpu = virCapabilitiesGetHostMaxcpu(caps); + ssize_t node = -1; + + if (!(ret = virBitmapNew(maxcpu + 1))) + return NULL; + + + while ((node = virBitmapNextSetBit(nodemask, node)) >= 0) { + if (virCapabilitiesGetCpusForNode(caps, node, ret) < 0) { + virBitmapFree(ret); + return NULL; + } + } + + return ret; +} diff --git a/src/conf/capabilities.h b/src/conf/capabilities.h index 6c7efde..88ec454 100644 --- a/src/conf/capabilities.h +++ b/src/conf/capabilities.h @@ -257,5 +257,7 @@ virCapabilitiesDefaultGuestEmulator(virCapsPtr caps, extern char * virCapabilitiesFormatXML(virCapsPtr caps); +virBitmapPtr virCapabilitiesGetCpusForNodemask(virCapsPtr caps, + virBitmapPtr nodemask); #endif /* __VIR_CAPABILITIES_H */ diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index fc4e750..d07ceaa 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -57,6 +57,7 @@ virCapabilitiesDefaultGuestMachine; virCapabilitiesFormatXML; virCapabilitiesFreeMachines; virCapabilitiesFreeNUMAInfo; +virCapabilitiesGetCpusForNodemask; virCapabilitiesNew; virCapabilitiesSetHostCPU; -- 1.8.3.2

On 07/18/2013 12:49 PM, Peter Krempa wrote:
These helpers use the remembered host capabilities to retrieve the cpu map rather than query the host again. The intended usage for this helpers is to fix automatic NUMA placement with strict memory alloc. The code doing the prepare needs to pin the emulator process only to cpus belonging to a subset of NUMA nodes of the host.
ACK, Martin

From: Osier Yang <jyang@redhat.com> When either "cpuset" of <vcpu> is specified, or the "placement" of <vcpu> is "auto", only setting the cpuset.mems might cause the guest starting to fail. E.g. ("placement" of both <vcpu> and <numatune> is "auto"): 1) Related XMLs <vcpu placement='auto'>4</vcpu> <numatune> <memory mode='strict' placement='auto'/> </numatune> 2) Host NUMA topology % numactl --hardware available: 8 nodes (0-7) node 0 cpus: 0 4 8 12 16 20 24 28 node 0 size: 16374 MB node 0 free: 11899 MB node 1 cpus: 32 36 40 44 48 52 56 60 node 1 size: 16384 MB node 1 free: 15318 MB node 2 cpus: 2 6 10 14 18 22 26 30 node 2 size: 16384 MB node 2 free: 15766 MB node 3 cpus: 34 38 42 46 50 54 58 62 node 3 size: 16384 MB node 3 free: 15347 MB node 4 cpus: 3 7 11 15 19 23 27 31 node 4 size: 16384 MB node 4 free: 15041 MB node 5 cpus: 35 39 43 47 51 55 59 63 node 5 size: 16384 MB node 5 free: 15202 MB node 6 cpus: 1 5 9 13 17 21 25 29 node 6 size: 16384 MB node 6 free: 15197 MB node 7 cpus: 33 37 41 45 49 53 57 61 node 7 size: 16368 MB node 7 free: 15669 MB 4) cpuset.cpus will be set as: (from debug log) 2013-05-09 16:50:17.296+0000: 417: debug : virCgroupSetValueStr:331 : Set value '/sys/fs/cgroup/cpuset/libvirt/qemu/toy/cpuset.cpus' to '0-63' 5) The advisory nodeset got from querying numad (from debug log) 2013-05-09 16:50:17.295+0000: 417: debug : qemuProcessStart:3614 : Nodeset returned from numad: 1 6) cpuset.mems will be set as: (from debug log) 2013-05-09 16:50:17.296+0000: 417: debug : virCgroupSetValueStr:331 : Set value '/sys/fs/cgroup/cpuset/libvirt/qemu/toy/cpuset.mems' to '0-7' I.E, the domain process's memory is restricted on the first NUMA node, however, it can use all of the CPUs, which will likely cause the domain process to fail to start because of the kernel fails to allocate memory with the the memory policy as "strict". % tail -n 20 /var/log/libvirt/qemu/toy.log ... 2013-05-09 05:53:32.972+0000: 7318: debug : virCommandHandshakeChild:377 : Handshake with parent is done char device redirected to /dev/pts/2 (label charserial0) kvm_init_vcpu failed: Cannot allocate memory ... Signed-off-by: Peter Krempa <pkrempa@redhat.com> --- Notes: Version 4: Resolve numa nodes to cpus for binding. src/qemu/qemu_cgroup.c | 51 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index fbe7e6f..0ccbff9 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -615,10 +615,12 @@ cleanup: static int qemuSetupCpusetCgroup(virDomainObjPtr vm, - virBitmapPtr nodemask) + virBitmapPtr nodemask, + virCapsPtr caps) { qemuDomainObjPrivatePtr priv = vm->privateData; - char *mask = NULL; + char *mem_mask = NULL; + char *cpu_mask = NULL; int rc; int ret = -1; @@ -632,17 +634,17 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, if (vm->def->numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) - mask = virBitmapFormat(nodemask); + mem_mask = virBitmapFormat(nodemask); else - mask = virBitmapFormat(vm->def->numatune.memory.nodemask); + mem_mask = virBitmapFormat(vm->def->numatune.memory.nodemask); - if (!mask) { + if (!mem_mask) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("failed to convert memory nodemask")); goto cleanup; } - rc = virCgroupSetCpusetMems(priv->cgroup, mask); + rc = virCgroupSetCpusetMems(priv->cgroup, mem_mask); if (rc != 0) { virReportSystemError(-rc, @@ -652,9 +654,37 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, } } + if (vm->def->cpumask || + (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) { + + if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { + virBitmapPtr cpumap = virCapabilitiesGetCpusForNodemask(caps, nodemask); + cpu_mask = virBitmapFormat(cpumap); + virBitmapFree(cpumap); + } else { + cpu_mask = virBitmapFormat(vm->def->cpumask); + } + + if (!cpu_mask) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to convert memory nodemask")); + goto cleanup; + } + + rc = virCgroupSetCpusetCpus(priv->cgroup, cpu_mask); + + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpuset.cpus for domain %s"), + vm->def->name); + goto cleanup; + } + } + ret = 0; cleanup: - VIR_FREE(mask); + VIR_FREE(mem_mask); + VIR_FREE(cpu_mask); return ret; } @@ -801,8 +831,12 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, virBitmapPtr nodemask) { qemuDomainObjPrivatePtr priv = vm->privateData; + virCapsPtr caps = NULL; int ret = -1; + if (!(caps = virQEMUDriverGetCapabilities(driver, false))) + goto cleanup; + if (qemuInitCgroup(driver, vm, true) < 0) return -1; @@ -821,11 +855,12 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, if (qemuSetupCpuCgroup(vm) < 0) goto cleanup; - if (qemuSetupCpusetCgroup(vm, nodemask) < 0) + if (qemuSetupCpusetCgroup(vm, nodemask, caps) < 0) goto cleanup; ret = 0; cleanup: + virObjectUnref(caps); return ret; } -- 1.8.3.2

On 07/18/2013 12:49 PM, Peter Krempa wrote:
From: Osier Yang <jyang@redhat.com>
When either "cpuset" of <vcpu> is specified, or the "placement" of <vcpu> is "auto", only setting the cpuset.mems might cause the guest starting to fail. E.g. ("placement" of both <vcpu> and <numatune> is "auto"):
[...]
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index fbe7e6f..0ccbff9 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c [...] @@ -652,9 +654,37 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, } }
+ if (vm->def->cpumask || + (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) { + + if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { + virBitmapPtr cpumap = virCapabilitiesGetCpusForNodemask(caps, nodemask);
You don't check for error after this line and then ...
+ cpu_mask = virBitmapFormat(cpumap); + virBitmapFree(cpumap); + } else { + cpu_mask = virBitmapFormat(vm->def->cpumask); + } + + if (!cpu_mask) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to convert memory nodemask"));
... rewrite it here in case it fails. With an error about some fancy "memory nodemask" of yours (copy-paste error) ;-) ACK with these two nits fixed. Martin

On 07/18/13 14:39, Martin Kletzander wrote:
On 07/18/2013 12:49 PM, Peter Krempa wrote:
From: Osier Yang <jyang@redhat.com>
When either "cpuset" of <vcpu> is specified, or the "placement" of <vcpu> is "auto", only setting the cpuset.mems might cause the guest starting to fail. E.g. ("placement" of both <vcpu> and <numatune> is "auto"):
[...]
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index fbe7e6f..0ccbff9 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c [...] @@ -652,9 +654,37 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, } }
+ if (vm->def->cpumask || + (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) { + + if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { + virBitmapPtr cpumap = virCapabilitiesGetCpusForNodemask(caps, nodemask);
You don't check for error after this line and then ...
+ cpu_mask = virBitmapFormat(cpumap); + virBitmapFree(cpumap); + } else { + cpu_mask = virBitmapFormat(vm->def->cpumask); + } + + if (!cpu_mask) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to convert memory nodemask"));
... rewrite it here in case it fails. With an error about some fancy "memory nodemask" of yours (copy-paste error) ;-)
ACK with these two nits fixed.
I noticed that the last hunk of the patch would actually leak the caps in case of cgroups fail. I will be pushing this with the following patch squashed in: diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 9d2f757..4bebd31 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -836,15 +836,15 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, virCapsPtr caps = NULL; int ret = -1; - if (!(caps = virQEMUDriverGetCapabilities(driver, false))) - goto cleanup; - if (qemuInitCgroup(driver, vm, true) < 0) return -1; if (!priv->cgroup) return 0; + if (!(caps = virQEMUDriverGetCapabilities(driver, false))) + goto cleanup; + if (qemuSetupDevicesCgroup(driver, vm) < 0) goto cleanup;
Martin
Peter

On 07/18/2013 02:57 PM, Peter Krempa wrote:
On 07/18/13 14:39, Martin Kletzander wrote: [...]
ACK with these two nits fixed.
I noticed that the last hunk of the patch would actually leak the caps in case of cgroups fail. I will be pushing this with the following patch squashed in:
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 9d2f757..4bebd31 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -836,15 +836,15 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, virCapsPtr caps = NULL; int ret = -1;
- if (!(caps = virQEMUDriverGetCapabilities(driver, false))) - goto cleanup; - if (qemuInitCgroup(driver, vm, true) < 0) return -1;
if (!priv->cgroup) return 0;
+ if (!(caps = virQEMUDriverGetCapabilities(driver, false))) + goto cleanup; + if (qemuSetupDevicesCgroup(driver, vm) < 0) goto cleanup;
Oh, you're right, I missed the fact that there were returns instead of gotos, but this squashed in is ok, no returns after that. Martin

--- src/qemu/qemu_cgroup.c | 69 +++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 0ccbff9..1a24fb3 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -76,8 +76,9 @@ qemuSetupDiskPathAllow(virDomainDiskDefPtr disk, } -int qemuSetupDiskCgroup(virDomainObjPtr vm, - virDomainDiskDefPtr disk) +int +qemuSetupDiskCgroup(virDomainObjPtr vm, + virDomainDiskDefPtr disk) { qemuDomainObjPrivatePtr priv = vm->privateData; @@ -85,10 +86,7 @@ int qemuSetupDiskCgroup(virDomainObjPtr vm, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; - return virDomainDiskDefForeachPath(disk, - true, - qemuSetupDiskPathAllow, - vm); + return virDomainDiskDefForeachPath(disk, true, qemuSetupDiskPathAllow, vm); } @@ -120,8 +118,9 @@ qemuTeardownDiskPathDeny(virDomainDiskDefPtr disk ATTRIBUTE_UNUSED, } -int qemuTeardownDiskCgroup(virDomainObjPtr vm, - virDomainDiskDefPtr disk) +int +qemuTeardownDiskCgroup(virDomainObjPtr vm, + virDomainDiskDefPtr disk) { qemuDomainObjPrivatePtr priv = vm->privateData; @@ -671,9 +670,7 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, goto cleanup; } - rc = virCgroupSetCpusetCpus(priv->cgroup, cpu_mask); - - if (rc != 0) { + if ((rc = virCgroupSetCpusetCpus(priv->cgroup, cpu_mask)) != 0) { virReportSystemError(-rc, _("Unable to set cpuset.cpus for domain %s"), vm->def->name); @@ -719,9 +716,10 @@ qemuSetupCpuCgroup(virDomainObjPtr vm) } -int qemuInitCgroup(virQEMUDriverPtr driver, - virDomainObjPtr vm, - bool startup) +int +qemuInitCgroup(virQEMUDriverPtr driver, + virDomainObjPtr vm, + bool startup) { int rc = -1; qemuDomainObjPrivatePtr priv = vm->privateData; @@ -826,9 +824,10 @@ cleanup: } -int qemuSetupCgroup(virQEMUDriverPtr driver, - virDomainObjPtr vm, - virBitmapPtr nodemask) +int +qemuSetupCgroup(virQEMUDriverPtr driver, + virDomainObjPtr vm, + virBitmapPtr nodemask) { qemuDomainObjPrivatePtr priv = vm->privateData; virCapsPtr caps = NULL; @@ -864,8 +863,10 @@ cleanup: return ret; } -int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, unsigned long long period, - long long quota) +int +qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, + unsigned long long period, + long long quota) { int rc; unsigned long long old_period; @@ -912,10 +913,11 @@ cleanup: return -1; } -int qemuSetupCgroupVcpuPin(virCgroupPtr cgroup, - virDomainVcpuPinDefPtr *vcpupin, - int nvcpupin, - int vcpuid) +int +qemuSetupCgroupVcpuPin(virCgroupPtr cgroup, + virDomainVcpuPinDefPtr *vcpupin, + int nvcpupin, + int vcpuid) { size_t i; @@ -928,8 +930,9 @@ int qemuSetupCgroupVcpuPin(virCgroupPtr cgroup, return -1; } -int qemuSetupCgroupEmulatorPin(virCgroupPtr cgroup, - virBitmapPtr cpumask) +int +qemuSetupCgroupEmulatorPin(virCgroupPtr cgroup, + virBitmapPtr cpumask) { int rc = 0; char *new_cpus = NULL; @@ -955,7 +958,8 @@ cleanup: return rc; } -int qemuSetupCgroupForVcpu(virDomainObjPtr vm) +int +qemuSetupCgroupForVcpu(virDomainObjPtr vm) { virCgroupPtr cgroup_vcpu = NULL; qemuDomainObjPrivatePtr priv = vm->privateData; @@ -1043,9 +1047,10 @@ cleanup: return -1; } -int qemuSetupCgroupForEmulator(virQEMUDriverPtr driver, - virDomainObjPtr vm, - virBitmapPtr nodemask) +int +qemuSetupCgroupForEmulator(virQEMUDriverPtr driver, + virDomainObjPtr vm, + virBitmapPtr nodemask) { virBitmapPtr cpumask = NULL; virBitmapPtr cpumap = NULL; @@ -1125,7 +1130,8 @@ cleanup: return rc; } -int qemuRemoveCgroup(virDomainObjPtr vm) +int +qemuRemoveCgroup(virDomainObjPtr vm) { qemuDomainObjPrivatePtr priv = vm->privateData; @@ -1135,7 +1141,8 @@ int qemuRemoveCgroup(virDomainObjPtr vm) return virCgroupRemove(priv->cgroup); } -int qemuAddToCgroup(virDomainObjPtr vm) +int +qemuAddToCgroup(virDomainObjPtr vm) { qemuDomainObjPrivatePtr priv = vm->privateData; int rc; -- 1.8.3.2

On 07/18/13 14:39, Martin Kletzander wrote:
On 07/18/2013 12:49 PM, Peter Krempa wrote:
--- src/qemu/qemu_cgroup.c | 69 +++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 31 deletions(-)
ACK,
Martin
Thanks for the review. Series is now pushed with fixes you requested in 2/3. Peter
participants (2)
-
Martin Kletzander
-
Peter Krempa