[libvirt] [PATCH 0/3 v3] Cgroup fixes
Instead of only post v2 of 5/7 (v2) in the old mail thread, this resend the left patches of v2 with a independant thread for easy reviewing. v2 - v3: * 1/7 ~ 4/7 of v1 are pushed. * goto label "cleanup" is kept with Daniel's suggestion for 5/7 (v1) * Commit log of 6/7 (v1) are updated a bit to reflect more clear about the cause v1 - v2: * Just resending v2: https://www.redhat.com/archives/libvir-list/2013-May/msg01314.html Osier Yang (3): qemu: Abstract code for the cpu controller setting into a helper qemu: Set cpuset.cpus for domain process qemu: Prohibit getting the numa parameters if mode is not strict src/qemu/qemu_cgroup.c | 94 ++++++++++++++++++++++++++++++++++++-------------- src/qemu/qemu_driver.c | 11 ++++-- 2 files changed, 78 insertions(+), 27 deletions(-) -- 1.8.1.4
--- src/qemu/qemu_cgroup.c | 55 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index bb1b235..cf46993 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -676,6 +676,36 @@ cleanup: } +static int +qemuSetupCpuCgroup(virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc = -1; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { + if (vm->def->cputune.shares) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("CPU tuning is not available on this host")); + return -1; + } else { + return 0; + } + } + + if (vm->def->cputune.shares) { + rc = virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set io cpu shares for domain %s"), + vm->def->name); + return -1; + } + } + + return 0; +} + + int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, bool startup) @@ -789,14 +819,14 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, virBitmapPtr nodemask) { - int rc = -1; qemuDomainObjPrivatePtr priv = vm->privateData; + int ret = -1; if (qemuInitCgroup(driver, vm, true) < 0) return -1; if (!priv->cgroup) - goto done; + return 0; if (qemuSetupDevicesCgroup(driver, vm) < 0) goto cleanup; @@ -807,28 +837,15 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, if (qemuSetupMemoryCgroup(vm) < 0) goto cleanup; - if (vm->def->cputune.shares != 0) { - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { - rc = virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set io cpu shares for domain %s"), - vm->def->name); - goto cleanup; - } - } else { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("CPU tuning is not available on this host")); - } - } + if (qemuSetupCpuCgroup(vm) < 0) + goto cleanup; if (qemuSetupCpusetCgroup(vm, nodemask) < 0) goto cleanup; -done: - rc = 0; + ret = 0; cleanup: - return rc == 0 ? 0 : -1; + return ret; } int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, unsigned long long period, -- 1.8.1.4
On 05/24/2013 11:08 AM, Osier Yang wrote:
--- src/qemu/qemu_cgroup.c | 55 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 19 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index bb1b235..cf46993 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -676,6 +676,36 @@ cleanup: }
+static int +qemuSetupCpuCgroup(virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc = -1; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { + if (vm->def->cputune.shares) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("CPU tuning is not available on this host")); + return -1; + } else { + return 0; + } + } + + if (vm->def->cputune.shares) { + rc = virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set io cpu shares for domain %s"), + vm->def->name); + return -1; + } + } + + return 0; +} + +
I think this would be a bit more readable: if (!vm->def->cputune.shares) return 0; if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { virReportError(...); return -1; } rc = virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares); if (rc != 0) { ... but that's a matter of opinion. ACK either way. Martin
On 04/06/13 22:23, Martin Kletzander wrote:
On 05/24/2013 11:08 AM, Osier Yang wrote:
--- src/qemu/qemu_cgroup.c | 55 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 19 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index bb1b235..cf46993 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -676,6 +676,36 @@ cleanup: }
+static int +qemuSetupCpuCgroup(virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc = -1; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { + if (vm->def->cputune.shares) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("CPU tuning is not available on this host")); + return -1; + } else { + return 0; + } + } + + if (vm->def->cputune.shares) { + rc = virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set io cpu shares for domain %s"), + vm->def->name); + return -1; + } + } + + return 0; +} + + I think this would be a bit more readable:
if (!vm->def->cputune.shares) return 0;
if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { virReportError(...); return -1; }
rc = virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares); if (rc != 0) { ...
but that's a matter of opinion. ACK either way.
More clear indeed, but it's same case for other helpers, I can change them together with later patch.. Pushed this as-is. Thanks. Osier
When either "cpuset" of <vcpu> is specified, or the "placement" of <vcpu> is "auto", only setting the cpuset.mems might cause the guest starting to fail. E.g. ("placement" of both <vcpu> and <numatune> is "auto"): 1) Related XMLs <vcpu placement='auto'>4</vcpu> <numatune> <memory mode='strict' placement='auto'/> </numatune> 2) Host NUMA topology % numactl --hardware available: 8 nodes (0-7) node 0 cpus: 0 4 8 12 16 20 24 28 node 0 size: 16374 MB node 0 free: 11899 MB node 1 cpus: 32 36 40 44 48 52 56 60 node 1 size: 16384 MB node 1 free: 15318 MB node 2 cpus: 2 6 10 14 18 22 26 30 node 2 size: 16384 MB node 2 free: 15766 MB node 3 cpus: 34 38 42 46 50 54 58 62 node 3 size: 16384 MB node 3 free: 15347 MB node 4 cpus: 3 7 11 15 19 23 27 31 node 4 size: 16384 MB node 4 free: 15041 MB node 5 cpus: 35 39 43 47 51 55 59 63 node 5 size: 16384 MB node 5 free: 15202 MB node 6 cpus: 1 5 9 13 17 21 25 29 node 6 size: 16384 MB node 6 free: 15197 MB node 7 cpus: 33 37 41 45 49 53 57 61 node 7 size: 16368 MB node 7 free: 15669 MB 4) cpuset.cpus will be set as: (from debug log) 2013-05-09 16:50:17.296+0000: 417: debug : virCgroupSetValueStr:331 : Set value '/sys/fs/cgroup/cpuset/libvirt/qemu/toy/cpuset.cpus' to '0-63' 5) The advisory nodeset got from querying numad (from debug log) 2013-05-09 16:50:17.295+0000: 417: debug : qemuProcessStart:3614 : Nodeset returned from numad: 1 6) cpuset.mems will be set as: (from debug log) 2013-05-09 16:50:17.296+0000: 417: debug : virCgroupSetValueStr:331 : Set value '/sys/fs/cgroup/cpuset/libvirt/qemu/toy/cpuset.mems' to '0-7' I.E, the domain process's memory is restricted on the first NUMA node, however, it can use all of the CPUs, which will likely cause the domain process to fail to start because of the kernel fails to allocate memory with the the memory policy as "strict". % tail -n 20 /var/log/libvirt/qemu/toy.log ... 2013-05-09 05:53:32.972+0000: 7318: debug : virCommandHandshakeChild:377 : Handshake with parent is done char device redirected to /dev/pts/2 (label charserial0) kvm_init_vcpu failed: Cannot allocate memory ... --- src/qemu/qemu_cgroup.c | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index cf46993..5bfae77 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -635,7 +635,8 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, virBitmapPtr nodemask) { qemuDomainObjPrivatePtr priv = vm->privateData; - char *mask = NULL; + char *mem_mask = NULL; + char *cpu_mask = NULL; int rc; int ret = -1; @@ -649,17 +650,17 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, if (vm->def->numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) - mask = virBitmapFormat(nodemask); + mem_mask = virBitmapFormat(nodemask); else - mask = virBitmapFormat(vm->def->numatune.memory.nodemask); + mem_mask = virBitmapFormat(vm->def->numatune.memory.nodemask); - if (!mask) { + if (!mem_mask) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("failed to convert memory nodemask")); goto cleanup; } - rc = virCgroupSetCpusetMems(priv->cgroup, mask); + rc = virCgroupSetCpusetMems(priv->cgroup, mem_mask); if (rc != 0) { virReportSystemError(-rc, @@ -669,9 +670,35 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, } } + if (vm->def->cpumask || + (vm->def->placement_mode == + VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) { + if (vm->def->placement_mode == + VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) + cpu_mask = virBitmapFormat(nodemask); + else + cpu_mask = virBitmapFormat(vm->def->cpumask); + + if (!cpu_mask) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to convert memory nodemask")); + goto cleanup; + } + + rc = virCgroupSetCpusetCpus(priv->cgroup, cpu_mask); + + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpuset.cpus for domain %s"), + vm->def->name); + goto cleanup; + } + } + ret = 0; cleanup: - VIR_FREE(mask); + VIR_FREE(mem_mask); + VIR_FREE(cpu_mask); return ret; } -- 1.8.1.4
On 05/24/2013 11:08 AM, Osier Yang wrote:
When either "cpuset" of <vcpu> is specified, or the "placement" of <vcpu> is "auto", only setting the cpuset.mems might cause the guest starting to fail. E.g. ("placement" of both <vcpu> and <numatune> is "auto"):
After spending a lot of time with this, I'm still not convinced that this makes the the domain unbootable. Even when mempolicy is set to 'strict' and the cpuset.cpus are from different node than cpuset.mems, the allocation shouldn't fail, it just slows down access to the memory. Might be a kernel bug?
1) Related XMLs <vcpu placement='auto'>4</vcpu> <numatune> <memory mode='strict' placement='auto'/> </numatune>
2) Host NUMA topology % numactl --hardware available: 8 nodes (0-7) node 0 cpus: 0 4 8 12 16 20 24 28 node 0 size: 16374 MB node 0 free: 11899 MB node 1 cpus: 32 36 40 44 48 52 56 60 node 1 size: 16384 MB node 1 free: 15318 MB node 2 cpus: 2 6 10 14 18 22 26 30 node 2 size: 16384 MB node 2 free: 15766 MB node 3 cpus: 34 38 42 46 50 54 58 62 node 3 size: 16384 MB node 3 free: 15347 MB node 4 cpus: 3 7 11 15 19 23 27 31 node 4 size: 16384 MB node 4 free: 15041 MB node 5 cpus: 35 39 43 47 51 55 59 63 node 5 size: 16384 MB node 5 free: 15202 MB node 6 cpus: 1 5 9 13 17 21 25 29 node 6 size: 16384 MB node 6 free: 15197 MB node 7 cpus: 33 37 41 45 49 53 57 61 node 7 size: 16368 MB node 7 free: 15669 MB
4) cpuset.cpus will be set as: (from debug log)
2013-05-09 16:50:17.296+0000: 417: debug : virCgroupSetValueStr:331 : Set value '/sys/fs/cgroup/cpuset/libvirt/qemu/toy/cpuset.cpus' to '0-63'
5) The advisory nodeset got from querying numad (from debug log)
2013-05-09 16:50:17.295+0000: 417: debug : qemuProcessStart:3614 : Nodeset returned from numad: 1
6) cpuset.mems will be set as: (from debug log)
2013-05-09 16:50:17.296+0000: 417: debug : virCgroupSetValueStr:331 : Set value '/sys/fs/cgroup/cpuset/libvirt/qemu/toy/cpuset.mems' to '0-7'
I.E, the domain process's memory is restricted on the first NUMA node, however, it can use all of the CPUs, which will likely cause the domain process to fail to start because of the kernel fails to allocate memory with the the memory policy as "strict".
% tail -n 20 /var/log/libvirt/qemu/toy.log ... 2013-05-09 05:53:32.972+0000: 7318: debug : virCommandHandshakeChild:377 : Handshake with parent is done char device redirected to /dev/pts/2 (label charserial0) kvm_init_vcpu failed: Cannot allocate memory ... --- src/qemu/qemu_cgroup.c | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index cf46993..5bfae77 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -635,7 +635,8 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, virBitmapPtr nodemask) { qemuDomainObjPrivatePtr priv = vm->privateData; - char *mask = NULL; + char *mem_mask = NULL; + char *cpu_mask = NULL; int rc; int ret = -1;
@@ -649,17 +650,17 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm,
if (vm->def->numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) - mask = virBitmapFormat(nodemask); + mem_mask = virBitmapFormat(nodemask); else - mask = virBitmapFormat(vm->def->numatune.memory.nodemask); + mem_mask = virBitmapFormat(vm->def->numatune.memory.nodemask);
- if (!mask) { + if (!mem_mask) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("failed to convert memory nodemask")); goto cleanup; }
- rc = virCgroupSetCpusetMems(priv->cgroup, mask); + rc = virCgroupSetCpusetMems(priv->cgroup, mem_mask);
if (rc != 0) { virReportSystemError(-rc, @@ -669,9 +670,35 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, } }
+ if (vm->def->cpumask || + (vm->def->placement_mode == + VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) { + if (vm->def->placement_mode == + VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) + cpu_mask = virBitmapFormat(nodemask);
You're right that we are not setting the cpuset.cpus at all and we should, but in the scenario you described in the commit message this will set the cpuset.cpus to the same value as cpuset.mems. Let's say 'numad -w X:Y' will return '3'. In this case cpuset.mems will be set correctly, but cpuset.cpus must be set to all cpus on that node ('34,38,42,46,50,54,58,62' in this case). ACK if you convert the node to CPUs as described. Martin.
I don't see any reason to getting the numa parameters if mode is not strict, as long as setNumaParameters doesn't allow to set "nodeset" if the mode is not strict, and cpuset.mems only understand "strict" mode. Things could be changed if we support to get numa parameters with libnuma one day, but let's prohibit it now. --- src/qemu/qemu_driver.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 3df26b8..9168072 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -7841,8 +7841,15 @@ qemuDomainGetNumaParameters(virDomainPtr dom, if (flags & VIR_DOMAIN_AFFECT_LIVE) { if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) { - virReportError(VIR_ERR_OPERATION_INVALID, - "%s", _("cgroup memory controller is not mounted")); + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("cgroup memory controller is not mounted")); + goto cleanup; + } + + if (vm->def->numatune.memory.mode != VIR_DOMAIN_NUMATUNE_MEM_STRICT) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("getting numa parameters of running domain is " + "only valid for strict numa mode")); goto cleanup; } } -- 1.8.1.4
On 05/24/2013 11:08 AM, Osier Yang wrote:
I don't see any reason to getting the numa parameters if mode is not strict, as long as setNumaParameters doesn't allow to set "nodeset" if the mode is not strict, and cpuset.mems only understand "strict" mode.
NACK, this makes sense for 'interleave' as well and checking this adds more code without any special benefits. This also makes the API fail when ran with default params even when the caller only wants to know what mode the domain was started with. The worst thing that can happen now is that the caller will not get the second parameter which is OK (especially when it makes sense). Martin
ping? On 24/05/13 17:08, Osier Yang wrote:
Instead of only post v2 of 5/7 (v2) in the old mail thread, this resend the left patches of v2 with a independant thread for easy reviewing.
v2 - v3: * 1/7 ~ 4/7 of v1 are pushed. * goto label "cleanup" is kept with Daniel's suggestion for 5/7 (v1) * Commit log of 6/7 (v1) are updated a bit to reflect more clear about the cause
v1 - v2: * Just resending
v2: https://www.redhat.com/archives/libvir-list/2013-May/msg01314.html
Osier Yang (3): qemu: Abstract code for the cpu controller setting into a helper qemu: Set cpuset.cpus for domain process qemu: Prohibit getting the numa parameters if mode is not strict
src/qemu/qemu_cgroup.c | 94 ++++++++++++++++++++++++++++++++++++-------------- src/qemu/qemu_driver.c | 11 ++++-- 2 files changed, 78 insertions(+), 27 deletions(-)
participants (2)
-
Martin Kletzander -
Osier Yang