[libvirt] [PATCH 0/6] qemu: Fix hotplugging cpus with strict memory pinning

Deatils are in the patches themselves, but the basic idea is this: Setup: $ grep DMA32 /proc/zoneinfo Node 0, zone DMA32 $ virsh dumpxml domain | grep -C1 strict <numatune> <memory mode='strict' nodeset='1'/> </numatune> $ virsh start domain Domain domain started Before: $ virsh setvcpus domain 2 error: Unable to read from monitor: Connection reset by peer # Domain died After: $ virsh setvcpus domain 2 # hotplug successful Martin Martin Kletzander (6): util: Add function virCgroupHasEmptyTasks util: Add virNumaGetHostNodeset qemu: Remove unnecessary qemuSetupCgroupPostInit function qemu: Save numad advice into qemuDomainObjPrivate qemu: Leave cpuset.mems in parent cgroup alone qemu: Fix hotplugging cpus with strict memory pinning src/libvirt_private.syms | 2 ++ src/qemu/qemu_cgroup.c | 94 +++++++++++++++++++++++++++++++++++++----------- src/qemu/qemu_cgroup.h | 9 ++--- src/qemu/qemu_domain.c | 1 + src/qemu/qemu_domain.h | 1 + src/qemu/qemu_driver.c | 88 +++++++++++++++++++++++++-------------------- src/qemu/qemu_process.c | 21 ++++++----- src/util/vircgroup.c | 23 ++++++++++++ src/util/vircgroup.h | 4 ++- src/util/virnuma.c | 28 +++++++++++++++ src/util/virnuma.h | 1 + 11 files changed, 194 insertions(+), 78 deletions(-) -- 2.2.0

That function helps checking whether there's a task in that cgroup. Signed-off-by: Martin Kletzander <mkletzan@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroup.c | 23 +++++++++++++++++++++++ src/util/vircgroup.h | 4 +++- 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 6df2784..60979f8 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1096,6 +1096,7 @@ virCgroupGetMemSwapHardLimit; virCgroupGetMemSwapUsage; virCgroupGetPercpuStats; virCgroupHasController; +virCgroupHasEmptyTasks; virCgroupIsolateMount; virCgroupKill; virCgroupKillPainfully; diff --git a/src/util/vircgroup.c b/src/util/vircgroup.c index 3995477..4857ef3 100644 --- a/src/util/vircgroup.c +++ b/src/util/vircgroup.c @@ -3926,6 +3926,20 @@ virCgroupSupportsCpuBW(virCgroupPtr cgroup) return ret; } +int +virCgroupHasEmptyTasks(virCgroupPtr cgroup, int controller) +{ + int ret = -1; + char *content = NULL; + + ret = virCgroupGetValueStr(cgroup, controller, "tasks", &content); + + if (ret == 0 && content[0] == '\0') + ret = 1; + + VIR_FREE(content); + return ret; +} #else /* !VIR_CGROUP_SUPPORTED */ @@ -4655,4 +4669,13 @@ virCgroupSetOwner(virCgroupPtr cgroup ATTRIBUTE_UNUSED, return -1; } +int +virCgroupHasEmptyTasks(virCgroupPtr cgroup ATTRIBUTE_UNUSED, + int controller ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("Control groups not supported on this platform")); + return -1; +} + #endif /* !VIR_CGROUP_SUPPORTED */ diff --git a/src/util/vircgroup.h b/src/util/vircgroup.h index 7718a07..f07c1a7 100644 --- a/src/util/vircgroup.h +++ b/src/util/vircgroup.h @@ -1,7 +1,7 @@ /* * vircgroup.h: methods for managing control cgroups * - * Copyright (C) 2011-2013 Red Hat, Inc. + * Copyright (C) 2011-2014 Red Hat, Inc. * Copyright IBM Corp. 2008 * * This library is free software; you can redistribute it and/or @@ -270,4 +270,6 @@ int virCgroupSetOwner(virCgroupPtr cgroup, gid_t gid, int controllers); +int virCgroupHasEmptyTasks(virCgroupPtr cgroup, int controller); + #endif /* __VIR_CGROUP_H__ */ -- 2.2.0

That function tries its best to create a bitmap of host NUMA nodes. Signed-off-by: Martin Kletzander <mkletzan@redhat.com> --- src/libvirt_private.syms | 1 + src/util/virnuma.c | 28 ++++++++++++++++++++++++++++ src/util/virnuma.h | 1 + 3 files changed, 30 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 60979f8..b21f5fb 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1748,6 +1748,7 @@ virNodeSuspendGetTargetMask; # util/virnuma.h virNumaGetAutoPlacementAdvice; virNumaGetDistances; +virNumaGetHostNodeset; virNumaGetMaxNode; virNumaGetNodeMemory; virNumaGetPageInfo; diff --git a/src/util/virnuma.c b/src/util/virnuma.c index b8d76f4..86564d4 100644 --- a/src/util/virnuma.c +++ b/src/util/virnuma.c @@ -983,3 +983,31 @@ virNumaNodesetIsAvailable(virBitmapPtr nodeset) } return true; } + +virBitmapPtr +virNumaGetHostNodeset(void) +{ + int maxnode = virNumaGetMaxNode(); + size_t i = 0; + virBitmapPtr nodeset = NULL; + + if (maxnode < 0) + return NULL; + + if (!(nodeset = virBitmapNew(maxnode + 1))) + return NULL; + + for (i = 0; i <= maxnode; i++) { + if (!virNumaNodeIsAvailable(i)) + continue; + + if (virBitmapSetBit(nodeset, i) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Problem setting bit in bitmap")); + virBitmapFree(nodeset); + return NULL; + } + } + + return nodeset; +} diff --git a/src/util/virnuma.h b/src/util/virnuma.h index 09034a2..1f3c0ad 100644 --- a/src/util/virnuma.h +++ b/src/util/virnuma.h @@ -33,6 +33,7 @@ char *virNumaGetAutoPlacementAdvice(unsigned short vcups, int virNumaSetupMemoryPolicy(virDomainNumatuneMemMode mode, virBitmapPtr nodeset); +virBitmapPtr virNumaGetHostNodeset(void); bool virNumaNodesetIsAvailable(virBitmapPtr nodeset); bool virNumaIsAvailable(void); int virNumaGetMaxNode(void); -- 2.2.0

Signed-off-by: Martin Kletzander <mkletzan@redhat.com> --- src/qemu/qemu_cgroup.c | 9 +-------- src/qemu/qemu_cgroup.h | 4 ++-- src/qemu/qemu_process.c | 2 +- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 0e94cae..ed5416b 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -607,7 +607,7 @@ qemuSetupDevicesCgroup(virQEMUDriverPtr driver, } -static int +int qemuSetupCpusetMems(virDomainObjPtr vm, virBitmapPtr nodemask) { @@ -867,13 +867,6 @@ qemuSetupCgroup(virQEMUDriverPtr driver, } int -qemuSetupCgroupPostInit(virDomainObjPtr vm, - virBitmapPtr nodemask) -{ - return qemuSetupCpusetMems(vm, nodemask); -} - -int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, unsigned long long period, long long quota) diff --git a/src/qemu/qemu_cgroup.h b/src/qemu/qemu_cgroup.h index 4a4f22c..72c28f7 100644 --- a/src/qemu/qemu_cgroup.h +++ b/src/qemu/qemu_cgroup.h @@ -47,8 +47,8 @@ int qemuConnectCgroup(virQEMUDriverPtr driver, int qemuSetupCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, virBitmapPtr nodemask); -int qemuSetupCgroupPostInit(virDomainObjPtr vm, - virBitmapPtr nodemask); +int qemuSetupCpusetMems(virDomainObjPtr vm, + virBitmapPtr nodemask); int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, unsigned long long period, long long quota); diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index a19e71a..9781e44 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -4710,7 +4710,7 @@ int qemuProcessStart(virConnectPtr conn, goto cleanup; VIR_DEBUG("Setting up post-init cgroup restrictions"); - if (qemuSetupCgroupPostInit(vm, nodemask) < 0) + if (qemuSetupCpusetMems(vm, nodemask) < 0) goto cleanup; VIR_DEBUG("Detecting VCPU PIDs"); -- 2.2.0

Thanks to that we don't need to drag the pointer everywhere and future code will get cleaner. Signed-off-by: Martin Kletzander <mkletzan@redhat.com> --- src/qemu/qemu_cgroup.c | 18 +++++++----------- src/qemu/qemu_cgroup.h | 9 +++------ src/qemu/qemu_domain.c | 1 + src/qemu/qemu_domain.h | 1 + src/qemu/qemu_process.c | 21 ++++++++++----------- 5 files changed, 22 insertions(+), 28 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index ed5416b..a0370bc 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -608,8 +608,7 @@ qemuSetupDevicesCgroup(virQEMUDriverPtr driver, int -qemuSetupCpusetMems(virDomainObjPtr vm, - virBitmapPtr nodemask) +qemuSetupCpusetMems(virDomainObjPtr vm) { virCgroupPtr cgroup_temp = NULL; qemuDomainObjPrivatePtr priv = vm->privateData; @@ -624,7 +623,7 @@ qemuSetupCpusetMems(virDomainObjPtr vm, return 0; if (virDomainNumatuneMaybeFormatNodeset(vm->def->numatune, - nodemask, + priv->autoNodeset, &mem_mask, -1) < 0) goto cleanup; @@ -644,7 +643,6 @@ qemuSetupCpusetMems(virDomainObjPtr vm, static int qemuSetupCpusetCgroup(virDomainObjPtr vm, - virBitmapPtr nodemask, virCapsPtr caps) { qemuDomainObjPrivatePtr priv = vm->privateData; @@ -659,7 +657,7 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { virBitmapPtr cpumap; - if (!(cpumap = virCapabilitiesGetCpusForNodemask(caps, nodemask))) + if (!(cpumap = virCapabilitiesGetCpusForNodemask(caps, priv->autoNodeset))) goto cleanup; cpu_mask = virBitmapFormat(cpumap); virBitmapFree(cpumap); @@ -823,8 +821,7 @@ qemuConnectCgroup(virQEMUDriverPtr driver, int qemuSetupCgroup(virQEMUDriverPtr driver, - virDomainObjPtr vm, - virBitmapPtr nodemask) + virDomainObjPtr vm) { qemuDomainObjPrivatePtr priv = vm->privateData; virCapsPtr caps = NULL; @@ -857,7 +854,7 @@ qemuSetupCgroup(virQEMUDriverPtr driver, if (qemuSetupCpuCgroup(driver, vm) < 0) goto cleanup; - if (qemuSetupCpusetCgroup(vm, nodemask, caps) < 0) + if (qemuSetupCpusetCgroup(vm, caps) < 0) goto cleanup; ret = 0; @@ -1042,8 +1039,7 @@ qemuSetupCgroupForVcpu(virDomainObjPtr vm) int qemuSetupCgroupForEmulator(virQEMUDriverPtr driver, - virDomainObjPtr vm, - virBitmapPtr nodemask) + virDomainObjPtr vm) { virBitmapPtr cpumask = NULL; virBitmapPtr cpumap = NULL; @@ -1079,7 +1075,7 @@ qemuSetupCgroupForEmulator(virQEMUDriverPtr driver, goto cleanup; if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { - if (!(cpumap = qemuPrepareCpumap(driver, nodemask))) + if (!(cpumap = qemuPrepareCpumap(driver, priv->autoNodeset))) goto cleanup; cpumask = cpumap; } else if (def->cputune.emulatorpin) { diff --git a/src/qemu/qemu_cgroup.h b/src/qemu/qemu_cgroup.h index 72c28f7..df7e3a1 100644 --- a/src/qemu/qemu_cgroup.h +++ b/src/qemu/qemu_cgroup.h @@ -45,10 +45,8 @@ int qemuTeardownHostdevCgroup(virDomainObjPtr vm, int qemuConnectCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm); int qemuSetupCgroup(virQEMUDriverPtr driver, - virDomainObjPtr vm, - virBitmapPtr nodemask); -int qemuSetupCpusetMems(virDomainObjPtr vm, - virBitmapPtr nodemask); + virDomainObjPtr vm); +int qemuSetupCpusetMems(virDomainObjPtr vm); int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, unsigned long long period, long long quota); @@ -64,8 +62,7 @@ int qemuSetupCgroupIOThreadsPin(virCgroupPtr cgroup, int qemuSetupCgroupForVcpu(virDomainObjPtr vm); int qemuSetupCgroupForIOThreads(virDomainObjPtr vm); int qemuSetupCgroupForEmulator(virQEMUDriverPtr driver, - virDomainObjPtr vm, - virBitmapPtr nodemask); + virDomainObjPtr vm); int qemuRemoveCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm); int qemuAddToCgroup(virDomainObjPtr vm); diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 220304f..8dd427a 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -451,6 +451,7 @@ qemuDomainObjPrivateFree(void *data) qemuAgentClose(priv->agent); } VIR_FREE(priv->cleanupCallbacks); + virBitmapFree(priv->autoNodeset); VIR_FREE(priv); } diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index efabd82..ce5eba3 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -187,6 +187,7 @@ struct _qemuDomainObjPrivate { char **qemuDevices; /* NULL-terminated list of devices aliases known to QEMU */ bool hookRun; /* true if there was a hook run over this domain */ + virBitmapPtr autoNodeset; }; typedef enum { diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 9781e44..d683918 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2377,12 +2377,12 @@ qemuPrepareCpumap(virQEMUDriverPtr driver, */ static int qemuProcessInitCpuAffinity(virQEMUDriverPtr driver, - virDomainObjPtr vm, - virBitmapPtr nodemask) + virDomainObjPtr vm) { int ret = -1; virBitmapPtr cpumap = NULL; virBitmapPtr cpumapToSet = NULL; + qemuDomainObjPrivatePtr priv = vm->privateData; if (!vm->pid) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", @@ -2390,7 +2390,7 @@ qemuProcessInitCpuAffinity(virQEMUDriverPtr driver, return -1; } - if (!(cpumap = qemuPrepareCpumap(driver, nodemask))) + if (!(cpumap = qemuPrepareCpumap(driver, priv->autoNodeset))) return -1; if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { @@ -3041,13 +3041,13 @@ struct qemuProcessHookData { virConnectPtr conn; virDomainObjPtr vm; virQEMUDriverPtr driver; - virBitmapPtr nodemask; virQEMUDriverConfigPtr cfg; }; static int qemuProcessHook(void *data) { struct qemuProcessHookData *h = data; + qemuDomainObjPrivatePtr priv = h->vm->privateData; int ret = -1; int fd; virBitmapPtr nodeset = NULL; @@ -3084,7 +3084,7 @@ static int qemuProcessHook(void *data) mode = virDomainNumatuneGetMode(h->vm->def->numatune, -1); nodeset = virDomainNumatuneGetNodeset(h->vm->def->numatune, - h->nodemask, -1); + priv->autoNodeset, -1); if (virNumaSetupMemoryPolicy(mode, nodeset) < 0) goto cleanup; @@ -4426,7 +4426,7 @@ int qemuProcessStart(virConnectPtr conn, if (virBitmapParse(nodeset, 0, &nodemask, VIR_DOMAIN_CPUMASK_LEN) < 0) goto cleanup; } - hookData.nodemask = nodemask; + priv->autoNodeset = nodemask; /* "volume" type disk's source must be translated before * cgroup and security setting. @@ -4630,13 +4630,13 @@ int qemuProcessStart(virConnectPtr conn, } VIR_DEBUG("Setting up domain cgroup (if required)"); - if (qemuSetupCgroup(driver, vm, nodemask) < 0) + if (qemuSetupCgroup(driver, vm) < 0) goto cleanup; /* This must be done after cgroup placement to avoid resetting CPU * affinity */ if (!vm->def->cputune.emulatorpin && - qemuProcessInitCpuAffinity(driver, vm, nodemask) < 0) + qemuProcessInitCpuAffinity(driver, vm) < 0) goto cleanup; VIR_DEBUG("Setting domain security labels"); @@ -4683,7 +4683,7 @@ int qemuProcessStart(virConnectPtr conn, goto cleanup; VIR_DEBUG("Setting cgroup for emulator (if required)"); - if (qemuSetupCgroupForEmulator(driver, vm, nodemask) < 0) + if (qemuSetupCgroupForEmulator(driver, vm) < 0) goto cleanup; VIR_DEBUG("Setting affinity of emulator threads"); @@ -4710,7 +4710,7 @@ int qemuProcessStart(virConnectPtr conn, goto cleanup; VIR_DEBUG("Setting up post-init cgroup restrictions"); - if (qemuSetupCpusetMems(vm, nodemask) < 0) + if (qemuSetupCpusetMems(vm) < 0) goto cleanup; VIR_DEBUG("Detecting VCPU PIDs"); @@ -4848,7 +4848,6 @@ int qemuProcessStart(virConnectPtr conn, * if we failed to initialize the now running VM. kill it off and * pretend we never started it */ VIR_FREE(nodeset); - virBitmapFree(nodemask); virCommandFree(cmd); VIR_FORCE_CLOSE(logfile); if (priv->mon) -- 2.2.0

Instead of setting the value of cpuset.mems once when the domain starts and then re-calculating the value every time we need to change the child cgroup values, leave the cgroup alone and rather set the child data every time there is new cgroup created. We don't leave any task in the parent group anyway. This will ease both current and future code. Signed-off-by: Martin Kletzander <mkletzan@redhat.com> --- src/qemu/qemu_cgroup.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-- src/qemu/qemu_driver.c | 59 +++++++++++++++----------------------------- 2 files changed, 85 insertions(+), 41 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index a0370bc..1e383c4 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -35,6 +35,7 @@ #include "virstring.h" #include "virfile.h" #include "virtypedparam.h" +#include "virnuma.h" #define VIR_FROM_THIS VIR_FROM_QEMU @@ -629,8 +630,7 @@ qemuSetupCpusetMems(virDomainObjPtr vm) if (mem_mask) if (virCgroupNewEmulator(priv->cgroup, false, &cgroup_temp) < 0 || - virCgroupSetCpusetMems(cgroup_temp, mem_mask) < 0 || - virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0) + virCgroupSetCpusetMems(cgroup_temp, mem_mask) < 0) goto cleanup; ret = 0; @@ -785,6 +785,39 @@ qemuInitCgroup(virQEMUDriverPtr driver, return ret; } +static void +qemuRestoreCgroupState(virDomainObjPtr vm) +{ + char *mem_mask; + int empty = -1; + qemuDomainObjPrivatePtr priv = vm->privateData; + virBitmapPtr all_nodes; + + if (!(all_nodes = virNumaGetHostNodeset())) + goto error; + + if (!(mem_mask = virBitmapFormat(all_nodes))) + goto error; + + if ((empty = virCgroupHasEmptyTasks(priv->cgroup, + VIR_CGROUP_CONTROLLER_CPUSET)) < 0) + + if (!empty) + goto error; + + if (virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0) + goto error; + + cleanup: + VIR_FREE(mem_mask); + virBitmapFree(all_nodes); + return; + + error: + virResetLastError(); + VIR_DEBUG("Couldn't restore cgroups to meaningful state"); + goto cleanup; +} int qemuConnectCgroup(virQEMUDriverPtr driver, @@ -812,6 +845,8 @@ qemuConnectCgroup(virQEMUDriverPtr driver, &priv->cgroup) < 0) goto cleanup; + qemuRestoreCgroupState(vm); + done: ret = 0; cleanup: @@ -961,6 +996,7 @@ qemuSetupCgroupForVcpu(virDomainObjPtr vm) size_t i, j; unsigned long long period = vm->def->cputune.period; long long quota = vm->def->cputune.quota; + char *mem_mask = NULL; if ((period || quota) && !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { @@ -992,6 +1028,13 @@ qemuSetupCgroupForVcpu(virDomainObjPtr vm) return 0; } + if (virDomainNumatuneGetMode(vm->def->numatune, -1) == + VIR_DOMAIN_NUMATUNE_MEM_STRICT && + virDomainNumatuneMaybeFormatNodeset(vm->def->numatune, + priv->autoNodeset, + &mem_mask, -1) < 0) + goto cleanup; + for (i = 0; i < priv->nvcpupids; i++) { if (virCgroupNewVcpu(priv->cgroup, i, true, &cgroup_vcpu) < 0) goto cleanup; @@ -1000,6 +1043,10 @@ qemuSetupCgroupForVcpu(virDomainObjPtr vm) if (virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]) < 0) goto cleanup; + if (mem_mask && + virCgroupSetCpusetMems(cgroup_vcpu, mem_mask) < 0) + goto cleanup; + if (period || quota) { if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0) goto cleanup; @@ -1025,6 +1072,7 @@ qemuSetupCgroupForVcpu(virDomainObjPtr vm) virCgroupFree(&cgroup_vcpu); } + VIR_FREE(mem_mask); return 0; @@ -1033,6 +1081,7 @@ qemuSetupCgroupForVcpu(virDomainObjPtr vm) virCgroupRemove(cgroup_vcpu); virCgroupFree(&cgroup_vcpu); } + VIR_FREE(mem_mask); return -1; } @@ -1121,6 +1170,7 @@ qemuSetupCgroupForIOThreads(virDomainObjPtr vm) size_t i, j; unsigned long long period = vm->def->cputune.period; long long quota = vm->def->cputune.quota; + char *mem_mask = NULL; if ((period || quota) && !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { @@ -1149,6 +1199,13 @@ qemuSetupCgroupForIOThreads(virDomainObjPtr vm) return 0; } + if (virDomainNumatuneGetMode(vm->def->numatune, -1) == + VIR_DOMAIN_NUMATUNE_MEM_STRICT && + virDomainNumatuneMaybeFormatNodeset(vm->def->numatune, + priv->autoNodeset, + &mem_mask, -1) < 0) + goto cleanup; + for (i = 0; i < priv->niothreadpids; i++) { /* IOThreads are numbered 1..n, although the array is 0..n-1, * so we will account for that here @@ -1166,6 +1223,10 @@ qemuSetupCgroupForIOThreads(virDomainObjPtr vm) goto cleanup; } + if (mem_mask && + virCgroupSetCpusetMems(cgroup_iothread, mem_mask) < 0) + goto cleanup; + /* Set iothreadpin in cgroup if iothreadpin xml is provided */ if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) { @@ -1188,6 +1249,7 @@ qemuSetupCgroupForIOThreads(virDomainObjPtr vm) virCgroupFree(&cgroup_iothread); } + VIR_FREE(mem_mask); return 0; @@ -1196,6 +1258,7 @@ qemuSetupCgroupForIOThreads(virDomainObjPtr vm) virCgroupRemove(cgroup_iothread); virCgroupFree(&cgroup_iothread); } + VIR_FREE(mem_mask); return -1; } diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index df3ba6d..bfa62c4 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4427,6 +4427,7 @@ static int qemuDomainHotplugVcpus(virQEMUDriverPtr driver, pid_t *cpupids = NULL; int ncpupids; virCgroupPtr cgroup_vcpu = NULL; + char *mem_mask = NULL; qemuDomainObjEnterMonitor(driver, vm); @@ -4490,6 +4491,13 @@ static int qemuDomainHotplugVcpus(virQEMUDriverPtr driver, goto cleanup; } + if (virDomainNumatuneGetMode(vm->def->numatune, -1) == + VIR_DOMAIN_NUMATUNE_MEM_STRICT && + virDomainNumatuneMaybeFormatNodeset(vm->def->numatune, + priv->autoNodeset, + &mem_mask, -1) < 0) + goto cleanup; + if (nvcpus > oldvcpus) { for (i = oldvcpus; i < nvcpus; i++) { if (priv->cgroup) { @@ -4498,6 +4506,10 @@ static int qemuDomainHotplugVcpus(virQEMUDriverPtr driver, if (virCgroupNewVcpu(priv->cgroup, i, true, &cgroup_vcpu) < 0) goto cleanup; + if (mem_mask && + virCgroupSetCpusetMems(cgroup_vcpu, mem_mask) < 0) + goto cleanup; + /* Add vcpu thread to the cgroup */ rv = virCgroupAddTask(cgroup_vcpu, cpupids[i]); if (rv < 0) { @@ -4507,6 +4519,7 @@ static int qemuDomainHotplugVcpus(virQEMUDriverPtr driver, virCgroupRemove(cgroup_vcpu); goto cleanup; } + } /* Inherit def->cpuset */ @@ -4579,6 +4592,7 @@ static int qemuDomainHotplugVcpus(virQEMUDriverPtr driver, qemuDomainObjExitMonitor(driver, vm); vm->def->vcpus = vcpus; VIR_FREE(cpupids); + VIR_FREE(mem_mask); virDomainAuditVcpu(vm, oldvcpus, nvcpus, "update", rc == 1); if (cgroup_vcpu) virCgroupFree(&cgroup_vcpu); @@ -9215,11 +9229,9 @@ qemuDomainGetMemoryParameters(virDomainPtr dom, static int qemuDomainSetNumaParamsLive(virDomainObjPtr vm, - virCapsPtr caps, virBitmapPtr nodeset) { virCgroupPtr cgroup_temp = NULL; - virBitmapPtr temp_nodeset = NULL; qemuDomainObjPrivatePtr priv = vm->privateData; char *nodeset_str = NULL; size_t i = 0; @@ -9233,39 +9245,15 @@ qemuDomainSetNumaParamsLive(virDomainObjPtr vm, goto cleanup; } - /* Get existing nodeset values */ - if (virCgroupGetCpusetMems(priv->cgroup, &nodeset_str) < 0 || - virBitmapParse(nodeset_str, 0, &temp_nodeset, - VIR_DOMAIN_CPUMASK_LEN) < 0) - goto cleanup; - VIR_FREE(nodeset_str); - - for (i = 0; i < caps->host.nnumaCell; i++) { - bool result; - virCapsHostNUMACellPtr cell = caps->host.numaCell[i]; - if (virBitmapGetBit(nodeset, cell->num, &result) < 0) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("Failed to get cpuset bit values")); - goto cleanup; - } - if (result && (virBitmapSetBit(temp_nodeset, cell->num) < 0)) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("Failed to set temporary cpuset bit values")); - goto cleanup; - } - } - - if (!(nodeset_str = virBitmapFormat(temp_nodeset))) - goto cleanup; - - if (virCgroupSetCpusetMems(priv->cgroup, nodeset_str) < 0) - goto cleanup; - VIR_FREE(nodeset_str); - /* Ensure the cpuset string is formatted before passing to cgroup */ if (!(nodeset_str = virBitmapFormat(nodeset))) goto cleanup; + if (virCgroupNewEmulator(priv->cgroup, false, &cgroup_temp) < 0 || + virCgroupSetCpusetMems(cgroup_temp, nodeset_str) < 0) + goto cleanup; + virCgroupFree(&cgroup_temp); + for (i = 0; i < priv->nvcpupids; i++) { if (virCgroupNewVcpu(priv->cgroup, i, false, &cgroup_temp) < 0 || virCgroupSetCpusetMems(cgroup_temp, nodeset_str) < 0) @@ -9273,11 +9261,6 @@ qemuDomainSetNumaParamsLive(virDomainObjPtr vm, virCgroupFree(&cgroup_temp); } - if (virCgroupNewEmulator(priv->cgroup, false, &cgroup_temp) < 0 || - virCgroupSetCpusetMems(cgroup_temp, nodeset_str) < 0 || - virCgroupSetCpusetMems(priv->cgroup, nodeset_str) < 0) - goto cleanup; - for (i = 0; i < priv->niothreadpids; i++) { if (virCgroupNewIOThread(priv->cgroup, i + 1, false, &cgroup_temp) < 0 || @@ -9286,11 +9269,9 @@ qemuDomainSetNumaParamsLive(virDomainObjPtr vm, virCgroupFree(&cgroup_temp); } - ret = 0; cleanup: VIR_FREE(nodeset_str); - virBitmapFree(temp_nodeset); virCgroupFree(&cgroup_temp); return ret; @@ -9392,7 +9373,7 @@ qemuDomainSetNumaParameters(virDomainPtr dom, } if (nodeset && - qemuDomainSetNumaParamsLive(vm, caps, nodeset) < 0) + qemuDomainSetNumaParamsLive(vm, nodeset) < 0) goto endjob; if (virDomainNumatuneSet(&vm->def->numatune, -- 2.2.0

On 12/15/2014 12:58 AM, Martin Kletzander wrote:
Instead of setting the value of cpuset.mems once when the domain starts and then re-calculating the value every time we need to change the child cgroup values, leave the cgroup alone and rather set the child data every time there is new cgroup created. We don't leave any task in the parent group anyway. This will ease both current and future code.
Signed-off-by: Martin Kletzander <mkletzan@redhat.com> --- src/qemu/qemu_cgroup.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-- src/qemu/qemu_driver.c | 59 +++++++++++++++----------------------------- 2 files changed, 85 insertions(+), 41 deletions(-)
This patch causes libvirtd to segfault on startup: Program received signal SIGSEGV, Segmentation fault. [Switching to Thread 0x7fffda116700 (LWP 15311)] 0x00007ffff7400890 in virCgroupPathOfController (group=0x0, controller=2, key=0x7ffff768a019 "tasks", path=0x7fffda115b30) at util/vircgroup.c:1867 1867 if (group->controllers[controller].mountPoint == NULL) { (gdb) bt #0 0x00007ffff7400890 in virCgroupPathOfController (group=0x0, controller=2, key=0x7ffff768a019 "tasks", path=0x7fffda115b30) at util/vircgroup.c:1867 #1 0x00007ffff73fe4db in virCgroupGetValueStr (group=0x0, controller=2, key=0x7ffff768a019 "tasks", value=0x7fffda115b80) at util/vircgroup.c:751 #2 0x00007ffff7405673 in virCgroupHasEmptyTasks (cgroup=0x0, controller=2) at util/vircgroup.c:3935 #3 0x00007fffdc79c687 in qemuRestoreCgroupState (vm=0x7fffd41dd8d0) at qemu/qemu_cgroup.c:802 #4 0x00007fffdc79c80c in qemuConnectCgroup (driver=0x7fffd4152ae0, vm=0x7fffd41dd8d0) at qemu/qemu_cgroup.c:848 #5 0x00007fffdc7b8553 in qemuProcessReconnect (opaque=0x7fffd41da150) at qemu/qemu_process.c:3616 #6 0x00007ffff7469830 in virThreadHelper (data=0x7fffd41c5b00) at util/virthread.c:197 #7 0x00007ffff3e65ee5 in start_thread (arg=0x7fffda116700) at pthread_create.c:309 #8 0x00007ffff3b94b8d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111 -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 12/16/2014 11:51 PM, Eric Blake wrote:
On 12/15/2014 12:58 AM, Martin Kletzander wrote:
Instead of setting the value of cpuset.mems once when the domain starts and then re-calculating the value every time we need to change the child cgroup values, leave the cgroup alone and rather set the child data every time there is new cgroup created. We don't leave any task in the parent group anyway. This will ease both current and future code.
Signed-off-by: Martin Kletzander <mkletzan@redhat.com> --- src/qemu/qemu_cgroup.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-- src/qemu/qemu_driver.c | 59 +++++++++++++++----------------------------- 2 files changed, 85 insertions(+), 41 deletions(-)
This patch causes libvirtd to segfault on startup:
More particularly, I'm doing an upgrade from older libvirtd to this version, while leaving a transient domain running that was started by the older libvirtd. Hope that helps you narrow in on the problem. Reverting 86759e and af2a1f0 was sufficient to get me going again locally, but I'm not going to push my reversions until you've first had a chance to address the regression.
Program received signal SIGSEGV, Segmentation fault. [Switching to Thread 0x7fffda116700 (LWP 15311)] 0x00007ffff7400890 in virCgroupPathOfController (group=0x0, controller=2, key=0x7ffff768a019 "tasks", path=0x7fffda115b30) at util/vircgroup.c:1867 1867 if (group->controllers[controller].mountPoint == NULL) { (gdb) bt #0 0x00007ffff7400890 in virCgroupPathOfController (group=0x0, controller=2, key=0x7ffff768a019 "tasks", path=0x7fffda115b30) at util/vircgroup.c:1867 #1 0x00007ffff73fe4db in virCgroupGetValueStr (group=0x0, controller=2, key=0x7ffff768a019 "tasks", value=0x7fffda115b80) at util/vircgroup.c:751 #2 0x00007ffff7405673 in virCgroupHasEmptyTasks (cgroup=0x0, controller=2) at util/vircgroup.c:3935 #3 0x00007fffdc79c687 in qemuRestoreCgroupState (vm=0x7fffd41dd8d0) at qemu/qemu_cgroup.c:802 #4 0x00007fffdc79c80c in qemuConnectCgroup (driver=0x7fffd4152ae0, vm=0x7fffd41dd8d0) at qemu/qemu_cgroup.c:848 #5 0x00007fffdc7b8553 in qemuProcessReconnect (opaque=0x7fffd41da150) at qemu/qemu_process.c:3616 #6 0x00007ffff7469830 in virThreadHelper (data=0x7fffd41c5b00) at util/virthread.c:197 #7 0x00007ffff3e65ee5 in start_thread (arg=0x7fffda116700) at pthread_create.c:309 #8 0x00007ffff3b94b8d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
-- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On Wed, Dec 17, 2014 at 12:00:36AM -0700, Eric Blake wrote:
On 12/16/2014 11:51 PM, Eric Blake wrote:
On 12/15/2014 12:58 AM, Martin Kletzander wrote:
Instead of setting the value of cpuset.mems once when the domain starts and then re-calculating the value every time we need to change the child cgroup values, leave the cgroup alone and rather set the child data every time there is new cgroup created. We don't leave any task in the parent group anyway. This will ease both current and future code.
Signed-off-by: Martin Kletzander <mkletzan@redhat.com> --- src/qemu/qemu_cgroup.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-- src/qemu/qemu_driver.c | 59 +++++++++++++++----------------------------- 2 files changed, 85 insertions(+), 41 deletions(-)
This patch causes libvirtd to segfault on startup:
More particularly, I'm doing an upgrade from older libvirtd to this version, while leaving a transient domain running that was started by the older libvirtd. Hope that helps you narrow in on the problem.
I tried that and it works for me. And I tried various domains, both heavily cgroup dependent and simple ones.
Reverting 86759e and af2a1f0 was sufficient to get me going again locally, but I'm not going to push my reversions until you've first had a chance to address the regression.
Program received signal SIGSEGV, Segmentation fault. [Switching to Thread 0x7fffda116700 (LWP 15311)] 0x00007ffff7400890 in virCgroupPathOfController (group=0x0, controller=2, key=0x7ffff768a019 "tasks", path=0x7fffda115b30) at util/vircgroup.c:1867 1867 if (group->controllers[controller].mountPoint == NULL) { (gdb) bt #0 0x00007ffff7400890 in virCgroupPathOfController (group=0x0, controller=2, key=0x7ffff768a019 "tasks", path=0x7fffda115b30) at util/vircgroup.c:1867 #1 0x00007ffff73fe4db in virCgroupGetValueStr (group=0x0, controller=2, key=0x7ffff768a019 "tasks", value=0x7fffda115b80) at util/vircgroup.c:751 #2 0x00007ffff7405673 in virCgroupHasEmptyTasks (cgroup=0x0, controller=2) at util/vircgroup.c:3935
From this line it looks like priv->cgroup was not initialized. I did not add a check for that, so that may be the cause. I'll send a patch soon. But I wonder how did you manage to do that, is that a session libvirtd you restarted? Otherwise how come virCgroupNewDetectMachine() didn't fill the cgroup?
#3 0x00007fffdc79c687 in qemuRestoreCgroupState (vm=0x7fffd41dd8d0) at qemu/qemu_cgroup.c:802 #4 0x00007fffdc79c80c in qemuConnectCgroup (driver=0x7fffd4152ae0, vm=0x7fffd41dd8d0) at qemu/qemu_cgroup.c:848 #5 0x00007fffdc7b8553 in qemuProcessReconnect (opaque=0x7fffd41da150) at qemu/qemu_process.c:3616 #6 0x00007ffff7469830 in virThreadHelper (data=0x7fffd41c5b00) at util/virthread.c:197 #7 0x00007ffff3e65ee5 in start_thread (arg=0x7fffda116700) at pthread_create.c:309 #8 0x00007ffff3b94b8d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
-- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 12/17/2014 08:06 AM, Martin Kletzander wrote:
On Wed, Dec 17, 2014 at 12:00:36AM -0700, Eric Blake wrote:
On 12/16/2014 11:51 PM, Eric Blake wrote:
On 12/15/2014 12:58 AM, Martin Kletzander wrote:
Instead of setting the value of cpuset.mems once when the domain starts and then re-calculating the value every time we need to change the child cgroup values, leave the cgroup alone and rather set the child data every time there is new cgroup created. We don't leave any task in the parent group anyway. This will ease both current and future code.
Signed-off-by: Martin Kletzander <mkletzan@redhat.com> --- src/qemu/qemu_cgroup.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-- src/qemu/qemu_driver.c | 59 +++++++++++++++----------------------------- 2 files changed, 85 insertions(+), 41 deletions(-)
This patch causes libvirtd to segfault on startup:
More particularly, I'm doing an upgrade from older libvirtd to this version, while leaving a transient domain running that was started by the older libvirtd. Hope that helps you narrow in on the problem.
Weird - At the time I made the report, I ran 'git bisect' and reliably reproduced the crash across multiple libvirtd restarts (a restart for each new build while trying to nail down the culprit commit), as long as I left that transient domain running.
I tried that and it works for me. And I tried various domains, both heavily cgroup dependent and simple ones.
But now that I've rebooted, and then proceeded to do incremental builds from both before and after the patch, I can't reproduce the crash. Although my formula for creating my transient domain was the same both yesterday and today, there may be some difference in the version of libvirtd that was running at the time I created the domain that then affected the XML affecting the libvirtd restarts.
Reverting 86759e and af2a1f0 was sufficient to get me going again locally, but I'm not going to push my reversions until you've first had a chance to address the regression.
#2 0x00007ffff7405673 in virCgroupHasEmptyTasks (cgroup=0x0, controller=2) at util/vircgroup.c:3935
From this line it looks like priv->cgroup was not initialized. I did not add a check for that, so that may be the cause. I'll send a patch soon.
But I wonder how did you manage to do that, is that a session libvirtd you restarted? Otherwise how come virCgroupNewDetectMachine() didn't fill the cgroup?
I'm not spotting anything obvious why it wasn't initialized at the time I reproduced the crash, nor why I can't reproduce it now. Hopefully it's not a lurking time bomb. If it happens again, I'll definitely report it; but for now, without a reliable reproduction, it could easily have been something caused on my end (since it is my dev machine, it may have been caused by a half-baked patch on my end that I was testing at the time I created the transient domain, where using only upstream patches wouldn't see the issue). So for now, don't worry too hard if you can't find it either. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On Wed, Dec 17, 2014 at 04:59:30PM -0700, Eric Blake wrote:
On 12/17/2014 08:06 AM, Martin Kletzander wrote:
On Wed, Dec 17, 2014 at 12:00:36AM -0700, Eric Blake wrote:
On 12/16/2014 11:51 PM, Eric Blake wrote:
On 12/15/2014 12:58 AM, Martin Kletzander wrote:
Instead of setting the value of cpuset.mems once when the domain starts and then re-calculating the value every time we need to change the child cgroup values, leave the cgroup alone and rather set the child data every time there is new cgroup created. We don't leave any task in the parent group anyway. This will ease both current and future code.
Signed-off-by: Martin Kletzander <mkletzan@redhat.com> --- src/qemu/qemu_cgroup.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-- src/qemu/qemu_driver.c | 59 +++++++++++++++----------------------------- 2 files changed, 85 insertions(+), 41 deletions(-)
This patch causes libvirtd to segfault on startup:
More particularly, I'm doing an upgrade from older libvirtd to this version, while leaving a transient domain running that was started by the older libvirtd. Hope that helps you narrow in on the problem.
Weird - At the time I made the report, I ran 'git bisect' and reliably reproduced the crash across multiple libvirtd restarts (a restart for each new build while trying to nail down the culprit commit), as long as I left that transient domain running.
I tried that and it works for me. And I tried various domains, both heavily cgroup dependent and simple ones.
But now that I've rebooted, and then proceeded to do incremental builds from both before and after the patch, I can't reproduce the crash. Although my formula for creating my transient domain was the same both yesterday and today, there may be some difference in the version of libvirtd that was running at the time I created the domain that then affected the XML affecting the libvirtd restarts.
Reverting 86759e and af2a1f0 was sufficient to get me going again locally, but I'm not going to push my reversions until you've first had a chance to address the regression.
#2 0x00007ffff7405673 in virCgroupHasEmptyTasks (cgroup=0x0, controller=2) at util/vircgroup.c:3935
From this line it looks like priv->cgroup was not initialized. I did not add a check for that, so that may be the cause. I'll send a patch soon.
But I wonder how did you manage to do that, is that a session libvirtd you restarted? Otherwise how come virCgroupNewDetectMachine() didn't fill the cgroup?
I'm not spotting anything obvious why it wasn't initialized at the time I reproduced the crash, nor why I can't reproduce it now. Hopefully it's not a lurking time bomb.
If it happens again, I'll definitely report it; but for now, without a reliable reproduction, it could easily have been something caused on my end (since it is my dev machine, it may have been caused by a half-baked patch on my end that I was testing at the time I created the transient domain, where using only upstream patches wouldn't see the issue). So for now, don't worry too hard if you can't find it either.
There is a valid code path that allows the cgroup to be NULL, although not easy to reach. However, since it is valid, we added a patch that just skips the cgroup restoration for cgroup == NULL. If you happen to stumble upon a similar problem, I'd be happy to try dealing with it. Martin

On Mon, Dec 22, 2014 at 10:45:33AM +0100, Martin Kletzander wrote:
On Wed, Dec 17, 2014 at 04:59:30PM -0700, Eric Blake wrote:
On 12/17/2014 08:06 AM, Martin Kletzander wrote:
On Wed, Dec 17, 2014 at 12:00:36AM -0700, Eric Blake wrote:
On 12/16/2014 11:51 PM, Eric Blake wrote:
On 12/15/2014 12:58 AM, Martin Kletzander wrote:
Instead of setting the value of cpuset.mems once when the domain starts and then re-calculating the value every time we need to change the child cgroup values, leave the cgroup alone and rather set the child data every time there is new cgroup created. We don't leave any task in the parent group anyway. This will ease both current and future code.
Signed-off-by: Martin Kletzander <mkletzan@redhat.com> --- src/qemu/qemu_cgroup.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-- src/qemu/qemu_driver.c | 59 +++++++++++++++----------------------------- 2 files changed, 85 insertions(+), 41 deletions(-)
This patch causes libvirtd to segfault on startup:
More particularly, I'm doing an upgrade from older libvirtd to this version, while leaving a transient domain running that was started by the older libvirtd. Hope that helps you narrow in on the problem.
Weird - At the time I made the report, I ran 'git bisect' and reliably reproduced the crash across multiple libvirtd restarts (a restart for each new build while trying to nail down the culprit commit), as long as I left that transient domain running.
I managed to reproduce this (or actually talk to a person who did) and found out a bit more. As you say, until the domain was running everything was reproducible. However, when I started it with the old libvirt second time to find out the root cause, it just worked...
I tried that and it works for me. And I tried various domains, both heavily cgroup dependent and simple ones.
But now that I've rebooted, and then proceeded to do incremental builds from both before and after the patch, I can't reproduce the crash. Although my formula for creating my transient domain was the same both yesterday and today, there may be some difference in the version of libvirtd that was running at the time I created the domain that then affected the XML affecting the libvirtd restarts.
Reverting 86759e and af2a1f0 was sufficient to get me going again locally, but I'm not going to push my reversions until you've first had a chance to address the regression.
#2 0x00007ffff7405673 in virCgroupHasEmptyTasks (cgroup=0x0, controller=2) at util/vircgroup.c:3935
From this line it looks like priv->cgroup was not initialized. I did not add a check for that, so that may be the cause. I'll send a patch soon.
But I wonder how did you manage to do that, is that a session libvirtd you restarted? Otherwise how come virCgroupNewDetectMachine() didn't fill the cgroup?
I'm not spotting anything obvious why it wasn't initialized at the time I reproduced the crash, nor why I can't reproduce it now. Hopefully it's not a lurking time bomb.
It might be and it may cause more problems that just non-working hot-plugging of vCPUs. And it is not caused by these patches I added. The thing is that old libvirt starts a domain, but it *somehow* (this is the part I was not able to find out yet) doesn't add the task into the cgroup hierarchy for all cgroup controllers. The /proc/$(pidof qemu-kvm)/cgroups file looks like this then: 10:hugetlb:/ 9:perf_event:/machine.slice/machine-qemu\x2da.scope 8:blkio:/ 7:net_cls:/machine.slice/machine-qemu\x2da.scope 6:freezer:/machine.slice/machine-qemu\x2da.scope 5:devices:/ 4:memory:/ 3:cpuacct,cpu:/ 2:cpuset:/machine.slice/machine-qemu\x2da.scope/emulator 1:name=systemd:/machine.slice/machine-qemu\x2da.scope And when new libvirtd is starting, it detects the cgroup mounts and placements (in this example the placement for memory cgroup is "/") and the it goes and validates them. If the cgroup doesn't have a placement that ends with machine-qemu-blah or libvirt-qemu or whatever, the following condition in virCgroupValidateMachineGroup() fails: if (STRNEQ(tmp, name) && STRNEQ(tmp, partname) && STRNEQ(tmp, scopename)) [in here the variable tmp points after the last slash in the partition string] The function returns false, hence virCgroupNewDetectMachine() frees the cgroup (and sets it to NULL) and returns success. This means we will still properly detect the machine but we won't have access to any cgroup settings of that particular machine. The patch I posted fixes it to the extent that the daemon doesn't crash, but it doesn't fix the root cause (which is unknown). I played with it a bit more and I've found out that a domain, which had its own partition in all the cgroups controllers when started, *lost* some of them after a while. I might have been caused by libvirtd restarting, but that's highly unlikely, I guess. Even though inotify works for cgroup fs it does not show us who accessed it. I'll have to resort to using systemtap again.
If it happens again, I'll definitely report it; but for now, without a reliable reproduction, it could easily have been something caused on my end (since it is my dev machine, it may have been caused by a half-baked patch on my end that I was testing at the time I created the transient domain, where using only upstream patches wouldn't see the issue). So for now, don't worry too hard if you can't find it either.
There is a valid code path that allows the cgroup to be NULL, although not easy to reach. However, since it is valid, we added a patch that just skips the cgroup restoration for cgroup == NULL.
If you happen to stumble upon a similar problem, I'd be happy to try dealing with it.
Martin
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list

When hot-plugging a VCPU into the guest, kvm needs to allocate some data from the DMA zone, which might be in a memory node that's not allowed in cpuset.mems. Basically the same problem as there was with starting the domain and due to which commit 7e72ac787848b7434c9359a57c1e2789d92350f8 exists. This patch just extends it to hotplugging as well. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1161540 Signed-off-by: Martin Kletzander <mkletzan@redhat.com> --- src/qemu/qemu_driver.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index bfa62c4..dbe48ad 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -97,6 +97,7 @@ #include "virhostdev.h" #include "domain_capabilities.h" #include "vircgroup.h" +#include "virnuma.h" #define VIR_FROM_THIS VIR_FROM_QEMU @@ -4621,6 +4622,11 @@ qemuDomainSetVcpusFlags(virDomainPtr dom, unsigned int nvcpus, int ncpuinfo; qemuDomainObjPrivatePtr priv; size_t i; + virCgroupPtr cgroup_temp = NULL; + char *mem_mask = NULL; + char *all_nodes_str = NULL; + virBitmapPtr all_nodes = NULL; + virErrorPtr err = NULL; virCheckFlags(VIR_DOMAIN_AFFECT_LIVE | VIR_DOMAIN_AFFECT_CONFIG | @@ -4646,9 +4652,22 @@ qemuDomainSetVcpusFlags(virDomainPtr dom, unsigned int nvcpus, priv = vm->privateData; + if (virCgroupNewEmulator(priv->cgroup, false, &cgroup_temp) < 0) + goto cleanup; + + if (!(all_nodes = virNumaGetHostNodeset())) + goto cleanup; + + if (!(all_nodes_str = virBitmapFormat(all_nodes))) + goto cleanup; + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; + if (virCgroupGetCpusetMems(cgroup_temp, &mem_mask) < 0 || + virCgroupSetCpusetMems(cgroup_temp, all_nodes_str) < 0) + goto endjob; + maximum = (flags & VIR_DOMAIN_VCPU_MAXIMUM) != 0; flags &= ~VIR_DOMAIN_VCPU_MAXIMUM; @@ -4758,6 +4777,12 @@ qemuDomainSetVcpusFlags(virDomainPtr dom, unsigned int nvcpus, ret = 0; endjob: + if (mem_mask) { + err = virSaveLastError(); + virCgroupSetCpusetMems(cgroup_temp, mem_mask); + virSetError(err); + } + if (!qemuDomainObjEndJob(driver, vm)) vm = NULL; @@ -4766,6 +4791,10 @@ qemuDomainSetVcpusFlags(virDomainPtr dom, unsigned int nvcpus, virObjectUnlock(vm); virObjectUnref(caps); VIR_FREE(cpuinfo); + VIR_FREE(mem_mask); + VIR_FREE(all_nodes_str); + virBitmapFree(all_nodes); + virCgroupFree(&cgroup_temp); virObjectUnref(cfg); return ret; } -- 2.2.0

On 15.12.2014 08:58, Martin Kletzander wrote:
Deatils are in the patches themselves, but the basic idea is this:
Setup:
$ grep DMA32 /proc/zoneinfo Node 0, zone DMA32
$ virsh dumpxml domain | grep -C1 strict <numatune> <memory mode='strict' nodeset='1'/> </numatune>
$ virsh start domain Domain domain started
Before:
$ virsh setvcpus domain 2 error: Unable to read from monitor: Connection reset by peer # Domain died
After:
$ virsh setvcpus domain 2 # hotplug successful
Martin
Martin Kletzander (6): util: Add function virCgroupHasEmptyTasks util: Add virNumaGetHostNodeset qemu: Remove unnecessary qemuSetupCgroupPostInit function qemu: Save numad advice into qemuDomainObjPrivate qemu: Leave cpuset.mems in parent cgroup alone qemu: Fix hotplugging cpus with strict memory pinning
src/libvirt_private.syms | 2 ++ src/qemu/qemu_cgroup.c | 94 +++++++++++++++++++++++++++++++++++++----------- src/qemu/qemu_cgroup.h | 9 ++--- src/qemu/qemu_domain.c | 1 + src/qemu/qemu_domain.h | 1 + src/qemu/qemu_driver.c | 88 +++++++++++++++++++++++++-------------------- src/qemu/qemu_process.c | 21 ++++++----- src/util/vircgroup.c | 23 ++++++++++++ src/util/vircgroup.h | 4 ++- src/util/virnuma.c | 28 +++++++++++++++ src/util/virnuma.h | 1 + 11 files changed, 194 insertions(+), 78 deletions(-)
ACK series. Michal
participants (3)
-
Eric Blake
-
Martin Kletzander
-
Michal Privoznik