[libvirt] [PATCH v2 0/3] A few NUMA fixes

diff to v1: -reworked to follow Jan's review. Hopefully. Michal Privoznik (3): vircgroup: Introduce virCgroupControllerAvailable qemuProcessHook: Call virNuma*() iff needed virLXCControllerSetupResourceLimits: Call virNuma*() iff needed src/libvirt_private.syms | 1 + src/lxc/lxc_controller.c | 22 ++++++++++++++++------ src/qemu/qemu_process.c | 21 +++++++++++++++++---- src/util/vircgroup.c | 19 +++++++++++++++++++ src/util/vircgroup.h | 1 + tests/vircgrouptest.c | 31 +++++++++++++++++++++++++++++++ 6 files changed, 85 insertions(+), 10 deletions(-) -- 2.0.5

This new internal API checks if given CGroup controller is available. It is going to be needed later when we need to make a decision whether pin domain memory onto NUMA nodes using cpuset CGroup controller or using numa_set_membind(). Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroup.c | 19 +++++++++++++++++++ src/util/vircgroup.h | 1 + tests/vircgrouptest.c | 31 +++++++++++++++++++++++++++++++ 4 files changed, 52 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 5716ece..136036b 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1129,6 +1129,7 @@ virCgroupAllowDevice; virCgroupAllowDeviceMajor; virCgroupAllowDevicePath; virCgroupAvailable; +virCgroupControllerAvailable; virCgroupControllerTypeFromString; virCgroupControllerTypeToString; virCgroupDenyAllDevices; diff --git a/src/util/vircgroup.c b/src/util/vircgroup.c index d42f433..00c0bab 100644 --- a/src/util/vircgroup.c +++ b/src/util/vircgroup.c @@ -4011,6 +4011,20 @@ virCgroupHasEmptyTasks(virCgroupPtr cgroup, int controller) return ret; } +bool +virCgroupControllerAvailable(int controller) +{ + virCgroupPtr cgroup; + bool ret = false; + + if (virCgroupNewSelf(&cgroup) < 0) + return ret; + + ret = virCgroupHasController(cgroup, controller); + virCgroupFree(&cgroup); + return ret; +} + #else /* !VIR_CGROUP_SUPPORTED */ bool @@ -4781,4 +4795,9 @@ virCgroupHasEmptyTasks(virCgroupPtr cgroup ATTRIBUTE_UNUSED, return -1; } +bool +virCgroupControllerAvailable(int controller ATTRIBUTE_UNUSED) +{ + return false; +} #endif /* !VIR_CGROUP_SUPPORTED */ diff --git a/src/util/vircgroup.h b/src/util/vircgroup.h index eee15ca..5d6356d 100644 --- a/src/util/vircgroup.h +++ b/src/util/vircgroup.h @@ -282,4 +282,5 @@ int virCgroupSetOwner(virCgroupPtr cgroup, int virCgroupHasEmptyTasks(virCgroupPtr cgroup, int controller); +bool virCgroupControllerAvailable(int controller); #endif /* __VIR_CGROUP_H__ */ diff --git a/tests/vircgrouptest.c b/tests/vircgrouptest.c index bbf28d1..2b30258 100644 --- a/tests/vircgrouptest.c +++ b/tests/vircgrouptest.c @@ -586,6 +586,34 @@ static int testCgroupAvailable(const void *args) return 0; } +static int testCgroupControllerAvailable(const void *args ATTRIBUTE_UNUSED) +{ + int ret = 0; + +# define CHECK_CONTROLLER(c, present) \ + if ((present && !virCgroupControllerAvailable(c)) || \ + (!present && virCgroupControllerAvailable(c))) { \ + fprintf(stderr, present ? \ + "Expected controller %s not available\n" : \ + "Unexpected controller %s available\n", #c); \ + ret = -1; \ + } + + CHECK_CONTROLLER(VIR_CGROUP_CONTROLLER_CPU, true) + CHECK_CONTROLLER(VIR_CGROUP_CONTROLLER_CPUACCT, true) + CHECK_CONTROLLER(VIR_CGROUP_CONTROLLER_CPUSET, true) + CHECK_CONTROLLER(VIR_CGROUP_CONTROLLER_MEMORY, true) + CHECK_CONTROLLER(VIR_CGROUP_CONTROLLER_DEVICES, false) + CHECK_CONTROLLER(VIR_CGROUP_CONTROLLER_FREEZER, true) + CHECK_CONTROLLER(VIR_CGROUP_CONTROLLER_BLKIO, true) + CHECK_CONTROLLER(VIR_CGROUP_CONTROLLER_NET_CLS, false) + CHECK_CONTROLLER(VIR_CGROUP_CONTROLLER_PERF_EVENT, false) + CHECK_CONTROLLER(VIR_CGROUP_CONTROLLER_SYSTEMD, true) + +# undef CHECK_CONTROLLER + return ret; +} + static int testCgroupGetPercpuStats(const void *args ATTRIBUTE_UNUSED) { virCgroupPtr cgroup = NULL; @@ -886,6 +914,9 @@ mymain(void) if (virtTestRun("Cgroup available", testCgroupAvailable, (void*)0x1) < 0) ret = -1; + if (virtTestRun("Cgroup controller available", testCgroupControllerAvailable, NULL) < 0) + ret = -1; + if (virtTestRun("virCgroupGetBlkioIoServiced works", testCgroupGetBlkioIoServiced, NULL) < 0) ret = -1; -- 2.0.5

On Tue, Mar 31, 2015 at 01:59:08PM +0200, Michal Privoznik wrote:
This new internal API checks if given CGroup controller is available. It is going to be needed later when we need to make a decision whether pin domain memory onto NUMA nodes using cpuset CGroup controller or using numa_set_membind().
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/libvirt_private.syms | 1 + src/util/vircgroup.c | 19 +++++++++++++++++++ src/util/vircgroup.h | 1 + tests/vircgrouptest.c | 31 +++++++++++++++++++++++++++++++ 4 files changed, 52 insertions(+)
ACK Jan

https://bugzilla.redhat.com/show_bug.cgi?id=1198645 Once upon a time, there was a little domain. And the domain was pinned onto a NUMA node and hasn't fully allocated its memory: <memory unit='KiB'>2355200</memory> <currentMemory unit='KiB'>1048576</currentMemory> <numatune> <memory mode='strict' nodeset='0'/> </numatune> Oh little me, said the domain, what will I do with so little memory. If I only had a few megabytes more. But the old admin noticed the whimpering, barely audible to untrained human ear. And good admin he was, he gave the domain yet more memory. But the old NUMA topology witch forbade to allocate more memory on the node zero. So he decided to allocate it on a different node: virsh # numatune little_domain --nodeset 0-1 virsh # setmem little_domain 2355200 The little domain was happy. For a while. Until bad, sharp teeth shaped creature came. Every process in the system was afraid of him. The OOM Killer they called him. Oh no, he's after the little domain. There's no escape. Do you kids know why? Because when the little domain was born, her father, Libvirt, called numa_set_membind(). So even if the admin allowed her to allocate memory from other nodes in the cgroups, the membind() forbid it. So what's the lesson? Libvirt should rely on cgroups, whenever possible and use numa_set_membind() as the last ditch effort. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_process.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 2d86eb8..6c955c3 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3166,6 +3166,7 @@ static int qemuProcessHook(void *data) int fd; virBitmapPtr nodeset = NULL; virDomainNumatuneMemMode mode; + bool doNuma = true; /* This method cannot use any mutexes, which are not * protected across fork() @@ -3197,11 +3198,23 @@ static int qemuProcessHook(void *data) goto cleanup; mode = virDomainNumatuneGetMode(h->vm->def->numa, -1); - nodeset = virDomainNumatuneGetNodeset(h->vm->def->numa, - priv->autoNodeset, -1); - if (virNumaSetupMemoryPolicy(mode, nodeset) < 0) - goto cleanup; + if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && + h->cfg->cgroupControllers & (1 << VIR_CGROUP_CONTROLLER_CPUSET) && + virCgroupControllerAvailable(VIR_CGROUP_CONTROLLER_CPUSET)) { + /* Use virNuma* API iff necessary. Once set and child is exec()-ed, + * there's no way for us to change it. Rely on cgroups (if available + * and enabled in the config) rather then virNuma*. */ + doNuma = false; + } + + if (doNuma) { + nodeset = virDomainNumatuneGetNodeset(h->vm->def->numa, + priv->autoNodeset, -1); + + if (virNumaSetupMemoryPolicy(mode, nodeset) < 0) + goto cleanup; + } ret = 0; -- 2.0.5

On Tue, Mar 31, 2015 at 01:59:09PM +0200, Michal Privoznik wrote:
https://bugzilla.redhat.com/show_bug.cgi?id=1198645
Once upon a time, there was a little domain. And the domain was pinned onto a NUMA node and hasn't fully allocated its memory:
<memory unit='KiB'>2355200</memory> <currentMemory unit='KiB'>1048576</currentMemory>
<numatune> <memory mode='strict' nodeset='0'/> </numatune>
Oh little me, said the domain, what will I do with so little memory. If I only had a few megabytes more. But the old admin noticed the whimpering, barely audible to untrained human ear. And good admin he was, he gave the domain yet more memory. But the old NUMA topology witch forbade to allocate more memory on the node zero. So he decided to allocate it on a different node:
virsh # numatune little_domain --nodeset 0-1
virsh # setmem little_domain 2355200
The little domain was happy. For a while. Until bad, sharp teeth shaped creature came. Every process in the system was afraid of him. The OOM Killer they called him. Oh no, he's after the little domain. There's no escape.
Do you kids know why? Because when the little domain was born, her father, Libvirt, called numa_set_membind(). So even if the admin allowed her to allocate memory from other nodes in the cgroups, the membind() forbid it.
So what's the lesson? Libvirt should rely on cgroups, whenever possible and use numa_set_membind() as the last ditch effort.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_process.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-)
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 2d86eb8..6c955c3 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3166,6 +3166,7 @@ static int qemuProcessHook(void *data) int fd; virBitmapPtr nodeset = NULL; virDomainNumatuneMemMode mode; + bool doNuma = true;
This redundant bool is redundant.
/* This method cannot use any mutexes, which are not * protected across fork() @@ -3197,11 +3198,23 @@ static int qemuProcessHook(void *data) goto cleanup;
mode = virDomainNumatuneGetMode(h->vm->def->numa, -1); - nodeset = virDomainNumatuneGetNodeset(h->vm->def->numa, - priv->autoNodeset, -1);
- if (virNumaSetupMemoryPolicy(mode, nodeset) < 0) - goto cleanup; + if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && + h->cfg->cgroupControllers & (1 << VIR_CGROUP_CONTROLLER_CPUSET) && + virCgroupControllerAvailable(VIR_CGROUP_CONTROLLER_CPUSET)) { + /* Use virNuma* API iff necessary. Once set and child is exec()-ed, + * there's no way for us to change it. Rely on cgroups (if available + * and enabled in the config) rather then virNuma*. */ + doNuma = false;
Just invert the whole condition: if (!(mode == && h->cfg && virCgroupController)) (I also think iff is an obscure obfuscation of 'only if', but given the fairy-tale commit message: ) ACK if you remove the bool Jan

Like we are doing in qemu driver ($COMMIT_HASH_TO_BE_FILLED_DURING_PUSHING), lets call virNumaSetupMemoryPolicy() only if really needed. Problem is, if we numa_set_membind() child, there's no way to change it from the daemon afterwards. So any later attempts to change the pinning will fail. But in very weird way - CGroups will be set, but due to membind child will not allocate memory from any other node. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/lxc/lxc_controller.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index 8545f29..4b340ab 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -742,14 +742,24 @@ static int virLXCControllerSetupResourceLimits(virLXCControllerPtr ctrl) virBitmapPtr nodeset = NULL; virDomainNumatuneMemMode mode; - VIR_DEBUG("Setting up process resource limits"); - - if (virLXCControllerGetNumadAdvice(ctrl, &auto_nodeset) < 0) - goto cleanup; - - nodeset = virDomainNumatuneGetNodeset(ctrl->def->numa, auto_nodeset, -1); mode = virDomainNumatuneGetMode(ctrl->def->numa, -1); + if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && + virCgroupControllerAvailable(VIR_CGROUP_CONTROLLER_CPUSET)) { + /* Use virNuma* API iff necessary. Once set and child is exec()-ed, + * there's no way for us to change it. Rely on cgroups (if available + * and enabled in the config) rather then virNuma*. */ + VIR_DEBUG("Postponing setting up resource limits to CGroup set up phase"); + return virLXCControllerSetupCpuAffinity(ctrl); + } + + VIR_DEBUG("Setting up process resource limits"); + + if (virLXCControllerGetNumadAdvice(ctrl, &auto_nodeset) < 0) + goto cleanup; + + nodeset = virDomainNumatuneGetNodeset(ctrl->def->numa, auto_nodeset, -1); + if (virNumaSetupMemoryPolicy(mode, nodeset) < 0) goto cleanup; -- 2.0.5

On Tue, Mar 31, 2015 at 01:59:10PM +0200, Michal Privoznik wrote:
Like we are doing in qemu driver ($COMMIT_HASH_TO_BE_FILLED_DURING_PUSHING), lets call virNumaSetupMemoryPolicy() only if really needed. Problem is, if we numa_set_membind() child, there's no way to change it from the daemon afterwards. So any later attempts to change the pinning will fail. But in very weird way - CGroups will be set, but due to membind child will not allocate memory from any other node.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/lxc/lxc_controller.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-)
diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index 8545f29..4b340ab 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -742,14 +742,24 @@ static int virLXCControllerSetupResourceLimits(virLXCControllerPtr ctrl) virBitmapPtr nodeset = NULL; virDomainNumatuneMemMode mode;
- VIR_DEBUG("Setting up process resource limits"); - - if (virLXCControllerGetNumadAdvice(ctrl, &auto_nodeset) < 0) - goto cleanup; - - nodeset = virDomainNumatuneGetNodeset(ctrl->def->numa, auto_nodeset, -1); mode = virDomainNumatuneGetMode(ctrl->def->numa, -1);
+ if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && + virCgroupControllerAvailable(VIR_CGROUP_CONTROLLER_CPUSET)) { + /* Use virNuma* API iff necessary. Once set and child is exec()-ed, + * there's no way for us to change it. Rely on cgroups (if available + * and enabled in the config) rather then virNuma*. */ + VIR_DEBUG("Postponing setting up resource limits to CGroup set up phase");
'resource limits' is too vague
+ return virLXCControllerSetupCpuAffinity(ctrl);
considering that you limit the available CPUs here. Also, instead of copying the unrelated call here,
+ } + + VIR_DEBUG("Setting up process resource limits"); + + if (virLXCControllerGetNumadAdvice(ctrl, &auto_nodeset) < 0) + goto cleanup; + + nodeset = virDomainNumatuneGetNodeset(ctrl->def->numa, auto_nodeset, -1); + if (virNumaSetupMemoryPolicy(mode, nodeset) < 0) goto cleanup;
Do if (!(STRICT && ...)) { virNuma() } else { VIR_DEBUG } ACK if you convert the function to only have one return call. (And the DEBUG could be useful in QEMU driver as well) Jan
participants (2)
-
Ján Tomko
-
Michal Privoznik