[libvirt] [PATCH] cgroup: Drop resource partition from virSystemdMakeScopeName

The scope name, even according to our docs is "machine-$DRIVER\x2d$VMNAME.scope" virSystemdMakeScopeName would use the resource partition name instead of "machine-" if it was specified thus creating invalid scope paths. This makes libvirt drop cgroups for a VM that uses custom resource partition upon reconnecting since the detected scope name would not match the expected name generated by virSystemdMakeScopeName. The error is exposed by the following log entry: debug : virCgroupValidateMachineGroup:302 : Name 'machine-qemu\x2dtestvm.scope' for controller 'cpu' does not match 'testvm', 'testvm.libvirt-qemu' or 'machine-test-qemu\x2dtestvm.scope' for a "/machine/test" resource and "testvm" vm. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1238570 --- src/lxc/lxc_process.c | 6 ------ src/qemu/qemu_cgroup.c | 3 --- src/util/vircgroup.c | 11 ++--------- src/util/vircgroup.h | 1 - src/util/virsystemd.c | 9 ++------- src/util/virsystemd.h | 3 +-- tests/virsystemdtest.c | 20 +++++++------------- 7 files changed, 12 insertions(+), 41 deletions(-) diff --git a/src/lxc/lxc_process.c b/src/lxc/lxc_process.c index 2bdce3b..87ee484 100644 --- a/src/lxc/lxc_process.c +++ b/src/lxc/lxc_process.c @@ -1319,9 +1319,6 @@ int virLXCProcessStart(virConnectPtr conn, * more reliable way to kill everything off if something * goes wrong from here onwards ... */ if (virCgroupNewDetectMachine(vm->def->name, "lxc", vm->pid, - vm->def->resource ? - vm->def->resource->partition : - NULL, -1, &priv->cgroup) < 0) goto cleanup; @@ -1505,9 +1502,6 @@ virLXCProcessReconnectDomain(virDomainObjPtr vm, goto error; if (virCgroupNewDetectMachine(vm->def->name, "lxc", vm->pid, - vm->def->resource ? - vm->def->resource->partition : - NULL, -1, &priv->cgroup) < 0) goto error; diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 8ed74ee..ab21e12 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -855,9 +855,6 @@ qemuConnectCgroup(virQEMUDriverPtr driver, if (virCgroupNewDetectMachine(vm->def->name, "qemu", vm->pid, - vm->def->resource ? - vm->def->resource->partition : - NULL, cfg->cgroupControllers, &priv->cgroup) < 0) goto cleanup; diff --git a/src/util/vircgroup.c b/src/util/vircgroup.c index 0ef2d29..0599ba5 100644 --- a/src/util/vircgroup.c +++ b/src/util/vircgroup.c @@ -243,7 +243,6 @@ static bool virCgroupValidateMachineGroup(virCgroupPtr group, const char *name, const char *drivername, - const char *partition, bool stripEmulatorSuffix) { size_t i; @@ -258,10 +257,7 @@ virCgroupValidateMachineGroup(virCgroupPtr group, if (virCgroupPartitionEscape(&partname) < 0) goto cleanup; - if (!partition) - partition = "/machine"; - - if (!(scopename = virSystemdMakeScopeName(name, drivername, partition))) + if (!(scopename = virSystemdMakeScopeName(name, drivername))) goto cleanup; if (virCgroupPartitionEscape(&scopename) < 0) @@ -1498,7 +1494,6 @@ int virCgroupNewDetectMachine(const char *name, const char *drivername, pid_t pid, - const char *partition, int controllers, virCgroupPtr *group) { @@ -1508,8 +1503,7 @@ virCgroupNewDetectMachine(const char *name, return -1; } - if (!virCgroupValidateMachineGroup(*group, name, drivername, partition, - true)) { + if (!virCgroupValidateMachineGroup(*group, name, drivername, true)) { VIR_DEBUG("Failed to validate machine name for '%s' driver '%s'", name, drivername); virCgroupFree(group); @@ -4047,7 +4041,6 @@ int virCgroupNewDetectMachine(const char *name ATTRIBUTE_UNUSED, const char *drivername ATTRIBUTE_UNUSED, pid_t pid ATTRIBUTE_UNUSED, - const char *partition ATTRIBUTE_UNUSED, int controllers ATTRIBUTE_UNUSED, virCgroupPtr *group ATTRIBUTE_UNUSED) { diff --git a/src/util/vircgroup.h b/src/util/vircgroup.h index e75c522..675a185 100644 --- a/src/util/vircgroup.h +++ b/src/util/vircgroup.h @@ -92,7 +92,6 @@ int virCgroupNewDetect(pid_t pid, int virCgroupNewDetectMachine(const char *name, const char *drivername, pid_t pid, - const char *partition, int controllers, virCgroupPtr *group); diff --git a/src/util/virsystemd.c b/src/util/virsystemd.c index 8cedf8d..54c409d 100644 --- a/src/util/virsystemd.c +++ b/src/util/virsystemd.c @@ -80,16 +80,11 @@ static void virSystemdEscapeName(virBufferPtr buf, char *virSystemdMakeScopeName(const char *name, - const char *drivername, - const char *partition) + const char *drivername) { virBuffer buf = VIR_BUFFER_INITIALIZER; - if (*partition == '/') - partition++; - - virSystemdEscapeName(&buf, partition); - virBufferAddChar(&buf, '-'); + virBufferAddLit(&buf, "machine-"); virSystemdEscapeName(&buf, drivername); virBufferAddLit(&buf, "\\x2d"); virSystemdEscapeName(&buf, name); diff --git a/src/util/virsystemd.h b/src/util/virsystemd.h index 7a29dba..8af2169 100644 --- a/src/util/virsystemd.h +++ b/src/util/virsystemd.h @@ -25,8 +25,7 @@ # include "internal.h" char *virSystemdMakeScopeName(const char *name, - const char *drivername, - const char *slicename); + const char *drivername); char *virSystemdMakeSliceName(const char *partition); char *virSystemdMakeMachineName(const char *name, diff --git a/tests/virsystemdtest.c b/tests/virsystemdtest.c index 261c4cc..d0b9335 100644 --- a/tests/virsystemdtest.c +++ b/tests/virsystemdtest.c @@ -340,7 +340,6 @@ static int testCreateNetwork(const void *opaque ATTRIBUTE_UNUSED) struct testScopeData { const char *name; - const char *partition; const char *expected; }; @@ -351,9 +350,7 @@ testScopeName(const void *opaque) int ret = -1; char *actual = NULL; - if (!(actual = virSystemdMakeScopeName(data->name, - "lxc", - data->partition))) + if (!(actual = virSystemdMakeScopeName(data->name, "lxc"))) goto cleanup; if (STRNEQ(actual, data->expected)) { @@ -472,22 +469,19 @@ mymain(void) if (virtTestRun("Test create with network ", testCreateNetwork, NULL) < 0) ret = -1; -# define TEST_SCOPE(name, partition, unitname) \ +# define TEST_SCOPE(name, unitname) \ do { \ struct testScopeData data = { \ - name, partition, unitname \ + name, unitname \ }; \ if (virtTestRun("Test scopename", testScopeName, &data) < 0) \ ret = -1; \ } while (0) - TEST_SCOPE("demo", "/machine", "machine-lxc\\x2ddemo.scope"); - TEST_SCOPE("demo-name", "/machine", "machine-lxc\\x2ddemo\\x2dname.scope"); - TEST_SCOPE("demo!name", "/machine", "machine-lxc\\x2ddemo\\x21name.scope"); - TEST_SCOPE(".demo", "/machine", "machine-lxc\\x2d\\x2edemo.scope"); - TEST_SCOPE("demo", "/machine/eng-dept", "machine-eng\\x2ddept-lxc\\x2ddemo.scope"); - TEST_SCOPE("demo", "/machine/eng-dept/testing!stuff", - "machine-eng\\x2ddept-testing\\x21stuff-lxc\\x2ddemo.scope"); + TEST_SCOPE("demo", "machine-lxc\\x2ddemo.scope"); + TEST_SCOPE("demo-name", "machine-lxc\\x2ddemo\\x2dname.scope"); + TEST_SCOPE("demo!name", "machine-lxc\\x2ddemo\\x21name.scope"); + TEST_SCOPE(".demo", "machine-lxc\\x2d\\x2edemo.scope"); # define TESTS_PM_SUPPORT_HELPER(name, function) \ do { \ -- 2.4.5

On Thu, Jul 16, 2015 at 04:18:08PM +0200, Peter Krempa wrote:
The scope name, even according to our docs is "machine-$DRIVER\x2d$VMNAME.scope" virSystemdMakeScopeName would use the resource partition name instead of "machine-" if it was specified thus creating invalid scope paths.
This makes libvirt drop cgroups for a VM that uses custom resource partition upon reconnecting since the detected scope name would not match the expected name generated by virSystemdMakeScopeName.
The error is exposed by the following log entry:
debug : virCgroupValidateMachineGroup:302 : Name 'machine-qemu\x2dtestvm.scope' for controller 'cpu' does not match 'testvm', 'testvm.libvirt-qemu' or 'machine-test-qemu\x2dtestvm.scope'
for a "/machine/test" resource and "testvm" vm.
This doesn't look right - the whole idea of the resource partition name is that it overrides the default /machine based path. So ignoring the resource partition is not right, unless I'm mis- understanding what this patch is doing. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Fri, Jul 17, 2015 at 15:40:31 +0100, Daniel Berrange wrote:
On Thu, Jul 16, 2015 at 04:18:08PM +0200, Peter Krempa wrote:
The scope name, even according to our docs is "machine-$DRIVER\x2d$VMNAME.scope" virSystemdMakeScopeName would use the resource partition name instead of "machine-" if it was specified thus creating invalid scope paths.
This makes libvirt drop cgroups for a VM that uses custom resource partition upon reconnecting since the detected scope name would not match the expected name generated by virSystemdMakeScopeName.
The error is exposed by the following log entry:
debug : virCgroupValidateMachineGroup:302 : Name 'machine-qemu\x2dtestvm.scope' for controller 'cpu' does not match 'testvm', 'testvm.libvirt-qemu' or 'machine-test-qemu\x2dtestvm.scope'
for a "/machine/test" resource and "testvm" vm.
This doesn't look right - the whole idea of the resource partition name is that it overrides the default /machine based path. So ignoring the resource partition is not right, unless I'm mis- understanding what this patch is doing.
Perhaps I wasn't clear enough in the explanation. The code path that calls the function where I've removed the partition is called on daemon restart, where the paths to cgroups are re-detected upon startup. At first virCgroupNewDetect() is called on a VM's pid and the function detects cgroup paths for the given process. The detected data for a VM with <partition>/machine/asdf/test</partiotion> look like: (gdb) p **group $2 = {path = 0x7fff94000ec0 "", controllers = {{type = 0, mountPoint = 0x7fff94001330 "/sys/fs/cgroup/cpu,cpuacct", linkPoint = 0x7fff94001470 "/sys/fs/cgroup/cpu", placement = 0x7fff94001fb0 "/machine.slice/machine-asdf.slice/machine-asdf-test.slice/machine-qemu\\x2df20live2.scope/emulator"}, {type = 0, mountPoint = 0x7fff94001200 "/sys/fs/cgroup/cpu,cpuacct", linkPoint = 0x7fff94001450 "/sys/fs/cgroup/cpuacct", placement = 0x7fff94002020 "/machine.slice/machine-asdf.slice/machine-asdf-test.slice/machine-qemu\\x2df20live2.scope/emulator"}, {type = 0, mountPoint = 0x7fff94001410 "/sys/fs/cgroup/cpuset", linkPoint = 0x0, placement = 0x7fff94001ad0 "/machine.slice/machine-asdf.slice/machine-asdf-test.slice/machine-qemu\\x2df20live2.scope/emulator"}, ... and so on for other cgroups. After the detection virCgroupValidateMachineGroup() is called on the detected cgroup array to validate them with the following algorithm: FOREACH entry IN 'cgroup' array: Find the last directory name in entry.placement (strrchr(..., '/')). IF the cgroup is one of CPU* cgroups AND the last directory component is '/emulator': find next previous directory component and strip '/emuator' from it # This equals to 'machine-qemu\\x2df20live2.scope' in the example above. # validate the name IF directory name IS EQUAL TO 'MACHINE_NAME' OR 'MACHINE_NAME.libvirt-DRIVER_NAME' OR virSystemdMakeScopeName(...): cgroup is valid Since the algorithm does not extract the partition name or validate it in any way in non-systemd scenarios and in scenarios where systemd is used the generated name wouldn't ever match since the scope name as extracted in the code does not include the slice generated from <partition> the cgroups code does not work on systemd machines after libvirt restart. The patch as-is removes the partition name from the code that generates the systemd SCOPE name since that does not contain it. The partition is used to generate SLICE name which is contained in the cgroup path before the SCOPE. The question that comes into my mind is whether we should actually make sure that the <partition> corresponds to the SLICE in the detected path, or just matching the machine SCOPE is good enough in this case. Peter

On Mon, Jul 20, 2015 at 11:10:18AM +0200, Peter Krempa wrote:
On Fri, Jul 17, 2015 at 15:40:31 +0100, Daniel Berrange wrote:
On Thu, Jul 16, 2015 at 04:18:08PM +0200, Peter Krempa wrote:
The scope name, even according to our docs is "machine-$DRIVER\x2d$VMNAME.scope" virSystemdMakeScopeName would use the resource partition name instead of "machine-" if it was specified thus creating invalid scope paths.
This makes libvirt drop cgroups for a VM that uses custom resource partition upon reconnecting since the detected scope name would not match the expected name generated by virSystemdMakeScopeName.
The error is exposed by the following log entry:
debug : virCgroupValidateMachineGroup:302 : Name 'machine-qemu\x2dtestvm.scope' for controller 'cpu' does not match 'testvm', 'testvm.libvirt-qemu' or 'machine-test-qemu\x2dtestvm.scope'
for a "/machine/test" resource and "testvm" vm.
This doesn't look right - the whole idea of the resource partition name is that it overrides the default /machine based path. So ignoring the resource partition is not right, unless I'm mis- understanding what this patch is doing.
Perhaps I wasn't clear enough in the explanation. The code path that calls the function where I've removed the partition is called on daemon restart, where the paths to cgroups are re-detected upon startup.
At first virCgroupNewDetect() is called on a VM's pid and the function detects cgroup paths for the given process.
The detected data for a VM with <partition>/machine/asdf/test</partiotion> look like: (gdb) p **group $2 = {path = 0x7fff94000ec0 "", controllers = {{type = 0, mountPoint = 0x7fff94001330 "/sys/fs/cgroup/cpu,cpuacct", linkPoint = 0x7fff94001470 "/sys/fs/cgroup/cpu", placement = 0x7fff94001fb0 "/machine.slice/machine-asdf.slice/machine-asdf-test.slice/machine-qemu\\x2df20live2.scope/emulator"}, {type = 0, mountPoint = 0x7fff94001200 "/sys/fs/cgroup/cpu,cpuacct", linkPoint = 0x7fff94001450 "/sys/fs/cgroup/cpuacct", placement = 0x7fff94002020 "/machine.slice/machine-asdf.slice/machine-asdf-test.slice/machine-qemu\\x2df20live2.scope/emulator"}, {type = 0, mountPoint = 0x7fff94001410 "/sys/fs/cgroup/cpuset", linkPoint = 0x0, placement = 0x7fff94001ad0 "/machine.slice/machine-asdf.slice/machine-asdf-test.slice/machine-qemu\\x2df20live2.scope/emulator"},
... and so on for other cgroups.
After the detection virCgroupValidateMachineGroup() is called on the detected cgroup array to validate them with the following algorithm:
FOREACH entry IN 'cgroup' array: Find the last directory name in entry.placement (strrchr(..., '/')).
IF the cgroup is one of CPU* cgroups AND the last directory component is '/emulator':
find next previous directory component and strip '/emuator' from it
# This equals to 'machine-qemu\\x2df20live2.scope' in the example above.
# validate the name
IF directory name IS EQUAL TO 'MACHINE_NAME' OR 'MACHINE_NAME.libvirt-DRIVER_NAME' OR virSystemdMakeScopeName(...): cgroup is valid
Since the algorithm does not extract the partition name or validate it in any way in non-systemd scenarios and in scenarios where systemd is used the generated name wouldn't ever match since the scope name as extracted in the code does not include the slice generated from <partition> the cgroups code does not work on systemd machines after libvirt restart.
The patch as-is removes the partition name from the code that generates the systemd SCOPE name since that does not contain it. The partition is used to generate SLICE name which is contained in the cgroup path before the SCOPE.
Ok, I understand what's going on now and reproduced the problem locally too. So ACK to your proposed patch.
The question that comes into my mind is whether we should actually make sure that the <partition> corresponds to the SLICE in the detected path, or just matching the machine SCOPE is good enough in this case.
I'm not sure it matters much - as long as we've detected the cgroup tree we'll do the right thing cleaning it up on guest shutdown, even if the <partition> in XML is different. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Tue, Jul 21, 2015 at 16:12:09 +0100, Daniel Berrange wrote:
On Mon, Jul 20, 2015 at 11:10:18AM +0200, Peter Krempa wrote:
On Fri, Jul 17, 2015 at 15:40:31 +0100, Daniel Berrange wrote:
On Thu, Jul 16, 2015 at 04:18:08PM +0200, Peter Krempa wrote:
...
The patch as-is removes the partition name from the code that generates the systemd SCOPE name since that does not contain it. The partition is used to generate SLICE name which is contained in the cgroup path before the SCOPE.
Ok, I understand what's going on now and reproduced the problem locally too. So ACK to your proposed patch.
I've pushed it now. Thanks.
The question that comes into my mind is whether we should actually make sure that the <partition> corresponds to the SLICE in the detected path, or just matching the machine SCOPE is good enough in this case.
I'm not sure it matters much - as long as we've detected the cgroup tree we'll do the right thing cleaning it up on guest shutdown, even if the <partition> in XML is different.
I have the same opinion. Additionally if we'd kill off the ability to configure cgroups just because that doesn't match it would be rather user-unfriendly. Peter
participants (2)
-
Daniel P. Berrange
-
Peter Krempa