[libvirt] Make systemd work with LXC user namespaces

These two patches fix the issue that control groups are unusable if user namespaces are enabled. We have to chown() the control group to the correct user. As the container mounts the control group and only the controller is allowed to chown() the mount point we need a new barrier to synchronize them after the container has setup the control groups. Thanks, //richard [PATCH 1/2] lxc: Add another barrier [PATCH 2/2] lxc: Add virCgroupSetOwner()

Add another barrier to give the controller a chance to setup additional things after the container setup is done. This new barrier is needed to chown() the cgroup after the container has mounted it. Signed-off-by: Richard Weinberger <richard@nod.at> --- src/lxc/lxc_container.c | 16 +++++++++++++++- src/lxc/lxc_container.h | 1 + src/lxc/lxc_controller.c | 17 +++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index c6bdc8c..24af73a 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -108,6 +108,7 @@ struct __lxc_child_argv { size_t nttyPaths; char **ttyPaths; int handshakefd; + int posthandshakefd; }; static int lxcContainerMountFSBlock(virDomainFSDefPtr fs, @@ -1880,10 +1881,20 @@ static int lxcContainerChild(void *data) goto cleanup; } + /* wait for controller to setup final tasks */ + VIR_DEBUG("Received container continue message"); + if (lxcContainerWaitForContinue(argv->posthandshakefd) < 0) { + virReportSystemError(errno, "%s", + _("Failed to read the container continue message")); + goto cleanup; + } + VIR_DEBUG("Received container continue message"); + VIR_DEBUG("Setting up security labeling"); if (virSecurityManagerSetProcessLabel(argv->securityDriver, vmDef) < 0) goto cleanup; + VIR_FORCE_CLOSE(argv->posthandshakefd); VIR_FORCE_CLOSE(argv->handshakefd); VIR_FORCE_CLOSE(argv->monitor); if (lxcContainerSetupFDs(&ttyfd, @@ -1896,6 +1907,7 @@ cleanup: VIR_FORCE_CLOSE(ttyfd); VIR_FORCE_CLOSE(argv->monitor); VIR_FORCE_CLOSE(argv->handshakefd); + VIR_FORCE_CLOSE(argv->posthandshakefd); if (ret == 0) { /* this function will only return if an error occurred */ @@ -1984,6 +1996,7 @@ int lxcContainerStart(virDomainDefPtr def, int *passFDs, int control, int handshakefd, + int posthandshakefd, size_t nttyPaths, char **ttyPaths) { @@ -2001,7 +2014,8 @@ int lxcContainerStart(virDomainDefPtr def, .monitor = control, .nttyPaths = nttyPaths, .ttyPaths = ttyPaths, - .handshakefd = handshakefd + .handshakefd = handshakefd, + .posthandshakefd = posthandshakefd }; /* allocate a stack for the container */ diff --git a/src/lxc/lxc_container.h b/src/lxc/lxc_container.h index e74a7d7..03102f4 100644 --- a/src/lxc/lxc_container.h +++ b/src/lxc/lxc_container.h @@ -60,6 +60,7 @@ int lxcContainerStart(virDomainDefPtr def, int *passFDs, int control, int handshakefd, + int posthandshakefd, size_t nttyPaths, char **ttyPaths); diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index 5ca960f..f7b614b 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -2128,6 +2128,7 @@ virLXCControllerRun(virLXCControllerPtr ctrl) int rc = -1; int control[2] = { -1, -1}; int containerhandshake[2] = { -1, -1 }; + int containerposthandshake[2] = { -1, -1 }; char **containerTTYPaths = NULL; size_t i; @@ -2146,6 +2147,12 @@ virLXCControllerRun(virLXCControllerPtr ctrl) goto cleanup; } + if (socketpair(PF_UNIX, SOCK_STREAM, 0, containerposthandshake) < 0) { + virReportSystemError(errno, "%s", + _("socketpair failed")); + goto cleanup; + } + if (virLXCControllerSetupPrivateNS() < 0) goto cleanup; @@ -2184,11 +2191,13 @@ virLXCControllerRun(virLXCControllerPtr ctrl) ctrl->passFDs, control[1], containerhandshake[1], + containerposthandshake[1], ctrl->nconsoles, containerTTYPaths)) < 0) goto cleanup; VIR_FORCE_CLOSE(control[1]); VIR_FORCE_CLOSE(containerhandshake[1]); + VIR_FORCE_CLOSE(containerposthandshake[1]); for (i = 0; i < ctrl->npassFDs; i++) VIR_FORCE_CLOSE(ctrl->passFDs[i]); @@ -2214,6 +2223,12 @@ virLXCControllerRun(virLXCControllerPtr ctrl) goto cleanup; } + if (lxcContainerSendContinue(containerposthandshake[0]) < 0) { + virReportSystemError(errno, "%s", + _("Unable to send container continue message")); + goto cleanup; + } + /* ...and reduce our privileges */ if (lxcControllerClearCapabilities() < 0) goto cleanup; @@ -2240,6 +2255,8 @@ cleanup: VIR_FORCE_CLOSE(control[1]); VIR_FORCE_CLOSE(containerhandshake[0]); VIR_FORCE_CLOSE(containerhandshake[1]); + VIR_FORCE_CLOSE(containerposthandshake[0]); + VIR_FORCE_CLOSE(containerposthandshake[1]); for (i = 0; i < ctrl->nconsoles; i++) VIR_FREE(containerTTYPaths[i]); -- 1.8.4.5

Add a new helper function to change the permissions of a control group. Signed-off-by: Richard Weinberger <richard@nod.at> --- src/lxc/lxc_controller.c | 7 +++++++ src/util/vircgroup.c | 43 +++++++++++++++++++++++++++++++++++++++++++ src/util/vircgroup.h | 2 ++ 3 files changed, 52 insertions(+) diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index f7b614b..6e348b3 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -2223,6 +2223,13 @@ virLXCControllerRun(virLXCControllerPtr ctrl) goto cleanup; } + /* setup control group permissions for user namespace */ + if (ctrl->def->idmap.uidmap) { + if (virCgroupSetOwner(ctrl->cgroup, ctrl->def->idmap.uidmap[0].target, + ctrl->def->idmap.gidmap[0].target)) + goto cleanup; + } + if (lxcContainerSendContinue(containerposthandshake[0]) < 0) { virReportSystemError(errno, "%s", _("Unable to send container continue message")); diff --git a/src/util/vircgroup.c b/src/util/vircgroup.c index a6d60c5..b66ffed 100644 --- a/src/util/vircgroup.c +++ b/src/util/vircgroup.c @@ -3252,6 +3252,49 @@ cleanup: return ret; } +int virCgroupSetOwner(virCgroupPtr cgroup, uid_t uid, gid_t gid) { + size_t i; + + for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) { + char *base, *entry; + DIR *dh; + struct dirent *de; + + if (virAsprintf(&base, "%s%s", cgroup->controllers[i].mountPoint, + cgroup->controllers[i].placement) < 0) { + virReportOOMError(); + return -1; + } + + dh = opendir(base); + while ((de = readdir(dh)) != NULL) { + if (STREQ(de->d_name, ".") || + STREQ(de->d_name, "..")) + continue; + + if (virAsprintf(&entry, "%s/%s", base, de->d_name) < 0) { + VIR_FREE(base); + virReportOOMError(); + } + + if (chown(entry, uid, gid) < 0) + virReportSystemError(errno, _("cannot chown '%s' to (%u, %u)"), + entry, uid, gid); + + VIR_FREE(entry); + } + closedir(dh); + + if (chown(base, uid, gid) < 0) + virReportSystemError(errno, _("cannot chown '%s' to (%u, %u)"), + base, uid, gid); + + VIR_FREE(base); + } + + return 0; +} + /** * virCgroupSupportsCpuBW(): diff --git a/src/util/vircgroup.h b/src/util/vircgroup.h index a70eb18..6e00f28 100644 --- a/src/util/vircgroup.h +++ b/src/util/vircgroup.h @@ -225,4 +225,6 @@ int virCgroupIsolateMount(virCgroupPtr group, bool virCgroupSupportsCpuBW(virCgroupPtr cgroup); +int virCgroupSetOwner(virCgroupPtr cgroup, uid_t uid, gid_t gid); + #endif /* __VIR_CGROUP_H__ */ -- 1.8.4.5

On Sat, Feb 08, 2014 at 06:37:43PM +0100, Richard Weinberger wrote:
Add a new helper function to change the permissions of a control group.
Signed-off-by: Richard Weinberger <richard@nod.at> --- src/lxc/lxc_controller.c | 7 +++++++ src/util/vircgroup.c | 43 +++++++++++++++++++++++++++++++++++++++++++ src/util/vircgroup.h | 2 ++ 3 files changed, 52 insertions(+)
diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index f7b614b..6e348b3 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -2223,6 +2223,13 @@ virLXCControllerRun(virLXCControllerPtr ctrl) goto cleanup; }
+ /* setup control group permissions for user namespace */ + if (ctrl->def->idmap.uidmap) { + if (virCgroupSetOwner(ctrl->cgroup, ctrl->def->idmap.uidmap[0].target, + ctrl->def->idmap.gidmap[0].target)) + goto cleanup; + } +
I wonder is it possible to just do the ownership change in the virLXCCgroupCreate method ? When the container then does the bind mount, it should preserve the ownership correctly. This would avoid any need for the extra handshake synchronization.
diff --git a/src/util/vircgroup.c b/src/util/vircgroup.c index a6d60c5..b66ffed 100644 --- a/src/util/vircgroup.c +++ b/src/util/vircgroup.c @@ -3252,6 +3252,49 @@ cleanup: return ret; }
+int virCgroupSetOwner(virCgroupPtr cgroup, uid_t uid, gid_t gid) { + size_t i; + + for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
I think we only want to set ownership for VIR_CGROUP_CONTROLLER_SYSTEMD. We don't want to give the container permission to change resource controllers, this that would allow it to potentially escape its resource limits. You might think the hierarchy would lock things down, but I believe the kernel developers still say it is unsafe to delegate cgroup resource controllers to unprivileged users. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

Am 11.02.2014 13:05, schrieb Daniel P. Berrange:
On Sat, Feb 08, 2014 at 06:37:43PM +0100, Richard Weinberger wrote:
Add a new helper function to change the permissions of a control group.
Signed-off-by: Richard Weinberger <richard@nod.at> --- src/lxc/lxc_controller.c | 7 +++++++ src/util/vircgroup.c | 43 +++++++++++++++++++++++++++++++++++++++++++ src/util/vircgroup.h | 2 ++ 3 files changed, 52 insertions(+)
diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index f7b614b..6e348b3 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -2223,6 +2223,13 @@ virLXCControllerRun(virLXCControllerPtr ctrl) goto cleanup; }
+ /* setup control group permissions for user namespace */ + if (ctrl->def->idmap.uidmap) { + if (virCgroupSetOwner(ctrl->cgroup, ctrl->def->idmap.uidmap[0].target, + ctrl->def->idmap.gidmap[0].target)) + goto cleanup; + } +
I wonder is it possible to just do the ownership change in the virLXCCgroupCreate method ? When the container then does the bind mount, it should preserve the ownership correctly. This would avoid any need for the extra handshake synchronization.
Bah, I've overlooked that. Of course it works. Much simplified patch is on its way. Thanks for pointing this out!
diff --git a/src/util/vircgroup.c b/src/util/vircgroup.c index a6d60c5..b66ffed 100644 --- a/src/util/vircgroup.c +++ b/src/util/vircgroup.c @@ -3252,6 +3252,49 @@ cleanup: return ret; }
+int virCgroupSetOwner(virCgroupPtr cgroup, uid_t uid, gid_t gid) { + size_t i; + + for (i = 0; i < VIR_CGROUP_CONTROLLER_LAST; i++) {
I think we only want to set ownership for VIR_CGROUP_CONTROLLER_SYSTEMD. We don't want to give the container permission to change resource controllers, this that would allow it to potentially escape its resource limits. You might think the hierarchy would lock things down, but I believe the kernel developers still say it is unsafe to delegate cgroup resource controllers to unprivileged users.
That's right. But then we should generally only delegate the systemd cgroup to a container. ...patch sent. Thanks, //richard
participants (2)
-
Daniel P. Berrange
-
Richard Weinberger