[libvirt] [PATCH 0/4] Improve mount setup in LXC containers

These 4 patches improve the way we setup special mounts like /sys and /proc in LXC containers. This should help ensure systemd is happier with the cgroups setup These apply ontop of the earlier series to improve SELinux driver integration

From: "Daniel P. Berrange" <berrange@redhat.com> Currently to make sysfs readonly, we remount the existing instance and then bind it readonly. Unfortunately this means sysfs is still showing device objects wrt the host OS namespace. We need it to reflect the container namespace, so we must mount a completely new instance of it. Do the same for selinuxfs since there is no benefit to bind mounting & this lets us simplify the code. * src/lxc/lxc_container.c: Mount fresh sysfs instance Signed-off-by: Daniel P. Berrange <berrange@redhat.com> --- src/lxc/lxc_container.c | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index ca5696d..2076c04 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -423,12 +423,10 @@ err: static int lxcContainerMountBasicFS(virDomainDefPtr def, - const char *srcprefix, bool pivotRoot, virSecurityManagerPtr securityDriver) { const struct { - bool needPrefix; const char *src; const char *dst; const char *type; @@ -441,20 +439,20 @@ static int lxcContainerMountBasicFS(virDomainDefPtr def, * mount point in the main OS becomes readonly too which is not what * we want. Hence some things have two entries here. */ - { false, "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV }, - { false, "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND }, - { false, "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, - { true, "/sys", "/sys", NULL, NULL, MS_BIND }, - { true, "/sys", "/sys", NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, + { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV }, + { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND }, + { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, + { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV }, + { "sysfs", "/sys", "sysfs", NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, #if HAVE_SELINUX - { true, SELINUX_MOUNT, SELINUX_MOUNT, NULL, NULL, MS_BIND }, - { true, SELINUX_MOUNT, SELINUX_MOUNT, NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, + { SELINUX_MOUNT, SELINUX_MOUNT, "selinuxfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV }, + { SELINUX_MOUNT, SELINUX_MOUNT, NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, #endif }; int i, rc = -1; char *opts = NULL; - VIR_DEBUG("Mounting basic filesystems %s pivotRoot=%d", NULLSTR(srcprefix), pivotRoot); + VIR_DEBUG("Mounting basic filesystems pivotRoot=%d", pivotRoot); for (i = 0 ; i < ARRAY_CARDINALITY(mnts) ; i++) { char *src = NULL; @@ -470,15 +468,7 @@ static int lxcContainerMountBasicFS(virDomainDefPtr def, goto cleanup; } - if (mnts[i].needPrefix && srcprefix) { - if (virAsprintf(&src, "%s%s", srcprefix, mnts[i].src) < 0) { - virReportOOMError(); - goto cleanup; - } - srcpath = src; - } else { - srcpath = mnts[i].src; - } + srcpath = mnts[i].src; /* Skip if mount doesn't exist in source */ if ((srcpath[0] == '/') && @@ -1121,7 +1111,7 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, return -1; /* Mounts the core /proc, /sys, etc filesystems */ - if (lxcContainerMountBasicFS(vmDef, "/.oldroot", true, securityDriver) < 0) + if (lxcContainerMountBasicFS(vmDef, true, securityDriver) < 0) return -1; /* Mounts /dev/pts */ @@ -1166,7 +1156,7 @@ static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef, return -1; /* Mounts the core /proc, /sys, etc filesystems */ - if (lxcContainerMountBasicFS(vmDef, NULL, false, securityDriver) < 0) + if (lxcContainerMountBasicFS(vmDef, false, securityDriver) < 0) return -1; VIR_DEBUG("Mounting completed"); -- 1.7.10.1

On 05/11/2012 10:48 AM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange" <berrange@redhat.com>
Currently to make sysfs readonly, we remount the existing instance and then bind it readonly. Unfortunately this means sysfs is still showing device objects wrt the host OS namespace. We need it to reflect the container namespace, so we must mount a completely new instance of it. Do the same for selinuxfs since there is no benefit to bind mounting & this lets us simplify the code.
* src/lxc/lxc_container.c: Mount fresh sysfs instance
Signed-off-by: Daniel P. Berrange <berrange@redhat.com> --- src/lxc/lxc_container.c | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-)
ACK. -- Eric Blake eblake@redhat.com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

From: "Daniel P. Berrange" <berrange@redhat.com> If the LXC config has a filesystem <filesystem> <source dir='/'/> <target dir='/'/> </filesystem> then there is no need to go down the pivot root codepath. We can simply use the existing root as needed. Signed-off-by: Daniel P. Berrange <berrange@redhat.com> --- src/lxc/lxc_container.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 2076c04..0e22de5 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -1137,6 +1137,7 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, /* Nothing mapped to /, we're using the main root, but with extra stuff mapped in */ static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef, + virDomainFSDefPtr root, virSecurityManagerPtr securityDriver) { VIR_DEBUG("def=%p", vmDef); @@ -1151,6 +1152,14 @@ static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef, return -1; } + if (root && root->readonly) { + if (mount("", "/", NULL, MS_BIND|MS_REC|MS_RDONLY|MS_REMOUNT, NULL) < 0) { + virReportSystemError(errno, "%s", + _("Failed to make root readonly")); + return -1; + } + } + VIR_DEBUG("Mounting config FS"); if (lxcContainerMountAllFS(vmDef, "", false) < 0) return -1; @@ -1192,10 +1201,14 @@ static int lxcContainerSetupMounts(virDomainDefPtr vmDef, if (lxcContainerResolveSymlinks(vmDef) < 0) return -1; - if (root) + /* If the user has specified a dst '/' with a source of '/' + * then we don't really want to go down the pivot root + * path, as we're just tuning the existing root + */ + if (root && root->src && STRNEQ(root->src, "/")) return lxcContainerSetupPivotRoot(vmDef, root, ttyPaths, nttyPaths, securityDriver); else - return lxcContainerSetupExtraMounts(vmDef, securityDriver); + return lxcContainerSetupExtraMounts(vmDef, root, securityDriver); } -- 1.7.10.1

On 05/11/2012 10:48 AM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange" <berrange@redhat.com>
If the LXC config has a filesystem
<filesystem> <source dir='/'/> <target dir='/'/> </filesystem>
then there is no need to go down the pivot root codepath. We can simply use the existing root as needed.
Signed-off-by: Daniel P. Berrange <berrange@redhat.com> --- src/lxc/lxc_container.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-)
@@ -1192,10 +1201,14 @@ static int lxcContainerSetupMounts(virDomainDefPtr vmDef, if (lxcContainerResolveSymlinks(vmDef) < 0) return -1;
- if (root) + /* If the user has specified a dst '/' with a source of '/' + * then we don't really want to go down the pivot root + * path, as we're just tuning the existing root + */ + if (root && root->src && STRNEQ(root->src, "/"))
Do we want to handle users that spell things as "///" instead of "/"? But that's a corner case that doesn't affect this change. ACK. -- Eric Blake eblake@redhat.com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

From: "Daniel P. Berrange" <berrange@redhat.com> Both /proc and /sys may have sub-mounts in them from the host OS. We must explicitly unmount them all before mounting the new instance over that location. If we don't then /proc/mounts will show the sub-mounts as existing, even though nothing will be able to access them, due to the over-mount. Signed-off-by: Daniel P. Berrange <berrange@redhat.com> --- src/lxc/lxc_container.c | 61 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 0e22de5..a3ca76c 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -1017,26 +1017,29 @@ static int lxcContainerMountAllFS(virDomainDefPtr vmDef, } -static int lxcContainerUnmountOldFS(void) +static int lxcContainerGetSubtree(const char *prefix, + char ***mountsret, + size_t *nmountsret) { - struct mntent mntent; - char **mounts = NULL; - int nmounts = 0; FILE *procmnt; - int i; + struct mntent mntent; char mntbuf[1024]; - int saveErrno; - const char *failedUmount = NULL; int ret = -1; + char **mounts = NULL; + size_t nmounts = 0; + + *mountsret = NULL; + *nmountsret = 0; if (!(procmnt = setmntent("/proc/mounts", "r"))) { virReportSystemError(errno, "%s", _("Failed to read /proc/mounts")); return -1; } + while (getmntent_r(procmnt, &mntent, mntbuf, sizeof(mntbuf)) != NULL) { VIR_DEBUG("Got %s", mntent.mnt_dir); - if (!STRPREFIX(mntent.mnt_dir, "/.oldroot")) + if (!STRPREFIX(mntent.mnt_dir, prefix)) continue; if (VIR_REALLOC_N(mounts, nmounts+1) < 0) { @@ -1054,13 +1057,36 @@ static int lxcContainerUnmountOldFS(void) qsort(mounts, nmounts, sizeof(mounts[0]), lxcContainerChildMountSort); + *mountsret = mounts; + *nmountsret = nmounts; + ret = 0; + +cleanup: + endmntent(procmnt); + return ret; +} + +static int lxcContainerUnmountSubtree(const char *prefix, + bool isOldRootFS) +{ + char **mounts = NULL; + size_t nmounts = 0; + size_t i; + int saveErrno; + const char *failedUmount = NULL; + int ret = -1; + + VIR_DEBUG("Unmount subtreee from %s", prefix); + + if (lxcContainerGetSubtree(prefix, &mounts, &nmounts) < 0) + return -1; for (i = 0 ; i < nmounts ; i++) { VIR_DEBUG("Umount %s", mounts[i]); if (umount(mounts[i]) < 0) { char ebuf[1024]; failedUmount = mounts[i]; saveErrno = errno; - VIR_WARN("Failed to unmount '%s', trying to detach root '%s': %s", + VIR_WARN("Failed to unmount '%s', trying to detach subtree '%s': %s", failedUmount, mounts[nmounts-1], virStrerror(errno, ebuf, sizeof(ebuf))); break; @@ -1068,15 +1094,16 @@ static int lxcContainerUnmountOldFS(void) } if (failedUmount) { - /* This detaches the old root filesystem */ + /* This detaches the subtree */ if (umount2(mounts[nmounts-1], MNT_DETACH) < 0) { virReportSystemError(saveErrno, - _("Failed to unmount '%s' and could not detach old root '%s'"), + _("Failed to unmount '%s' and could not detach subtree '%s'"), failedUmount, mounts[nmounts-1]); goto cleanup; } /* This unmounts the tmpfs on which the old root filesystem was hosted */ - if (umount(mounts[nmounts-1]) < 0) { + if (isOldRootFS && + umount(mounts[nmounts-1]) < 0) { virReportSystemError(saveErrno, _("Failed to unmount '%s' and could not unmount old root '%s'"), failedUmount, mounts[nmounts-1]); @@ -1089,7 +1116,6 @@ static int lxcContainerUnmountOldFS(void) cleanup: for (i = 0 ; i < nmounts ; i++) VIR_FREE(mounts[i]); - endmntent(procmnt); VIR_FREE(mounts); return ret; @@ -1127,7 +1153,7 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, return -1; /* Gets rid of all remaining mounts from host OS, including /.oldroot itself */ - if (lxcContainerUnmountOldFS() < 0) + if (lxcContainerUnmountSubtree("/.oldroot", true) < 0) return -1; return 0; @@ -1164,6 +1190,13 @@ static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef, if (lxcContainerMountAllFS(vmDef, "", false) < 0) return -1; + /* Gets rid of any existing stuff under /proc, since we need new + * namespace aware versions of those. We must do /proc second + * otherwise we won't find /proc/mounts :-) */ + if (lxcContainerUnmountSubtree("/sys", false) < 0 || + lxcContainerUnmountSubtree("/proc", false) < 0) + return -1; + /* Mounts the core /proc, /sys, etc filesystems */ if (lxcContainerMountBasicFS(vmDef, false, securityDriver) < 0) return -1; -- 1.7.10.1

On 05/11/2012 12:48 PM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange"<berrange@redhat.com>
Both /proc and /sys may have sub-mounts in them from the host OS. We must explicitly unmount them all before mounting the new instance over that location. If we don't then /proc/mounts will show the sub-mounts as existing, even though nothing will be able to access them, due to the over-mount.
Signed-off-by: Daniel P. Berrange<berrange@redhat.com> --- src/lxc/lxc_container.c | 61 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 14 deletions(-)
@@ -1054,13 +1057,36 @@ static int lxcContainerUnmountOldFS(void) qsort(mounts, nmounts, sizeof(mounts[0]), lxcContainerChildMountSort);
+ *mountsret = mounts; + *nmountsret = nmounts; + ret = 0; + +cleanup: + endmntent(procmnt); + return ret; +} + +static int lxcContainerUnmountSubtree(const char *prefix, + bool isOldRootFS) +{ + char **mounts = NULL; + size_t nmounts = 0; + size_t i; + int saveErrno; + const char *failedUmount = NULL; + int ret = -1; + + VIR_DEBUG("Unmount subtreee from %s", prefix); + + if (lxcContainerGetSubtree(prefix,&mounts,&nmounts)< 0) + return -1; for (i = 0 ; i< nmounts ; i++) { VIR_DEBUG("Umount %s", mounts[i]); if (umount(mounts[i])< 0) { char ebuf[1024]; failedUmount = mounts[i]; saveErrno = errno; - VIR_WARN("Failed to unmount '%s', trying to detach root '%s': %s", + VIR_WARN("Failed to unmount '%s', trying to detach subtree '%s': %s", failedUmount, mounts[nmounts-1], virStrerror(errno, ebuf, sizeof(ebuf))); break;
This may be an existing issue - should the code not try to continue unmounting rather than break'ing follwing error above? Would leaving the loop here leave stale mounts behind? Stefan

On Fri, May 11, 2012 at 10:04:24PM -0400, Stefan Berger wrote:
On 05/11/2012 12:48 PM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange"<berrange@redhat.com>
Both /proc and /sys may have sub-mounts in them from the host OS. We must explicitly unmount them all before mounting the new instance over that location. If we don't then /proc/mounts will show the sub-mounts as existing, even though nothing will be able to access them, due to the over-mount.
Signed-off-by: Daniel P. Berrange<berrange@redhat.com> --- src/lxc/lxc_container.c | 61 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 14 deletions(-)
@@ -1054,13 +1057,36 @@ static int lxcContainerUnmountOldFS(void) qsort(mounts, nmounts, sizeof(mounts[0]), lxcContainerChildMountSort);
+ *mountsret = mounts; + *nmountsret = nmounts; + ret = 0; + +cleanup: + endmntent(procmnt); + return ret; +} + +static int lxcContainerUnmountSubtree(const char *prefix, + bool isOldRootFS) +{ + char **mounts = NULL; + size_t nmounts = 0; + size_t i; + int saveErrno; + const char *failedUmount = NULL; + int ret = -1; + + VIR_DEBUG("Unmount subtreee from %s", prefix); + + if (lxcContainerGetSubtree(prefix,&mounts,&nmounts)< 0) + return -1; for (i = 0 ; i< nmounts ; i++) { VIR_DEBUG("Umount %s", mounts[i]); if (umount(mounts[i])< 0) { char ebuf[1024]; failedUmount = mounts[i]; saveErrno = errno; - VIR_WARN("Failed to unmount '%s', trying to detach root '%s': %s", + VIR_WARN("Failed to unmount '%s', trying to detach subtree '%s': %s", failedUmount, mounts[nmounts-1], virStrerror(errno, ebuf, sizeof(ebuf))); break;
This may be an existing issue - should the code not try to continue unmounting rather than break'ing follwing error above? Would leaving the loop here leave stale mounts behind?
In this scenario, if we fail to unmount any path, then code later will check 'failedUmount' and unmount2(MNT_DETACH) the entire root filesystem. So any mounts are lazily cleaned up by the kernel & inaccessible to the container. That said, we probably could continue trying to unmount other dirs, just to avoid wasting kernel memory from the detached mount tree. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

From: "Daniel P. Berrange" <berrange@redhat.com> Normal practice is for cgroups controllers to be mounted at /sys/fs/cgroup. When setting up a container, /sys is mounted with a new sysfs instance, thus we must re-mount all the cgroups controllers. The complexity is that we must mount them in the same layout as the host OS. ie if 'cpu' and 'cpuacct' were mounted at the same location in the host we must preserve this in the container. Also if any controllers are co-located we must setup symlinks from the individual controller name to the co-located mount-point Signed-off-by: Daniel P. Berrange <berrange@redhat.com> --- src/lxc/lxc_container.c | 243 +++++++++++++++++++++++++++++++++++++++++++++-- src/util/cgroup.h | 2 + 2 files changed, 236 insertions(+), 9 deletions(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index a3ca76c..79ecae8 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -35,6 +35,7 @@ #include <sys/stat.h> #include <unistd.h> #include <mntent.h> +#include <dirent.h> /* Yes, we want linux private one, for _syscall2() macro */ #include <linux/unistd.h> @@ -1122,6 +1123,181 @@ cleanup: } +struct lxcContainerCGroup { + const char *dir; + const char *linkDest; +}; + + +static int lxcContainerIdentifyCGroups(struct lxcContainerCGroup **mountsret, + size_t *nmountsret) +{ + FILE *procmnt = NULL; + struct mntent mntent; + struct dirent *dent; + char mntbuf[1024]; + int ret = -1; + struct lxcContainerCGroup *mounts = NULL; + size_t nmounts = 0; + size_t i; + DIR *dh = NULL; + char *path = NULL; + + *mountsret = NULL; + *nmountsret = 0; + + VIR_DEBUG("Finding cgroups mount points under %s", VIR_CGROUP_SYSFS_MOUNT); + + if (!(procmnt = setmntent("/proc/mounts", "r"))) { + virReportSystemError(errno, "%s", + _("Failed to read /proc/mounts")); + return -1; + } + + while (getmntent_r(procmnt, &mntent, mntbuf, sizeof(mntbuf)) != NULL) { + VIR_DEBUG("Got %s", mntent.mnt_dir); + if (STRNEQ(mntent.mnt_type, "cgroup") || + !STRPREFIX(mntent.mnt_dir, VIR_CGROUP_SYSFS_MOUNT)) + continue; + + /* Skip named mounts with no controller since they're + * for application use only ie systemd */ + if (strstr(mntent.mnt_opts, "name=")) + continue; + + if (VIR_EXPAND_N(mounts, nmounts, 1) < 0) { + virReportOOMError(); + goto cleanup; + } + if (!(mounts[nmounts-1].dir = strdup(mntent.mnt_dir))) { + virReportOOMError(); + goto cleanup; + } + VIR_DEBUG("Grabbed %s", mntent.mnt_dir); + } + + VIR_DEBUG("Checking for symlinks in %s", VIR_CGROUP_SYSFS_MOUNT); + if (!(dh = opendir(VIR_CGROUP_SYSFS_MOUNT))) { + virReportSystemError(errno, + _("Unable to read directory %s"), + VIR_CGROUP_SYSFS_MOUNT); + goto cleanup; + } + + while ((dent = readdir(dh)) != NULL) { + ssize_t rv; + /* The cgroups links are just relative to the local + * dir so we don't need a large buf */ + char linkbuf[100]; + if (dent->d_name[0] == '.') + continue; + + VIR_DEBUG("Checking entry %s", dent->d_name); + if (virAsprintf(&path, "%s/%s", VIR_CGROUP_SYSFS_MOUNT, dent->d_name) < 0) { + virReportOOMError(); + goto cleanup; + } + + if ((rv = readlink(path, linkbuf, sizeof(linkbuf)-1)) < 0) { + if (errno != EINVAL) { + virReportSystemError(errno, + _("Unable to resolve link %s"), + path); + VIR_FREE(path); + goto cleanup; + } + /* Ok not a link */ + VIR_FREE(path); + } else { + linkbuf[rv] = '\0'; + VIR_DEBUG("Got a link %s to %s", path, linkbuf); + if (VIR_EXPAND_N(mounts, nmounts, 1) < 0) { + virReportOOMError(); + goto cleanup; + } + if (!(mounts[nmounts-1].linkDest = strdup(linkbuf))) { + virReportOOMError(); + goto cleanup; + } + mounts[nmounts-1].dir = path; + path = NULL; + } + } + + *mountsret = mounts; + *nmountsret = nmounts; + ret = 0; + +cleanup: + closedir(dh); + endmntent(procmnt); + VIR_FREE(path); + + if (ret < 0) { + for (i = 0 ; i < nmounts ; i++) { + VIR_FREE(mounts[i].dir); + VIR_FREE(mounts[i].linkDest); + } + VIR_FREE(mounts); + } + return ret; +} + + +static int lxcContainerMountCGroups(struct lxcContainerCGroup *mounts, + size_t nmounts) +{ + size_t i; + + VIR_DEBUG("Mounting cgroups at '%s'", VIR_CGROUP_SYSFS_MOUNT); + + if (virFileMakePath(VIR_CGROUP_SYSFS_MOUNT) < 0) { + virReportSystemError(errno, + _("Unable to create directory %s"), + VIR_CGROUP_SYSFS_MOUNT); + return -1; + } + + if (mount("tmpfs", VIR_CGROUP_SYSFS_MOUNT, "tmpfs", MS_NOSUID|MS_NODEV|MS_NOEXEC, "mode=755") < 0) { + virReportSystemError(errno, + _("Failed to mount %s on %s type %s"), + "tmpfs", VIR_CGROUP_SYSFS_MOUNT, "tmpfs"); + return -1; + } + + for (i = 0 ; i < nmounts ; i++) { + if (mounts[i].linkDest) { + VIR_DEBUG("Link mount point '%s' to '%s'", + mounts[i].dir, mounts[i].linkDest); + if (symlink(mounts[i].linkDest, mounts[i].dir) < 0) { + virReportSystemError(errno, + _("Unable to symlink directory %s to %s"), + mounts[i].dir, mounts[i].linkDest); + return -1; + } + } else { + VIR_DEBUG("Create mount point '%s'", mounts[i].dir); + if (virFileMakePath(mounts[i].dir) < 0) { + virReportSystemError(errno, + _("Unable to create directory %s"), + mounts[i].dir); + return -1; + } + + if (mount("cgroup", mounts[i].dir, "cgroup", + 0, mounts[i].dir + strlen(VIR_CGROUP_SYSFS_MOUNT) + 1) < 0) { + virReportSystemError(errno, + _("Failed to mount %s on %s"), + "cgroup", mounts[i].dir); + return -1; + } + } + } + + return 0; +} + + /* Got a FS mapped to /, we're going the pivot_root * approach to do a better-chroot-than-chroot * this is based on this thread http://lkml.org/lkml/2008/3/5/29 @@ -1132,31 +1308,54 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, size_t nttyPaths, virSecurityManagerPtr securityDriver) { + struct lxcContainerCGroup *mounts = NULL; + size_t nmounts = 0; + size_t i; + int ret = -1; + + /* Before pivoting we need to identify any + * cgroups controllers that are mounted */ + if (lxcContainerIdentifyCGroups(&mounts, &nmounts) < 0) + goto cleanup; + /* Gives us a private root, leaving all parent OS mounts on /.oldroot */ if (lxcContainerPivotRoot(root) < 0) - return -1; + goto cleanup; /* Mounts the core /proc, /sys, etc filesystems */ if (lxcContainerMountBasicFS(vmDef, true, securityDriver) < 0) - return -1; + goto cleanup; + + /* Now we can re-mount the cgroups controllers in the + * same configuration as before */ + if (lxcContainerMountCGroups(mounts, nmounts) < 0) + goto cleanup; /* Mounts /dev/pts */ if (lxcContainerMountFSDevPTS(root) < 0) - return -1; + goto cleanup; /* Populates device nodes in /dev/ */ if (lxcContainerPopulateDevices(ttyPaths, nttyPaths) < 0) - return -1; + goto cleanup; /* Sets up any non-root mounts from guest config */ if (lxcContainerMountAllFS(vmDef, "/.oldroot", true) < 0) - return -1; + goto cleanup; /* Gets rid of all remaining mounts from host OS, including /.oldroot itself */ if (lxcContainerUnmountSubtree("/.oldroot", true) < 0) - return -1; + goto cleanup; - return 0; + ret = 0; + +cleanup: + for (i = 0 ; i < nmounts ; i++) { + VIR_FREE(mounts[i].dir); + VIR_FREE(mounts[i].linkDest); + } + VIR_FREE(mounts); + return ret; } @@ -1166,6 +1365,11 @@ static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef, virDomainFSDefPtr root, virSecurityManagerPtr securityDriver) { + int ret = -1; + struct lxcContainerCGroup *mounts = NULL; + size_t nmounts = 0; + size_t i; + VIR_DEBUG("def=%p", vmDef); /* * This makes sure that any new filesystems in the @@ -1190,19 +1394,40 @@ static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef, if (lxcContainerMountAllFS(vmDef, "", false) < 0) return -1; + /* Before replacing /sys we need to identify any + * cgroups controllers that are mounted */ + if (lxcContainerIdentifyCGroups(&mounts, &nmounts) < 0) + goto cleanup; + /* Gets rid of any existing stuff under /proc, since we need new * namespace aware versions of those. We must do /proc second * otherwise we won't find /proc/mounts :-) */ if (lxcContainerUnmountSubtree("/sys", false) < 0 || lxcContainerUnmountSubtree("/proc", false) < 0) - return -1; + goto cleanup; /* Mounts the core /proc, /sys, etc filesystems */ if (lxcContainerMountBasicFS(vmDef, false, securityDriver) < 0) - return -1; + goto cleanup; + + /* Now we can re-mount the cgroups controllers in the + * same configuration as before */ + if (lxcContainerMountCGroups(mounts, nmounts) < 0) + goto cleanup; VIR_DEBUG("Mounting completed"); return 0; + + ret = 0; + +cleanup: + for (i = 0 ; i < nmounts ; i++) { + VIR_FREE(mounts[i].dir); + VIR_FREE(mounts[i].linkDest); + } + VIR_FREE(mounts); + + return ret; } diff --git a/src/util/cgroup.h b/src/util/cgroup.h index 8486c42..857945d 100644 --- a/src/util/cgroup.h +++ b/src/util/cgroup.h @@ -16,6 +16,8 @@ struct virCgroup; typedef struct virCgroup *virCgroupPtr; +#define VIR_CGROUP_SYSFS_MOUNT "/sys/fs/cgroup" + enum { VIR_CGROUP_CONTROLLER_CPU, VIR_CGROUP_CONTROLLER_CPUACCT, -- 1.7.10.1

On 05/11/2012 12:48 PM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange"<berrange@redhat.com>
Normal practice is for cgroups controllers to be mounted at /sys/fs/cgroup. When setting up a container, /sys is mounted with a new sysfs instance, thus we must re-mount all the cgroups controllers. The complexity is that we must mount them in the same layout as the host OS. ie if 'cpu' and 'cpuacct' were mounted at the same location in the host we must preserve this in the container. Also if any controllers are co-located we must setup symlinks from the individual controller name to the co-located mount-point
Signed-off-by: Daniel P. Berrange<berrange@redhat.com> --- src/lxc/lxc_container.c | 243 +++++++++++++++++++++++++++++++++++++++++++++-- src/util/cgroup.h | 2 + 2 files changed, 236 insertions(+), 9 deletions(-)
diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index a3ca76c..79ecae8 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -35,6 +35,7 @@ #include<sys/stat.h> #include<unistd.h> #include<mntent.h> +#include<dirent.h>
/* Yes, we want linux private one, for _syscall2() macro */ #include<linux/unistd.h>
+static int lxcContainerIdentifyCGroups(struct lxcContainerCGroup **mountsret, + size_t *nmountsret) +{
[...]
+ while ((dent = readdir(dh)) != NULL) { + ssize_t rv; + /* The cgroups links are just relative to the local + * dir so we don't need a large buf */ + char linkbuf[100];
Nit: empty line here.
+ if (dent->d_name[0] == '.') + continue; + [...] +cleanup: + closedir(dh); + endmntent(procmnt); + VIR_FREE(path); + + if (ret< 0) { + for (i = 0 ; i< nmounts ; i++) { + VIR_FREE(mounts[i].dir); + VIR_FREE(mounts[i].linkDest); + } + VIR_FREE(mounts);
You could put this into a function lxcContainerCGroupFree().
+ +cleanup: + for (i = 0 ; i< nmounts ; i++) { + VIR_FREE(mounts[i].dir); + VIR_FREE(mounts[i].linkDest); + } + VIR_FREE(mounts);
... and call it again from here and save a few lines ...
@@ -1190,19 +1394,40 @@ static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef, if (lxcContainerMountAllFS(vmDef, "", false)< 0) return -1;
+ /* Before replacing /sys we need to identify any + * cgroups controllers that are mounted */ + if (lxcContainerIdentifyCGroups(&mounts,&nmounts)< 0) + goto cleanup; + /* Gets rid of any existing stuff under /proc, since we need new * namespace aware versions of those. We must do /proc second * otherwise we won't find /proc/mounts :-) */ if (lxcContainerUnmountSubtree("/sys", false)< 0 || lxcContainerUnmountSubtree("/proc", false)< 0) - return -1; + goto cleanup;
/* Mounts the core /proc, /sys, etc filesystems */ if (lxcContainerMountBasicFS(vmDef, false, securityDriver)< 0) - return -1; + goto cleanup; + + /* Now we can re-mount the cgroups controllers in the + * same configuration as before */ + if (lxcContainerMountCGroups(mounts, nmounts)< 0) + goto cleanup;
VIR_DEBUG("Mounting completed"); return 0; + + ret = 0; + +cleanup: + for (i = 0 ; i< nmounts ; i++) { + VIR_FREE(mounts[i].dir); + VIR_FREE(mounts[i].linkDest); + } + VIR_FREE(mounts);
... and save a few more lines here as well. With this nit, I'd ACK the series. Stefan

On Fri, May 11, 2012 at 10:05:52PM -0400, Stefan Berger wrote:
On 05/11/2012 12:48 PM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange"<berrange@redhat.com>
Normal practice is for cgroups controllers to be mounted at /sys/fs/cgroup. When setting up a container, /sys is mounted with a new sysfs instance, thus we must re-mount all the cgroups controllers. The complexity is that we must mount them in the same layout as the host OS. ie if 'cpu' and 'cpuacct' were mounted at the same location in the host we must preserve this in the container. Also if any controllers are co-located we must setup symlinks from the individual controller name to the co-located mount-point
With this nit, I'd ACK the series.
I applied the other previously ACK'd patches in this series, and have re-sent an update of just this last patch. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
participants (3)
-
Daniel P. Berrange
-
Eric Blake
-
Stefan Berger