[libvirt] [PATCH 0/3] Support readonly mounts in LXC

This patch series introduces support for readonly mounts in LXC. In includes general refactoring which will help other patches coming later too.

From: "Daniel P. Berrange" <berrange@redhat.com> The bind mount setup is about to get more complicated. To avoid having to deal with several copies, pull it out into a separate lxcContainerMountFSBind method. Also pull out the iteration over container filesystems, so that it will be easier to drop in support for non-bind mount filesystems * src/lxc/lxc_container.c: Pull bind mount code out into lxcContainerMountFSBind --- src/libvirt_private.syms | 2 + src/lxc/lxc_container.c | 118 ++++++++++++++++++++++++++++----------------- 2 files changed, 75 insertions(+), 45 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index acf7bb1..cac4995 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -290,6 +290,8 @@ virDomainDiskRemoveByName; virDomainDiskTypeFromString; virDomainDiskTypeToString; virDomainFSDefFree; +virDomainFSTypeFromString; +virDomainFSTypeToString; virDomainFindByID; virDomainFindByName; virDomainFindByUUID; diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 432b7f8..c5ef609 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -176,6 +176,7 @@ static int lxcContainerSetStdio(int control, int ttyfd, int handshakefd) rc = 0; cleanup: + VIR_DEBUG("rc=%d", rc); return rc; } @@ -513,41 +514,77 @@ static int lxcContainerPopulateDevices(void) } -static int lxcContainerMountNewFS(virDomainDefPtr vmDef) +static int lxcContainerMountFSBind(virDomainFSDefPtr fs, + const char *srcprefix) { - int i; + char *src = NULL; + int ret = -1; + + if (virAsprintf(&src, "%s%s", srcprefix, fs->src) < 0) { + virReportOOMError(); + goto cleanup; + } + + if (virFileMakePath(fs->dst) < 0) { + virReportSystemError(errno, + _("Failed to create %s"), + fs->dst); + goto cleanup; + } + + if (mount(src, fs->dst, NULL, MS_BIND, NULL) < 0) { + virReportSystemError(errno, + _("Failed to bind mount directory %s to %s"), + src, fs->dst); + goto cleanup; + } + + ret = 0; + + VIR_DEBUG("Done mounting filesystem ret=%d tryProc=%d", ret, tryProc); + +cleanup: + VIR_FREE(src); + return ret; +} + + +static int lxcContainerMountFS(virDomainFSDefPtr fs, + const char *srcprefix) +{ + switch (fs->type) { + case VIR_DOMAIN_FS_TYPE_MOUNT: + if (lxcContainerMountFSBind(fs, srcprefix) < 0) + return -1; + break; + default: + lxcError(VIR_ERR_CONFIG_UNSUPPORTED, + _("Cannot mount filesystem type %s"), + virDomainFSTypeToString(fs->type)); + break; + } + return 0; +} + + +static int lxcContainerMountAllFS(virDomainDefPtr vmDef, + const char *dstprefix, + bool skipRoot) +{ + size_t i; + VIR_DEBUG("Mounting %s %d", dstprefix, skipRoot); /* Pull in rest of container's mounts */ for (i = 0 ; i < vmDef->nfss ; i++) { - char *src; - if (STREQ(vmDef->fss[i]->dst, "/")) - continue; - /* XXX fix */ - if (vmDef->fss[i]->type != VIR_DOMAIN_FS_TYPE_MOUNT) + if (skipRoot && + STREQ(vmDef->fss[i]->dst, "/")) continue; - if (virAsprintf(&src, "/.oldroot/%s", vmDef->fss[i]->src) < 0) { - virReportOOMError(); - return -1; - } - - if (virFileMakePath(vmDef->fss[i]->dst) < 0) { - virReportSystemError(errno, - _("Failed to create %s"), - vmDef->fss[i]->dst); - VIR_FREE(src); - return -1; - } - if (mount(src, vmDef->fss[i]->dst, NULL, MS_BIND, NULL) < 0) { - virReportSystemError(errno, - _("Failed to mount %s at %s"), - src, vmDef->fss[i]->dst); - VIR_FREE(src); + if (lxcContainerMountFS(vmDef->fss[i], dstprefix) < 0) return -1; - } - VIR_FREE(src); } + VIR_DEBUG("Mounted all filesystems"); return 0; } @@ -624,7 +661,7 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, return -1; /* Sets up any non-root mounts from guest config */ - if (lxcContainerMountNewFS(vmDef) < 0) + if (lxcContainerMountAllFS(vmDef, "/.oldroot", true) < 0) return -1; /* Gets rid of all remaining mounts from host OS, including /.oldroot itself */ @@ -634,34 +671,25 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, return 0; } + /* Nothing mapped to /, we're using the main root, but with extra stuff mapped in */ static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef) { - int i; - + VIR_DEBUG("def=%p", vmDef); + /* + * This makes sure that any new filesystems in the + * host OS propagate to the container, but any + * changes in the container are private + */ if (mount("", "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) { virReportSystemError(errno, "%s", _("Failed to make / slave")); return -1; } - for (i = 0 ; i < vmDef->nfss ; i++) { - /* XXX fix to support other mount types */ - if (vmDef->fss[i]->type != VIR_DOMAIN_FS_TYPE_MOUNT) - continue; - if (mount(vmDef->fss[i]->src, - vmDef->fss[i]->dst, - NULL, - MS_BIND, - NULL) < 0) { - virReportSystemError(errno, - _("Failed to mount %s at %s"), - vmDef->fss[i]->src, - vmDef->fss[i]->dst); - return -1; - } - } + if (lxcContainerMountAllFS(vmDef, "", false) < 0) + return -1; /* mount /proc */ if (mount("lxcproc", "/proc", "proc", 0, NULL) < 0) { -- 1.7.6

On 07/22/2011 07:41 AM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange"<berrange@redhat.com>
The bind mount setup is about to get more complicated. To avoid having to deal with several copies, pull it out into a separate lxcContainerMountFSBind method.
Also pull out the iteration over container filesystems, so that it will be easier to drop in support for non-bind mount filesystems
* src/lxc/lxc_container.c: Pull bind mount code out into lxcContainerMountFSBind --- src/libvirt_private.syms | 2 + src/lxc/lxc_container.c | 118 ++++++++++++++++++++++++++++----------------- 2 files changed, 75 insertions(+), 45 deletions(-)
ACK. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

From: "Daniel P. Berrange" <berrange@redhat.com> Even in non-virtual root filesystem mode we should be mounting more than just a new /proc. Refactor lxcContainerMountBasicFS so that it does everything except for /dev and /dev/pts moving that into lxcContainerMountDevFS. Pass in a source prefix to lxcContainerMountBasicFS() so it can be used in both shared root and private root modes. * src/lxc/lxc_container.c: Unify mounting code for special filesystems --- src/lxc/lxc_container.c | 85 ++++++++++++++++++++++++++++++++++------------- 1 files changed, 62 insertions(+), 23 deletions(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index c5ef609..10ebca3 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -393,41 +393,75 @@ err: } -static int lxcContainerMountBasicFS(virDomainFSDefPtr root) +static int lxcContainerMountBasicFS(const char *srcprefix) { const struct { + bool needPrefix; const char *src; const char *dst; const char *type; + const char *opts; + int flags; } mnts[] = { - { "/dev", "/dev", "tmpfs" }, - { "/proc", "/proc", "proc" }, - { "/sys", "/sys", "sysfs" }, -#if WITH_SELINUX - { "none", "/selinux", "selinuxfs" }, -#endif + { false, "devfs", "/dev", "tmpfs", "mode=755", MS_NOSUID }, + { false, "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV }, + { false, "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND }, + { true, "/sys", "/sys", NULL, NULL, MS_BIND }, + { true, "/selinux", "/selinux", NULL, NULL, MS_BIND }, }; int i, rc = -1; - char *devpts; - - if (virAsprintf(&devpts, "/.oldroot%s/dev/pts", root->src) < 0) { - virReportOOMError(); - return rc; - } for (i = 0 ; i < ARRAY_CARDINALITY(mnts) ; i++) { + char *src = NULL; + const char *srcpath = NULL; if (virFileMakePath(mnts[i].dst) < 0) { virReportSystemError(errno, _("Failed to mkdir %s"), mnts[i].src); goto cleanup; } - if (mount(mnts[i].src, mnts[i].dst, mnts[i].type, 0, NULL) < 0) { + + if (mnts[i].needPrefix && srcprefix) { + if (virAsprintf(&src, "%s%s", srcprefix, mnts[i].src) < 0) { + virReportOOMError(); + goto cleanup; + } + srcpath = src; + } else { + srcpath = mnts[i].src; + } + + /* Skip if mount doesn't exist in source */ + if ((srcpath[0] == '/') && + (access(srcpath, R_OK) < 0)) + continue; + + if (mount(srcpath, mnts[i].dst, mnts[i].type, mnts[i].flags, mnts[i].opts) < 0) { + VIR_FREE(src); virReportSystemError(errno, - _("Failed to mount %s on %s"), - mnts[i].type, mnts[i].type); + _("Failed to mount %s on %s type %s"), + mnts[i].src, mnts[i].dst, NULLSTR(mnts[i].type)); goto cleanup; } + VIR_FREE(src); + } + + rc = 0; + +cleanup: + VIR_DEBUG("rc=%d", rc); + return rc; +} + + +static int lxcContainerMountDevFS(virDomainFSDefPtr root) +{ + char *devpts = NULL; + int rc = -1; + + if (virAsprintf(&devpts, "/.oldroot%s/dev/pts", root->src) < 0) { + virReportOOMError(); + goto cleanup; } if (virFileMakePath("/dev/pts") < 0) { @@ -652,8 +686,12 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, if (lxcContainerPivotRoot(root) < 0) return -1; - /* Mounts the core /proc, /sys, /dev, /dev/pts filesystems */ - if (lxcContainerMountBasicFS(root) < 0) + /* Mounts the core /proc, /sys, etc filesystems */ + if (lxcContainerMountBasicFS("/.oldroot") < 0) + return -1; + + /* Mounts /dev and /dev/pts */ + if (lxcContainerMountDevFS(root) < 0) return -1; /* Populates device nodes in /dev/ */ @@ -688,16 +726,16 @@ static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef) return -1; } + VIR_DEBUG("Mounting config FS"); if (lxcContainerMountAllFS(vmDef, "", false) < 0) return -1; - /* mount /proc */ - if (mount("lxcproc", "/proc", "proc", 0, NULL) < 0) { - virReportSystemError(errno, "%s", - _("Failed to mount /proc")); + /* Mounts the core /proc, /sys, etc filesystems */ + VIR_DEBUG("Mounting basic FS"); + if (lxcContainerMountBasicFS(NULL) < 0) return -1; - } + VIR_DEBUG("Mounting completed"); return 0; } @@ -994,5 +1032,6 @@ int lxcContainerAvailable(int features) waitpid(cpid, &childStatus, 0); } + VIR_DEBUG("Mounted all filesystems"); return 0; } -- 1.7.6

On 07/22/2011 07:42 AM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange"<berrange@redhat.com>
Even in non-virtual root filesystem mode we should be mounting more than just a new /proc. Refactor lxcContainerMountBasicFS so that it does everything except for /dev and /dev/pts moving that into lxcContainerMountDevFS. Pass in a source prefix to lxcContainerMountBasicFS() so it can be used in both shared root and private root modes.
* src/lxc/lxc_container.c: Unify mounting code for special filesystems --- src/lxc/lxc_container.c | 85 ++++++++++++++++++++++++++++++++++------------- 1 files changed, 62 insertions(+), 23 deletions(-)
ACK. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

From: "Daniel P. Berrange" <berrange@redhat.com> A container should not be allowed to modify stuff in /sys or /proc/sys so make them readonly. Make /selinux readonly so that containers think that selinux is disabled. Honour the readonly flag when mounting container filesystems from the guest XML config * src/lxc/lxc_container.c: Support readonly mounts --- src/lxc/lxc_container.c | 29 +++++++++++++++++++++++++++++ 1 files changed, 29 insertions(+), 0 deletions(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 10ebca3..5cb090e 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -363,6 +363,15 @@ static int lxcContainerPivotRoot(virDomainFSDefPtr root) goto err; } + if (root->readonly) { + if (mount(root->src, newroot, NULL, MS_BIND|MS_REC|MS_RDONLY|MS_REMOUNT, NULL) < 0) { + virReportSystemError(errno, + _("Failed to make new root %s readonly"), + root->src); + goto err; + } + } + /* Now we chroot into the tmpfs, then pivot into the * root->src bind-mounted onto '/new' */ if (chdir(newroot) < 0) { @@ -403,11 +412,20 @@ static int lxcContainerMountBasicFS(const char *srcprefix) const char *opts; int flags; } mnts[] = { + /* When we want to make a bind mount readonly, for unknown reasons, + * it is currently neccessary to bind it once, and then remount the + * bind with the readonly flag. If this is not done, then the original + * mount point in the main OS becomes readonly too which si not what + * we want. Hence some things have two entries here. + */ { false, "devfs", "/dev", "tmpfs", "mode=755", MS_NOSUID }, { false, "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV }, { false, "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND }, + { false, "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, { true, "/sys", "/sys", NULL, NULL, MS_BIND }, + { true, "/sys", "/sys", NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, { true, "/selinux", "/selinux", NULL, NULL, MS_BIND }, + { true, "/selinux", "/selinux", NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, }; int i, rc = -1; @@ -573,6 +591,17 @@ static int lxcContainerMountFSBind(virDomainFSDefPtr fs, goto cleanup; } + if (fs->readonly) { + VIR_DEBUG("Binding %s readonly", fs->dst); + if (mount(fs->dst, fs->dst, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) { + virReportSystemError(errno, + _("Failed to make directory %s readonly"), + fs->dst); + goto cleanup; + } + + } + ret = 0; VIR_DEBUG("Done mounting filesystem ret=%d tryProc=%d", ret, tryProc); -- 1.7.6

On 07/22/2011 07:42 AM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange"<berrange@redhat.com>
A container should not be allowed to modify stuff in /sys or /proc/sys so make them readonly. Make /selinux readonly so that containers think that selinux is disabled.
Are we ever going to want to mix selinux and containers? But for now, I guess this makes sense.
Honour the readonly flag when mounting container filesystems from the guest XML config
* src/lxc/lxc_container.c: Support readonly mounts --- src/lxc/lxc_container.c | 29 +++++++++++++++++++++++++++++ 1 files changed, 29 insertions(+), 0 deletions(-)
} mnts[] = { + /* When we want to make a bind mount readonly, for unknown reasons, + * it is currently neccessary to bind it once, and then remount the
s/neccessary/necessary/
+ * bind with the readonly flag. If this is not done, then the original + * mount point in the main OS becomes readonly too which si not what
s/si/is/ ACK with spelling nits fixed. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

On Fri, Jul 22, 2011 at 08:03:59AM -0600, Eric Blake wrote:
On 07/22/2011 07:42 AM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange"<berrange@redhat.com>
A container should not be allowed to modify stuff in /sys or /proc/sys so make them readonly. Make /selinux readonly so that containers think that selinux is disabled.
Are we ever going to want to mix selinux and containers? But for now, I guess this makes sense.
Yes, I have patches that support sVirt with LXC but they're not quite ready. SELinux is something that is enabled from the host OS pov though. eg the container init process is run with an sVirt container, and all further processes inherit this. What this change is doing, is making the container OS think that SELinux is not enabled. This is not true, but we need to trick it, otherwise the container will try to use SELinux which won't work, because you can't have different policy inside the container vs the host OS, the host OS has to be in control Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
participants (2)
-
Daniel P. Berrange
-
Eric Blake