[libvirt] [PATCH 0/2] Handle read only root in LXC

From: "Daniel P. Berrange" <berrange@redhat.com> A v2 of this patch https://www.redhat.com/archives/libvir-list/2013-September/msg00405.html Daniel P. Berrange (2): Move array of mounts out of lxcContainerMountBasicFS Ensure root filesystem is recursively mounted readonly src/lxc/lxc_container.c | 169 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 130 insertions(+), 39 deletions(-) -- 1.8.3.1

From: "Daniel P. Berrange" <berrange@redhat.com> Move the array of basic mounts out of the lxcContainerMountBasicFS function, to a global variable. This is to allow it to be referenced by other methods wanting to know what the basic mount paths are. Signed-off-by: Daniel P. Berrange <berrange@redhat.com> --- src/lxc/lxc_container.c | 79 ++++++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 9c04d06..d51cdc4 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -750,45 +750,50 @@ err: } -static int lxcContainerMountBasicFS(bool userns_enabled) -{ - const struct { - const char *src; - const char *dst; - const char *type; - const char *opts; - int mflags; - } mnts[] = { - /* When we want to make a bind mount readonly, for unknown reasons, - * it is currently necessary to bind it once, and then remount the - * bind with the readonly flag. If this is not done, then the original - * mount point in the main OS becomes readonly too which is not what - * we want. Hence some things have two entries here. - */ - { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV }, - { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND }, - { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, - { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV }, - { "sysfs", "/sys", "sysfs", NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, - { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV }, - { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, +typedef struct { + const char *src; + const char *dst; + const char *type; + const char *opts; + int mflags; +} virLXCBasicMountInfo; + +static const virLXCBasicMountInfo lxcBasicMounts[] = { + /* When we want to make a bind mount readonly, for unknown reasons, + * it is currently necessary to bind it once, and then remount the + * bind with the readonly flag. If this is not done, then the original + * mount point in the main OS becomes readonly too which is not what + * we want. Hence some things have two entries here. + */ + { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV }, + { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND }, + { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, + { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV }, + { "sysfs", "/sys", "sysfs", NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, + { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV }, + { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, #if WITH_SELINUX - { SELINUX_MOUNT, SELINUX_MOUNT, "selinuxfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV }, - { SELINUX_MOUNT, SELINUX_MOUNT, NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, + { SELINUX_MOUNT, SELINUX_MOUNT, "selinuxfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV }, + { SELINUX_MOUNT, SELINUX_MOUNT, NULL, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY }, #endif - }; +}; + + +static int lxcContainerMountBasicFS(bool userns_enabled) +{ size_t i; int rc = -1; VIR_DEBUG("Mounting basic filesystems"); - for (i = 0; i < ARRAY_CARDINALITY(mnts); i++) { + for (i = 0; i < ARRAY_CARDINALITY(lxcBasicMounts); i++) { + virLXCBasicMountInfo const *mnt = &lxcBasicMounts[i]; const char *srcpath = NULL; VIR_DEBUG("Processing %s -> %s", - mnts[i].src, mnts[i].dst); + mnt->src, mnt->dst); - srcpath = mnts[i].src; + srcpath = mnt->src; /* Skip if mount doesn't exist in source */ if ((srcpath[0] == '/') && @@ -796,34 +801,34 @@ static int lxcContainerMountBasicFS(bool userns_enabled) continue; #if WITH_SELINUX - if (STREQ(mnts[i].src, SELINUX_MOUNT) && + if (STREQ(mnt->src, SELINUX_MOUNT) && !is_selinux_enabled()) continue; #endif - if (STREQ(mnts[i].src, "securityfs") && userns_enabled) + if (STREQ(mnt->src, "securityfs") && userns_enabled) continue; - if (virFileMakePath(mnts[i].dst) < 0) { + if (virFileMakePath(mnt->dst) < 0) { virReportSystemError(errno, _("Failed to mkdir %s"), - mnts[i].src); + mnt->src); goto cleanup; } VIR_DEBUG("Mount %s on %s type=%s flags=%x, opts=%s", - srcpath, mnts[i].dst, mnts[i].type, mnts[i].mflags, mnts[i].opts); - if (mount(srcpath, mnts[i].dst, mnts[i].type, mnts[i].mflags, mnts[i].opts) < 0) { + srcpath, mnt->dst, mnt->type, mnt->mflags, mnt->opts); + if (mount(srcpath, mnt->dst, mnt->type, mnt->mflags, mnt->opts) < 0) { #if WITH_SELINUX - if (STREQ(mnts[i].src, SELINUX_MOUNT) && + if (STREQ(mnt->src, SELINUX_MOUNT) && (errno == EINVAL || errno == EPERM)) continue; #endif virReportSystemError(errno, _("Failed to mount %s on %s type %s flags=%x opts=%s"), - srcpath, mnts[i].dst, NULLSTR(mnts[i].type), - mnts[i].mflags, NULLSTR(mnts[i].opts)); + srcpath, mnt->dst, NULLSTR(mnt->type), + mnt->mflags, NULLSTR(mnt->opts)); goto cleanup; } } -- 1.8.3.1

On 09/10/2013 06:46 AM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange" <berrange@redhat.com>
Move the array of basic mounts out of the lxcContainerMountBasicFS function, to a global variable. This is to allow it to be referenced by other methods wanting to know what the basic mount paths are.
Signed-off-by: Daniel P. Berrange <berrange@redhat.com> --- src/lxc/lxc_container.c | 79 ++++++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 37 deletions(-)
ACK. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

From: "Daniel P. Berrange" <berrange@redhat.com> If the guest is configured with <filesystem type='mount'> <source dir='/'/> <target dir='/'/> <readonly/> </filesystem> Then any submounts under / should also end up readonly, except for those setup as basic mounts. eg if the user has /home on a separate volume, they'd expect /home to be readonly, but we should not touch the /sys, /proc, etc dirs we setup ourselves. Users can selectively make sub-mounts read-write again by simply listing them as new mounts without the <readonly> flag set <filesystem type='mount'> <source dir='/home'/> <target dir='/home'/> </filesystem> Signed-off-by: Daniel P. Berrange <berrange@redhat.com> --- src/lxc/lxc_container.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index d51cdc4..38d95b0 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -532,7 +532,6 @@ static int lxcContainerGetSubtree(const char *prefix, } while (getmntent_r(procmnt, &mntent, mntbuf, sizeof(mntbuf)) != NULL) { - VIR_DEBUG("Got %s", mntent.mnt_dir); if (!STRPREFIX(mntent.mnt_dir, prefix)) continue; @@ -541,7 +540,6 @@ static int lxcContainerGetSubtree(const char *prefix, if (VIR_STRDUP(mounts[nmounts], mntent.mnt_dir) < 0) goto cleanup; nmounts++; - VIR_DEBUG("Grabbed %s", mntent.mnt_dir); } if (mounts) @@ -779,6 +777,76 @@ static const virLXCBasicMountInfo lxcBasicMounts[] = { }; +static bool lxcIsBasicMountLocation(const char *path) +{ + size_t i; + + for (i = 0; i < ARRAY_CARDINALITY(lxcBasicMounts); i++) { + if (STRPREFIX(path, lxcBasicMounts[i].dst)) + return true; + } + + return false; +} + + +static int lxcContainerSetReadOnly(virDomainFSDefPtr root) +{ + FILE *procmnt; + struct mntent mntent; + char mntbuf[1024]; + int ret = -1; + char **mounts = NULL; + size_t nmounts = 0; + size_t i; + + VIR_DEBUG("root=%s", root->src); + + if (!(procmnt = setmntent("/proc/mounts", "r"))) { + virReportSystemError(errno, "%s", + _("Failed to read /proc/mounts")); + return -1; + } + + while (getmntent_r(procmnt, &mntent, mntbuf, sizeof(mntbuf)) != NULL) { + if (STREQ(mntent.mnt_dir, "/") || + STREQ(mntent.mnt_dir, "/.oldroot") || + STRPREFIX(mntent.mnt_dir, "/.oldroot/") || + lxcIsBasicMountLocation(mntent.mnt_dir)) + continue; + + if (VIR_REALLOC_N(mounts, nmounts + 1) < 0) + goto cleanup; + if (VIR_STRDUP(mounts[nmounts], mntent.mnt_dir) < 0) + goto cleanup; + nmounts++; + } + + if (mounts) + qsort(mounts, nmounts, sizeof(mounts[0]), + lxcContainerChildMountSort); + + for (i = 0; i < nmounts; i++) { + VIR_DEBUG("Bind readonly %s", mounts[i]); + if (mount(mounts[i], mounts[i], NULL, MS_BIND|MS_REC|MS_RDONLY|MS_REMOUNT, NULL) < 0) { + virReportSystemError(errno, + _("Failed to make mount %s readonly"), + mounts[i]); + goto cleanup; + } + } + + ret = 0; +cleanup: + for (i = 0; i < nmounts; i++) + VIR_FREE(mounts[i]); + VIR_FREE(mounts); + endmntent(procmnt); + return ret; + +} + + static int lxcContainerMountBasicFS(bool userns_enabled) { size_t i; @@ -1006,6 +1074,8 @@ static int lxcContainerMountFSBind(virDomainFSDefPtr fs, int ret = -1; struct stat st; + VIR_DEBUG("src=%s dst=%s", fs->src, fs->dst); + if (virAsprintf(&src, "%s%s", srcprefix, fs->src) < 0) goto cleanup; @@ -1062,6 +1132,13 @@ static int lxcContainerMountFSBind(virDomainFSDefPtr fs, _("Failed to make directory %s readonly"), fs->dst); } + } else { + VIR_DEBUG("Binding %s readwrite", fs->dst); + if (mount(src, fs->dst, NULL, MS_BIND|MS_REMOUNT, NULL) < 0) { + virReportSystemError(errno, + _("Failed to make directory %s readwrite"), + fs->dst); + } } ret = 0; @@ -1335,6 +1412,8 @@ static int lxcContainerMountFSBlock(virDomainFSDefPtr fs, char *src = NULL; int ret = -1; + VIR_DEBUG("src=%s dst=%s", fs->src, fs->dst); + if (virAsprintf(&src, "%s%s", srcprefix, fs->src) < 0) goto cleanup; @@ -1354,6 +1433,8 @@ static int lxcContainerMountFSTmpfs(virDomainFSDefPtr fs, int ret = -1; char *data = NULL; + VIR_DEBUG("usage=%lld sec=%s", fs->usage, sec_mount_options); + if (virAsprintf(&data, "size=%lldk%s", fs->usage, sec_mount_options) < 0) goto cleanup; @@ -1541,6 +1622,11 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, if (lxcContainerMountBasicFS(vmDef->idmap.nuidmap) < 0) goto cleanup; + /* Ensure entire root filesystem (except /.oldroot) is readonly */ + if (root->readonly && + lxcContainerSetReadOnly(root) < 0) + goto cleanup; + /* Mounts /proc/meminfo etc sysinfo */ if (lxcContainerMountProcFuse(vmDef, stateDir) < 0) goto cleanup; -- 1.8.3.1

On 09/10/2013 08:46 PM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange" <berrange@redhat.com>
If the guest is configured with
<filesystem type='mount'> <source dir='/'/> <target dir='/'/> <readonly/> </filesystem>
Then any submounts under / should also end up readonly, except for those setup as basic mounts. eg if the user has /home on a separate volume, they'd expect /home to be readonly, but we should not touch the /sys, /proc, etc dirs we setup ourselves.
Users can selectively make sub-mounts read-write again by simply listing them as new mounts without the <readonly> flag set
<filesystem type='mount'> <source dir='/home'/> <target dir='/home'/> </filesystem>
Signed-off-by: Daniel P. Berrange <berrange@redhat.com> --- src/lxc/lxc_container.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 2 deletions(-)
diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index d51cdc4..38d95b0 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -532,7 +532,6 @@ static int lxcContainerGetSubtree(const char *prefix, }
while (getmntent_r(procmnt, &mntent, mntbuf, sizeof(mntbuf)) != NULL) { - VIR_DEBUG("Got %s", mntent.mnt_dir); if (!STRPREFIX(mntent.mnt_dir, prefix)) continue;
@@ -541,7 +540,6 @@ static int lxcContainerGetSubtree(const char *prefix, if (VIR_STRDUP(mounts[nmounts], mntent.mnt_dir) < 0) goto cleanup; nmounts++; - VIR_DEBUG("Grabbed %s", mntent.mnt_dir); }
if (mounts) @@ -779,6 +777,76 @@ static const virLXCBasicMountInfo lxcBasicMounts[] = { };
+static bool lxcIsBasicMountLocation(const char *path) +{ + size_t i; + + for (i = 0; i < ARRAY_CARDINALITY(lxcBasicMounts); i++) { + if (STRPREFIX(path, lxcBasicMounts[i].dst)) + return true; + } + + return false; +} + + +static int lxcContainerSetReadOnly(virDomainFSDefPtr root) +{ + FILE *procmnt; + struct mntent mntent; + char mntbuf[1024]; + int ret = -1; + char **mounts = NULL; + size_t nmounts = 0; + size_t i; + + VIR_DEBUG("root=%s", root->src);
seems root is only used for debug message? The other looks good to me. ACK

On 09/10/2013 06:46 AM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange" <berrange@redhat.com>
If the guest is configured with
<filesystem type='mount'> <source dir='/'/> <target dir='/'/> <readonly/> </filesystem>
Then any submounts under / should also end up readonly, except for those setup as basic mounts. eg if the user has /home on a separate volume, they'd expect /home to be readonly, but we should not touch the /sys, /proc, etc dirs we setup ourselves.
Users can selectively make sub-mounts read-write again by simply listing them as new mounts without the <readonly> flag set
<filesystem type='mount'> <source dir='/home'/> <target dir='/home'/> </filesystem>
Signed-off-by: Daniel P. Berrange <berrange@redhat.com> --- src/lxc/lxc_container.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 2 deletions(-)
diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index d51cdc4..38d95b0 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -532,7 +532,6 @@ static int lxcContainerGetSubtree(const char *prefix, }
while (getmntent_r(procmnt, &mntent, mntbuf, sizeof(mntbuf)) != NULL) { - VIR_DEBUG("Got %s", mntent.mnt_dir); if (!STRPREFIX(mntent.mnt_dir, prefix)) continue;
@@ -541,7 +540,6 @@ static int lxcContainerGetSubtree(const char *prefix, if (VIR_STRDUP(mounts[nmounts], mntent.mnt_dir) < 0) goto cleanup; nmounts++; - VIR_DEBUG("Grabbed %s", mntent.mnt_dir); }
if (mounts) @@ -779,6 +777,76 @@ static const virLXCBasicMountInfo lxcBasicMounts[] = { };
+static bool lxcIsBasicMountLocation(const char *path) +{ + size_t i; + + for (i = 0; i < ARRAY_CARDINALITY(lxcBasicMounts); i++) { + if (STRPREFIX(path, lxcBasicMounts[i].dst)) + return true;
This will still have false positives. If I create a bind mount at /sys/fs, then STRPREFIX("/sys/fs", "/sys" will return true, even though it's not the basic mount. Likewise, if I create a mount at /system, it will return true. Don't you want STREQ rather than STRPREFIX?
+ + while (getmntent_r(procmnt, &mntent, mntbuf, sizeof(mntbuf)) != NULL) { + if (STREQ(mntent.mnt_dir, "/") || + STREQ(mntent.mnt_dir, "/.oldroot") || + STRPREFIX(mntent.mnt_dir, "/.oldroot/") || + lxcIsBasicMountLocation(mntent.mnt_dir)) + continue;
This part looks okay, though. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org
participants (3)
-
Daniel P. Berrange
-
Eric Blake
-
Gao feng