On Tue, Sep 10, 2013 at 09:58:13AM +0800, Gao feng wrote:
On 09/09/2013 11:30 PM, Daniel P. Berrange wrote:
> From: "Daniel P. Berrange" <berrange(a)redhat.com>
>
> If the guest is configured with
>
> <filesystem type='mount'>
> <source dir='/'/>
> <target dir='/'/>
> <readonly/>
> </filesystem>
>
> Then any submounts under / should also end up readonly. eg if
> the user has /home on a separate volume, they'd expect /home
> to be readonly.
>
> Users can selectively make sub-mounts read-write again by
> simply listing them as new mounts without the <readonly>
> flag set
>
> <filesystem type='mount'>
> <source dir='/home'/>
> <target dir='/home'/>
> </filesystem>
>
> Signed-off-by: Daniel P. Berrange <berrange(a)redhat.com>
> ---
> src/lxc/lxc_container.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 73 insertions(+), 2 deletions(-)
>
> diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c
> index 9c04d06..ae672c8 100644
> --- a/src/lxc/lxc_container.c
> +++ b/src/lxc/lxc_container.c
> @@ -532,7 +532,6 @@ static int lxcContainerGetSubtree(const char *prefix,
> }
>
> while (getmntent_r(procmnt, &mntent, mntbuf, sizeof(mntbuf)) != NULL) {
> - VIR_DEBUG("Got %s", mntent.mnt_dir);
> if (!STRPREFIX(mntent.mnt_dir, prefix))
> continue;
>
> @@ -541,7 +540,6 @@ static int lxcContainerGetSubtree(const char *prefix,
> if (VIR_STRDUP(mounts[nmounts], mntent.mnt_dir) < 0)
> goto cleanup;
> nmounts++;
> - VIR_DEBUG("Grabbed %s", mntent.mnt_dir);
> }
>
> if (mounts)
> @@ -750,6 +748,61 @@ err:
> }
>
>
> +static int lxcContainerSetReadOnly(virDomainFSDefPtr root)
> +{
> + FILE *procmnt;
> + struct mntent mntent;
> + char mntbuf[1024];
> + int ret = -1;
> + char **mounts = NULL;
> + size_t nmounts = 0;
> + size_t i;
> +
> + VIR_DEBUG("root=%s", root->src);
> +
> + if (!(procmnt = setmntent("/proc/mounts", "r"))) {
> + virReportSystemError(errno, "%s",
> + _("Failed to read /proc/mounts"));
> + return -1;
> + }
> +
> + while (getmntent_r(procmnt, &mntent, mntbuf, sizeof(mntbuf)) != NULL) {
> + if (STREQ(mntent.mnt_dir, "/") ||
> + STRPREFIX(mntent.mnt_dir, "/.oldroot"))
> + continue;
> +
> + if (VIR_REALLOC_N(mounts, nmounts+1) < 0)
> + goto cleanup;
> + if (VIR_STRDUP(mounts[nmounts], mntent.mnt_dir) < 0)
> + goto cleanup;
> + nmounts++;
> + }
> +
> + if (mounts)
> + qsort(mounts, nmounts, sizeof(mounts[0]),
> + lxcContainerChildMountSort);
> +
> + for (i = 0 ; i < nmounts ; i++) {
> + VIR_DEBUG("Bind readonly %s", mounts[i]);
> + if (mount(mounts[i], mounts[i], NULL, MS_BIND|MS_REC|MS_RDONLY|MS_REMOUNT,
NULL) < 0) {
> + virReportSystemError(errno,
> + _("Failed to make mount %s readonly"),
> + mounts[i]);
> + goto cleanup;
> + }
> + }
> +
> + ret = 0;
> +cleanup:
> + for (i = 0; i < nmounts; i++)
> + VIR_FREE(mounts[i]);
> + VIR_FREE(mounts);
> + endmntent(procmnt);
> + return ret;
> +
> +}
> +
> +
> static int lxcContainerMountBasicFS(bool userns_enabled)
> {
> const struct {
> @@ -1001,6 +1054,8 @@ static int lxcContainerMountFSBind(virDomainFSDefPtr fs,
> int ret = -1;
> struct stat st;
>
> + VIR_DEBUG("src=%s dst=%s", fs->src, fs->dst);
> +
> if (virAsprintf(&src, "%s%s", srcprefix, fs->src) < 0)
> goto cleanup;
>
> @@ -1057,6 +1112,13 @@ static int lxcContainerMountFSBind(virDomainFSDefPtr fs,
> _("Failed to make directory %s
readonly"),
> fs->dst);
> }
> + } else {
> + VIR_DEBUG("Binding %s readwrite", fs->dst);
> + if (mount(src, fs->dst, NULL, MS_BIND|MS_REMOUNT, NULL) < 0) {
> + virReportSystemError(errno,
> + _("Failed to make directory %s
readwrite"),
> + fs->dst);
> + }
> }
>
> ret = 0;
> @@ -1330,6 +1392,8 @@ static int lxcContainerMountFSBlock(virDomainFSDefPtr fs,
> char *src = NULL;
> int ret = -1;
>
> + VIR_DEBUG("src=%s dst=%s", fs->src, fs->dst);
> +
> if (virAsprintf(&src, "%s%s", srcprefix, fs->src) < 0)
> goto cleanup;
>
> @@ -1349,6 +1413,8 @@ static int lxcContainerMountFSTmpfs(virDomainFSDefPtr fs,
> int ret = -1;
> char *data = NULL;
>
> + VIR_DEBUG("usage=%lld sec=%s", fs->usage, sec_mount_options);
> +
> if (virAsprintf(&data,
> "size=%lldk%s", fs->usage, sec_mount_options) <
0)
> goto cleanup;
> @@ -1536,6 +1602,11 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef,
> if (lxcContainerMountBasicFS(vmDef->idmap.nuidmap) < 0)
> goto cleanup;
>
> + /* Ensure entire root filesystem (except /.oldroot) is readonly */
> + if (root->readonly &&
> + lxcContainerSetReadOnly(root) < 0)
> + goto cleanup;
lxcContainerSetReadOnly should be called before lxcContainerMountBasicFS,
it's meaningless to make /proc/ /sys/ readonly again.
Ah yes, good point.
And I would like to say user in container can still use "mount
-o remount,rw --bind /"
to remount root directory writable.
Using SELinux, or dropping certain capabilities will prevent that, so
this is still useful protection even if unconfined root can get around
it. In addition Eric Biederman has a change to allow the mount state
to be locked & prevent this approach.
Daniel
--
|:
http://berrange.com -o-
http://www.flickr.com/photos/dberrange/ :|
|:
http://libvirt.org -o-
http://virt-manager.org :|
|:
http://autobuild.org -o-
http://search.cpan.org/~danberr/ :|
|:
http://entangle-photo.org -o-
http://live.gnome.org/gtk-vnc :|