The LXC driver currently allows custom mount points to be setup inside the
container. This only works for non-root mount points. You cannot replace
the entire root filesystem. This patch adds support for replacing the entire
root filesystem, thus allowing the use of LXC containers as a 'better chroot
than chroot'. Well, with one minor flaw - the Linux kernel currently has no
device namespace virtualization, so the admin inside the container can just
do a 'mknod' and access the real devices of the host. So for now this patch
doesn't make LXC containers secure, but a future kernel release will enable
it to be secure.
lxc_container.c | 253 ++++++++++++++++++++++++++++++++++++++++++++++++--------
util.c | 12 +-
2 files changed, 226 insertions(+), 39 deletions(-)
Daniel
diff -r eaa42985aed4 src/lxc_container.c
--- a/src/lxc_container.c Tue Aug 05 16:50:59 2008 +0100
+++ b/src/lxc_container.c Tue Aug 05 16:51:14 2008 +0100
@@ -1,10 +1,12 @@
/*
* Copyright IBM Corp. 2008
+ * Copyright Red Hat 2008
*
* lxc_container.c: file description
*
* Authors:
* David L. Leskovec <dlesko at linux.vnet.ibm.com>
+ * Daniel P. Berrange <berrange(a)redhat.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@@ -28,10 +30,18 @@
#include <fcntl.h>
#include <limits.h>
#include <stdlib.h>
+#include <stdio.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/wait.h>
#include <unistd.h>
+#include <mntent.h>
+
+/* Yes, we want linux private one, for _syscall2() macro */
+#include <linux/unistd.h>
+
+/* For MS_MOVE */
+#include <linux/fs.h>
#include "lxc_container.h"
#include "util.h"
@@ -105,23 +115,15 @@
*
* Returns 0 on success or -1 in case of error
*/
-static int lxcContainerSetStdio(int control, const char *ttyPath)
+static int lxcContainerSetStdio(int control, int ttyfd)
{
int rc = -1;
- int ttyfd;
int open_max, i;
if (setsid() < 0) {
lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
_("setsid failed: %s"), strerror(errno));
- goto error_out;
- }
-
- ttyfd = open(ttyPath, O_RDWR|O_NOCTTY);
- if (ttyfd < 0) {
- lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
- _("open(%s) failed: %s"), ttyPath, strerror(errno));
- goto error_out;
+ goto cleanup;
}
if (ioctl(ttyfd, TIOCSCTTY, NULL) < 0) {
@@ -159,8 +161,6 @@
cleanup:
close(ttyfd);
-
-error_out:
return rc;
}
@@ -223,6 +223,7 @@
return 0;
}
+
/**
* lxcEnableInterfaces:
* @vm: Pointer to vm structure
@@ -252,6 +253,20 @@
return rc;
}
+
+//_syscall2(int, pivot_root, char *, newroot, const char *, oldroot)
+extern int pivot_root(const char * new_root,const char * put_old);
+
+static int lxcContainerChildMountSort(const void *a, const void *b)
+{
+ const char **sa = (const char**)a;
+ const char **sb = (const char**)b;
+
+ /* Delibrately reversed args - we need to unmount deepest
+ children first */
+ return strcmp(*sb, *sa);
+}
+
/**
* lxcChild:
* @argv: Pointer to container arguments
@@ -269,8 +284,8 @@
int rc = -1;
lxc_child_argv_t *argv = data;
virDomainDefPtr vmDef = argv->config;
- virDomainFSDefPtr curMount;
- int i;
+ virDomainFSDefPtr tmp, root = NULL;
+ int ttyfd, i;
if (NULL == vmDef) {
lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
@@ -278,36 +293,210 @@
return -1;
}
+#if 0
+ ttyfd = open(argv->ttyPath, O_RDWR|O_NOCTTY);
+ if (ttyfd < 0) {
+ lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+ _("open(%s) failed: %s"), argv->ttyPath, strerror(errno));
+ return -1;
+ }
+#endif
+
/* handle the bind mounts first before doing anything else that may */
/* then access those mounted dirs */
- curMount = vmDef->fss;
- for (i = 0; curMount; curMount = curMount->next) {
- // XXX fix
- if (curMount->type != VIR_DOMAIN_FS_TYPE_MOUNT)
+ for (tmp = vmDef->fss; tmp && !root; tmp = tmp->next) {
+ if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT)
continue;
- rc = mount(curMount->src,
- curMount->dst,
- NULL,
- MS_BIND,
- NULL);
- if (0 != rc) {
+ if (STREQ(tmp->dst, "/"))
+ root = tmp;
+ }
+
+ if (root) {
+ char *oldroot;
+ struct mntent *mntent;
+ char **mounts = NULL;
+ int nmounts = 0;
+ FILE *procmnt;
+ struct {
+ int maj;
+ int min;
+ const char *path;
+ } devs[] = {
+ { 1, 3, "/dev/null" },
+ { 1, 5, "/dev/zero" },
+ { 1, 7, "/dev/full" },
+ { 5, 1, "/dev/console" },
+ };
+
+ /* Got a FS mapped to /, we're going the pivot_root
+ approach to do a better-chroot-than-chroot */
+
+ /* this is based on this thread
http://lkml.org/lkml/2008/3/5/29 */
+
+ /* First step is to ensure the new root itself is
+ a mount point */
+ if (mount(root->src, root->src, NULL, MS_BIND, NULL) < 0) {
lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
- _("failed to mount %s at %s for container: %s"),
- curMount->src, curMount->dst, strerror(errno));
+ _("failed to bind new root %s: %s"),
+ root->src, strerror(errno));
+ return -1;
+ }
+
+ if (asprintf(&oldroot, "%s/.oldroot", root->src) < 0) {
+ lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, NULL);
+ return -1;
+ }
+
+ if (virFileMakePath(oldroot) < 0) {
+ lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+ _("failed to create %s: %s"),
+ oldroot, strerror(errno));
+ return -1;
+ }
+
+ /* The old root directory will live at /.oldroot after
+ * this and will soon be unmounted completely */
+ if (pivot_root(root->src, oldroot) < 0) {
+ lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+ _("failed to pivot root %s to %s: %s"),
+ oldroot, root->src, strerror(errno));
+ return -1;
+ }
+
+ /* CWD is undefined after pivot_root, so go to / */
+ if (chdir("/") < 0) {
+ return -1;
+ }
+
+ if (virFileMakePath("/proc") < 0 ||
+ mount("none", "/proc", "proc", 0, NULL) < 0)
{
+ lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+ _("failed to mount /proc for container: %s"),
+ strerror(errno));
+ return -1;
+ }
+ if (virFileMakePath("/dev") < 0 ||
+ mount("none", "/dev", "tmpfs", 0, NULL) < 0)
{
+ lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+ _("failed to mount /dev tmpfs for container: %s"),
+ strerror(errno));
+ return -1;
+ }
+ /* Move old devpts into container, since we have to
+ connect to the master ptmx which was opened in
+ the parent.
+ XXX This sucks, we need to figure out how to get our
+ own private devpts for isolation
+ */
+ if (virFileMakePath("/dev/pts") < 0 ||
+ mount("/.oldroot/dev/pts", "/dev/pts", NULL,
+ MS_MOVE, NULL) < 0) {
+ lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+ _("failed to move /dev/pts into container: %s"),
+ strerror(errno));
+ return -1;
+ }
+
+ /* Populate /dev/ with a few important bits */
+ umask(0);
+ for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) {
+ dev_t dev = makedev(devs[i].maj, devs[i].min);
+ if (mknod(devs[i].path,
+ 0777 | S_IFCHR,
+ dev) < 0) {
+ lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+ _("failed to make device %s: %s"),
+ devs[i].path, strerror(errno));
+ return -1;
+ }
+ }
+ umask(0700);
+
+ /* Pull in rest of container's mounts */
+ for (tmp = vmDef->fss; tmp; tmp = tmp->next) {
+ char *src;
+ if (STREQ(tmp->dst, "/"))
+ continue;
+ // XXX fix
+ if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT)
+ continue;
+
+ if (asprintf(&src, "/.oldroot/%s", tmp->src) < 0)
+ return -1;
+
+ if (virFileMakePath(tmp->dst) < 0 ||
+ mount(src, tmp->dst, NULL, MS_BIND, NULL) < 0) {
+ lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+ _("failed to mount %s at %s for container: %s"),
+ tmp->src, tmp->dst, strerror(errno));
+ return -1;
+ }
+ VIR_FREE(src);
+ }
+
+ if (!(procmnt = setmntent("/proc/mounts", "r"))) {
+ lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+ _("failed to read /proc/mounts: %s"),
+ strerror(errno));
+ return -1;
+ }
+ while ((mntent = getmntent(procmnt)) != NULL) {
+ if (!STRPREFIX(mntent->mnt_dir, "/.oldroot"))
+ continue;
+ if (VIR_REALLOC_N(mounts, nmounts+1) < 0)
+ return -1;
+ mounts[nmounts++] = strdup(mntent->mnt_dir);
+ }
+ endmntent(procmnt);
+
+ qsort(mounts, nmounts, sizeof(mounts[0]),
+ lxcContainerChildMountSort);
+
+ for (i = 0 ; i < nmounts ; i++) {
+ if (umount(mounts[i]) < 0) {
+ lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+ _("failed to unmount %s: %s"),
+ mounts[i], strerror(errno));
+ return -1;
+ }
+ }
+ } else {
+ /* Nothing mapped to /, we're using the main root,
+ but with extra stuff mapped in */
+ for (tmp = vmDef->fss; tmp; tmp = tmp->next) {
+ // XXX fix
+ if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT)
+ continue;
+ rc = mount(tmp->src,
+ tmp->dst,
+ NULL,
+ MS_BIND,
+ NULL);
+ if (0 != rc) {
+ lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+ _("failed to mount %s at %s for container: %s"),
+ tmp->src, tmp->dst, strerror(errno));
+ return -1;
+ }
+ }
+
+ /* mount /proc */
+ if (mount("lxcproc", "/proc", "proc", 0, NULL) <
0) {
+ lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+ _("failed to mount /proc for container: %s"),
+ strerror(errno));
return -1;
}
}
- /* mount /proc */
- rc = mount("lxcproc", "/proc", "proc", 0, NULL);
- if (0 != rc) {
+ ttyfd = open(argv->ttyPath, O_RDWR|O_NOCTTY);
+ if (ttyfd < 0) {
lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
- _("failed to mount /proc for container: %s"),
- strerror(errno));
+ _("open(%s) failed: %s"), argv->ttyPath, strerror(errno));
return -1;
}
- if (lxcContainerSetStdio(argv->monitor, argv->ttyPath) < 0)
+ if (lxcContainerSetStdio(argv->monitor, ttyfd) < 0)
return -1;
/* Wait for interface devices to show up */
diff -r eaa42985aed4 src/util.c
--- a/src/util.c Tue Aug 05 16:50:59 2008 +0100
+++ b/src/util.c Tue Aug 05 16:51:14 2008 +0100
@@ -524,13 +524,11 @@
if (!(p = strrchr(parent, '/')))
return EINVAL;
- if (p == parent)
- return EPERM;
-
- *p = '\0';
-
- if ((err = virFileMakePath(parent)))
- return err;
+ if (p != parent) {
+ *p = '\0';
+ if ((err = virFileMakePath(parent)))
+ return err;
+ }
if (mkdir(path, 0777) < 0 && errno != EEXIST)
return errno;
--
|: Red Hat, Engineering, London -o-
http://people.redhat.com/berrange/ :|
|:
http://libvirt.org -o-
http://virt-manager.org -o-
http://ovirt.org :|
|:
http://autobuild.org -o-
http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|