[libvirt] [PATCH] LXC: create a bind mount for sysfs when enable userns but disable netns

kernel commit 7dc5dbc879bd0779924b5132a48b731a0bc04a1e forbid us doing a fresh mount for sysfs when enable userns but disable netns. This patch will create a bind mount in this senario. Signed-off-by: Chen Hanxiao <chenhanxiao@cn.fujitsu.com> --- src/lxc/lxc_container.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 4d89677..8a27215 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -815,10 +815,13 @@ static int lxcContainerSetReadOnly(void) } -static int lxcContainerMountBasicFS(bool userns_enabled) +static int lxcContainerMountBasicFS(bool userns_enabled, + bool netns_disabled) { size_t i; int rc = -1; + char* mnt_src = NULL; + int mnt_mflags; VIR_DEBUG("Mounting basic filesystems"); @@ -826,8 +829,25 @@ static int lxcContainerMountBasicFS(bool userns_enabled) bool bindOverReadonly; virLXCBasicMountInfo const *mnt = &lxcBasicMounts[i]; + /* When enable userns but disable netns, kernel will + * forbid us doing a new fresh mount for sysfs. + * So we had to do a bind mount for sysfs instead. + */ + if (userns_enabled && netns_disabled && + STREQ(mnt->src, "sysfs")) { + if (VIR_STRDUP(mnt_src, "/sys") < 0) { + goto cleanup; + } + mnt_mflags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY|MS_BIND; + } else { + if (VIR_STRDUP(mnt_src, mnt->src) < 0) { + goto cleanup; + } + mnt_mflags = mnt->mflags; + } + VIR_DEBUG("Processing %s -> %s", - mnt->src, mnt->dst); + mnt_src, mnt->dst); if (mnt->skipUnmounted) { char *hostdir; @@ -856,7 +876,7 @@ static int lxcContainerMountBasicFS(bool userns_enabled) if (virFileMakePath(mnt->dst) < 0) { virReportSystemError(errno, _("Failed to mkdir %s"), - mnt->src); + mnt_src); goto cleanup; } @@ -867,24 +887,24 @@ static int lxcContainerMountBasicFS(bool userns_enabled) * we mount the filesystem in read-write mode initially, and then do a * separate read-only bind mount on top of that. */ - bindOverReadonly = !!(mnt->mflags & MS_RDONLY); + bindOverReadonly = !!(mnt_mflags & MS_RDONLY); VIR_DEBUG("Mount %s on %s type=%s flags=%x", - mnt->src, mnt->dst, mnt->type, mnt->mflags & ~MS_RDONLY); - if (mount(mnt->src, mnt->dst, mnt->type, mnt->mflags & ~MS_RDONLY, NULL) < 0) { + mnt_src, mnt->dst, mnt->type, mnt_mflags & ~MS_RDONLY); + if (mount(mnt_src, mnt->dst, mnt->type, mnt_mflags & ~MS_RDONLY, NULL) < 0) { virReportSystemError(errno, _("Failed to mount %s on %s type %s flags=%x"), - mnt->src, mnt->dst, NULLSTR(mnt->type), - mnt->mflags & ~MS_RDONLY); + mnt_src, mnt->dst, NULLSTR(mnt->type), + mnt_mflags & ~MS_RDONLY); goto cleanup; } if (bindOverReadonly && - mount(mnt->src, mnt->dst, NULL, + mount(mnt_src, mnt->dst, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) { virReportSystemError(errno, _("Failed to re-mount %s on %s flags=%x"), - mnt->src, mnt->dst, + mnt_src, mnt->dst, MS_BIND|MS_REMOUNT|MS_RDONLY); goto cleanup; } @@ -893,6 +913,7 @@ static int lxcContainerMountBasicFS(bool userns_enabled) rc = 0; cleanup: + VIR_FREE(mnt_src); VIR_DEBUG("rc=%d", rc); return rc; } @@ -1643,7 +1664,8 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, goto cleanup; /* Mounts the core /proc, /sys, etc filesystems */ - if (lxcContainerMountBasicFS(vmDef->idmap.nuidmap) < 0) + if (lxcContainerMountBasicFS(vmDef->idmap.nuidmap, + !vmDef->nnets) < 0) goto cleanup; /* Ensure entire root filesystem (except /.oldroot) is readonly */ -- 1.9.0

ping
-----Original Message----- From: libvir-list-bounces@redhat.com [mailto:libvir-list-bounces@redhat.com] On Behalf Of Chen Hanxiao Sent: Monday, July 14, 2014 6:02 PM To: libvir-list@redhat.com Subject: [libvirt] [PATCH] LXC: create a bind mount for sysfs when enable userns but disable netns
kernel commit 7dc5dbc879bd0779924b5132a48b731a0bc04a1e forbid us doing a fresh mount for sysfs when enable userns but disable netns. This patch will create a bind mount in this senario.
Signed-off-by: Chen Hanxiao <chenhanxiao@cn.fujitsu.com> --- src/lxc/lxc_container.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-)
diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 4d89677..8a27215 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -815,10 +815,13 @@ static int lxcContainerSetReadOnly(void) }
-static int lxcContainerMountBasicFS(bool userns_enabled) +static int lxcContainerMountBasicFS(bool userns_enabled, + bool netns_disabled) { size_t i; int rc = -1; + char* mnt_src = NULL; + int mnt_mflags;
VIR_DEBUG("Mounting basic filesystems");
@@ -826,8 +829,25 @@ static int lxcContainerMountBasicFS(bool userns_enabled) bool bindOverReadonly; virLXCBasicMountInfo const *mnt = &lxcBasicMounts[i];
+ /* When enable userns but disable netns, kernel will + * forbid us doing a new fresh mount for sysfs. + * So we had to do a bind mount for sysfs instead. + */ + if (userns_enabled && netns_disabled && + STREQ(mnt->src, "sysfs")) { + if (VIR_STRDUP(mnt_src, "/sys") < 0) { + goto cleanup; + } + mnt_mflags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY|MS_BIND; + } else { + if (VIR_STRDUP(mnt_src, mnt->src) < 0) { + goto cleanup; + } + mnt_mflags = mnt->mflags; + } + VIR_DEBUG("Processing %s -> %s", - mnt->src, mnt->dst); + mnt_src, mnt->dst);
if (mnt->skipUnmounted) { char *hostdir; @@ -856,7 +876,7 @@ static int lxcContainerMountBasicFS(bool userns_enabled) if (virFileMakePath(mnt->dst) < 0) { virReportSystemError(errno, _("Failed to mkdir %s"), - mnt->src); + mnt_src); goto cleanup; }
@@ -867,24 +887,24 @@ static int lxcContainerMountBasicFS(bool userns_enabled) * we mount the filesystem in read-write mode initially, and then do a * separate read-only bind mount on top of that. */ - bindOverReadonly = !!(mnt->mflags & MS_RDONLY); + bindOverReadonly = !!(mnt_mflags & MS_RDONLY);
VIR_DEBUG("Mount %s on %s type=%s flags=%x", - mnt->src, mnt->dst, mnt->type, mnt->mflags & ~MS_RDONLY); - if (mount(mnt->src, mnt->dst, mnt->type, mnt->mflags & ~MS_RDONLY, NULL) < 0) { + mnt_src, mnt->dst, mnt->type, mnt_mflags & ~MS_RDONLY); + if (mount(mnt_src, mnt->dst, mnt->type, mnt_mflags & ~MS_RDONLY, NULL) < 0) { virReportSystemError(errno, _("Failed to mount %s on %s type %s flags=%x"), - mnt->src, mnt->dst, NULLSTR(mnt->type), - mnt->mflags & ~MS_RDONLY); + mnt_src, mnt->dst, NULLSTR(mnt->type), + mnt_mflags & ~MS_RDONLY); goto cleanup; }
if (bindOverReadonly && - mount(mnt->src, mnt->dst, NULL, + mount(mnt_src, mnt->dst, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) { virReportSystemError(errno, _("Failed to re-mount %s on %s flags=%x"), - mnt->src, mnt->dst, + mnt_src, mnt->dst, MS_BIND|MS_REMOUNT|MS_RDONLY); goto cleanup; } @@ -893,6 +913,7 @@ static int lxcContainerMountBasicFS(bool userns_enabled) rc = 0;
cleanup: + VIR_FREE(mnt_src); VIR_DEBUG("rc=%d", rc); return rc; } @@ -1643,7 +1664,8 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, goto cleanup;
/* Mounts the core /proc, /sys, etc filesystems */ - if (lxcContainerMountBasicFS(vmDef->idmap.nuidmap) < 0) + if (lxcContainerMountBasicFS(vmDef->idmap.nuidmap, + !vmDef->nnets) < 0) goto cleanup;
/* Ensure entire root filesystem (except /.oldroot) is readonly */ -- 1.9.0
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list

On 07/14/2014 06:01 PM, Chen Hanxiao wrote:
kernel commit 7dc5dbc879bd0779924b5132a48b731a0bc04a1e forbid us doing a fresh mount for sysfs when enable userns but disable netns. This patch will create a bind mount in this senario.
Signed-off-by: Chen Hanxiao <chenhanxiao@cn.fujitsu.com> ---
Looks good to me, ACK
src/lxc/lxc_container.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-)
diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 4d89677..8a27215 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -815,10 +815,13 @@ static int lxcContainerSetReadOnly(void) }
-static int lxcContainerMountBasicFS(bool userns_enabled) +static int lxcContainerMountBasicFS(bool userns_enabled, + bool netns_disabled) { size_t i; int rc = -1; + char* mnt_src = NULL; + int mnt_mflags;
VIR_DEBUG("Mounting basic filesystems");
@@ -826,8 +829,25 @@ static int lxcContainerMountBasicFS(bool userns_enabled) bool bindOverReadonly; virLXCBasicMountInfo const *mnt = &lxcBasicMounts[i];
+ /* When enable userns but disable netns, kernel will + * forbid us doing a new fresh mount for sysfs. + * So we had to do a bind mount for sysfs instead. + */ + if (userns_enabled && netns_disabled && + STREQ(mnt->src, "sysfs")) { + if (VIR_STRDUP(mnt_src, "/sys") < 0) { + goto cleanup; + } + mnt_mflags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY|MS_BIND; + } else { + if (VIR_STRDUP(mnt_src, mnt->src) < 0) { + goto cleanup; + } + mnt_mflags = mnt->mflags; + } + VIR_DEBUG("Processing %s -> %s", - mnt->src, mnt->dst); + mnt_src, mnt->dst);
if (mnt->skipUnmounted) { char *hostdir; @@ -856,7 +876,7 @@ static int lxcContainerMountBasicFS(bool userns_enabled) if (virFileMakePath(mnt->dst) < 0) { virReportSystemError(errno, _("Failed to mkdir %s"), - mnt->src); + mnt_src); goto cleanup; }
@@ -867,24 +887,24 @@ static int lxcContainerMountBasicFS(bool userns_enabled) * we mount the filesystem in read-write mode initially, and then do a * separate read-only bind mount on top of that. */ - bindOverReadonly = !!(mnt->mflags & MS_RDONLY); + bindOverReadonly = !!(mnt_mflags & MS_RDONLY);
VIR_DEBUG("Mount %s on %s type=%s flags=%x", - mnt->src, mnt->dst, mnt->type, mnt->mflags & ~MS_RDONLY); - if (mount(mnt->src, mnt->dst, mnt->type, mnt->mflags & ~MS_RDONLY, NULL) < 0) { + mnt_src, mnt->dst, mnt->type, mnt_mflags & ~MS_RDONLY); + if (mount(mnt_src, mnt->dst, mnt->type, mnt_mflags & ~MS_RDONLY, NULL) < 0) { virReportSystemError(errno, _("Failed to mount %s on %s type %s flags=%x"), - mnt->src, mnt->dst, NULLSTR(mnt->type), - mnt->mflags & ~MS_RDONLY); + mnt_src, mnt->dst, NULLSTR(mnt->type), + mnt_mflags & ~MS_RDONLY); goto cleanup; }
if (bindOverReadonly && - mount(mnt->src, mnt->dst, NULL, + mount(mnt_src, mnt->dst, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) { virReportSystemError(errno, _("Failed to re-mount %s on %s flags=%x"), - mnt->src, mnt->dst, + mnt_src, mnt->dst, MS_BIND|MS_REMOUNT|MS_RDONLY); goto cleanup; } @@ -893,6 +913,7 @@ static int lxcContainerMountBasicFS(bool userns_enabled) rc = 0;
cleanup: + VIR_FREE(mnt_src); VIR_DEBUG("rc=%d", rc); return rc; } @@ -1643,7 +1664,8 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, goto cleanup;
/* Mounts the core /proc, /sys, etc filesystems */ - if (lxcContainerMountBasicFS(vmDef->idmap.nuidmap) < 0) + if (lxcContainerMountBasicFS(vmDef->idmap.nuidmap, + !vmDef->nnets) < 0) goto cleanup;
/* Ensure entire root filesystem (except /.oldroot) is readonly */

On 07/14/2014 06:01 PM, Chen Hanxiao wrote:
kernel commit 7dc5dbc879bd0779924b5132a48b731a0bc04a1e forbid us doing a fresh mount for sysfs when enable userns but disable netns. This patch will create a bind mount in this senario.
Signed-off-by: Chen Hanxiao <chenhanxiao@cn.fujitsu.com> --- src/lxc/lxc_container.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-)
Pushed, thanks!
diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 4d89677..8a27215 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -815,10 +815,13 @@ static int lxcContainerSetReadOnly(void) }
-static int lxcContainerMountBasicFS(bool userns_enabled) +static int lxcContainerMountBasicFS(bool userns_enabled, + bool netns_disabled) { size_t i; int rc = -1; + char* mnt_src = NULL; + int mnt_mflags;
VIR_DEBUG("Mounting basic filesystems");
@@ -826,8 +829,25 @@ static int lxcContainerMountBasicFS(bool userns_enabled) bool bindOverReadonly; virLXCBasicMountInfo const *mnt = &lxcBasicMounts[i];
+ /* When enable userns but disable netns, kernel will + * forbid us doing a new fresh mount for sysfs. + * So we had to do a bind mount for sysfs instead. + */ + if (userns_enabled && netns_disabled && + STREQ(mnt->src, "sysfs")) { + if (VIR_STRDUP(mnt_src, "/sys") < 0) { + goto cleanup; + } + mnt_mflags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY|MS_BIND; + } else { + if (VIR_STRDUP(mnt_src, mnt->src) < 0) { + goto cleanup; + } + mnt_mflags = mnt->mflags; + } + VIR_DEBUG("Processing %s -> %s", - mnt->src, mnt->dst); + mnt_src, mnt->dst);
if (mnt->skipUnmounted) { char *hostdir; @@ -856,7 +876,7 @@ static int lxcContainerMountBasicFS(bool userns_enabled) if (virFileMakePath(mnt->dst) < 0) { virReportSystemError(errno, _("Failed to mkdir %s"), - mnt->src); + mnt_src); goto cleanup; }
@@ -867,24 +887,24 @@ static int lxcContainerMountBasicFS(bool userns_enabled) * we mount the filesystem in read-write mode initially, and then do a * separate read-only bind mount on top of that. */ - bindOverReadonly = !!(mnt->mflags & MS_RDONLY); + bindOverReadonly = !!(mnt_mflags & MS_RDONLY);
VIR_DEBUG("Mount %s on %s type=%s flags=%x", - mnt->src, mnt->dst, mnt->type, mnt->mflags & ~MS_RDONLY); - if (mount(mnt->src, mnt->dst, mnt->type, mnt->mflags & ~MS_RDONLY, NULL) < 0) { + mnt_src, mnt->dst, mnt->type, mnt_mflags & ~MS_RDONLY); + if (mount(mnt_src, mnt->dst, mnt->type, mnt_mflags & ~MS_RDONLY, NULL) < 0) { virReportSystemError(errno, _("Failed to mount %s on %s type %s flags=%x"), - mnt->src, mnt->dst, NULLSTR(mnt->type), - mnt->mflags & ~MS_RDONLY); + mnt_src, mnt->dst, NULLSTR(mnt->type), + mnt_mflags & ~MS_RDONLY); goto cleanup; }
if (bindOverReadonly && - mount(mnt->src, mnt->dst, NULL, + mount(mnt_src, mnt->dst, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) { virReportSystemError(errno, _("Failed to re-mount %s on %s flags=%x"), - mnt->src, mnt->dst, + mnt_src, mnt->dst, MS_BIND|MS_REMOUNT|MS_RDONLY); goto cleanup; } @@ -893,6 +913,7 @@ static int lxcContainerMountBasicFS(bool userns_enabled) rc = 0;
cleanup: + VIR_FREE(mnt_src); VIR_DEBUG("rc=%d", rc); return rc; } @@ -1643,7 +1664,8 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, goto cleanup;
/* Mounts the core /proc, /sys, etc filesystems */ - if (lxcContainerMountBasicFS(vmDef->idmap.nuidmap) < 0) + if (lxcContainerMountBasicFS(vmDef->idmap.nuidmap, + !vmDef->nnets) < 0) goto cleanup;
/* Ensure entire root filesystem (except /.oldroot) is readonly */

On Mon, Jul 14, 2014 at 12:01 PM, Chen Hanxiao <chenhanxiao@cn.fujitsu.com> wrote:
kernel commit 7dc5dbc879bd0779924b5132a48b731a0bc04a1e forbid us doing a fresh mount for sysfs when enable userns but disable netns. This patch will create a bind mount in this senario.
Sorry for exhuming an already merged patch but today I ran into a nasty issue caused by it.
Signed-off-by: Chen Hanxiao <chenhanxiao@cn.fujitsu.com> --- src/lxc/lxc_container.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-)
diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 4d89677..8a27215 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -815,10 +815,13 @@ static int lxcContainerSetReadOnly(void) }
-static int lxcContainerMountBasicFS(bool userns_enabled) +static int lxcContainerMountBasicFS(bool userns_enabled, + bool netns_disabled) { size_t i; int rc = -1; + char* mnt_src = NULL; + int mnt_mflags;
VIR_DEBUG("Mounting basic filesystems");
@@ -826,8 +829,25 @@ static int lxcContainerMountBasicFS(bool userns_enabled) bool bindOverReadonly; virLXCBasicMountInfo const *mnt = &lxcBasicMounts[i];
+ /* When enable userns but disable netns, kernel will + * forbid us doing a new fresh mount for sysfs. + * So we had to do a bind mount for sysfs instead. + */ + if (userns_enabled && netns_disabled && + STREQ(mnt->src, "sysfs")) { + if (VIR_STRDUP(mnt_src, "/sys") < 0) { + goto cleanup; + }
This is clearly broken and looks very untested to me. It will issue this mount call: mount("/sys", "/sys", "sysfs", MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_BIND, NULL) because the code runs after pivot_root(2). i.e, /sys will be still empty after that and no sysfs at all there. As libvirt will later remount /sys readonly creating a container will fail with the most useless error message: Error: internal error: guest failed to start: Unable to create directory /sys/fs/: Read-only file system or Error: internal error: guest failed to start: Unable to create directory /sys/fs/cgroup: Read-only file system Please note that changing "/sys" to "/.oldroot/sys" will not solve the issue as this code runs already in the new namespace and therefore the old mount tree is locked, thus MS_BIND is not allowed. This brings me to the question, why do you handle the netns_disabled case anyway? If in the XML file no network is specified just create a new and empty network namespace. Bindmounting /sys into the container is a security issue. This is why mounting sysfs without a netns was disabled to begin with. P.S: Sorry for the grumpy mail, I've wasted almost the whole day with debugging that issue. -- Thanks, //richard

-----Original Message----- From: Richard Weinberger [mailto:richard.weinberger@gmail.com] Sent: Wednesday, March 11, 2015 4:56 AM To: Chen, Hanxiao/陈 晗霄 Cc: libvir-list@redhat.com; Daniel P. Berrange; Gao feng Subject: Re: [libvirt] [PATCH] LXC: create a bind mount for sysfs when enable userns but disable netns
On Mon, Jul 14, 2014 at 12:01 PM, Chen Hanxiao <chenhanxiao@cn.fujitsu.com> wrote:
kernel commit 7dc5dbc879bd0779924b5132a48b731a0bc04a1e forbid us doing a fresh mount for sysfs when enable userns but disable netns. This patch will create a bind mount in this senario.
Sorry for exhuming an already merged patch but today I ran into a nasty issue caused by it.
Signed-off-by: Chen Hanxiao <chenhanxiao@cn.fujitsu.com> --- src/lxc/lxc_container.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-)
diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 4d89677..8a27215 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -815,10 +815,13 @@ static int lxcContainerSetReadOnly(void) }
-static int lxcContainerMountBasicFS(bool userns_enabled) +static int lxcContainerMountBasicFS(bool userns_enabled, + bool netns_disabled) { size_t i; int rc = -1; + char* mnt_src = NULL; + int mnt_mflags;
VIR_DEBUG("Mounting basic filesystems");
@@ -826,8 +829,25 @@ static int lxcContainerMountBasicFS(bool userns_enabled) bool bindOverReadonly; virLXCBasicMountInfo const *mnt = &lxcBasicMounts[i];
+ /* When enable userns but disable netns, kernel will + * forbid us doing a new fresh mount for sysfs. + * So we had to do a bind mount for sysfs instead. + */ + if (userns_enabled && netns_disabled && + STREQ(mnt->src, "sysfs")) { + if (VIR_STRDUP(mnt_src, "/sys") < 0) { + goto cleanup; + }
This is clearly broken and looks very untested to me.
It's broken now. But when I submitted this patch last year, it's not.
It will issue this mount call: mount("/sys", "/sys", "sysfs", MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_BIND, NULL) because the code runs after pivot_root(2). i.e, /sys will be still empty after that and no sysfs at all there. As libvirt will later remount /sys readonly creating a container will fail with the most useless error message: Error: internal error: guest failed to start: Unable to create directory /sys/fs/: Read-only file system or Error: internal error: guest failed to start: Unable to create directory /sys/fs/cgroup: Read-only file system
Please note that changing "/sys" to "/.oldroot/sys" will not solve the issue as this code runs already in the new namespace and therefore the old mount tree is locked, thus MS_BIND is not allowed.
This brings me to the question, why do you handle the netns_disabled case anyway?
Please check the discussion at: http://lists.linux-foundation.org/pipermail/containers/2014-July/034721.html
If in the XML file no network is specified just create a new and empty network namespace. Bindmounting /sys into the container is a security issue. This is why mounting sysfs without a netns was disabled to begin with.
Yes, I tried to propose enable netns by default, but Dan thought that we should allow containers sharing the host's network: http://www.redhat.com/archives/libvir-list/2013-August/msg01025.html So we should allow user create containers without netns, they should know what they do if they read libvirt's docs See docs patch describe security considerations: http://www.redhat.com/archives/libvir-list/2013-September/msg00562.html Regards, - Chen
P.S: Sorry for the grumpy mail, I've wasted almost the whole day with debugging that issue.
-- Thanks, //richard

Am 11.03.2015 um 03:30 schrieb Chen, Hanxiao:
@@ -826,8 +829,25 @@ static int lxcContainerMountBasicFS(bool userns_enabled) bool bindOverReadonly; virLXCBasicMountInfo const *mnt = &lxcBasicMounts[i];
+ /* When enable userns but disable netns, kernel will + * forbid us doing a new fresh mount for sysfs. + * So we had to do a bind mount for sysfs instead. + */ + if (userns_enabled && netns_disabled && + STREQ(mnt->src, "sysfs")) { + if (VIR_STRDUP(mnt_src, "/sys") < 0) { + goto cleanup; + }
This is clearly broken and looks very untested to me.
It's broken now. But when I submitted this patch last year, it's not.
Are you sure? Just built libvirt v1.2.6-222-ga86b621, head is commit a86b6215a74b1feb2667204e214fbfd2f7decc5c Author: Chen Hanxiao <chenhanxiao@cn.fujitsu.com> Date: Mon Jul 14 18:01:51 2014 +0800 LXC: create a bind mount for sysfs when enable userns but disable netns /sys is still an empty directory but as at this time (most likely due to another bug) libvirt was able to create /sys/fs/cgroup and mounted groups there. But no sysfs at all is at /sys. I mean, how is this supposed to work? You bind mount /sys over /sys...
It will issue this mount call: mount("/sys", "/sys", "sysfs", MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_BIND, NULL) because the code runs after pivot_root(2). i.e, /sys will be still empty after that and no sysfs at all there. As libvirt will later remount /sys readonly creating a container will fail with the most useless error message: Error: internal error: guest failed to start: Unable to create directory /sys/fs/: Read-only file system or Error: internal error: guest failed to start: Unable to create directory /sys/fs/cgroup: Read-only file system
Please note that changing "/sys" to "/.oldroot/sys" will not solve the issue as this code runs already in the new namespace and therefore the old mount tree is locked, thus MS_BIND is not allowed.
This brings me to the question, why do you handle the netns_disabled case anyway?
Please check the discussion at: http://lists.linux-foundation.org/pipermail/containers/2014-July/034721.html
If in the XML file no network is specified just create a new and empty network namespace. Bindmounting /sys into the container is a security issue. This is why mounting sysfs without a netns was disabled to begin with.
Yes, I tried to propose enable netns by default, but Dan thought that we should allow containers sharing the host's network: http://www.redhat.com/archives/libvir-list/2013-August/msg01025.html
So we should allow user create containers without netns, they should know what they do if they read libvirt's docs See docs patch describe security considerations: http://www.redhat.com/archives/libvir-list/2013-September/msg00562.html
Thanks for the links! //richard

Am 11.03.2015 um 10:36 schrieb Richard Weinberger:
Am 11.03.2015 um 03:30 schrieb Chen, Hanxiao:
@@ -826,8 +829,25 @@ static int lxcContainerMountBasicFS(bool userns_enabled) bool bindOverReadonly; virLXCBasicMountInfo const *mnt = &lxcBasicMounts[i];
+ /* When enable userns but disable netns, kernel will + * forbid us doing a new fresh mount for sysfs. + * So we had to do a bind mount for sysfs instead. + */ + if (userns_enabled && netns_disabled && + STREQ(mnt->src, "sysfs")) { + if (VIR_STRDUP(mnt_src, "/sys") < 0) { + goto cleanup; + }
This is clearly broken and looks very untested to me.
It's broken now. But when I submitted this patch last year, it's not.
Are you sure? Just built libvirt v1.2.6-222-ga86b621, head is commit a86b6215a74b1feb2667204e214fbfd2f7decc5c Author: Chen Hanxiao <chenhanxiao@cn.fujitsu.com> Date: Mon Jul 14 18:01:51 2014 +0800
LXC: create a bind mount for sysfs when enable userns but disable netns
/sys is still an empty directory but as at this time (most likely due to another bug) libvirt was able to create /sys/fs/cgroup and mounted groups there. But no sysfs at all is at /sys.
I mean, how is this supposed to work? You bind mount /sys over /sys...
Any further comments on that? Thanks, //richard

On Thu, Mar 19, 2015 at 05:54:32PM +0100, Richard Weinberger wrote:
Am 11.03.2015 um 10:36 schrieb Richard Weinberger:
Am 11.03.2015 um 03:30 schrieb Chen, Hanxiao:
@@ -826,8 +829,25 @@ static int lxcContainerMountBasicFS(bool userns_enabled) bool bindOverReadonly; virLXCBasicMountInfo const *mnt = &lxcBasicMounts[i];
+ /* When enable userns but disable netns, kernel will + * forbid us doing a new fresh mount for sysfs. + * So we had to do a bind mount for sysfs instead. + */ + if (userns_enabled && netns_disabled && + STREQ(mnt->src, "sysfs")) { + if (VIR_STRDUP(mnt_src, "/sys") < 0) { + goto cleanup; + }
This is clearly broken and looks very untested to me.
It's broken now. But when I submitted this patch last year, it's not.
Are you sure? Just built libvirt v1.2.6-222-ga86b621, head is commit a86b6215a74b1feb2667204e214fbfd2f7decc5c Author: Chen Hanxiao <chenhanxiao@cn.fujitsu.com> Date: Mon Jul 14 18:01:51 2014 +0800
LXC: create a bind mount for sysfs when enable userns but disable netns
/sys is still an empty directory but as at this time (most likely due to another bug) libvirt was able to create /sys/fs/cgroup and mounted groups there. But no sysfs at all is at /sys.
I mean, how is this supposed to work? You bind mount /sys over /sys...
Any further comments on that?
It just looks impossible for it to work in this way Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

Am 19.03.2015 um 17:58 schrieb Daniel P. Berrange:
On Thu, Mar 19, 2015 at 05:54:32PM +0100, Richard Weinberger wrote:
Am 11.03.2015 um 10:36 schrieb Richard Weinberger:
Am 11.03.2015 um 03:30 schrieb Chen, Hanxiao:
@@ -826,8 +829,25 @@ static int lxcContainerMountBasicFS(bool userns_enabled) bool bindOverReadonly; virLXCBasicMountInfo const *mnt = &lxcBasicMounts[i];
+ /* When enable userns but disable netns, kernel will + * forbid us doing a new fresh mount for sysfs. + * So we had to do a bind mount for sysfs instead. + */ + if (userns_enabled && netns_disabled && + STREQ(mnt->src, "sysfs")) { + if (VIR_STRDUP(mnt_src, "/sys") < 0) { + goto cleanup; + }
This is clearly broken and looks very untested to me.
It's broken now. But when I submitted this patch last year, it's not.
Are you sure? Just built libvirt v1.2.6-222-ga86b621, head is commit a86b6215a74b1feb2667204e214fbfd2f7decc5c Author: Chen Hanxiao <chenhanxiao@cn.fujitsu.com> Date: Mon Jul 14 18:01:51 2014 +0800
LXC: create a bind mount for sysfs when enable userns but disable netns
/sys is still an empty directory but as at this time (most likely due to another bug) libvirt was able to create /sys/fs/cgroup and mounted groups there. But no sysfs at all is at /sys.
I mean, how is this supposed to work? You bind mount /sys over /sys...
Any further comments on that?
It just looks impossible for it to work in this way
That's also my impression. Therefore containers without their own network namespace currently don't work and have never worked as expected. Shall we revert commit a86b6215a74b and try to bind mount before the pivot_root()? Thanks, //richard

On Thu, Mar 19, 2015 at 06:04:57PM +0100, Richard Weinberger wrote:
Am 19.03.2015 um 17:58 schrieb Daniel P. Berrange:
On Thu, Mar 19, 2015 at 05:54:32PM +0100, Richard Weinberger wrote:
Am 11.03.2015 um 10:36 schrieb Richard Weinberger:
Am 11.03.2015 um 03:30 schrieb Chen, Hanxiao:
> @@ -826,8 +829,25 @@ static int lxcContainerMountBasicFS(bool userns_enabled) > bool bindOverReadonly; > virLXCBasicMountInfo const *mnt = &lxcBasicMounts[i]; > > + /* When enable userns but disable netns, kernel will > + * forbid us doing a new fresh mount for sysfs. > + * So we had to do a bind mount for sysfs instead. > + */ > + if (userns_enabled && netns_disabled && > + STREQ(mnt->src, "sysfs")) { > + if (VIR_STRDUP(mnt_src, "/sys") < 0) { > + goto cleanup; > + }
This is clearly broken and looks very untested to me.
It's broken now. But when I submitted this patch last year, it's not.
Are you sure? Just built libvirt v1.2.6-222-ga86b621, head is commit a86b6215a74b1feb2667204e214fbfd2f7decc5c Author: Chen Hanxiao <chenhanxiao@cn.fujitsu.com> Date: Mon Jul 14 18:01:51 2014 +0800
LXC: create a bind mount for sysfs when enable userns but disable netns
/sys is still an empty directory but as at this time (most likely due to another bug) libvirt was able to create /sys/fs/cgroup and mounted groups there. But no sysfs at all is at /sys.
I mean, how is this supposed to work? You bind mount /sys over /sys...
Any further comments on that?
It just looks impossible for it to work in this way
That's also my impression.
Therefore containers without their own network namespace currently don't work and have never worked as expected.
No, it is only a problem if userns is used. If userns is not used then they do work
Shall we revert commit a86b6215a74b and try to bind mount before the pivot_root()?
Not sure if that works with userns is active either. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

Am 19.03.2015 um 18:28 schrieb Daniel P. Berrange:
On Thu, Mar 19, 2015 at 06:04:57PM +0100, Richard Weinberger wrote:
Am 19.03.2015 um 17:58 schrieb Daniel P. Berrange:
On Thu, Mar 19, 2015 at 05:54:32PM +0100, Richard Weinberger wrote:
Am 11.03.2015 um 10:36 schrieb Richard Weinberger:
Am 11.03.2015 um 03:30 schrieb Chen, Hanxiao:
>> @@ -826,8 +829,25 @@ static int lxcContainerMountBasicFS(bool userns_enabled) >> bool bindOverReadonly; >> virLXCBasicMountInfo const *mnt = &lxcBasicMounts[i]; >> >> + /* When enable userns but disable netns, kernel will >> + * forbid us doing a new fresh mount for sysfs. >> + * So we had to do a bind mount for sysfs instead. >> + */ >> + if (userns_enabled && netns_disabled && >> + STREQ(mnt->src, "sysfs")) { >> + if (VIR_STRDUP(mnt_src, "/sys") < 0) { >> + goto cleanup; >> + } > > This is clearly broken and looks very untested to me. > It's broken now. But when I submitted this patch last year, it's not.
Are you sure? Just built libvirt v1.2.6-222-ga86b621, head is commit a86b6215a74b1feb2667204e214fbfd2f7decc5c Author: Chen Hanxiao <chenhanxiao@cn.fujitsu.com> Date: Mon Jul 14 18:01:51 2014 +0800
LXC: create a bind mount for sysfs when enable userns but disable netns
/sys is still an empty directory but as at this time (most likely due to another bug) libvirt was able to create /sys/fs/cgroup and mounted groups there. But no sysfs at all is at /sys.
I mean, how is this supposed to work? You bind mount /sys over /sys...
Any further comments on that?
It just looks impossible for it to work in this way
That's also my impression.
Therefore containers without their own network namespace currently don't work and have never worked as expected.
No, it is only a problem if userns is used. If userns is not used then they do work
Agreed.
Shall we revert commit a86b6215a74b and try to bind mount before the pivot_root()?
Not sure if that works with userns is active either.
Fact is that commit a86b6215a74 is broken. We could also refuse to create container with userns enabled but netns disabled... Thanks, //richard

-----Original Message----- From: Richard Weinberger [mailto:richard@nod.at] Sent: Friday, March 20, 2015 1:41 AM To: Daniel P. Berrange Cc: Chen, Hanxiao/陈 晗霄; libvir-list@redhat.com Subject: Re: [libvirt] [PATCH] LXC: create a bind mount for sysfs when enable userns but disable netns
Am 19.03.2015 um 18:28 schrieb Daniel P. Berrange:
On Thu, Mar 19, 2015 at 06:04:57PM +0100, Richard Weinberger wrote:
Am 19.03.2015 um 17:58 schrieb Daniel P. Berrange:
On Thu, Mar 19, 2015 at 05:54:32PM +0100, Richard Weinberger wrote:
Am 11.03.2015 um 10:36 schrieb Richard Weinberger:
Am 11.03.2015 um 03:30 schrieb Chen, Hanxiao: >>> @@ -826,8 +829,25 @@ static int lxcContainerMountBasicFS(bool userns_enabled) >>> bool bindOverReadonly; >>> virLXCBasicMountInfo const *mnt = &lxcBasicMounts[i]; >>> >>> + /* When enable userns but disable netns, kernel will >>> + * forbid us doing a new fresh mount for sysfs. >>> + * So we had to do a bind mount for sysfs instead. >>> + */ >>> + if (userns_enabled && netns_disabled && >>> + STREQ(mnt->src, "sysfs")) { >>> + if (VIR_STRDUP(mnt_src, "/sys") < 0) { >>> + goto cleanup; >>> + } >> >> This is clearly broken and looks very untested to me. >> > It's broken now. > But when I submitted this patch last year, it's not.
Are you sure? Just built libvirt v1.2.6-222-ga86b621, head is commit a86b6215a74b1feb2667204e214fbfd2f7decc5c Author: Chen Hanxiao <chenhanxiao@cn.fujitsu.com> Date: Mon Jul 14 18:01:51 2014 +0800
LXC: create a bind mount for sysfs when enable userns but disable netns
/sys is still an empty directory but as at this time (most likely due to another bug) libvirt was able to create /sys/fs/cgroup and mounted groups there. But no sysfs at all is at /sys.
I mean, how is this supposed to work? You bind mount /sys over /sys...
Any further comments on that?
It just looks impossible for it to work in this way
That's also my impression.
Therefore containers without their own network namespace currently don't work and have never worked as expected.
No, it is only a problem if userns is used. If userns is not used then they do work
Agreed.
That's what I tried to do. Sorry for my mistake.
Shall we revert commit a86b6215a74b and try to bind mount before the pivot_root()?
Not sure if that works with userns is active either.
Fact is that commit a86b6215a74 is broken. We could also refuse to create container with userns enabled but netns disabled...
I think we should refuse it too, rather than do something to work around. Dan, what's your opinion? Regards, - Chen

On Fri, Mar 20, 2015 at 02:14:04AM +0000, Chen, Hanxiao wrote:
-----Original Message----- From: Richard Weinberger [mailto:richard@nod.at] Sent: Friday, March 20, 2015 1:41 AM To: Daniel P. Berrange Cc: Chen, Hanxiao/陈 晗霄; libvir-list@redhat.com Subject: Re: [libvirt] [PATCH] LXC: create a bind mount for sysfs when enable userns but disable netns
Am 19.03.2015 um 18:28 schrieb Daniel P. Berrange:
On Thu, Mar 19, 2015 at 06:04:57PM +0100, Richard Weinberger wrote:
Am 19.03.2015 um 17:58 schrieb Daniel P. Berrange:
On Thu, Mar 19, 2015 at 05:54:32PM +0100, Richard Weinberger wrote:
Am 11.03.2015 um 10:36 schrieb Richard Weinberger: > Am 11.03.2015 um 03:30 schrieb Chen, Hanxiao: >>>> @@ -826,8 +829,25 @@ static int lxcContainerMountBasicFS(bool userns_enabled) >>>> bool bindOverReadonly; >>>> virLXCBasicMountInfo const *mnt = &lxcBasicMounts[i]; >>>> >>>> + /* When enable userns but disable netns, kernel will >>>> + * forbid us doing a new fresh mount for sysfs. >>>> + * So we had to do a bind mount for sysfs instead. >>>> + */ >>>> + if (userns_enabled && netns_disabled && >>>> + STREQ(mnt->src, "sysfs")) { >>>> + if (VIR_STRDUP(mnt_src, "/sys") < 0) { >>>> + goto cleanup; >>>> + } >>> >>> This is clearly broken and looks very untested to me. >>> >> It's broken now. >> But when I submitted this patch last year, it's not. > > Are you sure? > Just built libvirt v1.2.6-222-ga86b621, head is > commit a86b6215a74b1feb2667204e214fbfd2f7decc5c > Author: Chen Hanxiao <chenhanxiao@cn.fujitsu.com> > Date: Mon Jul 14 18:01:51 2014 +0800 > > LXC: create a bind mount for sysfs when enable userns but disable netns > > /sys is still an empty directory but as at this time (most likely due to another bug) > libvirt was able to create /sys/fs/cgroup and mounted groups there. > But no sysfs at all is at /sys. > > I mean, how is this supposed to work? You bind mount /sys over /sys...
Any further comments on that?
It just looks impossible for it to work in this way
That's also my impression.
Therefore containers without their own network namespace currently don't work and have never worked as expected.
No, it is only a problem if userns is used. If userns is not used then they do work
Agreed.
That's what I tried to do. Sorry for my mistake.
Shall we revert commit a86b6215a74b and try to bind mount before the pivot_root()?
Not sure if that works with userns is active either.
Fact is that commit a86b6215a74 is broken. We could also refuse to create container with userns enabled but netns disabled...
I think we should refuse it too, rather than do something to work around. Dan, what's your opinion?
Yes, if we are unable to figure out how to make this work, then we should report VIR_ERR_CONFIG_UNSUPPORTED for the combination of private userns + shared netns Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
participants (7)
-
Chen Hanxiao
-
Chen, Hanxiao
-
chenhanxiao@cn.fujitsu.com
-
Daniel P. Berrange
-
Gao feng
-
Richard Weinberger
-
Richard Weinberger