[libvirt] [PATCH] Allow use of file images for LXC container filesystems

From: "Daniel P. Berrange" <berrange@redhat.com> A previous commit gave the LXC driver the ability to mount block devices for the container filesystem. Through use of the loopback device functionality, we can build on this to support use of plain file images for LXC filesytems. By setting the LO_FLAGS_AUTOCLEAR flag we can ensure that the loop device automatically disappears when the container dies / shuts down * src/lxc/lxc_container.c: Raise error if we see a file based filesystem, since it should have been turned into a loopback device already * src/lxc/lxc_controller.c: Rewrite any filesystems of type=file, into type=block, by binding the file image to a free loop device --- src/lxc/lxc_container.c | 5 ++ src/lxc/lxc_controller.c | 176 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 179 insertions(+), 2 deletions(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index f6ab407..bf772e5 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -797,6 +797,11 @@ static int lxcContainerMountFS(virDomainFSDefPtr fs, if (lxcContainerMountFSBlock(fs, srcprefix) < 0) return -1; break; + case VIR_DOMAIN_FS_TYPE_FILE: + lxcError(VIR_ERR_INTERNAL_ERROR, + _("Unexpected filesystem type %s"), + virDomainFSTypeToString(fs->type)); + break; default: lxcError(VIR_ERR_CONFIG_UNSUPPORTED, _("Cannot mount filesystem type %s"), diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index 8848ae2..45b4c70 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -39,6 +39,8 @@ #include <getopt.h> #include <sys/mount.h> #include <locale.h> +#include <linux/loop.h> +#include <dirent.h> #if HAVE_CAPNG # include <cap-ng.h> @@ -63,6 +65,160 @@ struct cgroup_device_policy { int minor; }; + +static int lxcGetLoopFD(char **devname) +{ + int fd = -1; + DIR *dh = NULL; + struct dirent *de; + char *looppath; + struct loop_info64 lo; + + VIR_DEBUG("Looking for loop devices in /dev"); + + if (!(dh = opendir("/dev"))) { + virReportSystemError(errno, "%s", + _("Unable to read /dev")); + goto cleanup; + } + + while ((de = readdir(dh)) != NULL) { + if (!STRPREFIX(de->d_name, "loop")) + continue; + + if (virAsprintf(&looppath, "/dev/%s", de->d_name) < 0) { + virReportOOMError(); + goto cleanup; + } + + VIR_DEBUG("Checking up on device %s", looppath); + if ((fd = open(looppath, O_RDWR)) < 0) { + virReportSystemError(errno, + _("Unable to open %s"), looppath); + goto cleanup; + } + + if (ioctl(fd, LOOP_GET_STATUS64, &lo) < 0) { + /* Got a free device, return the fd */ + if (errno == ENXIO) + goto cleanup; + + VIR_FORCE_CLOSE(fd); + virReportSystemError(errno, + _("Unable to get loop status on %s"), + looppath); + goto cleanup; + } + + /* Oh well, try the next device */ + VIR_FORCE_CLOSE(fd); + VIR_FREE(looppath); + } + + lxcError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Unable to find a free loop device in /dev")); + +cleanup: + if (fd != -1) { + VIR_DEBUG("Got free loop device %s %d", looppath, fd); + *devname = looppath; + } else { + VIR_DEBUG("No free loop devices available"); + VIR_FREE(looppath); + } + if (dh) + closedir(dh); + return fd; +} + +static int lxcSetupLoopDevice(virDomainFSDefPtr fs) +{ + int lofd = -1; + int fsfd = -1; + struct loop_info64 lo; + char *loname = NULL; + int ret = -1; + + if ((lofd = lxcGetLoopFD(&loname)) < 0) + return -1; + + memset(&lo, 0, sizeof(lo)); + lo.lo_flags = LO_FLAGS_AUTOCLEAR; + + if ((fsfd = open(fs->src, O_RDWR)) < 0) { + virReportSystemError(errno, + _("Unable to open %s"), fs->src); + goto cleanup; + } + + if (ioctl(lofd, LOOP_SET_FD, fsfd) < 0) { + virReportSystemError(errno, + _("Unable to attach %s to loop device"), + fs->src); + goto cleanup; + } + + if (ioctl(lofd, LOOP_SET_STATUS64, &lo) < 0) { + virReportSystemError(errno, "%s", + _("Unable to mark loop device as autoclear")); + + if (ioctl(lofd, LOOP_CLR_FD, 0) < 0) + VIR_WARN("Unable to detach %s from loop device", fs->src); + goto cleanup; + } + + VIR_DEBUG("Attached loop device %s %d to %s", fs->src, lofd, loname); + /* + * We now change it into a block device type, so that + * the rest of container setup 'just works' + */ + fs->type = VIR_DOMAIN_FS_TYPE_BLOCK; + VIR_FREE(fs->src); + fs->src = loname; + loname = NULL; + + ret = 0; + +cleanup: + VIR_FREE(loname); + VIR_FORCE_CLOSE(fsfd); + if (ret == -1) + VIR_FORCE_CLOSE(lofd); + return lofd; +} + + +static int lxcSetupLoopDevices(virDomainDefPtr def, size_t *nloopDevs, int **loopDevs) +{ + size_t i; + int ret = -1; + + for (i = 0 ; i < def->nfss ; i++) { + int fd; + + if (def->fss[i]->type != VIR_DOMAIN_FS_TYPE_FILE) + continue; + + fd = lxcSetupLoopDevice(def->fss[i]); + if (fd < 0) + goto cleanup; + + VIR_DEBUG("Saving loop fd %d", fd); + if (VIR_REALLOC_N(*loopDevs, *nloopDevs+1) < 0) { + VIR_FORCE_CLOSE(fd); + virReportOOMError(); + goto cleanup; + } + (*loopDevs)[*nloopDevs++] = fd; + } + + VIR_DEBUG("Setup all loop devices"); + ret = 0; + +cleanup: + return ret; +} + /** * lxcSetContainerResources * @def: pointer to virtual machine structure @@ -641,6 +797,9 @@ lxcControllerRun(virDomainDefPtr def, virDomainFSDefPtr root; char *devpts = NULL; char *devptmx = NULL; + size_t nloopDevs = 0; + int *loopDevs = NULL; + size_t i; if (socketpair(PF_UNIX, SOCK_STREAM, 0, control) < 0) { virReportSystemError(errno, "%s", @@ -654,6 +813,9 @@ lxcControllerRun(virDomainDefPtr def, goto cleanup; } + if (lxcSetupLoopDevices(def, &nloopDevs, &loopDevs) < 0) + goto cleanup; + root = virDomainGetRootFilesystem(def); if (lxcSetContainerResources(def) < 0) @@ -778,8 +940,14 @@ lxcControllerRun(virDomainDefPtr def, goto cleanup; } - /* Now the container is running, there's no need for us to keep - any elevated capabilities */ + /* Now the container is fully setup... */ + + /* ...we can close the loop devices... */ + + for (i = 0 ; i < nloopDevs ; i++) + VIR_FORCE_CLOSE(loopDevs[i]); + + /* ...and reduce our privileges */ if (lxcControllerClearCapabilities() < 0) goto cleanup; @@ -803,6 +971,10 @@ cleanup: VIR_FORCE_CLOSE(containerhandshake[0]); VIR_FORCE_CLOSE(containerhandshake[1]); + for (i = 0 ; i < nloopDevs ; i++) + VIR_FORCE_CLOSE(loopDevs[i]); + VIR_FREE(loopDevs); + if (container > 1) { int status; kill(container, SIGTERM); -- 1.7.6

On 08/04/2011 09:41 AM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange"<berrange@redhat.com>
A previous commit gave the LXC driver the ability to mount block devices for the container filesystem. Through use of the loopback device functionality, we can build on this to support use of plain file images for LXC filesytems.
By setting the LO_FLAGS_AUTOCLEAR flag we can ensure that the loop device automatically disappears when the container dies / shuts down
* src/lxc/lxc_container.c: Raise error if we see a file based filesystem, since it should have been turned into a loopback device already * src/lxc/lxc_controller.c: Rewrite any filesystems of type=file, into type=block, by binding the file image to a free loop device
Neat trick. If I understand correctly, 'dumpxml' for an lxc domain using one of these loopback files will have the live xml showing the loopback device created, while the configured xml still shows the original file. But I don't see anything that reverts type=block back into type=file when doing a VIR_DOMAIN_XML_INACTIVE, other than the fact that live and configured xml are stored in separate domain def objects. I'm worried that you might be missing some changes to domain_conf.[hc] to correctly manipulate actual vs. configured setup all within the context of live xml. This is a similar problem to the recent <actual> for networks - we must track which loopback block device was tied to the the filesystem, in case libvirtd gets restarted before the lxc domain goes away, but we also don't want to tie ourselves to that device when the lxc domain reboots (rather, each boot should create a new loopback device, opened on the original file). On the other hand, VIR_DOMAIN_XML_INACTIVE normally means you will only see the configured setup, and is only ever used on active configuration by virDomain[Managed]Save, which are currently unsupported with lxc. And the LO_FLAGS_AUTOCLEAR ends the loopback device as soon as the lxc domain quits, which is a convenient way to handle a lot of cleanup. So while I didn't find any coding bugs in this patch, I need some reassurance that we are handling things correctly from the VIR_DOMAIN_XML_INACTIVE aspect before I give an ack. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

On Thu, Aug 04, 2011 at 11:55:21AM -0600, Eric Blake wrote:
On 08/04/2011 09:41 AM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange"<berrange@redhat.com>
A previous commit gave the LXC driver the ability to mount block devices for the container filesystem. Through use of the loopback device functionality, we can build on this to support use of plain file images for LXC filesytems.
By setting the LO_FLAGS_AUTOCLEAR flag we can ensure that the loop device automatically disappears when the container dies / shuts down
* src/lxc/lxc_container.c: Raise error if we see a file based filesystem, since it should have been turned into a loopback device already * src/lxc/lxc_controller.c: Rewrite any filesystems of type=file, into type=block, by binding the file image to a free loop device
Neat trick.
If I understand correctly, 'dumpxml' for an lxc domain using one of these loopback files will have the live xml showing the loopback device created, while the configured xml still shows the original file. But I don't see anything that reverts type=block back into type=file when doing a VIR_DOMAIN_XML_INACTIVE, other than the fact that live and configured xml are stored in separate domain def objects.
I'm worried that you might be missing some changes to domain_conf.[hc] to correctly manipulate actual vs. configured setup all within the context of live xml. This is a similar problem to the recent <actual> for networks - we must track which loopback block device was tied to the the filesystem, in case libvirtd gets restarted before the lxc domain goes away, but we also don't want to tie ourselves to that device when the lxc domain reboots (rather, each boot should create a new loopback device, opened on the original file).
On the other hand, VIR_DOMAIN_XML_INACTIVE normally means you will only see the configured setup, and is only ever used on active configuration by virDomain[Managed]Save, which are currently unsupported with lxc. And the LO_FLAGS_AUTOCLEAR ends the loopback device as soon as the lxc domain quits, which is a convenient way to handle a lot of cleanup.
So while I didn't find any coding bugs in this patch, I need some reassurance that we are handling things correctly from the VIR_DOMAIN_XML_INACTIVE aspect before I give an ack.
The live & inactive XML documents both live in the libvirtd daemon context. The lxc_controler/lxc_container files both execute in libvirt_lxc helper process context. So any changes made to the virDomainDefPtr here are not visible anywhere else. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 08/04/2011 09:41 AM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange"<berrange@redhat.com>
A previous commit gave the LXC driver the ability to mount block devices for the container filesystem. Through use of the loopback device functionality, we can build on this to support use of plain file images for LXC filesytems.
By setting the LO_FLAGS_AUTOCLEAR flag we can ensure that the loop device automatically disappears when the container dies / shuts down
+static int lxcSetupLoopDevice(virDomainFSDefPtr fs) +{ + int lofd = -1; + int fsfd = -1; + struct loop_info64 lo; + char *loname = NULL; + int ret = -1; + + if ((lofd = lxcGetLoopFD(&loname))< 0) + return -1; + + memset(&lo, 0, sizeof(lo)); + lo.lo_flags = LO_FLAGS_AUTOCLEAR;
The kernel headers available on RHEL 5 or CentOS 5 lack this enum value, which causes compilation to fail if you are trying to build lxc support for these older systems. When I have some time, I will probably just write a patch that adds a configure.ac probe for LO_FLAGS_AUTOCLEAR, and where it is missing, make any attempt to do plan9 file system passthrough to lxc guests fail since they are unsupported on these old kernels. You should still be able to use the other aspects of lxc that don't involve file system passthrough, although obviously lxc isn't getting much testing on these older systems. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

On Tue, Sep 27, 2011 at 05:27:42PM -0600, Eric Blake wrote:
On 08/04/2011 09:41 AM, Daniel P. Berrange wrote:
From: "Daniel P. Berrange"<berrange@redhat.com>
A previous commit gave the LXC driver the ability to mount block devices for the container filesystem. Through use of the loopback device functionality, we can build on this to support use of plain file images for LXC filesytems.
By setting the LO_FLAGS_AUTOCLEAR flag we can ensure that the loop device automatically disappears when the container dies / shuts down
+static int lxcSetupLoopDevice(virDomainFSDefPtr fs) +{ + int lofd = -1; + int fsfd = -1; + struct loop_info64 lo; + char *loname = NULL; + int ret = -1; + + if ((lofd = lxcGetLoopFD(&loname))< 0) + return -1; + + memset(&lo, 0, sizeof(lo)); + lo.lo_flags = LO_FLAGS_AUTOCLEAR;
The kernel headers available on RHEL 5 or CentOS 5 lack this enum value, which causes compilation to fail if you are trying to build lxc support for these older systems. When I have some time, I will probably just write a patch that adds a configure.ac probe for LO_FLAGS_AUTOCLEAR, and where it is missing, make any attempt to do plan9 file system passthrough to lxc guests fail since they are unsupported on these old kernels. You should still be able to use the other aspects of lxc that don't involve file system passthrough, although obviously lxc isn't getting much testing on these older systems.
plan9 FS is only a KVM thing. This code shouldn't ever be being compiled on RHEL5, since it does not have any kernel container namespace support at all. So if we have a new enough kernel for LXC, we should have LO_FLAGS_AUTOCLEAR Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 09/28/2011 02:02 AM, Daniel P. Berrange wrote:
The kernel headers available on RHEL 5 or CentOS 5 lack this enum value, which causes compilation to fail if you are trying to build lxc support for these older systems.
This code shouldn't ever be being compiled on RHEL5, since it does not have any kernel container namespace support at all. So if we have a new enough kernel for LXC, we should have LO_FLAGS_AUTOCLEAR
Which code - all of LXC, or just the code using LO_FLAGS_AUTOCLEAR (that is, code related to lxc namespace support)? Because using just the stock ./autogen.sh on RHEL 5 currently tries to build lxc, and because LO_FLAGS_AUTOCLEAR is missing, it is failing to compile. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

On Wed, Sep 28, 2011 at 10:46:12AM -0600, Eric Blake wrote:
On 09/28/2011 02:02 AM, Daniel P. Berrange wrote:
The kernel headers available on RHEL 5 or CentOS 5 lack this enum value, which causes compilation to fail if you are trying to build lxc support for these older systems.
This code shouldn't ever be being compiled on RHEL5, since it does not have any kernel container namespace support at all. So if we have a new enough kernel for LXC, we should have LO_FLAGS_AUTOCLEAR
Which code - all of LXC, or just the code using LO_FLAGS_AUTOCLEAR (that is, code related to lxc namespace support)? Because using just the stock ./autogen.sh on RHEL 5 currently tries to build lxc, and because LO_FLAGS_AUTOCLEAR is missing, it is failing to compile.
All of LXC. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 09/28/2011 11:38 AM, Daniel P. Berrange wrote:
On Wed, Sep 28, 2011 at 10:46:12AM -0600, Eric Blake wrote:
On 09/28/2011 02:02 AM, Daniel P. Berrange wrote:
The kernel headers available on RHEL 5 or CentOS 5 lack this enum value, which causes compilation to fail if you are trying to build lxc support for these older systems.
This code shouldn't ever be being compiled on RHEL5, since it does not have any kernel container namespace support at all. So if we have a new enough kernel for LXC, we should have LO_FLAGS_AUTOCLEAR
Which code - all of LXC, or just the code using LO_FLAGS_AUTOCLEAR (that is, code related to lxc namespace support)? Because using just the stock ./autogen.sh on RHEL 5 currently tries to build lxc, and because LO_FLAGS_AUTOCLEAR is missing, it is failing to compile.
All of LXC.
OK, that makes my configure.ac patch easier - probe for LO_FLAGS_AUTOCLEAR as a prereq for --with-lxc. Patch coming up soon. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

Per the discussion here, LXC on RHEL 5 makes no sense. https://www.redhat.com/archives/libvir-list/2011-September/msg01169.html * configure.ac (with_lxc): Reject RHEL 5.x LXC as too old. --- Pushing under the build-breaker rule. I tested that this has no impact on RHEL 6 or Fedora 14, but that on RHEL 5 it properly skips building the entire lxc driver, letting the build get further (it still died, but on a html file creation error rather than compilation-related). configure.ac | 3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/configure.ac b/configure.ac index 6a0936a..5753c08 100644 --- a/configure.ac +++ b/configure.ac @@ -687,8 +687,9 @@ fi if test "$with_lxc" = "yes" || test "$with_lxc" = "check"; then AC_TRY_LINK([ #include <sched.h> + #include <linux/loop.h> ], [ - unshare (1); + unshare (!LO_FLAGS_AUTOCLEAR); ], [ with_lxc=yes ], [ -- 1.7.4.4
participants (2)
-
Daniel P. Berrange
-
Eric Blake