[libvirt] PATCH: 0/7: Re-factor LXC driver

This is a long overdue followup to my previous set of patches to make the LXC driver use the new domain XML apis. Getting this merged is a blocker for the new libvirt release, now we've turned on LXC by default. Likewise we need to get OpenVZ fully ported to new XML before release too. The previous posting was here: http://www.redhat.com/archives/libvir-list/2008-July/msg00182.html Regards, Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

This patch does some simple re-factoring of the way the TTYs and control socket are handled to reduce the amount of state stored in the lxc_vm_t structure, in preparation for the switchover to the generic domain handling APIs. lxc_conf.c | 1 lxc_conf.h | 12 --- lxc_container.c | 60 +++++++-------- lxc_container.h | 14 +++ lxc_driver.c | 213 ++++++++++++++++---------------------------------------- 5 files changed, 104 insertions(+), 196 deletions(-) Daniel diff -r 63b8398c302e src/lxc_conf.c --- a/src/lxc_conf.c Mon Jul 14 12:18:23 2008 +0100 +++ b/src/lxc_conf.c Tue Jul 15 11:55:48 2008 +0100 @@ -1041,7 +1041,6 @@ void lxcFreeVM(lxc_vm_t *vm) { lxcFreeVMDef(vm->def); - VIR_FREE(vm->containerTty); VIR_FREE(vm); } diff -r 63b8398c302e src/lxc_conf.h --- a/src/lxc_conf.h Mon Jul 14 12:18:23 2008 +0100 +++ b/src/lxc_conf.h Tue Jul 15 11:55:48 2008 +0100 @@ -35,12 +35,6 @@ #define LXC_MAX_XML_LENGTH 16384 #define LXC_MAX_ERROR_LEN 1024 #define LXC_DOMAIN_TYPE "lxc" -#define LXC_PARENT_SOCKET 0 -#define LXC_CONTAINER_SOCKET 1 - -/* messages between parent and container */ -typedef char lxc_message_t; -#define LXC_CONTINUE_MSG 'c' /* types of networks for containers */ enum lxc_net_type { @@ -98,12 +92,6 @@ char configFileBase[PATH_MAX]; char ttyPidFile[PATH_MAX]; - - int parentTty; - int containerTtyFd; - char *containerTty; - - int sockpair[2]; lxc_vm_def_t *def; diff -r 63b8398c302e src/lxc_container.c --- a/src/lxc_container.c Mon Jul 14 12:18:23 2008 +0100 +++ b/src/lxc_container.c Tue Jul 15 11:55:48 2008 +0100 @@ -33,7 +33,6 @@ #include <unistd.h> #include "lxc_container.h" -#include "lxc_conf.h" #include "util.h" #include "memory.h" #include "veth.h" @@ -83,10 +82,11 @@ * * Returns 0 on success or -1 in case of error */ -static int lxcSetContainerStdio(const char *ttyName) +static int lxcSetContainerStdio(const char *ttyPath) { int rc = -1; int ttyfd; + int open_max, i; if (setsid() < 0) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, @@ -94,10 +94,10 @@ goto error_out; } - ttyfd = open(ttyName, O_RDWR|O_NOCTTY); + ttyfd = open(ttyPath, O_RDWR|O_NOCTTY); if (ttyfd < 0) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("open(%s) failed: %s"), ttyName, strerror(errno)); + _("open(%s) failed: %s"), ttyPath, strerror(errno)); goto error_out; } @@ -107,7 +107,12 @@ goto cleanup; } - close(0); close(1); close(2); + /* Just in case someone forget to set FD_CLOEXEC, explicitly + * close all FDs before executing the container */ + open_max = sysconf (_SC_OPEN_MAX); + for (i = 0; i < open_max; i++) + if (i != ttyfd) + close(i); if (dup2(ttyfd, 0) < 0) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, @@ -144,12 +149,11 @@ * * Returns 0 on success or -1 in case of error */ -static int lxcExecWithTty(lxc_vm_t *vm) +static int lxcExecWithTty(lxc_vm_def_t *vmDef, char *ttyPath) { int rc = -1; - lxc_vm_def_t *vmDef = vm->def; - if(lxcSetContainerStdio(vm->containerTty) < 0) { + if(lxcSetContainerStdio(ttyPath) < 0) { goto exit_with_error; } @@ -161,7 +165,7 @@ /** * lxcWaitForContinue: - * @vm: Pointer to vm structure + * @monitor: monitor FD from parent * * This function will wait for the container continue message from the * parent process. It will send this message on the socket pair stored in @@ -169,31 +173,23 @@ * * Returns 0 on success or -1 in case of error */ -static int lxcWaitForContinue(lxc_vm_t *vm) +static int lxcWaitForContinue(int monitor) { - int rc = -1; lxc_message_t msg; int readLen; - readLen = saferead(vm->sockpair[LXC_CONTAINER_SOCKET], &msg, sizeof(msg)); - if (readLen != sizeof(msg)) { + readLen = saferead(monitor, &msg, sizeof(msg)); + if (readLen != sizeof(msg) || + msg != LXC_CONTINUE_MSG) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("Failed to read the container continue message: %s"), strerror(errno)); - goto error_out; + return -1; } DEBUG0("Received container continue message"); - close(vm->sockpair[LXC_PARENT_SOCKET]); - vm->sockpair[LXC_PARENT_SOCKET] = -1; - close(vm->sockpair[LXC_CONTAINER_SOCKET]); - vm->sockpair[LXC_CONTAINER_SOCKET] = -1; - - rc = 0; - -error_out: - return rc; + return 0; } /** @@ -204,12 +200,12 @@ * * Returns 0 on success or nonzero in case of error */ -static int lxcEnableInterfaces(const lxc_vm_t *vm) +static int lxcEnableInterfaces(const lxc_vm_def_t *def) { int rc = 0; const lxc_net_def_t *net; - for (net = vm->def->nets; net; net = net->next) { + for (net = def->nets; net; net = net->next) { DEBUG("Enabling %s", net->containerVeth); rc = vethInterfaceUpOrDown(net->containerVeth, 1); if (0 != rc) { @@ -218,7 +214,7 @@ } /* enable lo device only if there were other net devices */ - if (vm->def->nets) + if (def->nets) rc = vethInterfaceUpOrDown("lo", 1); error_out: @@ -237,11 +233,11 @@ * * Returns 0 on success or -1 in case of error */ -int lxcChild( void *argv ) +int lxcChild( void *data ) { int rc = -1; - lxc_vm_t *vm = (lxc_vm_t *)argv; - lxc_vm_def_t *vmDef = vm->def; + lxc_child_argv_t *argv = data; + lxc_vm_def_t *vmDef = argv->config; lxc_mount_t *curMount; int i; @@ -278,16 +274,16 @@ } /* Wait for interface devices to show up */ - if (0 != (rc = lxcWaitForContinue(vm))) { + if (0 != (rc = lxcWaitForContinue(argv->monitor))) { goto cleanup; } /* enable interfaces */ - if (0 != (rc = lxcEnableInterfaces(vm))) { + if (0 != (rc = lxcEnableInterfaces(vmDef))) { goto cleanup; } - rc = lxcExecWithTty(vm); + rc = lxcExecWithTty(vmDef, argv->ttyPath); /* this function will only return if an error occured */ cleanup: diff -r 63b8398c302e src/lxc_container.h --- a/src/lxc_container.h Mon Jul 14 12:18:23 2008 +0100 +++ b/src/lxc_container.h Tue Jul 15 11:55:48 2008 +0100 @@ -24,7 +24,21 @@ #ifndef LXC_CONTAINER_H #define LXC_CONTAINER_H +#include "lxc_conf.h" + #ifdef WITH_LXC + +typedef struct __lxc_child_argv lxc_child_argv_t; +struct __lxc_child_argv { + lxc_vm_def_t *config; + int monitor; + char *ttyPath; +}; + +/* messages between parent and container */ +typedef char lxc_message_t; +#define LXC_CONTINUE_MSG 'c' + /* Function declarations */ int lxcChild( void *argv ); diff -r 63b8398c302e src/lxc_driver.c --- a/src/lxc_driver.c Mon Jul 14 12:18:23 2008 +0100 +++ b/src/lxc_driver.c Tue Jul 15 11:55:48 2008 +0100 @@ -561,27 +561,23 @@ /** * lxcSendContainerContinue: - * @vm: pointer to virtual machine structure + * @monitor: FD for communicating with child * * Sends the continue message via the socket pair stored in the vm * structure. * * Returns 0 on success or -1 in case of error */ -static int lxcSendContainerContinue(const lxc_vm_t *vm) +static int lxcSendContainerContinue(virConnectPtr conn, + int monitor) { int rc = -1; lxc_message_t msg = LXC_CONTINUE_MSG; int writeCount = 0; - if (NULL == vm) { - goto error_out; - } - - writeCount = safewrite(vm->sockpair[LXC_PARENT_SOCKET], &msg, - sizeof(msg)); + writeCount = safewrite(monitor, &msg, sizeof(msg)); if (writeCount != sizeof(msg)) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, _("unable to send container continue message: %s"), strerror(errno)); goto error_out; @@ -605,12 +601,15 @@ */ static int lxcStartContainer(virConnectPtr conn, lxc_driver_t* driver, - lxc_vm_t *vm) + lxc_vm_t *vm, + int monitor, + char *ttyPath) { int rc = -1; int flags; int stacksize = getpagesize() * 4; char *stack, *stacktop; + lxc_child_argv_t args = { vm->def, monitor, ttyPath }; /* allocate a stack for the container */ if (VIR_ALLOC_N(stack, stacksize) < 0) { @@ -625,7 +624,7 @@ if (vm->def->nets != NULL) flags |= CLONE_NEWNET; - vm->def->id = clone(lxcChild, stacktop, flags, (void *)vm); + vm->def->id = clone(lxcChild, stacktop, flags, &args); DEBUG("clone() returned, %d", vm->def->id); @@ -643,117 +642,9 @@ return rc; } -/** - * lxcPutTtyInRawMode: - * @conn: pointer to connection - * @ttyDev: file descriptor for tty - * - * Sets tty attributes via cfmakeraw() - * - * Returns 0 on success or -1 in case of error - */ -static int lxcPutTtyInRawMode(virConnectPtr conn, int ttyDev) -{ - int rc = -1; - - struct termios ttyAttr; - - if (tcgetattr(ttyDev, &ttyAttr) < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - "tcgetattr() failed: %s", strerror(errno)); - goto cleanup; - } - - cfmakeraw(&ttyAttr); - - if (tcsetattr(ttyDev, TCSADRAIN, &ttyAttr) < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - "tcsetattr failed: %s", strerror(errno)); - goto cleanup; - } - - rc = 0; - -cleanup: - return rc; -} /** - * lxcSetupTtyTunnel: - * @conn: pointer to connection - * @vmDef: pointer to virtual machine definition structure - * @ttyDev: pointer to int. On success will be set to fd for master - * end of tty - * - * Opens and configures the parent side tty - * - * Returns 0 on success or -1 in case of error - */ -static int lxcSetupTtyTunnel(virConnectPtr conn, - lxc_vm_def_t *vmDef, - int* ttyDev) -{ - int rc = -1; - char *ptsStr; - - if (0 < strlen(vmDef->tty)) { - *ttyDev = posix_openpt(O_RDWR|O_NOCTTY|O_NONBLOCK); - if (*ttyDev < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - "open() tty failed: %s", strerror(errno)); - goto setup_complete; - } - - rc = grantpt(*ttyDev); - if (rc < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - "grantpt() failed: %s", strerror(errno)); - goto setup_complete; - } - - rc = unlockpt(*ttyDev); - if (rc < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - "unlockpt() failed: %s", strerror(errno)); - goto setup_complete; - } - - /* get the name and print it to stdout */ - ptsStr = ptsname(*ttyDev); - if (ptsStr == NULL) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - "ptsname() failed"); - goto setup_complete; - } - /* This value needs to be stored in the container configuration file */ - VIR_FREE(vmDef->tty); - if (!(vmDef->tty = strdup(ptsStr))) { - lxcError(conn, NULL, VIR_ERR_NO_MEMORY, - _("unable to get storage for vm tty name")); - goto setup_complete; - } - - /* Enter raw mode, so all characters are passed directly to child */ - if (lxcPutTtyInRawMode(conn, *ttyDev) < 0) { - goto setup_complete; - } - - } else { - *ttyDev = -1; - } - - rc = 0; - -setup_complete: - if((0 != rc) && (*ttyDev > 0)) { - close(*ttyDev); - } - - return rc; -} - -/** - * lxcSetupContainerTty: + * lxcOpenTty: * @conn: pointer to connection * @ttymaster: pointer to int. On success, set to fd for master end * @ttyName: On success, will point to string slave end of tty. Caller @@ -763,12 +654,12 @@ * * Returns 0 on success or -1 in case of error */ -static int lxcSetupContainerTty(virConnectPtr conn, - int *ttymaster, - char **ttyName) +static int lxcOpenTty(virConnectPtr conn, + int *ttymaster, + char **ttyName, + int rawmode) { int rc = -1; - char tempTtyName[PATH_MAX]; *ttymaster = posix_openpt(O_RDWR|O_NOCTTY|O_NONBLOCK); if (*ttymaster < 0) { @@ -783,27 +674,43 @@ goto cleanup; } - if (0 != ptsname_r(*ttymaster, tempTtyName, sizeof(tempTtyName))) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("ptsname_r failed: %s"), strerror(errno)); - goto cleanup; + if (rawmode) { + struct termios ttyAttr; + if (tcgetattr(*ttymaster, &ttyAttr) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + "tcgetattr() failed: %s", strerror(errno)); + goto cleanup; + } + + cfmakeraw(&ttyAttr); + + if (tcsetattr(*ttymaster, TCSADRAIN, &ttyAttr) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + "tcsetattr failed: %s", strerror(errno)); + goto cleanup; + } } - if (VIR_ALLOC_N(*ttyName, strlen(tempTtyName) + 1) < 0) { - lxcError(conn, NULL, VIR_ERR_NO_MEMORY, - _("unable to allocate container name string")); - goto cleanup; + if (ttyName) { + char tempTtyName[PATH_MAX]; + if (0 != ptsname_r(*ttymaster, tempTtyName, sizeof(tempTtyName))) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("ptsname_r failed: %s"), strerror(errno)); + goto cleanup; + } + + if ((*ttyName = strdup(tempTtyName)) == NULL) { + lxcError(conn, NULL, VIR_ERR_NO_MEMORY, NULL); + goto cleanup; + } } - - strcpy(*ttyName, tempTtyName); rc = 0; cleanup: - if (0 != rc) { - if (-1 != *ttymaster) { - close(*ttymaster); - } + if (rc != 0 && + *ttymaster != -1) { + close(*ttymaster); } return rc; @@ -989,15 +896,18 @@ lxc_vm_t * vm) { int rc = -1; - lxc_vm_def_t *vmDef = vm->def; + int sockpair[2]; + int containerTty, parentTty; + char *containerTtyPath = NULL; /* open parent tty */ - if (lxcSetupTtyTunnel(conn, vmDef, &vm->parentTty) < 0) { + VIR_FREE(vm->def->tty); + if (lxcOpenTty(conn, &parentTty, &vm->def->tty, 1) < 0) { goto cleanup; } /* open container tty */ - if (lxcSetupContainerTty(conn, &(vm->containerTtyFd), &(vm->containerTty)) < 0) { + if (lxcOpenTty(conn, &containerTty, &containerTtyPath, 0) < 0) { goto cleanup; } @@ -1011,15 +921,15 @@ if (vm->pid == 0) { /* child process calls forward routine */ - lxcTtyForward(vm->parentTty, vm->containerTtyFd); + lxcTtyForward(parentTty, containerTty); } if (lxcStoreTtyPid(driver, vm)) { DEBUG0("unable to store tty pid"); } - close(vm->parentTty); - close(vm->containerTtyFd); + close(parentTty); + close(containerTty); if (0 != (rc = lxcSetupInterfaces(conn, vm))) { goto cleanup; @@ -1027,7 +937,7 @@ /* create a socket pair to send continue message to the container once */ /* we've completed the post clone configuration */ - if (0 != socketpair(PF_UNIX, SOCK_STREAM, 0, vm->sockpair)) { + if (0 != socketpair(PF_UNIX, SOCK_STREAM, 0, sockpair)) { lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, _("sockpair failed: %s"), strerror(errno)); goto cleanup; @@ -1035,7 +945,9 @@ /* check this rc */ - rc = lxcStartContainer(conn, driver, vm); + rc = lxcStartContainer(conn, driver, vm, + sockpair[1], + containerTtyPath); if (rc != 0) goto cleanup; @@ -1043,7 +955,7 @@ if (rc != 0) goto cleanup; - rc = lxcSendContainerContinue(vm); + rc = lxcSendContainerContinue(conn, sockpair[0]); if (rc != 0) goto cleanup; @@ -1052,10 +964,9 @@ driver->nactivevms++; cleanup: - close(vm->sockpair[LXC_PARENT_SOCKET]); - vm->sockpair[LXC_PARENT_SOCKET] = -1; - close(vm->sockpair[LXC_CONTAINER_SOCKET]); - vm->sockpair[LXC_CONTAINER_SOCKET] = -1; + close(sockpair[0]); + close(sockpair[1]); + VIR_FREE(containerTtyPath); return rc; } -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

"Daniel P. Berrange" <berrange@redhat.com> wrote:
This patch does some simple re-factoring of the way the TTYs and control socket are handled to reduce the amount of state stored in the lxc_vm_t structure, in preparation for the switchover to the generic domain handling APIs. ... diff -r 63b8398c302e src/lxc_driver.c --- a/src/lxc_driver.c Mon Jul 14 12:18:23 2008 +0100 +++ b/src/lxc_driver.c Tue Jul 15 11:55:48 2008 +0100 ... @@ -989,15 +896,18 @@ lxc_vm_t * vm) { int rc = -1; - lxc_vm_def_t *vmDef = vm->def; + int sockpair[2]; ... + if (lxcOpenTty(conn, &parentTty, &vm->def->tty, 1) < 0) { goto cleanup; }
/* open container tty */ - if (lxcSetupContainerTty(conn, &(vm->containerTtyFd), &(vm->containerTty)) < 0) { + if (lxcOpenTty(conn, &containerTty, &containerTtyPath, 0) < 0) { goto cleanup; }
...
+ if (0 != socketpair(PF_UNIX, SOCK_STREAM, 0, sockpair)) { ... cleanup: - close(vm->sockpair[LXC_PARENT_SOCKET]); - vm->sockpair[LXC_PARENT_SOCKET] = -1; - close(vm->sockpair[LXC_CONTAINER_SOCKET]); - vm->sockpair[LXC_CONTAINER_SOCKET] = -1; + close(sockpair[0]); + close(sockpair[1]); + VIR_FREE(containerTtyPath);
return rc; }
All looks fine except for the possibility that the cleanup code can close undefined sockpair[0,1]. The obvious fix is to initialize them to -1 and not close in that case. Oh, and the new name, "monitor" (new struct member and local/param in several functions) would be more readable as "monitor_fd".

On Mon, Aug 11, 2008 at 12:25:52PM +0200, Jim Meyering wrote:
"Daniel P. Berrange" <berrange@redhat.com> wrote:
This patch does some simple re-factoring of the way the TTYs and control socket are handled to reduce the amount of state stored in the lxc_vm_t structure, in preparation for the switchover to the generic domain handling APIs. ... diff -r 63b8398c302e src/lxc_driver.c --- a/src/lxc_driver.c Mon Jul 14 12:18:23 2008 +0100 +++ b/src/lxc_driver.c Tue Jul 15 11:55:48 2008 +0100 ... @@ -989,15 +896,18 @@ lxc_vm_t * vm) { int rc = -1; - lxc_vm_def_t *vmDef = vm->def; + int sockpair[2]; ... + if (lxcOpenTty(conn, &parentTty, &vm->def->tty, 1) < 0) { goto cleanup; }
/* open container tty */ - if (lxcSetupContainerTty(conn, &(vm->containerTtyFd), &(vm->containerTty)) < 0) { + if (lxcOpenTty(conn, &containerTty, &containerTtyPath, 0) < 0) { goto cleanup; }
...
+ if (0 != socketpair(PF_UNIX, SOCK_STREAM, 0, sockpair)) { ... cleanup: - close(vm->sockpair[LXC_PARENT_SOCKET]); - vm->sockpair[LXC_PARENT_SOCKET] = -1; - close(vm->sockpair[LXC_CONTAINER_SOCKET]); - vm->sockpair[LXC_CONTAINER_SOCKET] = -1; + close(sockpair[0]); + close(sockpair[1]); + VIR_FREE(containerTtyPath);
return rc; }
All looks fine except for the possibility that the cleanup code can close undefined sockpair[0,1]. The obvious fix is to initialize them to -1 and not close in that case.
Yep, I've made that change & committed this.
Oh, and the new name, "monitor" (new struct member and local/param in several functions) would be more readable as "monitor_fd".
The struct member will be going away in later re-factoring so I've not changed this naming Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

"Daniel P. Berrange" <berrange@redhat.com> wrote:
This patch does some simple re-factoring of the way the TTYs and control socket are handled to reduce the amount of state stored in the lxc_vm_t structure, in preparation for the switchover to the generic domain handling APIs.
One more thing: ...
diff -r 63b8398c302e src/lxc_container.c --- a/src/lxc_container.c Mon Jul 14 12:18:23 2008 +0100 +++ b/src/lxc_container.c Tue Jul 15 11:55:48 2008 +0100 ... - close(0); close(1); close(2); + /* Just in case someone forget to set FD_CLOEXEC, explicitly + * close all FDs before executing the container */ + open_max = sysconf (_SC_OPEN_MAX); + for (i = 0; i < open_max; i++) + if (i != ttyfd) + close(i);
Do you really need to close all file descriptors > 2 ? I seem to recall that an application doing this caused trouble when it closed a file descriptor (opened via the shell that I was using for log output. I think this might have caused trouble also when I used valgrind with its --log-* options on that program.

On Mon, Aug 11, 2008 at 12:50:46PM +0200, Jim Meyering wrote:
"Daniel P. Berrange" <berrange@redhat.com> wrote:
This patch does some simple re-factoring of the way the TTYs and control socket are handled to reduce the amount of state stored in the lxc_vm_t structure, in preparation for the switchover to the generic domain handling APIs.
One more thing:
...
diff -r 63b8398c302e src/lxc_container.c --- a/src/lxc_container.c Mon Jul 14 12:18:23 2008 +0100 +++ b/src/lxc_container.c Tue Jul 15 11:55:48 2008 +0100 ... - close(0); close(1); close(2); + /* Just in case someone forget to set FD_CLOEXEC, explicitly + * close all FDs before executing the container */ + open_max = sysconf (_SC_OPEN_MAX); + for (i = 0; i < open_max; i++) + if (i != ttyfd) + close(i);
Do you really need to close all file descriptors > 2 ? I seem to recall that an application doing this caused trouble when it closed a file descriptor (opened via the shell that I was using for log output.
This is important to ensuring no file descriptors are leaked into the container we run because that would be a potential security problem. In any case this code will be replaced by a call to virExec() by a later patch in this series. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

The lxc_driver.c file contains quite a large amount of code, serving two reasonably well separated purposes. First there is the direct implemntation of each of the libvirt driver APIs. Second there is the code to spawn a container and a controller for forwarding I/O to/from the PTYs. This patch attempts to re-arrange the code across files to better reflect the split in functionality. The general idea is thus: - lxc_driver.c: implementation of the libvirt driver APIs - lxc_container.c: code for creating containers - lxc_controller.c: code for managing an active container So this entails the following re-arrangement: - All calls to clone() move into lxc_container.c. In particular there is a lxcContainerAvailable() method to querying capabilities of current kernel, and the lxcContainerStart() method for actually starting a new container. - The I/O forwarding code moves into lxc_controller.c with a lxcControllerMain() function containing the epoll() event loop. The container code previously would pass the 'init' string path for the container's init program to /bin/sh for execution. This is bad because it assumes /bin/sh exists in the container's root, and more seriously does no escaping, so allows arbitrary shell code to run. So this also switches to passing the 'init' string directly to execve(). If we want to support passing args to the container startup program, we need explicit representation of the args in the XML, so we can safely pass them to execve() via the argv[] array. Aside from that, this patch should have no functional change to the way containers run. b/src/lxc_controller.c | 205 +++++++++++++++++++++++++++++ b/src/lxc_controller.h | 33 ++++ src/Makefile.am | 1 src/lxc_container.c | 220 +++++++++++++++++++++++-------- src/lxc_container.h | 19 +- src/lxc_driver.c | 345 ++----------------------------------------------- 6 files changed, 434 insertions(+), 389 deletions(-) Daniel diff -r 6a666b5713b4 src/Makefile.am --- a/src/Makefile.am Tue Aug 05 16:50:40 2008 +0100 +++ b/src/Makefile.am Tue Aug 05 16:50:43 2008 +0100 @@ -64,6 +64,7 @@ openvz_conf.c openvz_conf.h \ openvz_driver.c openvz_driver.h \ lxc_driver.c lxc_driver.h \ + lxc_controller.c lxc_controller.h \ lxc_conf.c lxc_conf.h \ lxc_container.c lxc_container.h \ veth.c veth.h \ diff -r 6a666b5713b4 src/lxc_container.c --- a/src/lxc_container.c Tue Aug 05 16:50:40 2008 +0100 +++ b/src/lxc_container.c Tue Aug 05 16:50:43 2008 +0100 @@ -30,6 +30,7 @@ #include <stdlib.h> #include <sys/ioctl.h> #include <sys/mount.h> +#include <sys/wait.h> #include <unistd.h> #include "lxc_container.h" @@ -40,49 +41,69 @@ #define DEBUG(fmt,...) VIR_DEBUG(__FILE__, fmt, __VA_ARGS__) #define DEBUG0(msg) VIR_DEBUG(__FILE__, "%s", msg) +/* + * GLibc headers are behind the kernel, so we define these + * constants if they're not present already. + */ + +#ifndef CLONE_NEWPID +#define CLONE_NEWPID 0x20000000 +#endif +#ifndef CLONE_NEWUTS +#define CLONE_NEWUTS 0x04000000 +#endif +#ifndef CLONE_NEWUSER +#define CLONE_NEWUSER 0x10000000 +#endif +#ifndef CLONE_NEWIPC +#define CLONE_NEWIPC 0x08000000 +#endif +#ifndef CLONE_NEWNET +#define CLONE_NEWNET 0x40000000 /* New network namespace */ +#endif + +/* messages between parent and container */ +typedef char lxc_message_t; +#define LXC_CONTINUE_MSG 'c' + +typedef struct __lxc_child_argv lxc_child_argv_t; +struct __lxc_child_argv { + lxc_vm_def_t *config; + int monitor; + char *ttyPath; +}; + + /** - * lxcExecContainerInit: + * lxcContainerExecInit: * @vmDef: Ptr to vm definition structure * - * Exec the container init string. The container init will replace then + * Exec the container init string. The container init will replace then * be running in the current process * - * Returns 0 on success or -1 in case of error + * Does not return */ -static int lxcExecContainerInit(const lxc_vm_def_t *vmDef) +static int lxcContainerExecInit(const lxc_vm_def_t *vmDef) { - int rc = -1; - char* execString; - size_t execStringLen = strlen(vmDef->init) + 1 + 5; + const char *const argv[] = { + vmDef->init, + NULL, + }; - if (VIR_ALLOC_N(execString, execStringLen) < 0) { - lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, - _("failed to calloc memory for init string: %s"), - strerror(errno)); - goto error_out; - } - - strcpy(execString, "exec "); - strcat(execString, vmDef->init); - - execl("/bin/sh", "sh", "-c", execString, (char*)NULL); - lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, - _("execl failed to exec init: %s"), strerror(errno)); - -error_out: - exit(rc); + return execve(argv[0], (char **)argv, NULL); } /** - * lxcSetContainerStdio: - * @ttyName: Name of tty to set as the container console + * lxcContainerSetStdio: + * @control: the conrol FD + * @ttyPath: Name of tty to set as the container console * * Sets the given tty as the primary conosole for the container as well as * stdout, stdin and stderr. * * Returns 0 on success or -1 in case of error */ -static int lxcSetContainerStdio(const char *ttyPath) +static int lxcContainerSetStdio(int control, const char *ttyPath) { int rc = -1; int ttyfd; @@ -111,7 +132,7 @@ * close all FDs before executing the container */ open_max = sysconf (_SC_OPEN_MAX); for (i = 0; i < open_max; i++) - if (i != ttyfd) + if (i != ttyfd && i != control) close(i); if (dup2(ttyfd, 0) < 0) { @@ -142,30 +163,38 @@ } /** - * lxcExecWithTty: - * @vm: Ptr to vm structure + * lxcContainerSendContinue: + * @monitor: control FD to child * - * Sets container console and stdio and then execs container init + * Sends the continue message via the socket pair stored in the vm + * structure. * * Returns 0 on success or -1 in case of error */ -static int lxcExecWithTty(lxc_vm_def_t *vmDef, char *ttyPath) +int lxcContainerSendContinue(virConnectPtr conn, + int control) { int rc = -1; + lxc_message_t msg = LXC_CONTINUE_MSG; + int writeCount = 0; - if(lxcSetContainerStdio(ttyPath) < 0) { - goto exit_with_error; + writeCount = safewrite(control, &msg, sizeof(msg)); + if (writeCount != sizeof(msg)) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("unable to send container continue message: %s"), + strerror(errno)); + goto error_out; } - lxcExecContainerInit(vmDef); + rc = 0; -exit_with_error: - exit(rc); +error_out: + return rc; } /** - * lxcWaitForContinue: - * @monitor: monitor FD from parent + * lxcContainerWaitForContinue: + * @control: control FD from parent * * This function will wait for the container continue message from the * parent process. It will send this message on the socket pair stored in @@ -173,12 +202,12 @@ * * Returns 0 on success or -1 in case of error */ -static int lxcWaitForContinue(int monitor) +static int lxcContainerWaitForContinue(int control) { lxc_message_t msg; int readLen; - readLen = saferead(monitor, &msg, sizeof(msg)); + readLen = saferead(control, &msg, sizeof(msg)); if (readLen != sizeof(msg) || msg != LXC_CONTINUE_MSG) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, @@ -186,6 +215,7 @@ strerror(errno)); return -1; } + close(control); DEBUG0("Received container continue message"); @@ -200,7 +230,7 @@ * * Returns 0 on success or nonzero in case of error */ -static int lxcEnableInterfaces(const lxc_vm_def_t *def) +static int lxcContainerEnableInterfaces(const lxc_vm_def_t *def) { int rc = 0; const lxc_net_def_t *net; @@ -233,7 +263,7 @@ * * Returns 0 on success or -1 in case of error */ -int lxcChild( void *data ) +static int lxcContainerChild( void *data ) { int rc = -1; lxc_child_argv_t *argv = data; @@ -244,7 +274,7 @@ if (NULL == vmDef) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("lxcChild() passed invalid vm definition")); - goto cleanup; + return -1; } /* handle the bind mounts first before doing anything else that may */ @@ -260,7 +290,7 @@ lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("failed to mount %s at %s for container: %s"), curMount->source, curMount->target, strerror(errno)); - goto cleanup; + return -1; } } @@ -270,24 +300,106 @@ lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("failed to mount /proc for container: %s"), strerror(errno)); - goto cleanup; + return -1; } + if (lxcContainerSetStdio(argv->monitor, argv->ttyPath) < 0) + return -1; + /* Wait for interface devices to show up */ - if (0 != (rc = lxcWaitForContinue(argv->monitor))) { - goto cleanup; + if (lxcContainerWaitForContinue(argv->monitor) < 0) + return -1; + + /* enable interfaces */ + if (lxcContainerEnableInterfaces(vmDef) < 0) + return -1; + + /* this function will only return if an error occured */ + return lxcContainerExecInit(vmDef); +} + +/** + * lxcContainerStart: + * @conn: pointer to connection + * @driver: pointer to driver structure + * @vm: pointer to virtual machine structure + * + * Starts a container process by calling clone() with the namespace flags + * + * Returns PID of container on success or -1 in case of error + */ +int lxcContainerStart(virConnectPtr conn, + lxc_vm_def_t *def, + int control, + char *ttyPath) +{ + pid_t pid; + int flags; + int stacksize = getpagesize() * 4; + char *stack, *stacktop; + lxc_child_argv_t args = { def, control, ttyPath }; + + /* allocate a stack for the container */ + if (VIR_ALLOC_N(stack, stacksize) < 0) { + lxcError(conn, NULL, VIR_ERR_NO_MEMORY, + _("unable to allocate container stack")); + return -1; + } + stacktop = stack + stacksize; + + flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWUSER|CLONE_NEWIPC|SIGCHLD; + + if (def->nets != NULL) + flags |= CLONE_NEWNET; + + pid = clone(lxcContainerChild, stacktop, flags, &args); + VIR_FREE(stack); + DEBUG("clone() returned, %d", pid); + + if (pid < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("clone() failed, %s"), strerror(errno)); + return -1; } - /* enable interfaces */ - if (0 != (rc = lxcEnableInterfaces(vmDef))) { - goto cleanup; + return pid; +} + +static int lxcContainerDummyChild(void *argv ATTRIBUTE_UNUSED) +{ + _exit(0); +} + +int lxcContainerAvailable(int features) +{ + int flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWUSER| + CLONE_NEWIPC|SIGCHLD; + int cpid; + char *childStack; + char *stack; + int childStatus; + + if (features & LXC_CONTAINER_FEATURE_NET) + flags |= CLONE_NEWNET; + + if (VIR_ALLOC_N(stack, getpagesize() * 4) < 0) { + DEBUG0("Unable to allocate stack"); + return -1; } - rc = lxcExecWithTty(vmDef, argv->ttyPath); - /* this function will only return if an error occured */ + childStack = stack + (getpagesize() * 4); -cleanup: - return rc; + cpid = clone(lxcContainerDummyChild, childStack, flags, NULL); + VIR_FREE(stack); + if (cpid < 0) { + DEBUG("clone call returned %s, container support is not enabled", + strerror(errno)); + return -1; + } else { + waitpid(cpid, &childStatus, 0); + } + + return 0; } #endif /* WITH_LXC */ diff -r 6a666b5713b4 src/lxc_container.h --- a/src/lxc_container.h Tue Aug 05 16:50:40 2008 +0100 +++ b/src/lxc_container.h Tue Aug 05 16:50:43 2008 +0100 @@ -28,20 +28,19 @@ #ifdef WITH_LXC -typedef struct __lxc_child_argv lxc_child_argv_t; -struct __lxc_child_argv { - lxc_vm_def_t *config; - int monitor; - char *ttyPath; +enum { + LXC_CONTAINER_FEATURE_NET = (1 << 0), }; -/* messages between parent and container */ -typedef char lxc_message_t; -#define LXC_CONTINUE_MSG 'c' +int lxcContainerSendContinue(virConnectPtr conn, + int control); +int lxcContainerStart(virConnectPtr conn, + lxc_vm_def_t *def, + int control, + char *ttyPath); -/* Function declarations */ -int lxcChild( void *argv ); +int lxcContainerAvailable(int features); #endif /* LXC_DRIVER_H */ diff -r 6a666b5713b4 src/lxc_controller.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lxc_controller.c Tue Aug 05 16:50:43 2008 +0100 @@ -0,0 +1,205 @@ +/* + * Copyright IBM Corp. 2008 + * + * lxc_controller.c: linux container process controller + * + * Authors: + * David L. Leskovec <dlesko at linux.vnet.ibm.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <config.h> + +#ifdef WITH_LXC + +#include <sys/epoll.h> +#include <unistd.h> + +#include "internal.h" +#include "util.h" + +#include "lxc_conf.h" +#include "lxc_controller.h" + + +#define DEBUG(fmt,...) VIR_DEBUG(__FILE__, fmt, __VA_ARGS__) + +/** + * lxcFdForward: + * @readFd: file descriptor to read + * @writeFd: file desriptor to write + * + * Reads 1 byte of data from readFd and writes to writeFd. + * + * Returns 0 on success, EAGAIN if returned on read, or -1 in case of error + */ +static int lxcFdForward(int readFd, int writeFd) +{ + int rc = -1; + char buf[2]; + + if (1 != (saferead(readFd, buf, 1))) { + if (EAGAIN == errno) { + rc = EAGAIN; + goto cleanup; + } + + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("read of fd %d failed: %s"), readFd, strerror(errno)); + goto cleanup; + } + + if (1 != (safewrite(writeFd, buf, 1))) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("write to fd %d failed: %s"), writeFd, strerror(errno)); + goto cleanup; + } + + rc = 0; + +cleanup: + return rc; +} + +typedef struct _lxcTtyForwardFd_t { + int fd; + int active; +} lxcTtyForwardFd_t; + +/** + * lxcTtyForward: + * @appPty: Open fd for application facing Pty + * @contPty: Open fd for container facing Pty + * + * Forwards traffic between fds. Data read from appPty will be written to contPty + * This process loops forever. + * This uses epoll in edge triggered mode to avoid a hard loop on POLLHUP + * events when the user disconnects the virsh console via ctrl-] + * + * Returns 0 on success or -1 in case of error + */ +int lxcControllerMain(int appPty, int contPty) +{ + int rc = -1; + int epollFd; + struct epoll_event epollEvent; + int numEvents; + int numActive = 0; + lxcTtyForwardFd_t fdArray[2]; + int timeout = -1; + int curFdOff = 0; + int writeFdOff = 0; + + fdArray[0].fd = appPty; + fdArray[0].active = 0; + fdArray[1].fd = contPty; + fdArray[1].active = 0; + + /* create the epoll fild descriptor */ + epollFd = epoll_create(2); + if (0 > epollFd) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("epoll_create(2) failed: %s"), strerror(errno)); + goto cleanup; + } + + /* add the file descriptors the epoll fd */ + memset(&epollEvent, 0x00, sizeof(epollEvent)); + epollEvent.events = EPOLLIN|EPOLLET; /* edge triggered */ + epollEvent.data.fd = appPty; + epollEvent.data.u32 = 0; /* fdArray position */ + if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, appPty, &epollEvent)) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("epoll_ctl(appPty) failed: %s"), strerror(errno)); + goto cleanup; + } + epollEvent.data.fd = contPty; + epollEvent.data.u32 = 1; /* fdArray position */ + if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, contPty, &epollEvent)) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("epoll_ctl(contPty) failed: %s"), strerror(errno)); + goto cleanup; + } + + while (1) { + /* if active fd's, return if no events, else wait forever */ + timeout = (numActive > 0) ? 0 : -1; + numEvents = epoll_wait(epollFd, &epollEvent, 1, timeout); + if (0 < numEvents) { + if (epollEvent.events & EPOLLIN) { + curFdOff = epollEvent.data.u32; + if (!fdArray[curFdOff].active) { + fdArray[curFdOff].active = 1; + ++numActive; + } + + } else if (epollEvent.events & EPOLLHUP) { + DEBUG("EPOLLHUP from fd %d", epollEvent.data.fd); + continue; + } else { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("error event %d"), epollEvent.events); + goto cleanup; + } + + } else if (0 == numEvents) { + if (2 == numActive) { + /* both fds active, toggle between the two */ + curFdOff ^= 1; + } else { + /* only one active, if current is active, use it, else it */ + /* must be the other one (ie. curFd just went inactive) */ + curFdOff = fdArray[curFdOff].active ? curFdOff : curFdOff ^ 1; + } + + } else { + if (EINTR == errno) { + continue; + } + + /* error */ + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("epoll_wait() failed: %s"), strerror(errno)); + goto cleanup; + + } + + if (0 < numActive) { + writeFdOff = curFdOff ^ 1; + rc = lxcFdForward(fdArray[curFdOff].fd, fdArray[writeFdOff].fd); + + if (EAGAIN == rc) { + /* this fd no longer has data, set it as inactive */ + --numActive; + fdArray[curFdOff].active = 0; + } else if (-1 == rc) { + goto cleanup; + } + + } + + } + + rc = 0; + +cleanup: + close(appPty); + close(contPty); + close(epollFd); + return rc; +} + +#endif diff -r 6a666b5713b4 src/lxc_controller.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lxc_controller.h Tue Aug 05 16:50:43 2008 +0100 @@ -0,0 +1,33 @@ +/* + * Copyright IBM Corp. 2008 + * + * lxc_controller.h: linux container process controller + * + * Authors: + * David L. Leskovec <dlesko at linux.vnet.ibm.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef LXC_CONTROLLER_H +#define LXC_CONTROLLER_H + +#ifdef WITH_LXC + +int lxcControllerMain(int appPty, int contPty); + +#endif /* WITH_LXC */ + +#endif /* LXC_CONTROLLER_H */ diff -r 6a666b5713b4 src/lxc_driver.c --- a/src/lxc_driver.c Tue Aug 05 16:50:40 2008 +0100 +++ b/src/lxc_driver.c Tue Aug 05 16:50:43 2008 +0100 @@ -26,7 +26,6 @@ #ifdef WITH_LXC #include <fcntl.h> -#include <sys/epoll.h> #include <sched.h> #include <sys/utsname.h> #include <stdbool.h> @@ -39,6 +38,7 @@ #include "lxc_conf.h" #include "lxc_container.h" #include "lxc_driver.h" +#include "lxc_controller.h" #include "driver.h" #include "internal.h" #include "memory.h" @@ -52,77 +52,19 @@ #define DEBUG(fmt,...) VIR_DEBUG(__FILE__, fmt, __VA_ARGS__) #define DEBUG0(msg) VIR_DEBUG(__FILE__, "%s", msg) -/* - * GLibc headers are behind the kernel, so we define these - * constants if they're not present already. - */ - -#ifndef CLONE_NEWPID -#define CLONE_NEWPID 0x20000000 -#endif -#ifndef CLONE_NEWUTS -#define CLONE_NEWUTS 0x04000000 -#endif -#ifndef CLONE_NEWUSER -#define CLONE_NEWUSER 0x10000000 -#endif -#ifndef CLONE_NEWIPC -#define CLONE_NEWIPC 0x08000000 -#endif -#ifndef CLONE_NEWNET -#define CLONE_NEWNET 0x40000000 /* New network namespace */ -#endif static int lxcStartup(void); static int lxcShutdown(void); static lxc_driver_t *lxc_driver = NULL; /* Functions */ -static int lxcDummyChild( void *argv ATTRIBUTE_UNUSED ) -{ - exit(0); -} - -static int lxcCheckContainerSupport(int extra_flags) -{ - int rc = 0; - int flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWUSER| - CLONE_NEWIPC|SIGCHLD|extra_flags; - int cpid; - char *childStack; - char *stack; - int childStatus; - - if (VIR_ALLOC_N(stack, getpagesize() * 4) < 0) { - DEBUG0("Unable to allocate stack"); - rc = -1; - goto check_complete; - } - - childStack = stack + (getpagesize() * 4); - - cpid = clone(lxcDummyChild, childStack, flags, NULL); - if ((0 > cpid) && (EINVAL == errno)) { - DEBUG0("clone call returned EINVAL, container support is not enabled"); - rc = -1; - } else { - waitpid(cpid, &childStatus, 0); - } - - VIR_FREE(stack); - -check_complete: - return rc; -} static const char *lxcProbe(void) { -#ifdef __linux__ - if (0 == lxcCheckContainerSupport(0)) { - return("lxc:///"); - } -#endif - return(NULL); + if (lxcContainerAvailable(0) < 0) + return NULL; + + return("lxc:///"); } static virDrvOpenStatus lxcOpen(virConnectPtr conn, @@ -559,89 +501,6 @@ return 0; } -/** - * lxcSendContainerContinue: - * @monitor: FD for communicating with child - * - * Sends the continue message via the socket pair stored in the vm - * structure. - * - * Returns 0 on success or -1 in case of error - */ -static int lxcSendContainerContinue(virConnectPtr conn, - int monitor) -{ - int rc = -1; - lxc_message_t msg = LXC_CONTINUE_MSG; - int writeCount = 0; - - writeCount = safewrite(monitor, &msg, sizeof(msg)); - if (writeCount != sizeof(msg)) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("unable to send container continue message: %s"), - strerror(errno)); - goto error_out; - } - - rc = 0; - -error_out: - return rc; -} - -/** - * lxcStartContainer: - * @conn: pointer to connection - * @driver: pointer to driver structure - * @vm: pointer to virtual machine structure - * - * Starts a container process by calling clone() with the namespace flags - * - * Returns 0 on success or -1 in case of error - */ -static int lxcStartContainer(virConnectPtr conn, - lxc_driver_t* driver, - lxc_vm_t *vm, - int monitor, - char *ttyPath) -{ - int rc = -1; - int flags; - int stacksize = getpagesize() * 4; - char *stack, *stacktop; - lxc_child_argv_t args = { vm->def, monitor, ttyPath }; - - /* allocate a stack for the container */ - if (VIR_ALLOC_N(stack, stacksize) < 0) { - lxcError(conn, NULL, VIR_ERR_NO_MEMORY, - _("unable to allocate container stack")); - goto error_exit; - } - stacktop = stack + stacksize; - - flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWUSER|CLONE_NEWIPC|SIGCHLD; - - if (vm->def->nets != NULL) - flags |= CLONE_NEWNET; - - vm->def->id = clone(lxcChild, stacktop, flags, &args); - - DEBUG("clone() returned, %d", vm->def->id); - - if (vm->def->id < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("clone() failed, %s"), strerror(errno)); - goto error_exit; - } - - lxcSaveConfig(NULL, driver, vm, vm->def); - - rc = 0; - -error_exit: - return rc; -} - /** * lxcOpenTty: @@ -716,170 +575,6 @@ return rc; } -/** - * lxcFdForward: - * @readFd: file descriptor to read - * @writeFd: file desriptor to write - * - * Reads 1 byte of data from readFd and writes to writeFd. - * - * Returns 0 on success, EAGAIN if returned on read, or -1 in case of error - */ -static int lxcFdForward(int readFd, int writeFd) -{ - int rc = -1; - char buf[2]; - - if (1 != (saferead(readFd, buf, 1))) { - if (EAGAIN == errno) { - rc = EAGAIN; - goto cleanup; - } - - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("read of fd %d failed: %s"), readFd, strerror(errno)); - goto cleanup; - } - - if (1 != (safewrite(writeFd, buf, 1))) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("write to fd %d failed: %s"), writeFd, strerror(errno)); - goto cleanup; - } - - rc = 0; - -cleanup: - return rc; -} - -typedef struct _lxcTtyForwardFd_t { - int fd; - bool active; -} lxcTtyForwardFd_t; - -/** - * lxcTtyForward: - * @fd1: Open fd - * @fd1: Open fd - * - * Forwards traffic between fds. Data read from fd1 will be written to fd2 - * This process loops forever. - * This uses epoll in edge triggered mode to avoid a hard loop on POLLHUP - * events when the user disconnects the virsh console via ctrl-] - * - * Returns 0 on success or -1 in case of error - */ -static int lxcTtyForward(int fd1, int fd2) -{ - int rc = -1; - int epollFd; - struct epoll_event epollEvent; - int numEvents; - int numActive = 0; - lxcTtyForwardFd_t fdArray[2]; - int timeout = -1; - int curFdOff = 0; - int writeFdOff = 0; - - fdArray[0].fd = fd1; - fdArray[0].active = false; - fdArray[1].fd = fd2; - fdArray[1].active = false; - - /* create the epoll fild descriptor */ - epollFd = epoll_create(2); - if (0 > epollFd) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("epoll_create(2) failed: %s"), strerror(errno)); - goto cleanup; - } - - /* add the file descriptors the epoll fd */ - memset(&epollEvent, 0x00, sizeof(epollEvent)); - epollEvent.events = EPOLLIN|EPOLLET; /* edge triggered */ - epollEvent.data.fd = fd1; - epollEvent.data.u32 = 0; /* fdArray position */ - if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, fd1, &epollEvent)) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("epoll_ctl(fd1) failed: %s"), strerror(errno)); - goto cleanup; - } - epollEvent.data.fd = fd2; - epollEvent.data.u32 = 1; /* fdArray position */ - if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, fd2, &epollEvent)) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("epoll_ctl(fd2) failed: %s"), strerror(errno)); - goto cleanup; - } - - while (1) { - /* if active fd's, return if no events, else wait forever */ - timeout = (numActive > 0) ? 0 : -1; - numEvents = epoll_wait(epollFd, &epollEvent, 1, timeout); - if (0 < numEvents) { - if (epollEvent.events & EPOLLIN) { - curFdOff = epollEvent.data.u32; - if (!fdArray[curFdOff].active) { - fdArray[curFdOff].active = true; - ++numActive; - } - - } else if (epollEvent.events & EPOLLHUP) { - DEBUG("EPOLLHUP from fd %d", epollEvent.data.fd); - continue; - } else { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("error event %d"), epollEvent.events); - goto cleanup; - } - - } else if (0 == numEvents) { - if (2 == numActive) { - /* both fds active, toggle between the two */ - curFdOff ^= 1; - } else { - /* only one active, if current is active, use it, else it */ - /* must be the other one (ie. curFd just went inactive) */ - curFdOff = fdArray[curFdOff].active ? curFdOff : curFdOff ^ 1; - } - - } else { - if (EINTR == errno) { - continue; - } - - /* error */ - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("epoll_wait() failed: %s"), strerror(errno)); - goto cleanup; - - } - - if (0 < numActive) { - writeFdOff = curFdOff ^ 1; - rc = lxcFdForward(fdArray[curFdOff].fd, fdArray[writeFdOff].fd); - - if (EAGAIN == rc) { - /* this fd no longer has data, set it as inactive */ - --numActive; - fdArray[curFdOff].active = false; - } else if (-1 == rc) { - goto cleanup; - } - - } - - } - - rc = 0; - -cleanup: - close(fd1); - close(fd2); - close(epollFd); - exit(rc); -} /** * lxcVmStart: @@ -921,7 +616,7 @@ if (vm->pid == 0) { /* child process calls forward routine */ - lxcTtyForward(parentTty, containerTty); + lxcControllerMain(parentTty, containerTty); } if (lxcStoreTtyPid(driver, vm)) { @@ -945,17 +640,19 @@ /* check this rc */ - rc = lxcStartContainer(conn, driver, vm, - sockpair[1], - containerTtyPath); - if (rc != 0) + vm->def->id = lxcContainerStart(conn, + vm->def, + sockpair[1], + containerTtyPath); + if (vm->def->id == -1) goto cleanup; + lxcSaveConfig(conn, driver, vm, vm->def); rc = lxcMoveInterfacesToNetNs(conn, vm); if (rc != 0) goto cleanup; - rc = lxcSendContainerContinue(conn, sockpair[0]); + rc = lxcContainerSendContinue(conn, sockpair[0]); if (rc != 0) goto cleanup; @@ -1196,16 +893,15 @@ { const char *argv[] = {"ip", "link", "set", "lo", "netns", "-1", NULL}; int ip_rc; - int user_netns = 0; - int kern_netns = 0; - if (virRun(NULL, (char **)argv, &ip_rc) == 0) - user_netns = WIFEXITED(ip_rc) && (WEXITSTATUS(ip_rc) != 255); + if (virRun(NULL, (char **)argv, &ip_rc) < 0 || + !(WIFEXITED(ip_rc) && (WEXITSTATUS(ip_rc) != 255))) + return 0; - if (lxcCheckContainerSupport(CLONE_NEWNET) == 0) - kern_netns = 1; + if (lxcContainerAvailable(LXC_CONTAINER_FEATURE_NET) < 0) + return 0; - return kern_netns && user_netns; + return 1; } static int lxcStartup(void) @@ -1222,9 +918,8 @@ } /* Check that this is a container enabled kernel */ - if(0 != lxcCheckContainerSupport(0)) { + if(lxcContainerAvailable(0) < 0) return -1; - } lxc_driver->have_netns = lxcCheckNetNsSupport(); -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

"Daniel P. Berrange" <berrange@redhat.com> wrote:
The lxc_driver.c file contains quite a large amount of code, serving two reasonably well separated purposes. First there is the direct implemntation of each of the libvirt driver APIs. Second there is the code to spawn a container and a controller for forwarding I/O to/from the PTYs. This patch attempts to re-arrange the code across files to better reflect the split in functionality. The general idea is thus:
- lxc_driver.c: implementation of the libvirt driver APIs - lxc_container.c: code for creating containers - lxc_controller.c: code for managing an active container
So this entails the following re-arrangement:
Most changes were straight function copies. In the few cases where there were nontrivial changes, they looked fine, though it was tedious to extract the diffs. I liked the s/0/false/ and s/1/true/ changes. It'd be nice to make more use of the "bool" type in libvirt. ACK

On Mon, Aug 11, 2008 at 07:16:20PM +0200, Jim Meyering wrote:
"Daniel P. Berrange" <berrange@redhat.com> wrote:
The lxc_driver.c file contains quite a large amount of code, serving two reasonably well separated purposes. First there is the direct implemntation of each of the libvirt driver APIs. Second there is the code to spawn a container and a controller for forwarding I/O to/from the PTYs. This patch attempts to re-arrange the code across files to better reflect the split in functionality. The general idea is thus:
- lxc_driver.c: implementation of the libvirt driver APIs - lxc_container.c: code for creating containers - lxc_controller.c: code for managing an active container
So this entails the following re-arrangement:
Most changes were straight function copies. In the few cases where there were nontrivial changes, they looked fine, though it was tedious to extract the diffs.
Ok I've committed this patch too now. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

Every LXC container has 2 processes, the leader of the actual container and a helper process to do the I/O forwarding. The LXC driver is written to allow libvirtd to be restarted without needing to shutdown the active containers. For this it uses a PID file in /var/run/libvirt/lxc/NAME.pid The driver also uses SIGCHLD to detect when a container has terminated. Here-in lies the problem - if you restart libvirtd, the container process gets re-parented to init, and so libvirtd will never get any further SIGCHLD signals for it. This patch attempts to address this problem by changing the relationship between libvirtd and the container processes. Instad of there being two processes which are siblings, the I/O process becomes the parent of the actual container. So the general idea is: - libvirtd LXC driver spawns a 'controller' process - this immediately double-forks itself into the background, making itself process leader, changing its root directory to /, and redirecting its stdin to /dev/null and its stdout/err to /var/log/libvirt/lxc/NAME.log. - The 'controller' process inherits a UNIX domain socket from libvirtd which has had bind() called against /var/run/libvirt/lxc/NAME.sock - Once it has backgrounded itself, the controller calls accept() on the socket, blocking until a client connects. If this fails for any reason the controller exits. - THe 'controller' also writes a file to /var/run/libvirt/lxc/NAME.pid containing its PID. - Immediately after forking the 'controller' process, the libvirtd LXC driver, calls connect() on /var/run/libvirt/lxc/NAME.sock. And does a blocking read of sizeof(pid_t) bytes. - They now do a handshake, consisting of simply sending & receving a single byte. This basically is to ensure the libvirt driver blocks until the controller has finished writing its PID file At this point in time, the libvirtd LXC driver knows what the controller process' PID is. This becomes the 'ID' of the virDomain object associated with this. If anything goes wrong from here-on, the libvirtd LXC driver also now knows what PID it has to kill off. The UNIX socket to the controller process is kept open, and registered with the libvirtd event loop for POLLHUP events. This means the LXC driver can get notification when the controller terminates, without needing to rely on SIGCHLD events. - Now the 'controller' has told libvirtd what its PID is, it goes ahead and starts the real 'container' process. - When the 'container' is up and running, the controller goes into the event loop where it handles I/O from the PTYs. It also keeps an eye out for SIGHUP on the client socket to the libvirtd daemon. - If the client goes away, it'll know it needs to accept() a new client (ie the libvirtd daemon starting up again). Upon a new client connecting it'll do the PID handshake again, so that libvirtd knows what the container ID is again. - When libvirtd starts up, it reads the container configs from /etc/libvirt/lxc and for each one, tries to connect to /var/run/libvirt/lxc/NAME.sock, and read the PID file. This lets it figure out which containers are running. Notice that throughout this, libvirtd's LXC driver doesn't need to know the PID of the actual container process - only that of the controller process. Think of the controller as serving the equivalent role of QEMU in context of KVM. QEMU provides the backend device model for KVM. When QEMU dies, the guest domain goes away. Well the controller provides the 'backend' device model for the container. - though in this case it really only the text console backend. When the controller dies, the container goes away. So architecturally the LXC driver is now very closely aligned with the QEMU driver. The main differences are that LXC can handle libvirtd restarts, and that the LXC driver simply forks() the controller. One could imagine these distinctions going away - the QEMU driver can get a restart capability in much the same way as the LXC restart works. We may want to make the controller process a proper standalone binary which can be directly exec'd, rather than just forked off from libvirtd. lxc_conf.c | 195 ---------------- lxc_conf.h | 12 lxc_container.c | 39 +-- lxc_container.h | 8 lxc_controller.c | 349 +++++++++++++++++++++++++++- lxc_controller.h | 12 lxc_driver.c | 662 ++++++++++++++++++++++++++----------------------------- util.c | 158 +++++++++++++ util.h | 13 + 9 files changed, 870 insertions(+), 578 deletions(-) Daniel diff -r 8093fb566748 src/lxc_conf.c --- a/src/lxc_conf.c Fri Aug 01 14:47:33 2008 +0100 +++ b/src/lxc_conf.c Tue Aug 05 12:13:24 2008 +0100 @@ -833,25 +833,24 @@ strncpy(vm->configFileBase, file, PATH_MAX); vm->configFile[PATH_MAX-1] = '\0'; - if (lxcLoadTtyPid(driver, vm) < 0) { - DEBUG0("failed to load tty pid"); - } - return vm; } int lxcLoadDriverConfig(lxc_driver_t *driver) { /* Set the container configuration directory */ - driver->configDir = strdup(SYSCONF_DIR "/libvirt/lxc"); - if (NULL == driver->configDir) { - lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, "configDir"); - return -1; - } - - driver->stateDir = strdup(LOCAL_STATE_DIR "/run/libvirt/lxc"); + if ((driver->configDir = strdup(SYSCONF_DIR "/libvirt/lxc")) == NULL) + goto no_memory; + if ((driver->stateDir = strdup(LOCAL_STATE_DIR "/run/libvirt/lxc")) == NULL) + goto no_memory; + if ((driver->logDir = strdup(LOCAL_STATE_DIR "/log/libvirt/lxc")) == NULL) + goto no_memory; return 0; + +no_memory: + lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, "configDir"); + return -1; } int lxcLoadContainerConfigFile(lxc_driver_t *driver, @@ -1012,9 +1011,7 @@ curNet = vmdef->nets; while (curNet) { nextNet = curNet->next; - printf("Freeing %s:%s\n", curNet->parentVeth, curNet->containerVeth); VIR_FREE(curNet->parentVeth); - VIR_FREE(curNet->containerVeth); VIR_FREE(curNet->txName); VIR_FREE(curNet); curNet = nextNet; @@ -1106,176 +1103,4 @@ return 0; } -/** - * lxcStoreTtyPid: - * @driver: pointer to driver - * @vm: Ptr to VM - * - * Stores the pid of the tty forward process contained in vm->pid - * LOCAL_STATE_DIR/run/libvirt/lxc/{container_name}.pid - * - * Returns 0 on success or -1 in case of error - */ -int lxcStoreTtyPid(const lxc_driver_t *driver, lxc_vm_t *vm) -{ - int rc = -1; - int fd; - FILE *file = NULL; - - if (vm->ttyPidFile[0] == 0x00) { - if ((rc = virFileMakePath(driver->stateDir))) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot create lxc state directory %s: %s"), - driver->stateDir, strerror(rc)); - goto error_out; - } - - if (virFileBuildPath(driver->stateDir, vm->def->name, ".pid", - vm->ttyPidFile, PATH_MAX) < 0) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot construct tty pid file path")); - goto error_out; - } - } - - if ((fd = open(vm->ttyPidFile, - O_WRONLY | O_CREAT | O_TRUNC, - S_IRUSR | S_IWUSR)) < 0) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot create tty pid file %s: %s"), - vm->ttyPidFile, strerror(errno)); - goto error_out; - } - - if (!(file = fdopen(fd, "w"))) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot fdopen tty pid file %s: %s"), - vm->ttyPidFile, strerror(errno)); - - if (close(fd) < 0) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("failed to close tty pid file %s: %s"), - vm->ttyPidFile, strerror(errno)); - } - - goto error_out; - } - - if (fprintf(file, "%d", vm->pid) < 0) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot write tty pid file %s: %s"), - vm->ttyPidFile, strerror(errno)); - - goto fclose_error_out; - } - - rc = 0; - -fclose_error_out: - if (fclose(file) < 0) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("failed to close tty pid file %s: %s"), - vm->ttyPidFile, strerror(errno)); - } - -error_out: - return rc; -} - -/** - * lxcLoadTtyPid: - * @driver: pointer to driver - * @vm: Ptr to VM - * - * Loads the pid of the tty forward process from the pid file. - * LOCAL_STATE_DIR/run/libvirt/lxc/{container_name}.pid - * - * Returns - * > 0 - pid of tty process - * 0 - no tty pid file - * -1 - error - */ -int lxcLoadTtyPid(const lxc_driver_t *driver, lxc_vm_t *vm) -{ - int rc = -1; - FILE *file; - - if (vm->ttyPidFile[0] == 0x00) { - if ((rc = virFileMakePath(driver->stateDir))) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot create lxc state directory %s: %s"), - driver->stateDir, strerror(rc)); - goto cleanup; - } - - if (virFileBuildPath(driver->stateDir, vm->def->name, ".pid", - vm->ttyPidFile, PATH_MAX) < 0) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot construct tty pid file path")); - goto cleanup; - } - } - - if (!(file = fopen(vm->ttyPidFile, "r"))) { - if (ENOENT == errno) { - rc = 0; - goto cleanup; - } - - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot open tty pid file %s: %s"), - vm->ttyPidFile, strerror(errno)); - goto cleanup; - } - - if (fscanf(file, "%d", &(vm->pid)) < 0) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot read tty pid file %s: %s"), - vm->ttyPidFile, strerror(errno)); - goto cleanup; - } - - if (fclose(file) < 0) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("failed to close tty pid file %s: %s"), - vm->ttyPidFile, strerror(errno)); - goto cleanup; - } - - rc = vm->pid; - - cleanup: - return rc; -} - -/** - * lxcDeleteTtyPid: - * @vm: Ptr to VM - * - * Unlinks the tty pid file for the vm - * LOCAL_STATE_DIR/run/libvirt/lxc/{container_name}.pid - * - * Returns on 0 success or -1 in case of error - */ -int lxcDeleteTtyPidFile(const lxc_vm_t *vm) -{ - if (vm->ttyPidFile[0] == 0x00) { - goto no_file; - } - - if (unlink(vm->ttyPidFile) < 0) { - if (errno == ENOENT) { - goto no_file; - } - - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot remove ttyPidFile %s: %s"), vm->ttyPidFile, - strerror(errno)); - return -1; - } - -no_file: - return 0; -} - #endif /* WITH_LXC */ diff -r 8093fb566748 src/lxc_conf.h --- a/src/lxc_conf.h Fri Aug 01 14:47:33 2008 +0100 +++ b/src/lxc_conf.h Tue Aug 05 12:13:24 2008 +0100 @@ -46,7 +46,6 @@ struct __lxc_net_def { int type; char *parentVeth; /* veth device in parent namespace */ - char *containerVeth; /* veth device in container namespace */ char *txName; /* bridge or network name */ lxc_net_def_t *next; @@ -87,11 +86,10 @@ struct __lxc_vm { int pid; int state; + int monitor; char configFile[PATH_MAX]; char configFileBase[PATH_MAX]; - - char ttyPidFile[PATH_MAX]; lxc_vm_def_t *def; @@ -103,8 +101,9 @@ lxc_vm_t *vms; int nactivevms; int ninactivevms; - char* configDir; - char* stateDir; + char *configDir; + char *stateDir; + char *logDir; int have_netns; }; @@ -154,9 +153,6 @@ lxc_driver_t *driver, const char *configFile, const char *name); -int lxcStoreTtyPid(const lxc_driver_t *driver, lxc_vm_t *vm); -int lxcLoadTtyPid(const lxc_driver_t *driver, lxc_vm_t *vm); -int lxcDeleteTtyPidFile(const lxc_vm_t *vm); void lxcError(virConnectPtr conn, virDomainPtr dom, diff -r 8093fb566748 src/lxc_container.c --- a/src/lxc_container.c Fri Aug 01 14:47:33 2008 +0100 +++ b/src/lxc_container.c Tue Aug 05 12:13:24 2008 +0100 @@ -69,6 +69,8 @@ typedef struct __lxc_child_argv lxc_child_argv_t; struct __lxc_child_argv { lxc_vm_def_t *config; + int nveths; + char **veths; int monitor; char *ttyPath; }; @@ -171,8 +173,7 @@ * * Returns 0 on success or -1 in case of error */ -int lxcContainerSendContinue(virConnectPtr conn, - int control) +int lxcContainerSendContinue(int control) { int rc = -1; lxc_message_t msg = LXC_CONTINUE_MSG; @@ -180,7 +181,7 @@ writeCount = safewrite(control, &msg, sizeof(msg)); if (writeCount != sizeof(msg)) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("unable to send container continue message: %s"), strerror(errno)); goto error_out; @@ -230,21 +231,21 @@ * * Returns 0 on success or nonzero in case of error */ -static int lxcContainerEnableInterfaces(const lxc_vm_def_t *def) +static int lxcContainerEnableInterfaces(int nveths, + char **veths) { - int rc = 0; - const lxc_net_def_t *net; + int rc = 0, i; - for (net = def->nets; net; net = net->next) { - DEBUG("Enabling %s", net->containerVeth); - rc = vethInterfaceUpOrDown(net->containerVeth, 1); + for (i = 0 ; i < nveths ; i++) { + DEBUG("Enabling %s", veths[i]); + rc = vethInterfaceUpOrDown(veths[i], 1); if (0 != rc) { goto error_out; } } /* enable lo device only if there were other net devices */ - if (def->nets) + if (veths) rc = vethInterfaceUpOrDown("lo", 1); error_out: @@ -311,7 +312,7 @@ return -1; /* enable interfaces */ - if (lxcContainerEnableInterfaces(vmDef) < 0) + if (lxcContainerEnableInterfaces(argv->nveths, argv->veths) < 0) return -1; /* this function will only return if an error occured */ @@ -320,7 +321,6 @@ /** * lxcContainerStart: - * @conn: pointer to connection * @driver: pointer to driver structure * @vm: pointer to virtual machine structure * @@ -328,8 +328,9 @@ * * Returns PID of container on success or -1 in case of error */ -int lxcContainerStart(virConnectPtr conn, - lxc_vm_def_t *def, +int lxcContainerStart(lxc_vm_def_t *def, + int nveths, + char **veths, int control, char *ttyPath) { @@ -337,12 +338,11 @@ int flags; int stacksize = getpagesize() * 4; char *stack, *stacktop; - lxc_child_argv_t args = { def, control, ttyPath }; + lxc_child_argv_t args = { def, nveths, veths, control, ttyPath }; /* allocate a stack for the container */ if (VIR_ALLOC_N(stack, stacksize) < 0) { - lxcError(conn, NULL, VIR_ERR_NO_MEMORY, - _("unable to allocate container stack")); + lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, NULL); return -1; } stacktop = stack + stacksize; @@ -357,7 +357,7 @@ DEBUG("clone() returned, %d", pid); if (pid < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("clone() failed, %s"), strerror(errno)); return -1; } @@ -379,8 +379,9 @@ char *stack; int childStatus; - if (features & LXC_CONTAINER_FEATURE_NET) + if (features & LXC_CONTAINER_FEATURE_NET) { flags |= CLONE_NEWNET; + } if (VIR_ALLOC_N(stack, getpagesize() * 4) < 0) { DEBUG0("Unable to allocate stack"); diff -r 8093fb566748 src/lxc_container.h --- a/src/lxc_container.h Fri Aug 01 14:47:33 2008 +0100 +++ b/src/lxc_container.h Tue Aug 05 12:13:24 2008 +0100 @@ -32,11 +32,11 @@ LXC_CONTAINER_FEATURE_NET = (1 << 0), }; -int lxcContainerSendContinue(virConnectPtr conn, - int control); +int lxcContainerSendContinue(int control); -int lxcContainerStart(virConnectPtr conn, - lxc_vm_def_t *def, +int lxcContainerStart(lxc_vm_def_t *def, + int nveths, + char **veths, int control, char *ttyPath); diff -r 8093fb566748 src/lxc_controller.c --- a/src/lxc_controller.c Fri Aug 01 14:47:33 2008 +0100 +++ b/src/lxc_controller.c Tue Aug 05 12:13:24 2008 +0100 @@ -26,16 +26,59 @@ #ifdef WITH_LXC #include <sys/epoll.h> +#include <sys/wait.h> +#include <sys/socket.h> #include <unistd.h> +#include <paths.h> +#include <fcntl.h> #include "internal.h" #include "util.h" #include "lxc_conf.h" +#include "lxc_container.h" #include "lxc_controller.h" +#include "veth.h" +#include "memory.h" +#include "util.h" #define DEBUG(fmt,...) VIR_DEBUG(__FILE__, fmt, __VA_ARGS__) +#define DEBUG0(msg) VIR_DEBUG(__FILE__, "%s", msg) + + +#define LXC_HANDSHAKE_REQUEST 'm' +#define LXC_HANDSHAKE_REPLY 'r' + +int lxcControllerClientHandshake(int monitor) +{ + char c; + if (saferead(monitor, &c, sizeof(c)) != sizeof(c)) + return -1; + if (c != LXC_HANDSHAKE_REQUEST) { + errno = EINVAL; + return -1; + } + c = LXC_HANDSHAKE_REPLY; + if (safewrite(monitor, &c, sizeof(c)) != sizeof(c)) + return -1; + return 0; +} + +static int lxcControllerServerHandshake(int monitor) +{ + char c = LXC_HANDSHAKE_REQUEST; + if (safewrite(monitor, &c, sizeof(c)) != sizeof(c)) + return -1; + if (saferead(monitor, &c, sizeof(c)) != sizeof(c)) + return -1; + if (c != LXC_HANDSHAKE_REPLY) { + errno = EINVAL; + return -1; + } + return 0; +} + /** * lxcFdForward: @@ -91,7 +134,10 @@ * * Returns 0 on success or -1 in case of error */ -int lxcControllerMain(int appPty, int contPty) +static int lxcControllerMain(int monitor, + int client, + int appPty, + int contPty) { int rc = -1; int epollFd; @@ -120,15 +166,29 @@ memset(&epollEvent, 0x00, sizeof(epollEvent)); epollEvent.events = EPOLLIN|EPOLLET; /* edge triggered */ epollEvent.data.fd = appPty; - epollEvent.data.u32 = 0; /* fdArray position */ if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, appPty, &epollEvent)) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("epoll_ctl(appPty) failed: %s"), strerror(errno)); goto cleanup; } epollEvent.data.fd = contPty; - epollEvent.data.u32 = 1; /* fdArray position */ if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, contPty, &epollEvent)) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("epoll_ctl(contPty) failed: %s"), strerror(errno)); + goto cleanup; + } + + epollEvent.events = EPOLLIN; + epollEvent.data.fd = monitor; + if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, monitor, &epollEvent)) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("epoll_ctl(contPty) failed: %s"), strerror(errno)); + goto cleanup; + } + + epollEvent.events = EPOLLHUP; + epollEvent.data.fd = client; + if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, client, &epollEvent)) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("epoll_ctl(contPty) failed: %s"), strerror(errno)); goto cleanup; @@ -138,23 +198,46 @@ /* if active fd's, return if no events, else wait forever */ timeout = (numActive > 0) ? 0 : -1; numEvents = epoll_wait(epollFd, &epollEvent, 1, timeout); - if (0 < numEvents) { - if (epollEvent.events & EPOLLIN) { - curFdOff = epollEvent.data.u32; - if (!fdArray[curFdOff].active) { - fdArray[curFdOff].active = 1; - ++numActive; + if (numEvents > 0) { + if (epollEvent.data.fd == monitor) { + int fd = accept(monitor, NULL, 0); + if (client != -1 || /* Already connected, so kick new one out */ + lxcControllerServerHandshake(client) < 0) { + close(fd); + continue; } - - } else if (epollEvent.events & EPOLLHUP) { - DEBUG("EPOLLHUP from fd %d", epollEvent.data.fd); - continue; + client = fd; + epollEvent.events = EPOLLHUP; + epollEvent.data.fd = client; + if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, client, &epollEvent)) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("epoll_ctl(contPty) failed: %s"), strerror(errno)); + goto cleanup; + } + } else if (client != -1 && epollEvent.data.fd == client) { + if (0 > epoll_ctl(epollFd, EPOLL_CTL_DEL, client, &epollEvent)) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("epoll_ctl(contPty) failed: %s"), strerror(errno)); + goto cleanup; + } + close(client); + client = -1; } else { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("error event %d"), epollEvent.events); - goto cleanup; + if (epollEvent.events & EPOLLIN) { + curFdOff = epollEvent.data.fd == appPty ? 0 : 1; + if (!fdArray[curFdOff].active) { + fdArray[curFdOff].active = 1; + ++numActive; + } + } else if (epollEvent.events & EPOLLHUP) { + DEBUG("EPOLLHUP from fd %d", epollEvent.data.fd); + continue; + } else { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("error event %d"), epollEvent.events); + goto cleanup; + } } - } else if (0 == numEvents) { if (2 == numActive) { /* both fds active, toggle between the two */ @@ -202,4 +285,236 @@ return rc; } + + +/** + * lxcControllerMoveInterfaces + * @nveths: number of interfaces + * @veths: interface names + * @container: pid of container + * + * Moves network interfaces into a container's namespace + * + * Returns 0 on success or -1 in case of error + */ +static int lxcControllerMoveInterfaces(int nveths, + char **veths, + pid_t container) +{ + int i; + for (i = 0 ; i < nveths ; i++) + if (moveInterfaceToNetNs(veths[i], container) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to move interface %s to ns %d"), + veths[i], container); + return -1; + } + + return 0; +} + + +/** + * lxcCleanupInterfaces: + * @conn: pointer to connection + * @vm: pointer to virtual machine structure + * + * Cleans up the container interfaces by deleting the veth device pairs. + * + * Returns 0 on success or -1 in case of error + */ +static int lxcControllerCleanupInterfaces(int nveths, + char **veths) +{ + int i; + for (i = 0 ; i < nveths ; i++) + if (vethDelete(veths[i]) < 0) + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to delete veth: %s"), veths[i]); + /* will continue to try to cleanup any other interfaces */ + + return 0; +} + + +static int +lxcControllerRun(const char *stateDir, + lxc_vm_def_t *def, + int nveths, + char **veths, + int monitor, + int client, + int appPty) +{ + int rc = -1; + int control[2] = { -1, -1}; + int containerPty; + char *containerPtyPath; + pid_t container = -1; + + if (socketpair(PF_UNIX, SOCK_STREAM, 0, control) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("sockpair failed: %s"), strerror(errno)); + goto cleanup; + } + + if (virFileOpenTty(&containerPty, + &containerPtyPath, + 0) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to allocate tty: %s"), strerror(errno)); + goto cleanup; + } + + if ((container = lxcContainerStart(def, + nveths, + veths, + control[1], + containerPtyPath)) < 0) + goto cleanup; + close(control[1]); + control[1] = -1; + + if (lxcControllerMoveInterfaces(nveths, veths, container) < 0) + goto cleanup; + + if (lxcContainerSendContinue(control[0]) < 0) + goto cleanup; + + rc = lxcControllerMain(monitor, client, appPty, containerPty); + +cleanup: + if (control[0] != -1) + close(control[0]); + if (control[1] != -1) + close(control[1]); + VIR_FREE(containerPtyPath); + if (containerPty != -1) + close(containerPty); + + kill(container, SIGTERM); + waitpid(container, NULL, 0); + lxcControllerCleanupInterfaces(nveths, veths); + virFileDeletePid(stateDir, def->name); + return -1; +} + + +int lxcControllerStart(const char *stateDir, + lxc_vm_def_t *def, + int nveths, + char **veths, + int monitor, + int appPty, + int logfd) +{ + pid_t pid; + int rc; + int status, null; + int open_max, i; + int client; + + if ((pid = fork()) < 0) + return -1; + + if (pid > 0) { + /* Original caller waits for first child to exit */ + while (1) { + rc = waitpid(pid, &status, 0); + if (rc < 0) { + if (errno == EINTR) + continue; + return -1; + } + if (rc != pid) { + fprintf(stderr, + _("Unexpected pid %d != %d from waitpid\n"), + rc, pid); + return -1; + } + if (WIFEXITED(status) && + WEXITSTATUS(status) == 0) + return 0; + else { + fprintf(stderr, + _("Unexpected status %d from pid %d\n"), + status, pid); + return -1; + } + } + } + + /* First child is running here */ + + if (chdir("/") < 0) { + fprintf(stderr, _("Unable to change to root dir: %s\n"), + strerror(errno)); + _exit(-1); + } + + if (setsid() < 0) { + fprintf(stderr, _("Unable to become session leader: %s\n"), + strerror(errno)); + _exit(-1); + } + + if ((null = open(_PATH_DEVNULL, O_RDONLY)) < 0) { + fprintf(stderr, _("Unable to open %s: %s\n"), + _PATH_DEVNULL, strerror(errno)); + _exit(-1); + } + + open_max = sysconf (_SC_OPEN_MAX); + for (i = 0; i < open_max; i++) + if (i != appPty && + i != monitor && + i != logfd && + i != null) + close(i); + + if (dup2(null, STDIN_FILENO) < 0 || + dup2(logfd, STDOUT_FILENO) < 0 || + dup2(logfd, STDERR_FILENO) < 0) { + fprintf(stderr, _("Unable to redirect stdio: %s\n"), + strerror(errno)); + _exit(-1); + } + + close(null); + close(logfd); + + /* Now fork the real controller process */ + if ((pid = fork()) < 0) { + fprintf(stderr, _("Unable to fork controller: %s\n"), + strerror(errno)); + _exit(-1); + } + + if (pid > 0) { + /* First child now exits */ + _exit(0); + } + + /* This is real controller running... */ + if ((rc = virFileWritePid(stateDir, def->name, getpid())) != 0) { + fprintf(stderr, _("Unable to write pid file: %s\n"), + strerror(rc)); + _exit(-1); + } + + /* Accept initial client which is the libvirtd daemon */ + if ((client = accept(monitor, NULL, 0)) < 0 || + lxcControllerServerHandshake(client) < 0) { + fprintf(stderr, _("Failed connection from LXC driver: %s\n"), + strerror(errno)); + _exit(-1); + } + + /* Controlling libvirtd LXC driver now knows + what our PID is, and is able to cleanup after + us from now on */ + _exit(lxcControllerRun(stateDir, def, nveths, veths, monitor, client, appPty)); +} + + #endif diff -r 8093fb566748 src/lxc_controller.h --- a/src/lxc_controller.h Fri Aug 01 14:47:33 2008 +0100 +++ b/src/lxc_controller.h Tue Aug 05 12:13:24 2008 +0100 @@ -26,7 +26,17 @@ #ifdef WITH_LXC -int lxcControllerMain(int appPty, int contPty); +#include "lxc_conf.h" + +int lxcControllerClientHandshake(int monitor); + +int lxcControllerStart(const char *stateDir, + lxc_vm_def_t *def, + int nveths, + char **veths, + int monitor, + int appPty, + int logfd); #endif /* WITH_LXC */ diff -r 8093fb566748 src/lxc_driver.c --- a/src/lxc_driver.c Fri Aug 01 14:47:33 2008 +0100 +++ b/src/lxc_driver.c Tue Aug 05 12:13:24 2008 +0100 @@ -31,22 +31,23 @@ #include <stdbool.h> #include <string.h> #include <sys/types.h> -#include <termios.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/poll.h> #include <unistd.h> #include <wait.h> +#include "internal.h" #include "lxc_conf.h" #include "lxc_container.h" #include "lxc_driver.h" #include "lxc_controller.h" -#include "driver.h" -#include "internal.h" #include "memory.h" #include "util.h" -#include "memory.h" #include "bridge.h" -#include "qemu_conf.h" #include "veth.h" +#include "event.h" + /* debug macros */ #define DEBUG(fmt,...) VIR_DEBUG(__FILE__, fmt, __VA_ARGS__) @@ -284,8 +285,6 @@ vm->configFile[0] = '\0'; - lxcDeleteTtyPidFile(vm); - lxcRemoveInactiveVM(driver, vm); return 0; @@ -339,10 +338,59 @@ return lxcGenerateXML(dom->conn, driver, vm, vm->def); } + +/** + * lxcVmCleanup: + * @vm: Ptr to VM to clean up + * + * waitpid() on the container process. kill and wait the tty process + * This is called by boh lxcDomainDestroy and lxcSigHandler when a + * container exits. + * + * Returns 0 on success or -1 in case of error + */ +static int lxcVMCleanup(virConnectPtr conn, + lxc_driver_t *driver, + lxc_vm_t * vm) +{ + int rc = -1; + int waitRc; + int childStatus = -1; + + while (((waitRc = waitpid(vm->pid, &childStatus, 0)) == -1) && + errno == EINTR); + + if ((waitRc != vm->pid) && (errno != ECHILD)) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("waitpid failed to wait for container %d: %d %s"), + vm->pid, waitRc, strerror(errno)); + } + + rc = 0; + + if (WIFEXITED(childStatus)) { + rc = WEXITSTATUS(childStatus); + DEBUG("container exited with rc: %d", rc); + } + + virEventRemoveHandle(vm->monitor); + close(vm->monitor); + + virFileDeletePid(driver->stateDir, vm->def->name); + + vm->state = VIR_DOMAIN_SHUTOFF; + vm->pid = -1; + vm->def->id = -1; + vm->monitor = -1; + driver->nactivevms--; + driver->ninactivevms++; + + return rc; +} + /** * lxcSetupInterfaces: - * @conn: pointer to connection - * @vm: pointer to virtual machine structure + * @def: pointer to virtual machine structure * * Sets up the container interfaces by creating the veth device pairs and * attaching the parent end to the appropriate bridge. The container end @@ -351,24 +399,21 @@ * Returns 0 on success or -1 in case of error */ static int lxcSetupInterfaces(virConnectPtr conn, - lxc_vm_t *vm) + lxc_vm_def_t *def, + int *nveths, + char ***veths) { int rc = -1; - lxc_driver_t *driver = conn->privateData; - struct qemud_driver *networkDriver = - (struct qemud_driver *)(conn->networkPrivateData); - lxc_net_def_t *net = vm->def->nets; - char* bridge; + lxc_net_def_t *net; + char *bridge = NULL; char parentVeth[PATH_MAX] = ""; char containerVeth[PATH_MAX] = ""; + brControl *brctl = NULL; - if ((vm->def->nets != NULL) && (driver->have_netns == 0)) { - lxcError(conn, NULL, VIR_ERR_NO_SUPPORT, - _("System lacks NETNS support")); + if (brInit(&brctl) != 0) return -1; - } - for (net = vm->def->nets; net; net = net->next) { + for (net = def->nets; net; net = net->next) { if (LXC_NET_NETWORK == net->type) { virNetworkPtr network = virNetworkLookupByName(conn, net->txName); if (!network) { @@ -378,7 +423,6 @@ bridge = virNetworkGetBridgeName(network); virNetworkFree(network); - } else { bridge = net->txName; } @@ -394,9 +438,6 @@ if (NULL != net->parentVeth) { strcpy(parentVeth, net->parentVeth); } - if (NULL != net->containerVeth) { - strcpy(containerVeth, net->containerVeth); - } DEBUG("parentVeth: %s, containerVeth: %s", parentVeth, containerVeth); if (0 != (rc = vethCreate(parentVeth, PATH_MAX, containerVeth, PATH_MAX))) { lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, @@ -406,24 +447,18 @@ if (NULL == net->parentVeth) { net->parentVeth = strdup(parentVeth); } - if (NULL == net->containerVeth) { - net->containerVeth = strdup(containerVeth); - } + if (VIR_REALLOC_N(*veths, (*nveths)+1) < 0) + goto error_exit; + if (((*veths)[(*nveths)++] = strdup(containerVeth)) == NULL) + goto error_exit; - if ((NULL == net->parentVeth) || (NULL == net->containerVeth)) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + if (NULL == net->parentVeth) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("failed to allocate veth names")); goto error_exit; } - if (!(networkDriver->brctl) && (rc = brInit(&(networkDriver->brctl)))) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot initialize bridge support: %s"), - strerror(rc)); - goto error_exit; - } - - if (0 != (rc = brAddInterface(networkDriver->brctl, bridge, parentVeth))) { + if (0 != (rc = brAddInterface(brctl, bridge, parentVeth))) { lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, _("failed to add %s device to %s: %s"), parentVeth, @@ -433,7 +468,7 @@ } if (0 != (rc = vethInterfaceUpOrDown(parentVeth, 1))) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("failed to enable parent ns veth device: %d"), rc); goto error_exit; } @@ -443,136 +478,151 @@ rc = 0; error_exit: + brShutdown(brctl); return rc; } -/** - * lxcMoveInterfacesToNetNs: - * @conn: pointer to connection - * @vm: pointer to virtual machine structure - * - * Starts a container process by calling clone() with the namespace flags - * - * Returns 0 on success or -1 in case of error - */ -static int lxcMoveInterfacesToNetNs(virConnectPtr conn, - const lxc_vm_t *vm) +static int lxcMonitorServer(virConnectPtr conn, + lxc_driver_t * driver, + lxc_vm_t *vm) { - int rc = -1; - lxc_net_def_t *net; + char *sockpath = NULL; + int fd; + struct sockaddr_un addr; - for (net = vm->def->nets; net; net = net->next) { - if (0 != moveInterfaceToNetNs(net->containerVeth, vm->def->id)) { + if (asprintf(&sockpath, "%s/%s.sock", + driver->stateDir, vm->def->name) < 0) { + lxcError(conn, NULL, VIR_ERR_NO_MEMORY, NULL); + return -1; + } + + if ((fd = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to create server socket: %s"), + strerror(errno)); + goto error; + } + + unlink(sockpath); + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, sockpath, sizeof(addr.sun_path)); + + if (bind(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to bind server socket: %s"), + strerror(errno)); + goto error; + } + if (listen(fd, 30 /* backlog */ ) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to listen server socket: %s"), + strerror(errno)); + goto error; + return (-1); + } + + VIR_FREE(sockpath); + return fd; + +error: + VIR_FREE(sockpath); + if (fd != -1) + close(fd); + return -1; +} + +static int lxcMonitorClient(virConnectPtr conn, + lxc_driver_t * driver, + lxc_vm_t *vm) +{ + char *sockpath = NULL; + int fd; + struct sockaddr_un addr; + + if (asprintf(&sockpath, "%s/%s.sock", + driver->stateDir, vm->def->name) < 0) { + lxcError(conn, NULL, VIR_ERR_NO_MEMORY, NULL); + return -1; + } + + if ((fd = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to create client socket: %s"), + strerror(errno)); + goto error; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, sockpath, sizeof(addr.sun_path)); + + if (connect(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to connect to client socket: %s"), + strerror(errno)); + goto error; + } + + if (lxcControllerClientHandshake(fd) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to handshake with client: %s"), + strerror(errno)); + goto error; + } + + VIR_FREE(sockpath); + return fd; + +error: + VIR_FREE(sockpath); + if (fd != -1) + close(fd); + return -1; +} + + +static int lxcVmTerminate(virConnectPtr conn, + lxc_driver_t *driver, + lxc_vm_t *vm, + int signum) +{ + if (signum == 0) + signum = SIGINT; + + if (kill(vm->pid, signum) < 0) { + if (errno != ESRCH) { lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("failed to move interface %s to ns %d"), - net->containerVeth, vm->def->id); - goto error_exit; + _("failed to kill pid %d: %s"), + vm->pid, strerror(errno)); + return -1; } } - rc = 0; + vm->state = VIR_DOMAIN_SHUTDOWN; -error_exit: - return rc; + return lxcVMCleanup(conn, driver, vm); } -/** - * lxcCleanupInterfaces: - * @conn: pointer to connection - * @vm: pointer to virtual machine structure - * - * Cleans up the container interfaces by deleting the veth device pairs. - * - * Returns 0 on success or -1 in case of error - */ -static int lxcCleanupInterfaces(const lxc_vm_t *vm) +static void lxcMonitorEvent(int fd, + int events ATTRIBUTE_UNUSED, + void *data) { - int rc = -1; - lxc_net_def_t *net; + lxc_driver_t *driver = data; + lxc_vm_t *vm = driver->vms; - for (net = vm->def->nets; net; net = net->next) { - if (0 != (rc = vethDelete(net->parentVeth))) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("failed to delete veth: %s"), net->parentVeth); - /* will continue to try to cleanup any other interfaces */ - } + while (vm) { + if (vm->monitor == fd) + break; + vm = vm->next; + } + if (!vm) { + virEventRemoveHandle(fd); + return; } - return 0; -} - - -/** - * lxcOpenTty: - * @conn: pointer to connection - * @ttymaster: pointer to int. On success, set to fd for master end - * @ttyName: On success, will point to string slave end of tty. Caller - * must free when done (such as in lxcFreeVM). - * - * Opens and configures container tty. - * - * Returns 0 on success or -1 in case of error - */ -static int lxcOpenTty(virConnectPtr conn, - int *ttymaster, - char **ttyName, - int rawmode) -{ - int rc = -1; - - *ttymaster = posix_openpt(O_RDWR|O_NOCTTY|O_NONBLOCK); - if (*ttymaster < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("posix_openpt failed: %s"), strerror(errno)); - goto cleanup; - } - - if (unlockpt(*ttymaster) < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("unlockpt failed: %s"), strerror(errno)); - goto cleanup; - } - - if (rawmode) { - struct termios ttyAttr; - if (tcgetattr(*ttymaster, &ttyAttr) < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - "tcgetattr() failed: %s", strerror(errno)); - goto cleanup; - } - - cfmakeraw(&ttyAttr); - - if (tcsetattr(*ttymaster, TCSADRAIN, &ttyAttr) < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - "tcsetattr failed: %s", strerror(errno)); - goto cleanup; - } - } - - if (ttyName) { - char tempTtyName[PATH_MAX]; - if (0 != ptsname_r(*ttymaster, tempTtyName, sizeof(tempTtyName))) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("ptsname_r failed: %s"), strerror(errno)); - goto cleanup; - } - - if ((*ttyName = strdup(tempTtyName)) == NULL) { - lxcError(conn, NULL, VIR_ERR_NO_MEMORY, NULL); - goto cleanup; - } - } - - rc = 0; - -cleanup: - if (rc != 0 && - *ttymaster != -1) { - close(*ttymaster); - } - - return rc; + if (lxcVmTerminate(NULL, driver, vm, SIGINT) < 0) + virEventRemoveHandle(fd); } @@ -590,81 +640,106 @@ lxc_driver_t * driver, lxc_vm_t * vm) { - int rc = -1; - int sockpair[2]; - int containerTty, parentTty; - char *containerTtyPath = NULL; + int rc = -1, i; + int monitor; + int parentTty; + char *logfile = NULL; + int logfd = -1; + int nveths = 0; + char **veths = NULL; + + if (virFileMakePath(driver->logDir) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("cannot create log directory %s: %s"), + driver->logDir, strerror(rc)); + return -1; + } + + if (asprintf(&logfile, "%s/%s.log", + driver->logDir, vm->def->name) < 0) { + lxcError(conn, NULL, VIR_ERR_NO_MEMORY, NULL); + return -1; + } + + if ((monitor = lxcMonitorServer(conn, driver, vm)) < 0) + goto cleanup; /* open parent tty */ VIR_FREE(vm->def->tty); - if (lxcOpenTty(conn, &parentTty, &vm->def->tty, 1) < 0) { - goto cleanup; - } - - /* open container tty */ - if (lxcOpenTty(conn, &containerTty, &containerTtyPath, 0) < 0) { - goto cleanup; - } - - /* fork process to handle the tty io forwarding */ - if ((vm->pid = fork()) < 0) { + if (virFileOpenTty(&parentTty, &vm->def->tty, 1) < 0) { lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("unable to fork tty forwarding process: %s"), + _("failed to allocate tty: %s"), strerror(errno)); goto cleanup; } - if (vm->pid == 0) { - /* child process calls forward routine */ - lxcControllerMain(parentTty, containerTty); - } + if (lxcSetupInterfaces(conn, vm->def, &nveths, &veths) != 0) + goto cleanup; - if (lxcStoreTtyPid(driver, vm)) { - DEBUG0("unable to store tty pid"); - } - - close(parentTty); - close(containerTty); - - if (0 != (rc = lxcSetupInterfaces(conn, vm))) { + if ((logfd = open(logfile, O_WRONLY | O_TRUNC | O_CREAT, + S_IRUSR|S_IWUSR)) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to open %s: %s"), logfile, + strerror(errno)); goto cleanup; } - /* create a socket pair to send continue message to the container once */ - /* we've completed the post clone configuration */ - if (0 != socketpair(PF_UNIX, SOCK_STREAM, 0, sockpair)) { + if (lxcControllerStart(driver->stateDir, + vm->def, nveths, veths, + monitor, parentTty, logfd) < 0) + goto cleanup; + /* Close the server side of the monitor, now owned + * by the controller process */ + close(monitor); + monitor = -1; + + /* Connect to the controller as a client *first* because + * this will block until the child has written their + * pid file out to disk */ + if ((vm->monitor = lxcMonitorClient(conn, driver, vm)) < 0) + goto cleanup; + + /* And get its pid */ + if ((rc = virFileReadPid(driver->stateDir, vm->def->name, &vm->pid)) != 0) { lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("sockpair failed: %s"), strerror(errno)); + _("Failed to read pid file %s/%s.pid: %s"), + driver->stateDir, vm->def->name, strerror(errno)); + rc = -1; goto cleanup; } - /* check this rc */ - - vm->def->id = lxcContainerStart(conn, - vm->def, - sockpair[1], - containerTtyPath); - if (vm->def->id == -1) - goto cleanup; - lxcSaveConfig(conn, driver, vm, vm->def); - - rc = lxcMoveInterfacesToNetNs(conn, vm); - if (rc != 0) - goto cleanup; - - rc = lxcContainerSendContinue(conn, sockpair[0]); - if (rc != 0) - goto cleanup; - + vm->def->id = vm->pid; vm->state = VIR_DOMAIN_RUNNING; driver->ninactivevms--; driver->nactivevms++; + if (virEventAddHandle(vm->monitor, + POLLERR | POLLHUP, + lxcMonitorEvent, + driver) < 0) { + lxcVmTerminate(conn, driver, vm, 0); + goto cleanup; + } + + rc = 0; + cleanup: - close(sockpair[0]); - close(sockpair[1]); - VIR_FREE(containerTtyPath); - + for (i = 0 ; i < nveths ; i++) { + if (rc != 0) + vethDelete(veths[i]); + VIR_FREE(veths[i]); + } + if (monitor != -1) + close(monitor); + if (rc != 0 && vm->monitor != -1) { + close(vm->monitor); + vm->monitor = -1; + } + if (parentTty != -1) + close(parentTty); + if (logfd != -1) + close(logfd); + VIR_FREE(logfile); return rc; } @@ -752,105 +827,18 @@ */ static int lxcDomainShutdown(virDomainPtr dom) { - int rc = -1; lxc_driver_t *driver = (lxc_driver_t*)dom->conn->privateData; lxc_vm_t *vm = lxcFindVMByID(driver, dom->id); if (!vm) { lxcError(dom->conn, dom, VIR_ERR_INVALID_DOMAIN, _("no domain with id %d"), dom->id); - goto error_out; + return -1; } - if (0 > (kill(vm->def->id, SIGINT))) { - if (ESRCH != errno) { - lxcError(dom->conn, dom, VIR_ERR_INTERNAL_ERROR, - _("sending SIGTERM failed: %s"), strerror(errno)); - - goto error_out; - } - } - - vm->state = VIR_DOMAIN_SHUTDOWN; - - rc = 0; - -error_out: - return rc; + return lxcVmTerminate(dom->conn, driver, vm, 0); } -/** - * lxcVmCleanup: - * @vm: Ptr to VM to clean up - * - * waitpid() on the container process. kill and wait the tty process - * This is called by boh lxcDomainDestroy and lxcSigHandler when a - * container exits. - * - * Returns 0 on success or -1 in case of error - */ -static int lxcVMCleanup(lxc_driver_t *driver, lxc_vm_t * vm) -{ - int rc = -1; - int waitRc; - int childStatus = -1; - - /* if this fails, we'll continue. it will report any errors */ - lxcCleanupInterfaces(vm); - - while (((waitRc = waitpid(vm->def->id, &childStatus, 0)) == -1) && - errno == EINTR); - - if ((waitRc != vm->def->id) && (errno != ECHILD)) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("waitpid failed to wait for container %d: %d %s"), - vm->def->id, waitRc, strerror(errno)); - goto kill_tty; - } - - rc = 0; - - if (WIFEXITED(childStatus)) { - rc = WEXITSTATUS(childStatus); - DEBUG("container exited with rc: %d", rc); - } - -kill_tty: - if (2 > vm->pid) { - DEBUG("not killing tty process with pid %d", vm->pid); - goto tty_error_out; - } - - if (0 > (kill(vm->pid, SIGKILL))) { - if (ESRCH != errno) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("sending SIGKILL to tty process failed: %s"), - strerror(errno)); - - goto tty_error_out; - } - } - - while (((waitRc = waitpid(vm->pid, &childStatus, 0)) == -1) && - errno == EINTR); - - if ((waitRc != vm->pid) && (errno != ECHILD)) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("waitpid failed to wait for tty %d: %d %s"), - vm->pid, waitRc, strerror(errno)); - } - -tty_error_out: - vm->state = VIR_DOMAIN_SHUTOFF; - vm->pid = -1; - lxcDeleteTtyPidFile(vm); - vm->def->id = -1; - driver->nactivevms--; - driver->ninactivevms++; - lxcSaveConfig(NULL, driver, vm, vm->def); - - return rc; - } /** * lxcDomainDestroy: @@ -862,31 +850,16 @@ */ static int lxcDomainDestroy(virDomainPtr dom) { - int rc = -1; lxc_driver_t *driver = (lxc_driver_t*)dom->conn->privateData; lxc_vm_t *vm = lxcFindVMByID(driver, dom->id); if (!vm) { lxcError(dom->conn, dom, VIR_ERR_INVALID_DOMAIN, _("no domain with id %d"), dom->id); - goto error_out; + return -1; } - if (0 > (kill(vm->def->id, SIGKILL))) { - if (ESRCH != errno) { - lxcError(dom->conn, dom, VIR_ERR_INTERNAL_ERROR, - _("sending SIGKILL failed: %s"), strerror(errno)); - - goto error_out; - } - } - - vm->state = VIR_DOMAIN_SHUTDOWN; - - rc = lxcVMCleanup(driver, vm); - -error_out: - return rc; + return lxcVmTerminate(dom->conn, driver, vm, SIGKILL); } static int lxcCheckNetNsSupport(void) @@ -907,6 +880,7 @@ static int lxcStartup(void) { uid_t uid = getuid(); + lxc_vm_t *vm; /* Check that the user is root */ if (0 != uid) { @@ -935,6 +909,36 @@ return -1; } + vm = lxc_driver->vms; + while (vm) { + int rc; + if ((vm->monitor = lxcMonitorClient(NULL, lxc_driver, vm)) < 0) { + vm = vm->next; + continue; + } + + /* Read pid from controller */ + if ((rc = virFileReadPid(lxc_driver->stateDir, vm->def->name, &vm->pid)) != 0) { + close(vm->monitor); + vm->monitor = -1; + vm = vm->next; + continue; + } + + if (vm->pid != 0) { + vm->def->id = vm->pid; + vm->state = VIR_DOMAIN_RUNNING; + lxc_driver->ninactivevms--; + lxc_driver->nactivevms++; + } else { + vm->def->id = -1; + close(vm->monitor); + vm->monitor = -1; + } + + vm = vm->next; + } + return 0; } @@ -942,6 +946,7 @@ { VIR_FREE(driver->configDir); VIR_FREE(driver->stateDir); + VIR_FREE(driver->logDir); VIR_FREE(driver); } @@ -976,37 +981,6 @@ /* Otherwise we're happy to deal with a shutdown */ return 0; -} - -/** - * lxcSigHandler: - * @siginfo: Pointer to siginfo_t structure - * - * Handles signals received by libvirtd. Currently this is used to - * catch SIGCHLD from an exiting container. - * - * Returns 0 on success or -1 in case of error - */ -static int lxcSigHandler(siginfo_t *siginfo) -{ - int rc = -1; - lxc_vm_t *vm; - - if (siginfo->si_signo == SIGCHLD) { - vm = lxcFindVMByID(lxc_driver, siginfo->si_pid); - - if (NULL == vm) { - DEBUG("Ignoring SIGCHLD from non-container process %d\n", - siginfo->si_pid); - goto cleanup; - } - - rc = lxcVMCleanup(lxc_driver, vm); - - } - -cleanup: - return rc; } @@ -1079,7 +1053,7 @@ lxcShutdown, NULL, /* reload */ lxcActive, - lxcSigHandler + NULL, }; int lxcRegister(void) diff -r 8093fb566748 src/util.c --- a/src/util.c Fri Aug 01 14:47:33 2008 +0100 +++ b/src/util.c Tue Aug 05 12:13:24 2008 +0100 @@ -37,6 +37,7 @@ #include <sys/wait.h> #endif #include <string.h> +#include <termios.h> #include "c-ctype.h" #ifdef HAVE_PATHS_H @@ -556,6 +557,163 @@ return 0; } + +int virFileOpenTty(int *ttymaster, + char **ttyName, + int rawmode) +{ + int rc = -1; + + if ((*ttymaster = posix_openpt(O_RDWR|O_NOCTTY|O_NONBLOCK)) < 0) + goto cleanup; + + if (unlockpt(*ttymaster) < 0) + goto cleanup; + + if (grantpt(*ttymaster) < 0) + goto cleanup; + + if (rawmode) { + struct termios ttyAttr; + if (tcgetattr(*ttymaster, &ttyAttr) < 0) + goto cleanup; + + cfmakeraw(&ttyAttr); + + if (tcsetattr(*ttymaster, TCSADRAIN, &ttyAttr) < 0) + goto cleanup; + } + + if (ttyName) { + char tempTtyName[PATH_MAX]; + if (ptsname_r(*ttymaster, tempTtyName, sizeof(tempTtyName)) < 0) + goto cleanup; + + if ((*ttyName = strdup(tempTtyName)) == NULL) { + errno = ENOMEM; + goto cleanup; + } + } + + rc = 0; + +cleanup: + if (rc != 0 && + *ttymaster != -1) { + close(*ttymaster); + } + + return rc; + +} + + +int virFileWritePid(const char *dir, + const char *name, + pid_t pid) +{ + int rc; + int fd; + FILE *file = NULL; + char *pidfile = NULL; + + if ((rc = virFileMakePath(dir))) { + goto cleanup; + } + + if (asprintf(&pidfile, "%s/%s.pid", dir, name) < 0) { + rc = ENOMEM; + goto cleanup; + } + + if ((fd = open(pidfile, + O_WRONLY | O_CREAT | O_TRUNC, + S_IRUSR | S_IWUSR)) < 0) { + rc = errno; + goto cleanup; + } + + if (!(file = fdopen(fd, "w"))) { + rc = errno; + close(fd); + goto cleanup; + } + + if (fprintf(file, "%d", pid) < 0) { + rc = errno; + goto cleanup; + } + + rc = 0; + +cleanup: + if (file && + fclose(file) < 0) { + rc = errno; + } + + VIR_FREE(pidfile); + return rc; +} + +int virFileReadPid(const char *dir, + const char *name, + pid_t *pid) +{ + int rc; + FILE *file; + char *pidfile = NULL; + *pid = 0; + if (asprintf(&pidfile, "%s/%s.pid", dir, name) < 0) { + rc = ENOMEM; + goto cleanup; + } + + if (!(file = fopen(pidfile, "r"))) { + rc = errno; + goto cleanup; + } + + if (fscanf(file, "%d", pid) != 1) { + rc = EINVAL; + goto cleanup; + } + + if (fclose(file) < 0) { + rc = errno; + goto cleanup; + } + + rc = 0; + + cleanup: + VIR_FREE(pidfile); + return rc; +} + +int virFileDeletePid(const char *dir, + const char *name) +{ + int rc = 0; + char *pidfile = NULL; + + if (asprintf(&pidfile, "%s/%s.pid", dir, name) < 0) { + rc = errno; + goto cleanup; + } + + if (unlink(pidfile) < 0 && + errno != ENOENT) { + rc = errno; + } + +cleanup: + VIR_FREE(pidfile); + return rc; +} + + + /* Like strtol, but produce an "int" result, and check more carefully. Return 0 upon success; return -1 to indicate failure. When END_PTR is NULL, the byte after the final valid digit must be NUL. diff -r 8093fb566748 src/util.h --- a/src/util.h Fri Aug 01 14:47:33 2008 +0100 +++ b/src/util.h Tue Aug 05 12:13:24 2008 +0100 @@ -58,6 +58,19 @@ char *buf, unsigned int buflen); +int virFileOpenTty(int *ttymaster, + char **ttyName, + int rawmode); + + +int virFileWritePid(const char *dir, + const char *name, + pid_t pid); +int virFileReadPid(const char *dir, + const char *name, + pid_t *pid); +int virFileDeletePid(const char *dir, + const char *name); int __virStrToLong_i(char const *s, char **end_ptr, -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

"Daniel P. Berrange" <berrange@redhat.com> wrote: ... [ Nice long explanation. ] It'd be great to put that in the code.
lxc_conf.c | 195 ---------------- lxc_conf.h | 12 lxc_container.c | 39 +-- lxc_container.h | 8 lxc_controller.c | 349 +++++++++++++++++++++++++++- lxc_controller.h | 12 lxc_driver.c | 662 ++++++++++++++++++++++++++----------------------------- util.c | 158 +++++++++++++ util.h | 13 + 9 files changed, 870 insertions(+), 578 deletions(-) ... diff -r 8093fb566748 src/lxc_conf.c diff -r 8093fb566748 src/lxc_conf.h --- a/src/lxc_conf.h Fri Aug 01 14:47:33 2008 +0100 +++ b/src/lxc_conf.h Tue Aug 05 12:13:24 2008 +0100 @@ -46,7 +46,6 @@ struct __lxc_net_def { int type; char *parentVeth; /* veth device in parent namespace */ - char *containerVeth; /* veth device in container namespace */ char *txName; /* bridge or network name */
lxc_net_def_t *next; @@ -87,11 +86,10 @@ struct __lxc_vm { int pid; int state; + int monitor;
I like "monitor_fd" ;-)
diff -r 8093fb566748 src/lxc_container.c --- a/src/lxc_container.c Fri Aug 01 14:47:33 2008 +0100 +++ b/src/lxc_container.c Tue Aug 05 12:13:24 2008 +0100 @@ -69,6 +69,8 @@ typedef struct __lxc_child_argv lxc_child_argv_t; struct __lxc_child_argv { lxc_vm_def_t *config; + int nveths;
From the looks of all uses, this can be an unsigned type. Then readers won't wonder if negative values are significant.
+ char **veths; int monitor; char *ttyPath; }; @@ -171,8 +173,7 @@ * * Returns 0 on success or -1 in case of error */ -int lxcContainerSendContinue(virConnectPtr conn, - int control) +int lxcContainerSendContinue(int control)
I like "control_fd", but hesitated to mention it, since this is a preexisting name.
{ int rc = -1; lxc_message_t msg = LXC_CONTINUE_MSG; @@ -180,7 +181,7 @@
writeCount = safewrite(control, &msg, sizeof(msg)); if (writeCount != sizeof(msg)) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("unable to send container continue message: %s"), strerror(errno)); goto error_out; @@ -230,21 +231,21 @@ * * Returns 0 on success or nonzero in case of error */ -static int lxcContainerEnableInterfaces(const lxc_vm_def_t *def) +static int lxcContainerEnableInterfaces(int nveths, + char **veths) { - int rc = 0; - const lxc_net_def_t *net; + int rc = 0, i;
Many style guides recommend against putting multiple ","-separated declarations on the same line... ...
@@ -337,12 +338,11 @@ int flags; int stacksize = getpagesize() * 4; char *stack, *stacktop; - lxc_child_argv_t args = { def, control, ttyPath }; + lxc_child_argv_t args = { def, nveths, veths, control, ttyPath };
If possible, it'd be nice to make the struct "const". ...
@@ -379,8 +379,9 @@ char *stack; int childStatus;
- if (features & LXC_CONTAINER_FEATURE_NET) + if (features & LXC_CONTAINER_FEATURE_NET) { flags |= CLONE_NEWNET; + }
Since you're making a change like this, I suspect it's worth adding a sentence+example to HACKING saying that we prefer to use braces even when there's only one statement. Out of habit, I tend *not* to use braces in that case, but have no trouble adapting. Codifying it might help avoid a little churn. Let me know and I'll be happy to make the change.
diff -r 8093fb566748 src/lxc_controller.c ... -int lxcControllerMain(int appPty, int contPty) +static int lxcControllerMain(int monitor, + int client, + int appPty, + int contPty) { int rc = -1; int epollFd; @@ -120,15 +166,29 @@ memset(&epollEvent, 0x00, sizeof(epollEvent)); epollEvent.events = EPOLLIN|EPOLLET; /* edge triggered */ epollEvent.data.fd = appPty; - epollEvent.data.u32 = 0; /* fdArray position */ if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, appPty, &epollEvent)) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("epoll_ctl(appPty) failed: %s"), strerror(errno)); goto cleanup; } epollEvent.data.fd = contPty; - epollEvent.data.u32 = 1; /* fdArray position */ if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, contPty, &epollEvent)) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("epoll_ctl(contPty) failed: %s"), strerror(errno)); + goto cleanup; + } + + epollEvent.events = EPOLLIN; + epollEvent.data.fd = monitor; + if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, monitor, &epollEvent)) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("epoll_ctl(contPty) failed: %s"), strerror(errno)); + goto cleanup; + } + + epollEvent.events = EPOLLHUP; + epollEvent.data.fd = client; + if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, client, &epollEvent)) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("epoll_ctl(contPty) failed: %s"), strerror(errno)); goto cleanup; @@ -138,23 +198,46 @@ /* if active fd's, return if no events, else wait forever */ timeout = (numActive > 0) ? 0 : -1; numEvents = epoll_wait(epollFd, &epollEvent, 1, timeout); - if (0 < numEvents) { - if (epollEvent.events & EPOLLIN) { - curFdOff = epollEvent.data.u32; - if (!fdArray[curFdOff].active) { - fdArray[curFdOff].active = 1; - ++numActive; + if (numEvents > 0) { + if (epollEvent.data.fd == monitor) { + int fd = accept(monitor, NULL, 0); + if (client != -1 || /* Already connected, so kick new one out */
I presume you meant to insert "client = fd;" right after "fd = ...", (or compare fd != -1, and then set client = fd after the "}"), since it looks like "client" must be defined after this if/else chain.
+ lxcControllerServerHandshake(client) < 0) { + close(fd); + continue; } - - } else if (epollEvent.events & EPOLLHUP) { - DEBUG("EPOLLHUP from fd %d", epollEvent.data.fd); - continue; + client = fd; + epollEvent.events = EPOLLHUP; + epollEvent.data.fd = client; + if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, client, &epollEvent)) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("epoll_ctl(contPty) failed: %s"), strerror(errno)); + goto cleanup; + } + } else if (client != -1 && epollEvent.data.fd == client) { ... +static int +lxcControllerRun(const char *stateDir, + lxc_vm_def_t *def, + int nveths, + char **veths, + int monitor, + int client, + int appPty) +{ + int rc = -1; + int control[2] = { -1, -1}; + int containerPty; + char *containerPtyPath; + pid_t container = -1; + + if (socketpair(PF_UNIX, SOCK_STREAM, 0, control) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("sockpair failed: %s"), strerror(errno)); + goto cleanup; + } + + if (virFileOpenTty(&containerPty, + &containerPtyPath, + 0) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to allocate tty: %s"), strerror(errno)); + goto cleanup; + } + + if ((container = lxcContainerStart(def, + nveths, + veths, + control[1], + containerPtyPath)) < 0) + goto cleanup; + close(control[1]); + control[1] = -1; + + if (lxcControllerMoveInterfaces(nveths, veths, container) < 0) + goto cleanup; + + if (lxcContainerSendContinue(control[0]) < 0) + goto cleanup; + + rc = lxcControllerMain(monitor, client, appPty, containerPty); + +cleanup: + if (control[0] != -1) + close(control[0]); + if (control[1] != -1) + close(control[1]); + VIR_FREE(containerPtyPath); + if (containerPty != -1) + close(containerPty); + + kill(container, SIGTERM); + waitpid(container, NULL, 0);
It might be worthwhile to detect kill or waitpid failure when rc == 0.
+ lxcControllerCleanupInterfaces(nveths, veths); + virFileDeletePid(stateDir, def->name); + return -1;
I guess this should be "return rc;". Otherwise, this function always returns -1 and "rc" is set but never used.
+} + + +int lxcControllerStart(const char *stateDir, ... diff -r 8093fb566748 src/lxc_driver.c +/** + * lxcVmCleanup: + * @vm: Ptr to VM to clean up + * + * waitpid() on the container process. kill and wait the tty process + * This is called by boh lxcDomainDestroy and lxcSigHandler when a
s/boh/both/
+ * container exits. + * + * Returns 0 on success or -1 in case of error + */ +static int lxcVMCleanup(virConnectPtr conn, + lxc_driver_t *driver, + lxc_vm_t * vm) +{ + int rc = -1; + int waitRc; + int childStatus = -1; + + while (((waitRc = waitpid(vm->pid, &childStatus, 0)) == -1) && + errno == EINTR);
It's easier to recognize this as an empty loop if you give it a comment and put the semicolon on its own line: while (((waitRc = waitpid(vm->pid, &childStatus, 0)) == -1) && errno == EINTR) ; /* empty */ Or give it an empty brace group: while (((waitRc = waitpid(vm->pid, &childStatus, 0)) == -1) && errno == EINTR) { /* empty */ } ...
+static int lxcMonitorServer(virConnectPtr conn, + lxc_driver_t * driver, + lxc_vm_t *vm) { - int rc = -1; - lxc_net_def_t *net; + char *sockpath = NULL; + int fd; + struct sockaddr_un addr;
- for (net = vm->def->nets; net; net = net->next) { - if (0 != moveInterfaceToNetNs(net->containerVeth, vm->def->id)) { + if (asprintf(&sockpath, "%s/%s.sock", + driver->stateDir, vm->def->name) < 0) { + lxcError(conn, NULL, VIR_ERR_NO_MEMORY, NULL); + return -1; + } + + if ((fd = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to create server socket: %s"),
It'd be nice to include sockpath in these diagnostics.
+ strerror(errno)); + goto error; + } + + unlink(sockpath);
Report failure other than ENOENT.
+ memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, sockpath, sizeof(addr.sun_path)); + + if (bind(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to bind server socket: %s"), + strerror(errno)); + goto error; + } + if (listen(fd, 30 /* backlog */ ) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to listen server socket: %s"), + strerror(errno)); + goto error; + return (-1);
That return is dead code. ...
+static int lxcMonitorClient(virConnectPtr conn, + lxc_driver_t * driver, + lxc_vm_t *vm) +{ + char *sockpath = NULL; + int fd; + struct sockaddr_un addr; + + if (asprintf(&sockpath, "%s/%s.sock", + driver->stateDir, vm->def->name) < 0) { + lxcError(conn, NULL, VIR_ERR_NO_MEMORY, NULL); + return -1; + } + + if ((fd = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to create client socket: %s"), + strerror(errno));
Likewise, it'd be nice to mention sockpath in these.
+ goto error; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, sockpath, sizeof(addr.sun_path)); + + if (connect(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to connect to client socket: %s"), + strerror(errno)); + goto error; + } ...
I'm quoting the following snippet here (may be out of order), because I spotted the problem only after eliding the original.
+ /* And get its pid */ + if ((rc = virFileReadPid(driver->stateDir, vm->def->name, &vm->pid)) != 0> ) { lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("sockpair failed: %s"), strerror(errno)); + _("Failed to read pid file %s/%s.pid: %s"), + driver->stateDir, vm->def->name, strerror(errno));
That should be "rc", not "errno". Otherwise, diagnosing the virFileReadPid parse error that is intended to map to EINVAL, we'd use some unrelated errno value.
+ rc = -1; goto cleanup; }
int lxcRegister(void) diff -r 8093fb566748 src/util.c --- a/src/util.c Fri Aug 01 14:47:33 2008 +0100 +++ b/src/util.c Tue Aug 05 12:13:24 2008 +0100 @@ -37,6 +37,7 @@ #include <sys/wait.h> #endif #include <string.h> +#include <termios.h> #include "c-ctype.h"
#ifdef HAVE_PATHS_H @@ -556,6 +557,163 @@ return 0; }
+ +int virFileOpenTty(int *ttymaster, + char **ttyName, + int rawmode) +{ + int rc = -1;
As you know, this function is a portability "challenge" ;-) (that's an understatement) I suppose we'll need to guard this function with #ifdef WITH_LXC.
+ if ((*ttymaster = posix_openpt(O_RDWR|O_NOCTTY|O_NONBLOCK)) < 0) + goto cleanup; + + if (unlockpt(*ttymaster) < 0) + goto cleanup; + + if (grantpt(*ttymaster) < 0) + goto cleanup; + + if (rawmode) { + struct termios ttyAttr; + if (tcgetattr(*ttymaster, &ttyAttr) < 0) + goto cleanup; + + cfmakeraw(&ttyAttr);
This is glibc-specific, but that's probably ok, for something that is used only WITH_LXC.
+ + if (tcsetattr(*ttymaster, TCSADRAIN, &ttyAttr) < 0) + goto cleanup; + } + + if (ttyName) { + char tempTtyName[PATH_MAX]; + if (ptsname_r(*ttymaster, tempTtyName, sizeof(tempTtyName)) < 0) + goto cleanup; + + if ((*ttyName = strdup(tempTtyName)) == NULL) { + errno = ENOMEM; + goto cleanup; + } + } + + rc = 0; + +cleanup: + if (rc != 0 && + *ttymaster != -1) { + close(*ttymaster); + } + + return rc; + +} + + +int virFileWritePid(const char *dir, + const char *name, + pid_t pid) +{ + int rc; + int fd; + FILE *file = NULL; + char *pidfile = NULL; + + if ((rc = virFileMakePath(dir))) { + goto cleanup; + } + + if (asprintf(&pidfile, "%s/%s.pid", dir, name) < 0) { + rc = ENOMEM; + goto cleanup; + } + + if ((fd = open(pidfile, + O_WRONLY | O_CREAT | O_TRUNC, + S_IRUSR | S_IWUSR)) < 0) { + rc = errno; + goto cleanup; + } + + if (!(file = fdopen(fd, "w"))) { + rc = errno; + close(fd); + goto cleanup; + } + + if (fprintf(file, "%d", pid) < 0) { + rc = errno; + goto cleanup; + } + + rc = 0; + +cleanup: + if (file && + fclose(file) < 0) { + rc = errno; + } + + VIR_FREE(pidfile); + return rc; +} + +int virFileReadPid(const char *dir, + const char *name, + pid_t *pid) +{ + int rc; + FILE *file; + char *pidfile = NULL; + *pid = 0; + if (asprintf(&pidfile, "%s/%s.pid", dir, name) < 0) { + rc = ENOMEM; + goto cleanup; + } + + if (!(file = fopen(pidfile, "r"))) { + rc = errno; + goto cleanup; + } + + if (fscanf(file, "%d", pid) != 1) { + rc = EINVAL; + goto cleanup; + }
[save this for a rainy day] fscanf is not robust. It does the job (and is almost always fine), but fails to diagnose inputs like "938bogus suffix" and silently wraps e.g., 4294967296 to 0. Then there's the possibility that pid_t is not the same size as "int". An alternative would be to read two "tokens" (e.g., with getdelim), ensuring you get EOF on the second read, convert the first token to an integer using virStrToLong_ui, then ensure the resulting value is fits in a pid_t variable.
+ if (fclose(file) < 0) { + rc = errno; + goto cleanup; + } + + rc = 0; + + cleanup: + VIR_FREE(pidfile); + return rc; +} + +int virFileDeletePid(const char *dir, + const char *name) +{ + int rc = 0; + char *pidfile = NULL; + + if (asprintf(&pidfile, "%s/%s.pid", dir, name) < 0) { + rc = errno; + goto cleanup; + } + + if (unlink(pidfile) < 0 && + errno != ENOENT) { + rc = errno;
I'd put the errno test on the previous line. Otherwise, it looks deceptively like "errno != ..." is the beginning of a statement. This is part of the rationale for putting operators like "&&" at the beginning of each continued line, rather than at the end of the preceding one.
+ } + +cleanup: + VIR_FREE(pidfile); + return rc; +} + + + /* Like strtol, but produce an "int" result, and check more carefully. Return 0 upon success; return -1 to indicate failure. When END_PTR is NULL, the byte after the final valid digit must be NUL.

On Tue, Aug 12, 2008 at 11:07:00AM +0200, Jim Meyering wrote:
"Daniel P. Berrange" <berrange@redhat.com> wrote:
diff -r 8093fb566748 src/lxc_conf.c diff -r 8093fb566748 src/lxc_conf.h --- a/src/lxc_conf.h Fri Aug 01 14:47:33 2008 +0100 +++ b/src/lxc_conf.h Tue Aug 05 12:13:24 2008 +0100 @@ -46,7 +46,6 @@ struct __lxc_net_def { int type; char *parentVeth; /* veth device in parent namespace */ - char *containerVeth; /* veth device in container namespace */ char *txName; /* bridge or network name */
lxc_net_def_t *next; @@ -87,11 +86,10 @@ struct __lxc_vm { int pid; int state; + int monitor;
I like "monitor_fd" ;-)
This struct goes away in the next patch.
diff -r 8093fb566748 src/lxc_container.c --- a/src/lxc_container.c Fri Aug 01 14:47:33 2008 +0100 +++ b/src/lxc_container.c Tue Aug 05 12:13:24 2008 +0100 @@ -69,6 +69,8 @@ typedef struct __lxc_child_argv lxc_child_argv_t; struct __lxc_child_argv { lxc_vm_def_t *config; + int nveths;
From the looks of all uses, this can be an unsigned type.
Yes, I've updated this to be unsigned throughout.
@@ -230,21 +231,21 @@ * * Returns 0 on success or nonzero in case of error */ -static int lxcContainerEnableInterfaces(const lxc_vm_def_t *def) +static int lxcContainerEnableInterfaces(int nveths, + char **veths) { - int rc = 0; - const lxc_net_def_t *net; + int rc = 0, i;
Many style guides recommend against putting multiple ","-separated declarations on the same line...
I've split this onto 2 declarations anyway, because 'i' can now be unsigned.
@@ -337,12 +338,11 @@ int flags; int stacksize = getpagesize() * 4; char *stack, *stacktop; - lxc_child_argv_t args = { def, control, ttyPath }; + lxc_child_argv_t args = { def, nveths, veths, control, ttyPath };
If possible, it'd be nice to make the struct "const".
Doesn't help much, because then I have to cast-away the constness when passing it into clone().
@@ -379,8 +379,9 @@ char *stack; int childStatus;
- if (features & LXC_CONTAINER_FEATURE_NET) + if (features & LXC_CONTAINER_FEATURE_NET) { flags |= CLONE_NEWNET; + }
Since you're making a change like this, I suspect it's worth adding a sentence+example to HACKING saying that we prefer to use braces even when there's only one statement. Out of habit, I tend *not* to use braces in that case, but have no trouble adapting. Codifying it might help avoid a little churn.
No, that's a bogus change - I don't like to have {} for single line statements - its just unneccessary noise.
@@ -138,23 +198,46 @@ /* if active fd's, return if no events, else wait forever */ timeout = (numActive > 0) ? 0 : -1; numEvents = epoll_wait(epollFd, &epollEvent, 1, timeout); - if (0 < numEvents) { - if (epollEvent.events & EPOLLIN) { - curFdOff = epollEvent.data.u32; - if (!fdArray[curFdOff].active) { - fdArray[curFdOff].active = 1; - ++numActive; + if (numEvents > 0) { + if (epollEvent.data.fd == monitor) { + int fd = accept(monitor, NULL, 0); + if (client != -1 || /* Already connected, so kick new one out */
I presume you meant to insert "client = fd;" right after "fd = ...", (or compare fd != -1, and then set client = fd after the "}"), since it looks like "client" must be defined after this if/else chain.
Yes, there should have been a 'client = fd;' statement after the conditional.
+ kill(container, SIGTERM); + waitpid(container, NULL, 0);
It might be worthwhile to detect kill or waitpid failure when rc == 0.
Any more to the point, it should not be invoked at all when container == -1, because that kills off every process on your system except for init. Yes, that's happened to me several times while debugging this :-)
+ * container exits. + * + * Returns 0 on success or -1 in case of error + */ +static int lxcVMCleanup(virConnectPtr conn, + lxc_driver_t *driver, + lxc_vm_t * vm) +{ + int rc = -1; + int waitRc; + int childStatus = -1; + + while (((waitRc = waitpid(vm->pid, &childStatus, 0)) == -1) && + errno == EINTR);
It's easier to recognize this as an empty loop if you give it a comment and put the semicolon on its own line:
while (((waitRc = waitpid(vm->pid, &childStatus, 0)) == -1) && errno == EINTR) ; /* empty */
Made this change.
+ goto error; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, sockpath, sizeof(addr.sun_path)); + + if (connect(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to connect to client socket: %s"), + strerror(errno)); + goto error; + } ...
I'm quoting the following snippet here (may be out of order), because I spotted the problem only after eliding the original.
+ /* And get its pid */ + if ((rc = virFileReadPid(driver->stateDir, vm->def->name, &vm->pid)) != 0> ) { lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("sockpair failed: %s"), strerror(errno)); + _("Failed to read pid file %s/%s.pid: %s"), + driver->stateDir, vm->def->name, strerror(errno));
That should be "rc", not "errno". Otherwise, diagnosing the virFileReadPid parse error that is intended to map to EINVAL, we'd use some unrelated errno value.
Oh yes, changed that to 'rc'.
int lxcRegister(void) diff -r 8093fb566748 src/util.c --- a/src/util.c Fri Aug 01 14:47:33 2008 +0100 +++ b/src/util.c Tue Aug 05 12:13:24 2008 +0100 @@ -37,6 +37,7 @@ #include <sys/wait.h> #endif #include <string.h> +#include <termios.h> #include "c-ctype.h"
#ifdef HAVE_PATHS_H @@ -556,6 +557,163 @@ return 0; }
+ +int virFileOpenTty(int *ttymaster, + char **ttyName, + int rawmode) +{ + int rc = -1;
As you know, this function is a portability "challenge" ;-) (that's an understatement) I suppose we'll need to guard this function with #ifdef WITH_LXC.
The util.c file is intended to be generic code so I like to avoid making it conditional based on WITH_LXC. Instead I've changed it to be #ifdef __linux__, and left an empty stub which just reports an error for the non-Linux case. This follows the pattern we use with virExec too. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

The re-architecting of the LXC controller/container process relationship in the previous patch removed the last obstacle to switching over to the generic domain XML routines. So this patch switches the driver over. First the vast majority of lxc_conf.h/c is simply deleted - this is all redundant when using the domain_conf.h APIs. Then, all references to lxc_vm_t are changed to virDomainObj, and lxc_vm_def_t switches to virDomainDef. Finally the LXC driver registers its capabilities data. For this I have chosen an OS type of 'exe', since the 'operating system' we're running in the container is just any plain executable process. lxc_conf.c | 1052 +------------------------------------------------------ lxc_conf.h | 121 ------ lxc_container.c | 23 - lxc_container.h | 2 lxc_controller.c | 4 lxc_controller.h | 2 lxc_driver.c | 289 ++++++++------- 7 files changed, 215 insertions(+), 1278 deletions(-) Daniel diff -r 72ff3545be62 src/lxc_conf.c --- a/src/lxc_conf.c Tue Aug 05 16:50:44 2008 +0100 +++ b/src/lxc_conf.c Tue Aug 05 16:50:49 2008 +0100 @@ -27,22 +27,8 @@ #ifdef WITH_LXC -#include <dirent.h> -#include <errno.h> -#include <fcntl.h> -#include <string.h> -#include <unistd.h> +#include <sys/utsname.h> -#include <libxml/parser.h> -#include <libxml/tree.h> -#include <libxml/uri.h> -#include <libxml/xpath.h> - -#include "buf.h" -#include "util.h" -#include "uuid.h" -#include "xml.h" -#include "memory.h" #include "lxc_conf.h" /* debug macros */ @@ -54,12 +40,12 @@ const char *fmt, ...) { va_list args; - char errorMessage[LXC_MAX_ERROR_LEN]; + char errorMessage[1024]; const char *codeErrorMessage; if (fmt) { va_start(args, fmt); - vsnprintf(errorMessage, LXC_MAX_ERROR_LEN-1, fmt, args); + vsnprintf(errorMessage, sizeof(errorMessage)-1, fmt, args); va_end(args); } else { errorMessage[0] = '\0'; @@ -71,769 +57,40 @@ codeErrorMessage, errorMessage); } -/** - * lxcParseInterfaceXML: - * @conn: pointer to connection - * @nodePtr: pointer to xml node structure - * @vm: pointer to net definition structure to fill in - * - * Parses the XML for a network interface and places the configuration - * in the given structure. - * - * Returns 0 on success or -1 in case of error - */ -static int lxcParseInterfaceXML(virConnectPtr conn, xmlNodePtr nodePtr, - lxc_net_def_t *netDef) +virCapsPtr lxcCapsInit(void) { - int rc = -1; - xmlNodePtr cur; - xmlChar *type = NULL; - xmlChar *parentIfName = NULL; - xmlChar *network = NULL; - xmlChar *bridge = NULL; - xmlChar *macaddr = NULL; + struct utsname utsname; + virCapsPtr caps; + virCapsGuestPtr guest; - netDef->type = LXC_NET_NETWORK; + uname(&utsname); - type = xmlGetProp(nodePtr, BAD_CAST "type"); - if (type != NULL) { - if (xmlStrEqual(type, BAD_CAST "network")) { - netDef->type = LXC_NET_NETWORK; - } - else if (xmlStrEqual(type, BAD_CAST "bridge")) { - netDef->type = LXC_NET_BRIDGE; - } - else { - lxcError(conn, NULL, VIR_ERR_XML_ERROR, - _("invalid interface type: %s"), type); - goto error_out; - } - } + if ((caps = virCapabilitiesNew(utsname.machine, + 0, 0)) == NULL) + goto no_memory; - cur = nodePtr->children; - for (cur = nodePtr->children; cur != NULL; cur = cur->next) { - if (cur->type == XML_ELEMENT_NODE) { - DEBUG("cur->name: %s", (char*)(cur->name)); - if ((macaddr == NULL) && - (xmlStrEqual(cur->name, BAD_CAST "mac"))) { - macaddr = xmlGetProp(cur, BAD_CAST "address"); - } else if ((network == NULL) && - (netDef->type == LXC_NET_NETWORK) && - (xmlStrEqual(cur->name, BAD_CAST "source"))) { - network = xmlGetProp(cur, BAD_CAST "network"); - parentIfName = xmlGetProp(cur, BAD_CAST "dev"); - } else if ((bridge == NULL) && - (netDef->type == LXC_NET_BRIDGE) && - (xmlStrEqual(cur->name, BAD_CAST "source"))) { - bridge = xmlGetProp(cur, BAD_CAST "bridge"); - } else if ((parentIfName == NULL) && - (xmlStrEqual(cur->name, BAD_CAST "target"))) { - parentIfName = xmlGetProp(cur, BAD_CAST "dev"); - } - } - } + if ((guest = virCapabilitiesAddGuest(caps, + "exe", + utsname.machine, + sizeof(int) == 4 ? 32 : 8, + NULL, + NULL, + 0, + NULL)) == NULL) + goto no_memory; - if (netDef->type == LXC_NET_NETWORK) { - if (network == NULL) { - lxcError(conn, NULL, VIR_ERR_XML_ERROR, - _("No <source> 'network' attribute specified with <interface type='network'/>")); - goto error_out; - } + if (virCapabilitiesAddGuestDomain(guest, + "lxc", + NULL, + NULL, + 0, + NULL) == NULL) + goto no_memory; + return caps; - netDef->txName = strdup((char *)network); - if (NULL == netDef->txName) { - lxcError(conn, NULL, VIR_ERR_NO_MEMORY, - _("No storage for network name")); - goto error_out; - } - - } else if (netDef->type == LXC_NET_BRIDGE) { - if (bridge == NULL) { - lxcError(conn, NULL, VIR_ERR_XML_ERROR, - _("No <source> 'bridge' attribute specified with <interface type='bridge'/>")); - goto error_out; - } - - netDef->txName = strdup((char *)bridge); - if (NULL == netDef->txName) { - lxcError(conn, NULL, VIR_ERR_NO_MEMORY, - _("No storage for bridge name")); - goto error_out; - } - } - - if (parentIfName != NULL) { - DEBUG("set netDef->parentVeth: %s", netDef->parentVeth); - netDef->parentVeth = strdup((char *)parentIfName); - if (NULL == netDef->parentVeth) { - lxcError(conn, NULL, VIR_ERR_NO_MEMORY, - _("No storage for parent veth device name")); - goto error_out; - } - } else { - netDef->parentVeth = NULL; - DEBUG0("set netDef->parentVeth: NULL"); - } - - rc = 0; - -error_out: - xmlFree(macaddr); - xmlFree(network); - xmlFree(bridge); - xmlFree(parentIfName); - - return rc; -} - -/** - * lxcParseDomainInterfaces: - * @conn: pointer to connection - * @nets: on success, points to an list of net def structs - * @contextPtr: pointer to xml context - * - * Parses the domain network interfaces and returns the information in a list - * - * Returns 0 on success or -1 in case of error - */ -static int lxcParseDomainInterfaces(virConnectPtr conn, - lxc_net_def_t **nets, - xmlXPathContextPtr contextPtr) -{ - int rc = -1; - int i; - lxc_net_def_t *netDef; - lxc_net_def_t *prevDef = NULL; - int numNets = 0; - xmlNodePtr *list; - int res; - - DEBUG0("parsing nets"); - - res = virXPathNodeSet(conn, "/domain/devices/interface", contextPtr, &list); - if (res > 0) { - for (i = 0; i < res; ++i) { - netDef = calloc(1, sizeof(lxc_net_def_t)); - if (NULL == netDef) { - lxcError(conn, NULL, VIR_ERR_NO_MEMORY, - _("No storage for net def structure")); - free(list); - goto parse_complete; - } - - rc = lxcParseInterfaceXML(conn, list[i], netDef); - if (0 > rc) { - DEBUG("failed parsing a net: %d", rc); - - free(netDef); - free(list); - goto parse_complete; - } - - DEBUG0("parsed a net"); - - /* set the linked list pointers */ - numNets++; - netDef->next = NULL; - if (0 == i) { - *nets = netDef; - } else { - prevDef->next = netDef; - } - prevDef = netDef; - } - free(list); - } - - rc = numNets; - -parse_complete: - DEBUG("parsed %d nets", rc); - return rc; -} - -static int lxcParseMountXML(virConnectPtr conn, xmlNodePtr nodePtr, - lxc_mount_t *lxcMount) -{ - xmlChar *fsType = NULL; - xmlNodePtr curNode; - xmlChar *mountSource = NULL; - xmlChar *mountTarget = NULL; - int strLen; - int rc = -1; - - fsType = xmlGetProp(nodePtr, BAD_CAST "type"); - if (NULL == fsType) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("missing filesystem type")); - goto error; - } - - if (xmlStrEqual(fsType, BAD_CAST "mount") == 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("invalid filesystem type")); - goto error; - } - - for (curNode = nodePtr->children; - NULL != curNode; - curNode = curNode->next) { - if (curNode->type != XML_ELEMENT_NODE) { - continue; - } - - if ((mountSource == NULL) && - (xmlStrEqual(curNode->name, BAD_CAST "source"))) { - mountSource = xmlGetProp(curNode, BAD_CAST "dir"); - } else if ((mountTarget == NULL) && - (xmlStrEqual(curNode->name, BAD_CAST "target"))) { - mountTarget = xmlGetProp(curNode, BAD_CAST "dir"); - } - } - - if (mountSource == NULL) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("missing mount source")); - goto error; - } - - strLen = xmlStrlen(mountSource); - if ((strLen > (PATH_MAX-1)) || (0 == strLen)) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("empty or invalid mount source")); - goto error; - } - - strncpy(lxcMount->source, (char *)mountSource, strLen); - lxcMount->source[strLen] = '\0'; - - if (mountTarget == NULL) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("missing mount target")); - goto error; - } - - strLen = xmlStrlen(mountTarget); - if ((strLen > (PATH_MAX-1)) || (0 == strLen)) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("empty or invalid mount target")); - goto error; - } - - strncpy(lxcMount->target, (char *)mountTarget, strLen); - lxcMount->target[strLen] = '\0'; - - rc = 0; - -error: - xmlFree(fsType); - xmlFree(mountSource); - xmlFree(mountTarget); - - return rc; -} - -static int lxcParseDomainName(virConnectPtr conn, char **name, - xmlXPathContextPtr contextPtr) -{ - char *res; - - res = virXPathString(conn, "string(/domain/name[1])", contextPtr); - if (res == NULL) { - lxcError(conn, NULL, VIR_ERR_NO_NAME, NULL); - return(-1); - } - - *name = res; - return(0); -} - -static int lxcParseDomainUUID(virConnectPtr conn, unsigned char *uuid, - xmlXPathContextPtr contextPtr) -{ - char *res; - - res = virXPathString(conn, "string(/domain/uuid[1])", contextPtr); - if (res == NULL) { - if (virUUIDGenerate(uuid)) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("failed to generate uuid")); - return(-1); - } - } else { - if (virUUIDParse(res, uuid) < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("invalid uuid element")); - VIR_FREE(res); - return(-1); - } - VIR_FREE(res); - } - return(0); -} - -static int lxcParseDomainMounts(virConnectPtr conn, - lxc_mount_t **mounts, - xmlXPathContextPtr contextPtr) -{ - int rc = -1; - int i; - lxc_mount_t *mountObj; - lxc_mount_t *prevObj = NULL; - int nmounts = 0; - xmlNodePtr *list; - int res; - - res = virXPathNodeSet(conn, "/domain/devices/filesystem", contextPtr, &list); - if (res > 0) { - for (i = 0; i < res; ++i) { - if (VIR_ALLOC(mountObj) < 0) { - lxcError(conn, NULL, VIR_ERR_NO_MEMORY, "mount"); - goto parse_complete; - } - - rc = lxcParseMountXML(conn, list[i], mountObj); - if (0 > rc) { - VIR_FREE(mountObj); - goto parse_complete; - } - - /* set the linked list pointers */ - nmounts++; - mountObj->next = NULL; - if (0 == i) { - *mounts = mountObj; - } else { - prevObj->next = mountObj; - } - prevObj = mountObj; - } - VIR_FREE(list); - } - - rc = nmounts; - -parse_complete: - return rc; -} - -static int lxcParseDomainInit(virConnectPtr conn, char** init, - xmlXPathContextPtr contextPtr) -{ - char *res; - - res = virXPathString(conn, "string(/domain/os/init[1])", contextPtr); - if (res == NULL) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("invalid or missing init element")); - return(-1); - } - - if (strlen(res) >= PATH_MAX - 1) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("init string too long")); - VIR_FREE(res); - return(-1); - } - - *init = res; - - return(0); -} - - -static int lxcParseDomainTty(virConnectPtr conn, char **tty, xmlXPathContextPtr contextPtr) -{ - char *res; - - res = virXPathString(conn, "string(/domain/devices/console[1]/@tty)", contextPtr); - if (res == NULL) { - /* make sure the tty string is empty */ - *tty = strdup(""); - if (*tty == NULL) { - lxcError(conn, NULL, VIR_ERR_NO_MEMORY, NULL); - return(-1); - } - } else { - *tty = res; - } - - return(0); -} - -static int lxcParseDomainMemory(virConnectPtr conn, int* memory, xmlXPathContextPtr contextPtr) -{ - long res; - int rc; - - rc = virXPathLong(conn, "string(/domain/memory[1])", contextPtr, &res); - if ((rc == -2) || ((rc == 0) && (res <= 0))) { - *memory = -1; - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("invalid memory value")); - } else if (rc < 0) { - /* not an error, default to an invalid value so it's not used */ - *memory = -1; - } else { - *memory = (int) res; - } - return(0); -} - -static lxc_vm_def_t * lxcParseXML(virConnectPtr conn, xmlDocPtr docPtr) -{ - xmlNodePtr rootNodePtr = NULL; - xmlXPathContextPtr contextPtr = NULL; - xmlChar *xmlProp = NULL; - lxc_vm_def_t *containerDef; - - if (VIR_ALLOC(containerDef) < 0) { - lxcError(conn, NULL, VIR_ERR_NO_MEMORY, "containerDef"); - return NULL; - } - - /* Prepare parser / xpath context */ - rootNodePtr = xmlDocGetRootElement(docPtr); - if ((rootNodePtr == NULL) || - (!xmlStrEqual(rootNodePtr->name, BAD_CAST "domain"))) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("invalid root element")); - goto error; - } - - contextPtr = xmlXPathNewContext(docPtr); - if (contextPtr == NULL) { - lxcError(conn, NULL, VIR_ERR_NO_MEMORY, "context"); - goto error; - } - - /* Verify the domain type is linuxcontainer */ - if (!(xmlProp = xmlGetProp(rootNodePtr, BAD_CAST "type"))) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("missing domain type")); - goto error; - } - - if (!(xmlStrEqual(xmlProp, BAD_CAST LXC_DOMAIN_TYPE))) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("invalid domain type")); - goto error; - } - VIR_FREE(xmlProp); - - if ((xmlProp = xmlGetProp(rootNodePtr, BAD_CAST "id"))) { - if (0 > virStrToLong_i((char*)xmlProp, NULL, 10, &(containerDef->id))) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("invalid domain id")); - goto error; - } - - /* verify the container process still exists */ - if (1 != lxcCheckContainerProcess(containerDef)) { - containerDef->id = -1; - } - - } else { - containerDef->id = -1; - } - VIR_FREE(xmlProp); - - if (lxcParseDomainName(conn, &(containerDef->name), contextPtr) < 0) { - goto error; - } - - if (lxcParseDomainInit(conn, &(containerDef->init), contextPtr) < 0) { - goto error; - } - - if (lxcParseDomainUUID(conn, containerDef->uuid, contextPtr) < 0) { - goto error; - } - - containerDef->nmounts = lxcParseDomainMounts(conn, &(containerDef->mounts), - contextPtr); - if (0 > containerDef->nmounts) { - goto error; - } - - containerDef->numNets = lxcParseDomainInterfaces(conn, - &(containerDef->nets), - contextPtr); - if (0 > containerDef->numNets) { - goto error; - } - - if (lxcParseDomainTty(conn, &(containerDef->tty), contextPtr) < 0) { - goto error; - } - - if (lxcParseDomainMemory(conn, &(containerDef->maxMemory), contextPtr) < 0) { - goto error; - } - - xmlXPathFreeContext(contextPtr); - - return containerDef; - - error: - VIR_FREE(xmlProp); - xmlXPathFreeContext(contextPtr); - lxcFreeVMDef(containerDef); - +no_memory: + virCapabilitiesFree(caps); return NULL; -} - - -lxc_vm_def_t * lxcParseVMDef(virConnectPtr conn, - const char* xmlString, - const char* fileName) -{ - xmlDocPtr xml; - lxc_vm_def_t *containerDef; - - xml = xmlReadDoc(BAD_CAST xmlString, - fileName ? fileName : "domain.xml", - NULL, XML_PARSE_NOENT | - XML_PARSE_NONET | XML_PARSE_NOERROR | - XML_PARSE_NOWARNING); - if (!xml) { - lxcError(conn, NULL, VIR_ERR_XML_ERROR, NULL); - return NULL; - } - - containerDef = lxcParseXML(conn, xml); - - xmlFreeDoc(xml); - - return containerDef; -} - -lxc_vm_t * lxcAssignVMDef(virConnectPtr conn, - lxc_driver_t *driver, - lxc_vm_def_t *def) -{ - lxc_vm_t *vm = NULL; - - if ((vm = lxcFindVMByName(driver, def->name))) { - if (!lxcIsActiveVM(vm)) { - lxcFreeVMDef(vm->def); - vm->def = def; - } else { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("Can't redefine active VM with name %s"), def->name); - return NULL; - } - - return vm; - } - - if (VIR_ALLOC(vm) < 0) { - lxcError(conn, NULL, VIR_ERR_NO_MEMORY, "vm"); - return NULL; - } - - vm->pid = -1; - vm->def = def; - vm->next = driver->vms; - - driver->vms = vm; - - if (lxcIsActiveVM(vm)) { - vm->state = VIR_DOMAIN_RUNNING; - driver->nactivevms++; - } else { - vm->state = VIR_DOMAIN_SHUTOFF; - driver->ninactivevms++; - } - - return vm; -} - -/** - * lxcCheckContainerProcess: - * @def: Ptr to VM definition - * - * Checks if the container process (stored at def->id is running - * - * Returns on success or -1 in case of error - * 0 - no process with id vm->def->id - * 1 - container process exists - * -1 - error - */ -int lxcCheckContainerProcess(lxc_vm_def_t *def) -{ - int rc = -1; - - if (1 < def->id) { - if (-1 == kill(def->id, 0)) { - if (ESRCH == errno) { - rc = 0; - DEBUG("pid %d no longer exists", def->id); - goto done; - } - - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("error checking container process: %d %s"), - def->id, strerror(errno)); - goto done; - } - - DEBUG("pid %d still exists", def->id); - rc = 1; - goto done; - } - - rc = 0; - -done: - return rc; -} - -void lxcRemoveInactiveVM(lxc_driver_t *driver, - lxc_vm_t *vm) -{ - lxc_vm_t *prevVm = NULL; - lxc_vm_t *curVm; - - for (curVm = driver->vms; - (curVm != vm) && (NULL != curVm); - curVm = curVm->next) { - prevVm = curVm; - } - - if (curVm) { - if (prevVm) { - prevVm->next = curVm->next; - } else { - driver->vms = curVm->next; - } - - driver->ninactivevms--; - } - - lxcFreeVM(vm); -} - -/* Save a container's config data into a persistent file */ -int lxcSaveConfig(virConnectPtr conn, - lxc_driver_t *driver, - lxc_vm_t *vm, - lxc_vm_def_t *def) -{ - int rc = -1; - char *xmlDef; - int fd = -1; - int amtToWrite; - - if (!(xmlDef = lxcGenerateXML(conn, driver, vm, def))) { - return -1; - } - - if ((fd = open(vm->configFile, - O_WRONLY | O_CREAT | O_TRUNC, - S_IRUSR | S_IWUSR )) < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot create config file %s: %s"), - vm->configFile, strerror(errno)); - goto cleanup; - } - - amtToWrite = strlen(xmlDef); - if (safewrite(fd, xmlDef, amtToWrite) < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot write config file %s: %s"), - vm->configFile, strerror(errno)); - goto cleanup; - } - - if (close(fd) < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot save config file %s: %s"), - vm->configFile, strerror(errno)); - goto cleanup; - } - - rc = 0; - - cleanup: - if (fd != -1) { - close(fd); - } - - VIR_FREE(xmlDef); - - return rc; -} - -int lxcSaveVMDef(virConnectPtr conn, - lxc_driver_t *driver, - lxc_vm_t *vm, - lxc_vm_def_t *def) -{ - int rc = -1; - - if (vm->configFile[0] == '\0') { - if ((rc = virFileMakePath(driver->configDir))) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot create config directory %s: %s"), - driver->configDir, strerror(rc)); - goto save_complete; - } - - if (virFileBuildPath(driver->configDir, vm->def->name, ".xml", - vm->configFile, PATH_MAX) < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot construct config file path")); - goto save_complete; - } - - strncpy(vm->configFileBase, vm->def->name, PATH_MAX-1); - strncat(vm->configFileBase, ".xml", PATH_MAX - strlen(vm->def->name)-1); - - } - - rc = lxcSaveConfig(conn, driver, vm, def); - -save_complete: - return rc; -} - - - -static lxc_vm_t * lxcLoadConfig(lxc_driver_t *driver, - const char *file, - const char *fullFilePath, - const char *xmlData) -{ - lxc_vm_def_t *containerDef; - lxc_vm_t * vm; - - containerDef = lxcParseVMDef(NULL, xmlData, file); - if (NULL == containerDef) { - DEBUG0("Error parsing container config"); - return NULL; - } - - if (!virFileMatchesNameSuffix(file, containerDef->name, ".xml")) { - DEBUG0("Container name does not match config file name"); - lxcFreeVMDef(containerDef); - return NULL; - } - - vm = lxcAssignVMDef(NULL, driver, containerDef); - if (NULL == vm) { - DEBUG0("Failed to load container config"); - lxcFreeVMDef(containerDef); - return NULL; - } - - strncpy(vm->configFile, fullFilePath, PATH_MAX); - vm->configFile[PATH_MAX-1] = '\0'; - - strncpy(vm->configFileBase, file, PATH_MAX); - vm->configFile[PATH_MAX-1] = '\0'; - - return vm; } int lxcLoadDriverConfig(lxc_driver_t *driver) @@ -853,254 +110,5 @@ return -1; } -int lxcLoadContainerConfigFile(lxc_driver_t *driver, - const char *file) -{ - int rc = -1; - char tempPath[PATH_MAX]; - char* xmlData; - - rc = virFileBuildPath(driver->configDir, file, NULL, tempPath, - PATH_MAX); - if (0 > rc) { - DEBUG0("config file name too long"); - goto load_complete; - } - - if ((rc = virFileReadAll(tempPath, LXC_MAX_XML_LENGTH, &xmlData)) < 0) { - goto load_complete; - } - - lxcLoadConfig(driver, file, tempPath, xmlData); - - VIR_FREE(xmlData); - -load_complete: - return rc; -} - -int lxcLoadContainerInfo(lxc_driver_t *driver) -{ - int rc = -1; - DIR *dir; - struct dirent *dirEntry; - - if (!(dir = opendir(driver->configDir))) { - if (ENOENT == errno) { - /* no config dir => no containers */ - rc = 0; - } else { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("failed to open config directory %s: %s"), - driver->configDir, strerror(errno)); - } - - goto load_complete; - } - - while ((dirEntry = readdir(dir))) { - if (dirEntry->d_name[0] == '.') { - continue; - } - - if (!virFileHasSuffix(dirEntry->d_name, ".xml")) { - continue; - } - - lxcLoadContainerConfigFile(driver, dirEntry->d_name); - } - - closedir(dir); - - rc = 0; - -load_complete: - return rc; -} - -/* Generate an XML document describing the vm's configuration */ -char *lxcGenerateXML(virConnectPtr conn, - lxc_driver_t *driver ATTRIBUTE_UNUSED, - lxc_vm_t *vm, - lxc_vm_def_t *def) -{ - virBuffer buf = VIR_BUFFER_INITIALIZER; - unsigned char *uuid; - char uuidstr[VIR_UUID_STRING_BUFLEN]; - lxc_mount_t *mount; - lxc_net_def_t *net; - - if (lxcIsActiveVM(vm)) - virBufferVSprintf(&buf, "<domain type='%s' id='%d'>\n", - LXC_DOMAIN_TYPE, vm->def->id); - else - virBufferVSprintf(&buf, "<domain type='%s'>\n", - LXC_DOMAIN_TYPE); - - virBufferVSprintf(&buf, " <name>%s</name>\n", def->name); - - uuid = def->uuid; - virUUIDFormat(uuid, uuidstr); - virBufferVSprintf(&buf, " <uuid>%s</uuid>\n", uuidstr); - virBufferAddLit(&buf, " <os>\n"); - virBufferVSprintf(&buf, " <init>%s</init>\n", def->init); - virBufferAddLit(&buf, " </os>\n"); - virBufferVSprintf(&buf, " <memory>%d</memory>\n", def->maxMemory); - virBufferAddLit(&buf, " <devices>\n"); - - /* loop adding mounts */ - for (mount = def->mounts; mount; mount = mount->next) { - virBufferAddLit(&buf, " <filesystem type='mount'>\n"); - virBufferVSprintf(&buf, " <source dir='%s'/>\n", - mount->source); - virBufferVSprintf(&buf, " <target dir='%s'/>\n", - mount->target); - virBufferAddLit(&buf, " </filesystem>\n"); - } - - /* loop adding nets */ - for (net = def->nets; net; net = net->next) { - if (net->type == LXC_NET_NETWORK) { - virBufferAddLit(&buf, " <interface type='network'>\n"); - virBufferVSprintf(&buf, " <source network='%s'/>\n", - net->txName); - } else { - virBufferAddLit(&buf, " <interface type='bridge'>\n"); - virBufferVSprintf(&buf, " <source bridge='%s'/>\n", - net->txName); - } - - if (NULL != net->parentVeth) { - virBufferVSprintf(&buf, " <target dev='%s'/>\n", - net->parentVeth); - } - - virBufferAddLit(&buf, " </interface>\n"); - - } - - virBufferVSprintf(&buf, " <console tty='%s'/>\n", def->tty); - virBufferAddLit(&buf, " </devices>\n"); - virBufferAddLit(&buf, "</domain>\n"); - - if (virBufferError(&buf)) { - lxcError(conn, NULL, VIR_ERR_NO_MEMORY,_("allocate buffer")); - return NULL; - } - - return virBufferContentAndReset(&buf); -} - -void lxcFreeVMDef(lxc_vm_def_t *vmdef) -{ - lxc_mount_t *curMount; - lxc_mount_t *nextMount; - lxc_net_def_t *curNet; - lxc_net_def_t *nextNet; - - if (vmdef == NULL) - return; - - curMount = vmdef->mounts; - while (curMount) { - nextMount = curMount->next; - VIR_FREE(curMount); - curMount = nextMount; - } - - curNet = vmdef->nets; - while (curNet) { - nextNet = curNet->next; - VIR_FREE(curNet->parentVeth); - VIR_FREE(curNet->txName); - VIR_FREE(curNet); - curNet = nextNet; - } - - VIR_FREE(vmdef->name); - VIR_FREE(vmdef->init); - VIR_FREE(vmdef->tty); - VIR_FREE(vmdef); -} - -void lxcFreeVMs(lxc_vm_t *vms) -{ - lxc_vm_t *curVm = vms; - lxc_vm_t *nextVm; - - while (curVm) { - nextVm = curVm->next; - lxcFreeVM(curVm); - curVm = nextVm; - } -} - -void lxcFreeVM(lxc_vm_t *vm) -{ - lxcFreeVMDef(vm->def); - VIR_FREE(vm); -} - -lxc_vm_t *lxcFindVMByID(const lxc_driver_t *driver, int id) -{ - lxc_vm_t *vm; - - for (vm = driver->vms; vm; vm = vm->next) { - if (lxcIsActiveVM(vm) && (vm->def->id == id)) { - return vm; - } - - } - - return NULL; -} - -lxc_vm_t *lxcFindVMByUUID(const lxc_driver_t *driver, - const unsigned char *uuid) -{ - lxc_vm_t *vm; - - for (vm = driver->vms; vm; vm = vm->next) { - if (!memcmp(vm->def->uuid, uuid, VIR_UUID_BUFLEN)) { - return vm; - } - } - - return NULL; -} - -lxc_vm_t *lxcFindVMByName(const lxc_driver_t *driver, - const char *name) -{ - lxc_vm_t *vm; - - for (vm = driver->vms; vm; vm = vm->next) { - if (STREQ(vm->def->name, name)) { - return vm; - } - } - - return NULL; -} - -int lxcDeleteConfig(virConnectPtr conn, - lxc_driver_t *driver ATTRIBUTE_UNUSED, - const char *configFile, - const char *name) -{ - if (!configFile[0]) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("no config file for %s"), name); - return -1; - } - - if (unlink(configFile) < 0) { - lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, - _("cannot remove config for %s"), name); - return -1; - } - - return 0; -} #endif /* WITH_LXC */ diff -r 72ff3545be62 src/lxc_conf.h --- a/src/lxc_conf.h Tue Aug 05 16:50:44 2008 +0100 +++ b/src/lxc_conf.h Tue Aug 05 16:50:49 2008 +0100 @@ -29,130 +29,23 @@ #ifdef WITH_LXC #include "internal.h" - -/* Defines */ -#define LXC_MAX_TTY_NAME 32 -#define LXC_MAX_XML_LENGTH 16384 -#define LXC_MAX_ERROR_LEN 1024 -#define LXC_DOMAIN_TYPE "lxc" - -/* types of networks for containers */ -enum lxc_net_type { - LXC_NET_NETWORK, - LXC_NET_BRIDGE -}; - -typedef struct __lxc_net_def lxc_net_def_t; -struct __lxc_net_def { - int type; - char *parentVeth; /* veth device in parent namespace */ - char *txName; /* bridge or network name */ - - lxc_net_def_t *next; -}; - -typedef struct __lxc_mount lxc_mount_t; -struct __lxc_mount { - char source[PATH_MAX]; /* user's directory */ - char target[PATH_MAX]; - - lxc_mount_t *next; -}; - -typedef struct __lxc_vm_def lxc_vm_def_t; -struct __lxc_vm_def { - unsigned char uuid[VIR_UUID_BUFLEN]; - char* name; - int id; - - /* init command string */ - char *init; - - int maxMemory; - - /* mounts - list of mount structs */ - int nmounts; - lxc_mount_t *mounts; - - /* tty device */ - char *tty; - - /* network devices */ - int numNets; - lxc_net_def_t *nets; -}; - -typedef struct __lxc_vm lxc_vm_t; -struct __lxc_vm { - int pid; - int state; - int monitor; - - char configFile[PATH_MAX]; - char configFileBase[PATH_MAX]; - - lxc_vm_def_t *def; - - lxc_vm_t *next; -}; +#include "domain_conf.h" +#include "capabilities.h" typedef struct __lxc_driver lxc_driver_t; struct __lxc_driver { - lxc_vm_t *vms; - int nactivevms; - int ninactivevms; + virCapsPtr caps; + + virDomainObjPtr domains; char *configDir; + char *autostartDir; char *stateDir; char *logDir; int have_netns; }; -/* Types and structs */ - -/* Inline Functions */ -static inline int lxcIsActiveVM(lxc_vm_t *vm) -{ - return vm->def->id != -1; -} - -/* Function declarations */ -lxc_vm_def_t * lxcParseVMDef(virConnectPtr conn, - const char* xmlString, - const char* fileName); -int lxcSaveVMDef(virConnectPtr conn, - lxc_driver_t *driver, - lxc_vm_t *vm, - lxc_vm_def_t *def); int lxcLoadDriverConfig(lxc_driver_t *driver); -int lxcSaveConfig(virConnectPtr conn, - lxc_driver_t *driver, - lxc_vm_t *vm, - lxc_vm_def_t *def); -int lxcLoadContainerInfo(lxc_driver_t *driver); -int lxcLoadContainerConfigFile(lxc_driver_t *driver, - const char *file); -lxc_vm_t * lxcAssignVMDef(virConnectPtr conn, - lxc_driver_t *driver, - lxc_vm_def_t *def); -char *lxcGenerateXML(virConnectPtr conn, - lxc_driver_t *driver, - lxc_vm_t *vm, - lxc_vm_def_t *def); -lxc_vm_t *lxcFindVMByID(const lxc_driver_t *driver, int id); -lxc_vm_t *lxcFindVMByUUID(const lxc_driver_t *driver, - const unsigned char *uuid); -lxc_vm_t *lxcFindVMByName(const lxc_driver_t *driver, - const char *name); -int lxcCheckContainerProcess(lxc_vm_def_t *vm); -void lxcRemoveInactiveVM(lxc_driver_t *driver, - lxc_vm_t *vm); -void lxcFreeVMs(lxc_vm_t *vms); -void lxcFreeVM(lxc_vm_t *vm); -void lxcFreeVMDef(lxc_vm_def_t *vmdef); -int lxcDeleteConfig(virConnectPtr conn, - lxc_driver_t *driver, - const char *configFile, - const char *name); +virCapsPtr lxcCapsInit(void); void lxcError(virConnectPtr conn, virDomainPtr dom, diff -r 72ff3545be62 src/lxc_container.c --- a/src/lxc_container.c Tue Aug 05 16:50:44 2008 +0100 +++ b/src/lxc_container.c Tue Aug 05 16:50:49 2008 +0100 @@ -68,7 +68,7 @@ typedef struct __lxc_child_argv lxc_child_argv_t; struct __lxc_child_argv { - lxc_vm_def_t *config; + virDomainDefPtr config; int nveths; char **veths; int monitor; @@ -85,10 +85,10 @@ * * Does not return */ -static int lxcContainerExecInit(const lxc_vm_def_t *vmDef) +static int lxcContainerExecInit(virDomainDefPtr vmDef) { const char *const argv[] = { - vmDef->init, + vmDef->os.init, NULL, }; @@ -268,8 +268,8 @@ { int rc = -1; lxc_child_argv_t *argv = data; - lxc_vm_def_t *vmDef = argv->config; - lxc_mount_t *curMount; + virDomainDefPtr vmDef = argv->config; + virDomainFSDefPtr curMount; int i; if (NULL == vmDef) { @@ -280,17 +280,20 @@ /* handle the bind mounts first before doing anything else that may */ /* then access those mounted dirs */ - curMount = vmDef->mounts; + curMount = vmDef->fss; for (i = 0; curMount; curMount = curMount->next) { - rc = mount(curMount->source, - curMount->target, + // XXX fix + if (curMount->type != VIR_DOMAIN_FS_TYPE_MOUNT) + continue; + rc = mount(curMount->src, + curMount->dst, NULL, MS_BIND, NULL); if (0 != rc) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("failed to mount %s at %s for container: %s"), - curMount->source, curMount->target, strerror(errno)); + curMount->src, curMount->dst, strerror(errno)); return -1; } } @@ -328,7 +331,7 @@ * * Returns PID of container on success or -1 in case of error */ -int lxcContainerStart(lxc_vm_def_t *def, +int lxcContainerStart(virDomainDefPtr def, int nveths, char **veths, int control, diff -r 72ff3545be62 src/lxc_container.h --- a/src/lxc_container.h Tue Aug 05 16:50:44 2008 +0100 +++ b/src/lxc_container.h Tue Aug 05 16:50:49 2008 +0100 @@ -34,7 +34,7 @@ int lxcContainerSendContinue(int control); -int lxcContainerStart(lxc_vm_def_t *def, +int lxcContainerStart(virDomainDefPtr def, int nveths, char **veths, int control, diff -r 72ff3545be62 src/lxc_controller.c --- a/src/lxc_controller.c Tue Aug 05 16:50:44 2008 +0100 +++ b/src/lxc_controller.c Tue Aug 05 16:50:49 2008 +0100 @@ -339,7 +339,7 @@ static int lxcControllerRun(const char *stateDir, - lxc_vm_def_t *def, + virDomainDefPtr def, int nveths, char **veths, int monitor, @@ -401,7 +401,7 @@ int lxcControllerStart(const char *stateDir, - lxc_vm_def_t *def, + virDomainDefPtr def, int nveths, char **veths, int monitor, diff -r 72ff3545be62 src/lxc_controller.h --- a/src/lxc_controller.h Tue Aug 05 16:50:44 2008 +0100 +++ b/src/lxc_controller.h Tue Aug 05 16:50:49 2008 +0100 @@ -31,7 +31,7 @@ int lxcControllerClientHandshake(int monitor); int lxcControllerStart(const char *stateDir, - lxc_vm_def_t *def, + virDomainDefPtr def, int nveths, char **veths, int monitor, diff -r 72ff3545be62 src/lxc_driver.c --- a/src/lxc_driver.c Tue Aug 05 16:50:44 2008 +0100 +++ b/src/lxc_driver.c Tue Aug 05 16:50:49 2008 +0100 @@ -37,7 +37,6 @@ #include <unistd.h> #include <wait.h> -#include "internal.h" #include "lxc_conf.h" #include "lxc_container.h" #include "lxc_driver.h" @@ -111,7 +110,7 @@ int id) { lxc_driver_t *driver = (lxc_driver_t *)conn->privateData; - lxc_vm_t *vm = lxcFindVMByID(driver, id); + virDomainObjPtr vm = virDomainFindByID(driver->domains, id); virDomainPtr dom; if (!vm) { @@ -131,7 +130,7 @@ const unsigned char *uuid) { lxc_driver_t *driver = (lxc_driver_t *)conn->privateData; - lxc_vm_t *vm = lxcFindVMByUUID(driver, uuid); + virDomainObjPtr vm = virDomainFindByUUID(driver->domains, uuid); virDomainPtr dom; if (!vm) { @@ -151,7 +150,7 @@ const char *name) { lxc_driver_t *driver = (lxc_driver_t *)conn->privateData; - lxc_vm_t *vm = lxcFindVMByName(driver, name); + virDomainObjPtr vm = virDomainFindByName(driver->domains, name); virDomainPtr dom; if (!vm) { @@ -167,90 +166,96 @@ return dom; } -static int lxcListDomains(virConnectPtr conn, int *ids, int nids) -{ +static int lxcListDomains(virConnectPtr conn, int *ids, int nids) { lxc_driver_t *driver = (lxc_driver_t *)conn->privateData; - lxc_vm_t *vm; - int numDoms = 0; - - for (vm = driver->vms; vm && (numDoms < nids); vm = vm->next) { - if (lxcIsActiveVM(vm)) { - ids[numDoms] = vm->def->id; - numDoms++; + virDomainObjPtr vm = driver->domains; + int got = 0; + while (vm && got < nids) { + if (virDomainIsActive(vm)) { + ids[got] = vm->def->id; + got++; } + vm = vm->next; } - - return numDoms; + return got; } - -static int lxcNumDomains(virConnectPtr conn) -{ +static int lxcNumDomains(virConnectPtr conn) { lxc_driver_t *driver = (lxc_driver_t *)conn->privateData; - return driver->nactivevms; + int n = 0; + virDomainObjPtr dom = driver->domains; + while (dom) { + if (virDomainIsActive(dom)) + n++; + dom = dom->next; + } + return n; } static int lxcListDefinedDomains(virConnectPtr conn, - char **const names, int nnames) -{ + char **const names, int nnames) { lxc_driver_t *driver = (lxc_driver_t *)conn->privateData; - lxc_vm_t *vm; - int numDoms = 0; - int i; - - for (vm = driver->vms; vm && (numDoms < nnames); vm = vm->next) { - if (!lxcIsActiveVM(vm)) { - if (!(names[numDoms] = strdup(vm->def->name))) { - lxcError(conn, NULL, VIR_ERR_NO_MEMORY, "names"); + virDomainObjPtr vm = driver->domains; + int got = 0, i; + while (vm && got < nnames) { + if (!virDomainIsActive(vm)) { + if (!(names[got] = strdup(vm->def->name))) { + lxcError(conn, NULL, VIR_ERR_NO_MEMORY, NULL); goto cleanup; } - - numDoms++; + got++; } - + vm = vm->next; } - - return numDoms; + return got; cleanup: - for (i = 0 ; i < numDoms ; i++) { + for (i = 0 ; i < got ; i++) VIR_FREE(names[i]); - } - return -1; } -static int lxcNumDefinedDomains(virConnectPtr conn) -{ +static int lxcNumDefinedDomains(virConnectPtr conn) { lxc_driver_t *driver = (lxc_driver_t *)conn->privateData; - return driver->ninactivevms; + int n = 0; + virDomainObjPtr dom = driver->domains; + while (dom) { + if (!virDomainIsActive(dom)) + n++; + dom = dom->next; + } + return n; } + + static virDomainPtr lxcDomainDefine(virConnectPtr conn, const char *xml) { lxc_driver_t *driver = (lxc_driver_t *)conn->privateData; - lxc_vm_def_t *def; - lxc_vm_t *vm; + virDomainDefPtr def; + virDomainObjPtr vm; virDomainPtr dom; - if (!(def = lxcParseVMDef(conn, xml, NULL))) { + if (!(def = virDomainDefParseString(conn, driver->caps, xml))) return NULL; - } if ((def->nets != NULL) && !(driver->have_netns)) { lxcError(conn, NULL, VIR_ERR_NO_SUPPORT, _("System lacks NETNS support")); - lxcFreeVMDef(def); + virDomainDefFree(def); return NULL; } - if (!(vm = lxcAssignVMDef(conn, driver, def))) { - lxcFreeVMDef(def); + if (!(vm = virDomainAssignDef(conn, &driver->domains, def))) { + virDomainDefFree(def); return NULL; } - if (lxcSaveVMDef(conn, driver, vm, def) < 0) { - lxcRemoveInactiveVM(driver, vm); + if (virDomainSaveConfig(conn, + driver->configDir, + driver->autostartDir, + vm) < 0) { + virDomainRemoveInactive(&driver->domains, vm); return NULL; } @@ -265,7 +270,7 @@ static int lxcDomainUndefine(virDomainPtr dom) { lxc_driver_t *driver = (lxc_driver_t *)dom->conn->privateData; - lxc_vm_t *vm = lxcFindVMByUUID(driver, dom->uuid); + virDomainObjPtr vm = virDomainFindByUUID(driver->domains, dom->uuid); if (!vm) { lxcError(dom->conn, dom, VIR_ERR_INVALID_DOMAIN, @@ -273,19 +278,18 @@ return -1; } - if (lxcIsActiveVM(vm)) { + if (virDomainIsActive(vm)) { lxcError(dom->conn, dom, VIR_ERR_INTERNAL_ERROR, _("cannot delete active domain")); return -1; } - if (lxcDeleteConfig(dom->conn, driver, vm->configFile, vm->def->name) < 0) { + if (virDomainDeleteConfig(dom->conn, vm) <0) return -1; - } vm->configFile[0] = '\0'; - lxcRemoveInactiveVM(driver, vm); + virDomainRemoveInactive(&driver->domains, vm); return 0; } @@ -294,7 +298,7 @@ virDomainInfoPtr info) { lxc_driver_t *driver = (lxc_driver_t *)dom->conn->privateData; - lxc_vm_t *vm = lxcFindVMByUUID(driver, dom->uuid); + virDomainObjPtr vm = virDomainFindByUUID(driver->domains, dom->uuid); if (!vm) { lxcError(dom->conn, dom, VIR_ERR_INVALID_DOMAIN, @@ -304,30 +308,23 @@ info->state = vm->state; - if (!lxcIsActiveVM(vm)) { + if (!virDomainIsActive(vm)) { info->cpuTime = 0; } else { info->cpuTime = 0; } - info->maxMem = vm->def->maxMemory; - info->memory = vm->def->maxMemory; + info->maxMem = vm->def->maxmem; + info->memory = vm->def->memory; info->nrVirtCpu = 1; return 0; } -static char *lxcGetOSType(virDomainPtr dom ATTRIBUTE_UNUSED) -{ - /* Linux containers only run on Linux */ - return strdup("linux"); -} - -static char *lxcDomainDumpXML(virDomainPtr dom, - int flags ATTRIBUTE_UNUSED) +static char *lxcGetOSType(virDomainPtr dom) { lxc_driver_t *driver = (lxc_driver_t *)dom->conn->privateData; - lxc_vm_t *vm = lxcFindVMByUUID(driver, dom->uuid); + virDomainObjPtr vm = virDomainFindByUUID(driver->domains, dom->uuid); if (!vm) { lxcError(dom->conn, dom, VIR_ERR_INVALID_DOMAIN, @@ -335,7 +332,25 @@ return NULL; } - return lxcGenerateXML(dom->conn, driver, vm, vm->def); + return strdup(vm->def->os.type); +} + +static char *lxcDomainDumpXML(virDomainPtr dom, + int flags) +{ + lxc_driver_t *driver = (lxc_driver_t *)dom->conn->privateData; + virDomainObjPtr vm = virDomainFindByUUID(driver->domains, dom->uuid); + + if (!vm) { + lxcError(dom->conn, dom, VIR_ERR_INVALID_DOMAIN, + _("no domain with matching uuid")); + return NULL; + } + + return virDomainDefFormat(dom->conn, + (flags & VIR_DOMAIN_XML_INACTIVE) && + vm->newDef ? vm->newDef : vm->def, + flags); } @@ -350,12 +365,12 @@ * Returns 0 on success or -1 in case of error */ static int lxcVMCleanup(virConnectPtr conn, - lxc_driver_t *driver, - lxc_vm_t * vm) + virDomainObjPtr vm) { int rc = -1; int waitRc; int childStatus = -1; + lxc_driver_t *driver = (lxc_driver_t *)conn->privateData; while (((waitRc = waitpid(vm->pid, &childStatus, 0)) == -1) && errno == EINTR); @@ -382,8 +397,6 @@ vm->pid = -1; vm->def->id = -1; vm->monitor = -1; - driver->nactivevms--; - driver->ninactivevms++; return rc; } @@ -399,12 +412,12 @@ * Returns 0 on success or -1 in case of error */ static int lxcSetupInterfaces(virConnectPtr conn, - lxc_vm_def_t *def, + virDomainDefPtr def, int *nveths, char ***veths) { int rc = -1; - lxc_net_def_t *net; + virDomainNetDefPtr net; char *bridge = NULL; char parentVeth[PATH_MAX] = ""; char containerVeth[PATH_MAX] = ""; @@ -414,8 +427,11 @@ return -1; for (net = def->nets; net; net = net->next) { - if (LXC_NET_NETWORK == net->type) { - virNetworkPtr network = virNetworkLookupByName(conn, net->txName); + switch (net->type) { + case VIR_DOMAIN_NET_TYPE_NETWORK: + { + virNetworkPtr network = virNetworkLookupByName(conn, + net->data.network.name); if (!network) { goto error_exit; } @@ -423,8 +439,11 @@ bridge = virNetworkGetBridgeName(network); virNetworkFree(network); - } else { - bridge = net->txName; + break; + } + case VIR_DOMAIN_NET_TYPE_BRIDGE: + bridge = net->data.bridge.brname; + break; } DEBUG("bridge: %s", bridge); @@ -435,8 +454,8 @@ } DEBUG0("calling vethCreate()"); - if (NULL != net->parentVeth) { - strcpy(parentVeth, net->parentVeth); + if (NULL != net->ifname) { + strcpy(parentVeth, net->ifname); } DEBUG("parentVeth: %s, containerVeth: %s", parentVeth, containerVeth); if (0 != (rc = vethCreate(parentVeth, PATH_MAX, containerVeth, PATH_MAX))) { @@ -444,15 +463,15 @@ _("failed to create veth device pair: %d"), rc); goto error_exit; } - if (NULL == net->parentVeth) { - net->parentVeth = strdup(parentVeth); + if (NULL == net->ifname) { + net->ifname = strdup(parentVeth); } if (VIR_REALLOC_N(*veths, (*nveths)+1) < 0) goto error_exit; if (((*veths)[(*nveths)++] = strdup(containerVeth)) == NULL) goto error_exit; - if (NULL == net->parentVeth) { + if (NULL == net->ifname) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("failed to allocate veth names")); goto error_exit; @@ -484,7 +503,7 @@ static int lxcMonitorServer(virConnectPtr conn, lxc_driver_t * driver, - lxc_vm_t *vm) + virDomainObjPtr vm) { char *sockpath = NULL; int fd; @@ -534,7 +553,7 @@ static int lxcMonitorClient(virConnectPtr conn, lxc_driver_t * driver, - lxc_vm_t *vm) + virDomainObjPtr vm) { char *sockpath = NULL; int fd; @@ -583,8 +602,7 @@ static int lxcVmTerminate(virConnectPtr conn, - lxc_driver_t *driver, - lxc_vm_t *vm, + virDomainObjPtr vm, int signum) { if (signum == 0) @@ -601,7 +619,7 @@ vm->state = VIR_DOMAIN_SHUTDOWN; - return lxcVMCleanup(conn, driver, vm); + return lxcVMCleanup(conn, vm); } static void lxcMonitorEvent(int fd, @@ -609,7 +627,7 @@ void *data) { lxc_driver_t *driver = data; - lxc_vm_t *vm = driver->vms; + virDomainObjPtr vm = driver->domains; while (vm) { if (vm->monitor == fd) @@ -621,7 +639,7 @@ return; } - if (lxcVmTerminate(NULL, driver, vm, SIGINT) < 0) + if (lxcVmTerminate(NULL, vm, SIGINT) < 0) virEventRemoveHandle(fd); } @@ -638,11 +656,12 @@ */ static int lxcVmStart(virConnectPtr conn, lxc_driver_t * driver, - lxc_vm_t * vm) + virDomainObjPtr vm) { int rc = -1, i; int monitor; int parentTty; + char *parentTtyPath = NULL; char *logfile = NULL; int logfd = -1; int nveths = 0; @@ -665,12 +684,18 @@ goto cleanup; /* open parent tty */ - VIR_FREE(vm->def->tty); - if (virFileOpenTty(&parentTty, &vm->def->tty, 1) < 0) { + if (virFileOpenTty(&parentTty, &parentTtyPath, 1) < 0) { lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, _("failed to allocate tty: %s"), strerror(errno)); goto cleanup; + } + if (vm->def->console && + vm->def->console->type == VIR_DOMAIN_CHR_TYPE_PTY) { + VIR_FREE(vm->def->console->data.file.path); + vm->def->console->data.file.path = parentTtyPath; + } else { + VIR_FREE(parentTtyPath); } if (lxcSetupInterfaces(conn, vm->def, &nveths, &veths) != 0) @@ -710,14 +735,12 @@ vm->def->id = vm->pid; vm->state = VIR_DOMAIN_RUNNING; - driver->ninactivevms--; - driver->nactivevms++; if (virEventAddHandle(vm->monitor, POLLERR | POLLHUP, lxcMonitorEvent, driver) < 0) { - lxcVmTerminate(conn, driver, vm, 0); + lxcVmTerminate(conn, vm, 0); goto cleanup; } @@ -756,7 +779,7 @@ int rc = -1; virConnectPtr conn = dom->conn; lxc_driver_t *driver = (lxc_driver_t *)(conn->privateData); - lxc_vm_t *vm = lxcFindVMByName(driver, dom->name); + virDomainObjPtr vm = virDomainFindByName(driver->domains, dom->name); if (!vm) { lxcError(conn, dom, VIR_ERR_INVALID_DOMAIN, @@ -785,26 +808,20 @@ const char *xml, unsigned int flags ATTRIBUTE_UNUSED) { lxc_driver_t *driver = (lxc_driver_t *)conn->privateData; - lxc_vm_t *vm; - lxc_vm_def_t *def; + virDomainObjPtr vm; + virDomainDefPtr def; virDomainPtr dom = NULL; - if (!(def = lxcParseVMDef(conn, xml, NULL))) { + if (!(def = virDomainDefParseString(conn, driver->caps, xml))) + goto return_point; + + if (!(vm = virDomainAssignDef(conn, &driver->domains, def))) { + virDomainDefFree(def); goto return_point; } - if (!(vm = lxcAssignVMDef(conn, driver, def))) { - lxcFreeVMDef(def); - goto return_point; - } - - if (lxcSaveVMDef(conn, driver, vm, def) < 0) { - lxcRemoveInactiveVM(driver, vm); - return NULL; - } - if (lxcVmStart(conn, driver, vm) < 0) { - lxcRemoveInactiveVM(driver, vm); + virDomainRemoveInactive(&driver->domains, vm); goto return_point; } @@ -828,7 +845,7 @@ static int lxcDomainShutdown(virDomainPtr dom) { lxc_driver_t *driver = (lxc_driver_t*)dom->conn->privateData; - lxc_vm_t *vm = lxcFindVMByID(driver, dom->id); + virDomainObjPtr vm = virDomainFindByID(driver->domains, dom->id); if (!vm) { lxcError(dom->conn, dom, VIR_ERR_INVALID_DOMAIN, @@ -836,7 +853,7 @@ return -1; } - return lxcVmTerminate(dom->conn, driver, vm, 0); + return lxcVmTerminate(dom->conn, vm, 0); } @@ -851,7 +868,7 @@ static int lxcDomainDestroy(virDomainPtr dom) { lxc_driver_t *driver = (lxc_driver_t*)dom->conn->privateData; - lxc_vm_t *vm = lxcFindVMByID(driver, dom->id); + virDomainObjPtr vm = virDomainFindByID(driver->domains, dom->id); if (!vm) { lxcError(dom->conn, dom, VIR_ERR_INVALID_DOMAIN, @@ -859,7 +876,7 @@ return -1; } - return lxcVmTerminate(dom->conn, driver, vm, SIGKILL); + return lxcVmTerminate(dom->conn, vm, SIGKILL); } static int lxcCheckNetNsSupport(void) @@ -880,7 +897,7 @@ static int lxcStartup(void) { uid_t uid = getuid(); - lxc_vm_t *vm; + virDomainObjPtr vm; /* Check that the user is root */ if (0 != uid) { @@ -903,13 +920,21 @@ return -1; } - /* Call function to load the container configuration files */ - if (lxcLoadContainerInfo(lxc_driver) < 0) { + if ((lxc_driver->caps = lxcCapsInit()) == NULL) { lxcShutdown(); return -1; } - vm = lxc_driver->vms; + if (virDomainLoadAllConfigs(NULL, + lxc_driver->caps, + &lxc_driver->domains, + lxc_driver->configDir, + lxc_driver->autostartDir) < 0) { + lxcShutdown(); + return -1; + } + + vm = lxc_driver->domains; while (vm) { int rc; if ((vm->monitor = lxcMonitorClient(NULL, lxc_driver, vm)) < 0) { @@ -928,8 +953,6 @@ if (vm->pid != 0) { vm->def->id = vm->pid; vm->state = VIR_DOMAIN_RUNNING; - lxc_driver->ninactivevms--; - lxc_driver->nactivevms++; } else { vm->def->id = -1; close(vm->monitor); @@ -945,6 +968,7 @@ static void lxcFreeDriver(lxc_driver_t *driver) { VIR_FREE(driver->configDir); + VIR_FREE(driver->autostartDir); VIR_FREE(driver->stateDir); VIR_FREE(driver->logDir); VIR_FREE(driver); @@ -952,10 +976,15 @@ static int lxcShutdown(void) { + virDomainObjPtr vm; if (lxc_driver == NULL) return(-1); - lxcFreeVMs(lxc_driver->vms); - lxc_driver->vms = NULL; + vm = lxc_driver->domains; + while (vm) { + virDomainObjPtr next = vm->next; + virDomainObjFree(vm); + vm = next; + } lxcFreeDriver(lxc_driver); lxc_driver = NULL; @@ -971,13 +1000,17 @@ */ static int lxcActive(void) { + virDomainObjPtr dom; + if (lxc_driver == NULL) return(0); - /* If we've any active networks or guests, then we - * mark this driver as active - */ - if (lxc_driver->nactivevms) - return 1; + + dom = lxc_driver->domains; + while (dom) { + if (virDomainIsActive(dom)) + return 1; + dom = dom->next; + } /* Otherwise we're happy to deal with a shutdown */ return 0; -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

"Daniel P. Berrange" <berrange@redhat.com> wrote:
The re-architecting of the LXC controller/container process relationship in the previous patch removed the last obstacle to switching over to the generic domain XML routines. So this patch switches the driver over.
First the vast majority of lxc_conf.h/c is simply deleted - this is all redundant when using the domain_conf.h APIs. Then, all references to lxc_vm_t are changed to virDomainObj, and lxc_vm_def_t switches to virDomainDef. Finally the LXC driver registers its capabilities data. For this I have chosen an OS type of 'exe', since the 'operating system' we're running in the container is just any plain executable process.
lxc_conf.c | 1052 +------------------------------------------------------ lxc_conf.h | 121 ------ lxc_container.c | 23 - lxc_container.h | 2 lxc_controller.c | 4 lxc_controller.h | 2 lxc_driver.c | 289 ++++++++------- 7 files changed, 215 insertions(+), 1278 deletions(-)
All looks fine. ACK. However, please note that it would have been a lot easier/quicker to review if you'd done the mechanical/automatable changes (i.e., the global substitutions like s/lxc_vm_t/virDomainObj/) separately from the others.

On Tue, Aug 12, 2008 at 04:39:05PM +0200, Jim Meyering wrote:
"Daniel P. Berrange" <berrange@redhat.com> wrote:
The re-architecting of the LXC controller/container process relationship in the previous patch removed the last obstacle to switching over to the generic domain XML routines. So this patch switches the driver over.
First the vast majority of lxc_conf.h/c is simply deleted - this is all redundant when using the domain_conf.h APIs. Then, all references to lxc_vm_t are changed to virDomainObj, and lxc_vm_def_t switches to virDomainDef. Finally the LXC driver registers its capabilities data. For this I have chosen an OS type of 'exe', since the 'operating system' we're running in the container is just any plain executable process.
lxc_conf.c | 1052 +------------------------------------------------------ lxc_conf.h | 121 ------ lxc_container.c | 23 - lxc_container.h | 2 lxc_controller.c | 4 lxc_controller.h | 2 lxc_driver.c | 289 ++++++++------- 7 files changed, 215 insertions(+), 1278 deletions(-)
All looks fine. ACK.
However, please note that it would have been a lot easier/quicker to review if you'd done the mechanical/automatable changes (i.e., the global substitutions like s/lxc_vm_t/virDomainObj/) separately from the others.
If i'd separated out the substitution, then the intermediate state between the two patches would not be functional. I prefer to have the changes fully operational at each step so you can bisect change history. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

This patch removes the configDir and autostartLink paths from the internal virDomainObjPtr struct. Instead they can calculated at time of use. This is to facilitate a following patch, where we want to write a config to an alternative location at some points. This also makes the LXC & QEMU drivers correctly set/use the 'persistent' property. domain_conf.c | 99 +++++++++++++++++++++++++++++----------------------------- domain_conf.h | 8 +--- lxc_driver.c | 40 ++++++++++++++++++----- qemu_driver.c | 71 ++++++++++++++++++++++++++++++----------- 4 files changed, 138 insertions(+), 80 deletions(-) Daniel diff -r a204a9425afd src/domain_conf.c --- a/src/domain_conf.c Tue Aug 05 16:50:49 2008 +0100 +++ b/src/domain_conf.c Tue Aug 05 16:50:51 2008 +0100 @@ -399,8 +399,6 @@ virDomainDefFree(dom->newDef); VIR_FREE(dom->vcpupids); - VIR_FREE(dom->configFile); - VIR_FREE(dom->autostartLink); VIR_FREE(dom); } @@ -2952,31 +2950,23 @@ int virDomainSaveConfig(virConnectPtr conn, const char *configDir, - const char *autostartDir, - virDomainObjPtr dom) + virDomainDefPtr def) { char *xml; + char *configFile = NULL; int fd = -1, ret = -1; size_t towrite; int err; - if (!dom->configFile && - asprintf(&dom->configFile, "%s/%s.xml", - configDir, dom->def->name) < 0) { - dom->configFile = NULL; - virDomainReportError(conn, VIR_ERR_NO_MEMORY, NULL); - goto cleanup; - } - if (!dom->autostartLink && - asprintf(&dom->autostartLink, "%s/%s.xml", - autostartDir, dom->def->name) < 0) { - dom->autostartLink = NULL; + if (asprintf(&configFile, "%s/%s.xml", + configDir, def->name) < 0) { + configFile = NULL; virDomainReportError(conn, VIR_ERR_NO_MEMORY, NULL); goto cleanup; } if (!(xml = virDomainDefFormat(conn, - dom->newDef ? dom->newDef : dom->def, + def, VIR_DOMAIN_XML_SECURE))) goto cleanup; @@ -2987,34 +2977,27 @@ goto cleanup; } - if ((err = virFileMakePath(autostartDir))) { - virDomainReportError(conn, VIR_ERR_INTERNAL_ERROR, - _("cannot create autostart directory %s: %s"), - autostartDir, strerror(err)); - goto cleanup; - } - - if ((fd = open(dom->configFile, + if ((fd = open(configFile, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR )) < 0) { virDomainReportError(conn, VIR_ERR_INTERNAL_ERROR, - _("cannot create config file %s: %s"), - dom->configFile, strerror(errno)); + _("cannot create config file %s: %s"), + configFile, strerror(errno)); goto cleanup; } towrite = strlen(xml); if (safewrite(fd, xml, towrite) < 0) { virDomainReportError(conn, VIR_ERR_INTERNAL_ERROR, - _("cannot write config file %s: %s"), - dom->configFile, strerror(errno)); + _("cannot write config file %s: %s"), + configFile, strerror(errno)); goto cleanup; } if (close(fd) < 0) { virDomainReportError(conn, VIR_ERR_INTERNAL_ERROR, - _("cannot save config file %s: %s"), - dom->configFile, strerror(errno)); + _("cannot save config file %s: %s"), + configFile, strerror(errno)); goto cleanup; } @@ -3072,8 +3055,6 @@ goto error; dom->state = VIR_DOMAIN_SHUTOFF; - dom->configFile = configFile; - dom->autostartLink = autostartLink; dom->autostart = autostart; return dom; @@ -3104,6 +3085,8 @@ } while ((entry = readdir(dir))) { + virDomainObjPtr dom; + if (entry->d_name[0] == '.') continue; @@ -3112,12 +3095,14 @@ /* NB: ignoring errors, so one malformed config doesn't kill the whole process */ - virDomainLoadConfig(conn, - caps, - doms, - configDir, - autostartDir, - entry->d_name); + dom = virDomainLoadConfig(conn, + caps, + doms, + configDir, + autostartDir, + entry->d_name); + if (dom) + dom->persistent = 1; } closedir(dir); @@ -3126,25 +3111,43 @@ } int virDomainDeleteConfig(virConnectPtr conn, - virDomainObjPtr dom) + const char *configDir, + const char *autostartDir, + virDomainObjPtr dom) { - if (!dom->configFile || !dom->autostartLink) { - virDomainReportError(conn, VIR_ERR_INTERNAL_ERROR, - _("no config file for %s"), dom->def->name); - return -1; + char *configFile = NULL, *autostartLink = NULL; + int ret = -1; + + if (asprintf(&configFile, "%s/%s", + configDir, dom->def->name) < 0) { + configFile = NULL; + virDomainReportError(conn, VIR_ERR_NO_MEMORY, NULL); + goto cleanup; + } + if (asprintf(&autostartLink, "%s/%s", + autostartDir, dom->def->name) < 0) { + autostartLink = NULL; + virDomainReportError(conn, VIR_ERR_NO_MEMORY, NULL); + goto cleanup; } /* Not fatal if this doesn't work */ - unlink(dom->autostartLink); + unlink(autostartLink); - if (unlink(dom->configFile) < 0) { + if (unlink(configFile) < 0 && + errno != ENOENT) { virDomainReportError(conn, VIR_ERR_INTERNAL_ERROR, - _("cannot remove config for %s: %s"), + _("cannot remove config for %s: %s"), dom->def->name, strerror(errno)); - return -1; + goto cleanup; } - return 0; + ret = 0; + +cleanup: + VIR_FREE(configFile); + VIR_FREE(autostartLink); + return ret; } #endif /* ! PROXY */ diff -r a204a9425afd src/domain_conf.h --- a/src/domain_conf.h Tue Aug 05 16:50:49 2008 +0100 +++ b/src/domain_conf.h Tue Aug 05 16:50:51 2008 +0100 @@ -409,9 +409,6 @@ unsigned int autostart : 1; unsigned int persistent : 1; - char *configFile; - char *autostartLink; - virDomainDefPtr def; /* The current definition */ virDomainDefPtr newDef; /* New definition to activate at shutdown */ @@ -482,8 +479,7 @@ int virDomainSaveConfig(virConnectPtr conn, const char *configDir, - const char *autostartDir, - virDomainObjPtr dom); + virDomainDefPtr def); virDomainObjPtr virDomainLoadConfig(virConnectPtr conn, virCapsPtr caps, @@ -499,6 +495,8 @@ const char *autostartDir); int virDomainDeleteConfig(virConnectPtr conn, + const char *configDir, + const char *autostartDir, virDomainObjPtr dom); virDomainNetDefPtr virDomainNetDefParseXML(virConnectPtr conn, diff -r a204a9425afd src/lxc_driver.c --- a/src/lxc_driver.c Tue Aug 05 16:50:49 2008 +0100 +++ b/src/lxc_driver.c Tue Aug 05 16:50:51 2008 +0100 @@ -250,11 +250,11 @@ virDomainDefFree(def); return NULL; } + vm->persistent = 1; if (virDomainSaveConfig(conn, driver->configDir, - driver->autostartDir, - vm) < 0) { + vm->newDef ? vm->newDef : vm->def) < 0) { virDomainRemoveInactive(&driver->domains, vm); return NULL; } @@ -284,10 +284,17 @@ return -1; } - if (virDomainDeleteConfig(dom->conn, vm) <0) + if (!vm->persistent) { + lxcError(dom->conn, dom, VIR_ERR_INTERNAL_ERROR, + "%s", _("cannot undefine transient domain")); return -1; + } - vm->configFile[0] = '\0'; + if (virDomainDeleteConfig(dom->conn, + driver->configDir, + driver->autostartDir, + vm) <0) + return -1; virDomainRemoveInactive(&driver->domains, vm); @@ -639,8 +646,13 @@ return; } - if (lxcVmTerminate(NULL, vm, SIGINT) < 0) - virEventRemoveHandle(fd); + lxcVmTerminate(NULL, vm, SIGINT); + + virEventRemoveHandle(fd); + + if (!vm->persistent) + virDomainRemoveInactive(&driver->domains, + vm); } @@ -846,6 +858,7 @@ { lxc_driver_t *driver = (lxc_driver_t*)dom->conn->privateData; virDomainObjPtr vm = virDomainFindByID(driver->domains, dom->id); + int ret; if (!vm) { lxcError(dom->conn, dom, VIR_ERR_INVALID_DOMAIN, @@ -853,7 +866,12 @@ return -1; } - return lxcVmTerminate(dom->conn, vm, 0); + ret = lxcVmTerminate(dom->conn, vm, 0); + if (!vm->persistent) + virDomainRemoveInactive(&driver->domains, + vm); + + return ret; } @@ -869,6 +887,7 @@ { lxc_driver_t *driver = (lxc_driver_t*)dom->conn->privateData; virDomainObjPtr vm = virDomainFindByID(driver->domains, dom->id); + int ret; if (!vm) { lxcError(dom->conn, dom, VIR_ERR_INVALID_DOMAIN, @@ -876,7 +895,12 @@ return -1; } - return lxcVmTerminate(dom->conn, vm, SIGKILL); + ret = lxcVmTerminate(dom->conn, vm, SIGKILL); + if (!vm->persistent) + virDomainRemoveInactive(&driver->domains, + vm); + + return ret; } static int lxcCheckNetNsSupport(void) diff -r a204a9425afd src/qemu_driver.c --- a/src/qemu_driver.c Tue Aug 05 16:50:49 2008 +0100 +++ b/src/qemu_driver.c Tue Aug 05 16:50:51 2008 +0100 @@ -351,7 +351,7 @@ virDomainObjPtr next = vm->next; if (virDomainIsActive(vm)) qemudShutdownVMDaemon(NULL, qemu_driver, vm); - if (!vm->configFile) + if (!vm->persistent) virDomainRemoveInactive(&qemu_driver->domains, vm); vm = next; @@ -1068,7 +1068,7 @@ static int qemudDispatchVMLog(struct qemud_driver *driver, virDomainObjPtr vm, int fd) { if (qemudVMData(driver, vm, fd) < 0) { qemudShutdownVMDaemon(NULL, driver, vm); - if (!vm->configFile) + if (!vm->persistent) virDomainRemoveInactive(&driver->domains, vm); } @@ -1078,7 +1078,7 @@ static int qemudDispatchVMFailure(struct qemud_driver *driver, virDomainObjPtr vm, int fd ATTRIBUTE_UNUSED) { qemudShutdownVMDaemon(NULL, driver, vm); - if (!vm->configFile) + if (!vm->persistent) virDomainRemoveInactive(&driver->domains, vm); return 0; @@ -2138,7 +2138,7 @@ } qemudShutdownVMDaemon(dom->conn, driver, vm); - if (!vm->configFile) + if (!vm->persistent) virDomainRemoveInactive(&driver->domains, vm); @@ -2469,7 +2469,7 @@ /* Shut it down */ qemudShutdownVMDaemon(dom->conn, driver, vm); - if (!vm->configFile) + if (!vm->persistent) virDomainRemoveInactive(&driver->domains, vm); @@ -2761,7 +2761,7 @@ if (ret < 0) { qemudReportError(conn, NULL, NULL, VIR_ERR_OPERATION_FAILED, "%s", _("failed to start VM")); - if (!vm->configFile) + if (!vm->persistent) virDomainRemoveInactive(&driver->domains, vm); return -1; @@ -2867,11 +2867,11 @@ virDomainDefFree(def); return NULL; } + vm->persistent = 1; if (virDomainSaveConfig(conn, driver->configDir, - driver->autostartDir, - vm) < 0) { + vm->newDef ? vm->newDef : vm->def) < 0) { virDomainRemoveInactive(&driver->domains, vm); return NULL; @@ -2898,7 +2898,13 @@ return -1; } - if (virDomainDeleteConfig(dom->conn, vm) < 0) + if (!vm->persistent) { + qemudReportError(dom->conn, dom, NULL, VIR_ERR_INTERNAL_ERROR, + "%s", _("cannot undefine transient domain")); + return -1; + } + + if (virDomainDeleteConfig(dom->conn, driver->configDir, driver->autostartDir, vm) < 0) return -1; virDomainRemoveInactive(&driver->domains, @@ -3024,13 +3030,21 @@ } static int qemudDomainSetAutostart(virDomainPtr dom, - int autostart) { + int autostart) { struct qemud_driver *driver = (struct qemud_driver *)dom->conn->privateData; virDomainObjPtr vm = virDomainFindByUUID(driver->domains, dom->uuid); + char *configFile = NULL, *autostartLink = NULL; + int ret = -1; if (!vm) { qemudReportError(dom->conn, dom, NULL, VIR_ERR_INVALID_DOMAIN, "%s", _("no domain with matching uuid")); + return -1; + } + + if (!vm->persistent) { + qemudReportError(dom->conn, dom, NULL, VIR_ERR_INTERNAL_ERROR, + "%s", _("cannot set autostart for transient domain")); return -1; } @@ -3039,6 +3053,20 @@ if (vm->autostart == autostart) return 0; + if (asprintf(&configFile, "%s/%s.xml", + driver->configDir, vm->def->name) < 0) { + configFile = NULL; + qemudReportError(dom->conn, dom, NULL, VIR_ERR_NO_MEMORY, NULL); + goto cleanup; + } + + if (asprintf(&autostartLink, "%s/%s.xml", + driver->autostartDir, vm->def->name) < 0) { + autostartLink = NULL; + qemudReportError(dom->conn, dom, NULL, VIR_ERR_NO_MEMORY, NULL); + goto cleanup; + } + if (autostart) { int err; @@ -3046,27 +3074,32 @@ qemudReportError(dom->conn, dom, NULL, VIR_ERR_INTERNAL_ERROR, _("cannot create autostart directory %s: %s"), driver->autostartDir, strerror(err)); - return -1; + goto cleanup; } - if (symlink(vm->configFile, vm->autostartLink) < 0) { + if (symlink(configFile, autostartLink) < 0) { qemudReportError(dom->conn, dom, NULL, VIR_ERR_INTERNAL_ERROR, - _("Failed to create symlink '%s' to '%s': %s"), - vm->autostartLink, vm->configFile, strerror(errno)); - return -1; + _("Failed to create symlink '%s to '%s': %s"), + autostartLink, configFile, strerror(errno)); + goto cleanup; } } else { - if (unlink(vm->autostartLink) < 0 && errno != ENOENT && errno != ENOTDIR) { + if (unlink(autostartLink) < 0 && errno != ENOENT && errno != ENOTDIR) { qemudReportError(dom->conn, dom, NULL, VIR_ERR_INTERNAL_ERROR, _("Failed to delete symlink '%s': %s"), - vm->autostartLink, strerror(errno)); - return -1; + autostartLink, strerror(errno)); + goto cleanup; } } vm->autostart = autostart; + ret = 0; - return 0; +cleanup: + VIR_FREE(configFile); + VIR_FREE(autostartLink); + + return ret; } /* This uses the 'info blockstats' monitor command which was -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

"Daniel P. Berrange" <berrange@redhat.com> wrote: ... ACK, modulo two questions:
diff -r a204a9425afd src/domain_conf.c int virDomainDeleteConfig(virConnectPtr conn, - virDomainObjPtr dom) + const char *configDir, + const char *autostartDir, + virDomainObjPtr dom) { - if (!dom->configFile || !dom->autostartLink) { - virDomainReportError(conn, VIR_ERR_INTERNAL_ERROR, - _("no config file for %s"), dom->def->name); - return -1; + char *configFile = NULL, *autostartLink = NULL; + int ret = -1; + + if (asprintf(&configFile, "%s/%s",
Shouldn't that be "%s/%s.xml", as used in at least two other places where configFile is defined? This deserves a tiny helper function, so that the dom->configFile mapping happens in just one place.
+ configDir, dom->def->name) < 0) { + configFile = NULL; ...
#endif /* ! PROXY */ ... diff -r a204a9425afd src/qemu_driver.c ... static int qemudDomainSetAutostart(virDomainPtr dom, - int autostart) { + int autostart) { struct qemud_driver *driver = (struct qemud_driver *)dom->conn->privateData; virDomainObjPtr vm = virDomainFindByUUID(driver->domains, dom->uuid); + char *configFile = NULL, *autostartLink = NULL; + int ret = -1;
if (!vm) { qemudReportError(dom->conn, dom, NULL, VIR_ERR_INVALID_DOMAIN, "%s", _("no domain with matching uuid")); + return -1; + } + + if (!vm->persistent) { + qemudReportError(dom->conn, dom, NULL, VIR_ERR_INTERNAL_ERROR, + "%s", _("cannot set autostart for transient domain")); return -1; }
@@ -3039,6 +3053,20 @@ if (vm->autostart == autostart) return 0;
+ if (asprintf(&configFile, "%s/%s.xml", + driver->configDir, vm->def->name) < 0) { + configFile = NULL; + qemudReportError(dom->conn, dom, NULL, VIR_ERR_NO_MEMORY, NULL); + goto cleanup; + } + + if (asprintf(&autostartLink, "%s/%s.xml", + driver->autostartDir, vm->def->name) < 0) {
Your 6/7 patch has this: if (asprintf(&autostartLink, "%s/%s", Is the different format deliberate? Maybe a helper function for the dom->autostartLink name, too?
+ autostartLink = NULL; + qemudReportError(dom->conn, dom, NULL, VIR_ERR_NO_MEMORY, NULL); + goto cleanup; + } + ... +cleanup: + VIR_FREE(configFile); + VIR_FREE(autostartLink); + + return ret; }
/* This uses the 'info blockstats' monitor command which was

Internally the drivers track the current live configuration, and the new inactive config for running domains. When the libvirtd process is restarted though, this data is lost for any active LXC domains. This patch makes the LXC driver persist the live config to /var/run/libvirt/lxc/NAME.xml so it can be tracked across restarts It required a small change to the domain XML APis to make the autostart symlink processing optional when deleting a config file domain_conf.c | 24 ++++++++++++------------ lxc_driver.c | 27 +++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 12 deletions(-) Daniel diff -r cf1cf3a1d4d6 src/domain_conf.c --- a/src/domain_conf.c Tue Aug 05 16:50:51 2008 +0100 +++ b/src/domain_conf.c Tue Aug 05 16:50:59 2008 +0100 @@ -1087,13 +1087,11 @@ break; case VIR_DOMAIN_CHR_TYPE_PTY: - /* @path attribute is an output only property - pty is auto-allocted */ - break; - case VIR_DOMAIN_CHR_TYPE_DEV: case VIR_DOMAIN_CHR_TYPE_FILE: case VIR_DOMAIN_CHR_TYPE_PIPE: - if (path == NULL) { + if (path == NULL && + def->type != VIR_DOMAIN_CHR_TYPE_PTY) { virDomainReportError(conn, VIR_ERR_INTERNAL_ERROR, "%s", _("Missing source path attribute for char device")); goto error; @@ -3124,15 +3122,17 @@ virDomainReportError(conn, VIR_ERR_NO_MEMORY, NULL); goto cleanup; } - if (asprintf(&autostartLink, "%s/%s", - autostartDir, dom->def->name) < 0) { - autostartLink = NULL; - virDomainReportError(conn, VIR_ERR_NO_MEMORY, NULL); - goto cleanup; + if (autostartDir) { + if (asprintf(&autostartLink, "%s/%s", + autostartDir, dom->def->name) < 0) { + autostartLink = NULL; + virDomainReportError(conn, VIR_ERR_NO_MEMORY, NULL); + goto cleanup; + } + + /* Not fatal if this doesn't work */ + unlink(autostartLink); } - - /* Not fatal if this doesn't work */ - unlink(autostartLink); if (unlink(configFile) < 0 && errno != ENOENT) { diff -r cf1cf3a1d4d6 src/lxc_driver.c --- a/src/lxc_driver.c Tue Aug 05 16:50:51 2008 +0100 +++ b/src/lxc_driver.c Tue Aug 05 16:50:59 2008 +0100 @@ -399,6 +399,7 @@ close(vm->monitor); virFileDeletePid(driver->stateDir, vm->def->name); + virDomainDeleteConfig(conn, driver->stateDir, NULL, vm); vm->state = VIR_DOMAIN_SHUTOFF; vm->pid = -1; @@ -615,6 +616,12 @@ if (signum == 0) signum = SIGINT; + if (vm->pid <= 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + _("invalid PID %d for container"), vm->pid); + return -1; + } + if (kill(vm->pid, signum) < 0) { if (errno != ESRCH) { lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, @@ -741,6 +748,12 @@ lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, _("Failed to read pid file %s/%s.pid: %s"), driver->stateDir, vm->def->name, strerror(errno)); + rc = -1; + goto cleanup; + } + + /* Persist the live configuration */ + if (virDomainSaveConfig(conn, driver->stateDir, vm->def) < 0) { rc = -1; goto cleanup; } @@ -960,6 +973,8 @@ vm = lxc_driver->domains; while (vm) { + char *config = NULL; + virDomainDefPtr tmp; int rc; if ((vm->monitor = lxcMonitorClient(NULL, lxc_driver, vm)) < 0) { vm = vm->next; @@ -972,6 +987,18 @@ vm->monitor = -1; vm = vm->next; continue; + } + + if (asprintf(&config, "%s/%s.xml", + lxc_driver->stateDir, vm->def->name) < 0) + continue; + + /* Try and load the live config */ + tmp = virDomainDefParseFile(NULL, lxc_driver->caps, config); + VIR_FREE(config); + if (tmp) { + vm->newDef = vm->def; + vm->def = tmp; } if (vm->pid != 0) { -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

"Daniel P. Berrange" <berrange@redhat.com> wrote:
Internally the drivers track the current live configuration, and the new inactive config for running domains. When the libvirtd process is restarted though, this data is lost for any active LXC domains. This patch makes the LXC driver persist the live config to /var/run/libvirt/lxc/NAME.xml so it can be tracked across restarts
It required a small change to the domain XML APis to make the autostart symlink processing optional when deleting a config file ...
Looks fine to me. ACK
diff -r cf1cf3a1d4d6 src/lxc_driver.c --- a/src/lxc_driver.c Tue Aug 05 16:50:51 2008 +0100 +++ b/src/lxc_driver.c Tue Aug 05 16:50:59 2008 +0100 ... @@ -960,6 +973,8 @@
vm = lxc_driver->domains; while (vm) { + char *config = NULL; + virDomainDefPtr tmp;
The initialization of config looks unnecessary. The only other change I'd make would be to move both declarations "down" to first use. Then the context-challenged reader doesn't have to wonder what, if anything, happens to those variables between declaration and first use.
int rc; if ((vm->monitor = lxcMonitorClient(NULL, lxc_driver, vm)) < 0) { vm = vm->next; @@ -972,6 +987,18 @@ vm->monitor = -1; vm = vm->next; continue; + } +
char *config;
+ if (asprintf(&config, "%s/%s.xml", + lxc_driver->stateDir, vm->def->name) < 0) + continue; + + /* Try and load the live config */ + tmp = virDomainDefParseFile(NULL, lxc_driver->caps, config);
virDomainDefPtr tmp = virDomainDefParseFile(NULL, lxc_driver->caps, config);
+ VIR_FREE(config); + if (tmp) { + vm->newDef = vm->def; + vm->def = tmp; }
if (vm->pid != 0) {

The LXC driver currently allows custom mount points to be setup inside the container. This only works for non-root mount points. You cannot replace the entire root filesystem. This patch adds support for replacing the entire root filesystem, thus allowing the use of LXC containers as a 'better chroot than chroot'. Well, with one minor flaw - the Linux kernel currently has no device namespace virtualization, so the admin inside the container can just do a 'mknod' and access the real devices of the host. So for now this patch doesn't make LXC containers secure, but a future kernel release will enable it to be secure. lxc_container.c | 253 ++++++++++++++++++++++++++++++++++++++++++++++++-------- util.c | 12 +- 2 files changed, 226 insertions(+), 39 deletions(-) Daniel diff -r eaa42985aed4 src/lxc_container.c --- a/src/lxc_container.c Tue Aug 05 16:50:59 2008 +0100 +++ b/src/lxc_container.c Tue Aug 05 16:51:14 2008 +0100 @@ -1,10 +1,12 @@ /* * Copyright IBM Corp. 2008 + * Copyright Red Hat 2008 * * lxc_container.c: file description * * Authors: * David L. Leskovec <dlesko at linux.vnet.ibm.com> + * Daniel P. Berrange <berrange@redhat.com> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -28,10 +30,18 @@ #include <fcntl.h> #include <limits.h> #include <stdlib.h> +#include <stdio.h> #include <sys/ioctl.h> #include <sys/mount.h> #include <sys/wait.h> #include <unistd.h> +#include <mntent.h> + +/* Yes, we want linux private one, for _syscall2() macro */ +#include <linux/unistd.h> + +/* For MS_MOVE */ +#include <linux/fs.h> #include "lxc_container.h" #include "util.h" @@ -105,23 +115,15 @@ * * Returns 0 on success or -1 in case of error */ -static int lxcContainerSetStdio(int control, const char *ttyPath) +static int lxcContainerSetStdio(int control, int ttyfd) { int rc = -1; - int ttyfd; int open_max, i; if (setsid() < 0) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("setsid failed: %s"), strerror(errno)); - goto error_out; - } - - ttyfd = open(ttyPath, O_RDWR|O_NOCTTY); - if (ttyfd < 0) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("open(%s) failed: %s"), ttyPath, strerror(errno)); - goto error_out; + goto cleanup; } if (ioctl(ttyfd, TIOCSCTTY, NULL) < 0) { @@ -159,8 +161,6 @@ cleanup: close(ttyfd); - -error_out: return rc; } @@ -223,6 +223,7 @@ return 0; } + /** * lxcEnableInterfaces: * @vm: Pointer to vm structure @@ -252,6 +253,20 @@ return rc; } + +//_syscall2(int, pivot_root, char *, newroot, const char *, oldroot) +extern int pivot_root(const char * new_root,const char * put_old); + +static int lxcContainerChildMountSort(const void *a, const void *b) +{ + const char **sa = (const char**)a; + const char **sb = (const char**)b; + + /* Delibrately reversed args - we need to unmount deepest + children first */ + return strcmp(*sb, *sa); +} + /** * lxcChild: * @argv: Pointer to container arguments @@ -269,8 +284,8 @@ int rc = -1; lxc_child_argv_t *argv = data; virDomainDefPtr vmDef = argv->config; - virDomainFSDefPtr curMount; - int i; + virDomainFSDefPtr tmp, root = NULL; + int ttyfd, i; if (NULL == vmDef) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, @@ -278,36 +293,210 @@ return -1; } +#if 0 + ttyfd = open(argv->ttyPath, O_RDWR|O_NOCTTY); + if (ttyfd < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("open(%s) failed: %s"), argv->ttyPath, strerror(errno)); + return -1; + } +#endif + /* handle the bind mounts first before doing anything else that may */ /* then access those mounted dirs */ - curMount = vmDef->fss; - for (i = 0; curMount; curMount = curMount->next) { - // XXX fix - if (curMount->type != VIR_DOMAIN_FS_TYPE_MOUNT) + for (tmp = vmDef->fss; tmp && !root; tmp = tmp->next) { + if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT) continue; - rc = mount(curMount->src, - curMount->dst, - NULL, - MS_BIND, - NULL); - if (0 != rc) { + if (STREQ(tmp->dst, "/")) + root = tmp; + } + + if (root) { + char *oldroot; + struct mntent *mntent; + char **mounts = NULL; + int nmounts = 0; + FILE *procmnt; + struct { + int maj; + int min; + const char *path; + } devs[] = { + { 1, 3, "/dev/null" }, + { 1, 5, "/dev/zero" }, + { 1, 7, "/dev/full" }, + { 5, 1, "/dev/console" }, + }; + + /* Got a FS mapped to /, we're going the pivot_root + approach to do a better-chroot-than-chroot */ + + /* this is based on this thread http://lkml.org/lkml/2008/3/5/29 */ + + /* First step is to ensure the new root itself is + a mount point */ + if (mount(root->src, root->src, NULL, MS_BIND, NULL) < 0) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("failed to mount %s at %s for container: %s"), - curMount->src, curMount->dst, strerror(errno)); + _("failed to bind new root %s: %s"), + root->src, strerror(errno)); + return -1; + } + + if (asprintf(&oldroot, "%s/.oldroot", root->src) < 0) { + lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, NULL); + return -1; + } + + if (virFileMakePath(oldroot) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to create %s: %s"), + oldroot, strerror(errno)); + return -1; + } + + /* The old root directory will live at /.oldroot after + * this and will soon be unmounted completely */ + if (pivot_root(root->src, oldroot) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to pivot root %s to %s: %s"), + oldroot, root->src, strerror(errno)); + return -1; + } + + /* CWD is undefined after pivot_root, so go to / */ + if (chdir("/") < 0) { + return -1; + } + + if (virFileMakePath("/proc") < 0 || + mount("none", "/proc", "proc", 0, NULL) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to mount /proc for container: %s"), + strerror(errno)); + return -1; + } + if (virFileMakePath("/dev") < 0 || + mount("none", "/dev", "tmpfs", 0, NULL) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to mount /dev tmpfs for container: %s"), + strerror(errno)); + return -1; + } + /* Move old devpts into container, since we have to + connect to the master ptmx which was opened in + the parent. + XXX This sucks, we need to figure out how to get our + own private devpts for isolation + */ + if (virFileMakePath("/dev/pts") < 0 || + mount("/.oldroot/dev/pts", "/dev/pts", NULL, + MS_MOVE, NULL) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to move /dev/pts into container: %s"), + strerror(errno)); + return -1; + } + + /* Populate /dev/ with a few important bits */ + umask(0); + for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) { + dev_t dev = makedev(devs[i].maj, devs[i].min); + if (mknod(devs[i].path, + 0777 | S_IFCHR, + dev) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to make device %s: %s"), + devs[i].path, strerror(errno)); + return -1; + } + } + umask(0700); + + /* Pull in rest of container's mounts */ + for (tmp = vmDef->fss; tmp; tmp = tmp->next) { + char *src; + if (STREQ(tmp->dst, "/")) + continue; + // XXX fix + if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT) + continue; + + if (asprintf(&src, "/.oldroot/%s", tmp->src) < 0) + return -1; + + if (virFileMakePath(tmp->dst) < 0 || + mount(src, tmp->dst, NULL, MS_BIND, NULL) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to mount %s at %s for container: %s"), + tmp->src, tmp->dst, strerror(errno)); + return -1; + } + VIR_FREE(src); + } + + if (!(procmnt = setmntent("/proc/mounts", "r"))) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to read /proc/mounts: %s"), + strerror(errno)); + return -1; + } + while ((mntent = getmntent(procmnt)) != NULL) { + if (!STRPREFIX(mntent->mnt_dir, "/.oldroot")) + continue; + if (VIR_REALLOC_N(mounts, nmounts+1) < 0) + return -1; + mounts[nmounts++] = strdup(mntent->mnt_dir); + } + endmntent(procmnt); + + qsort(mounts, nmounts, sizeof(mounts[0]), + lxcContainerChildMountSort); + + for (i = 0 ; i < nmounts ; i++) { + if (umount(mounts[i]) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to unmount %s: %s"), + mounts[i], strerror(errno)); + return -1; + } + } + } else { + /* Nothing mapped to /, we're using the main root, + but with extra stuff mapped in */ + for (tmp = vmDef->fss; tmp; tmp = tmp->next) { + // XXX fix + if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT) + continue; + rc = mount(tmp->src, + tmp->dst, + NULL, + MS_BIND, + NULL); + if (0 != rc) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to mount %s at %s for container: %s"), + tmp->src, tmp->dst, strerror(errno)); + return -1; + } + } + + /* mount /proc */ + if (mount("lxcproc", "/proc", "proc", 0, NULL) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to mount /proc for container: %s"), + strerror(errno)); return -1; } } - /* mount /proc */ - rc = mount("lxcproc", "/proc", "proc", 0, NULL); - if (0 != rc) { + ttyfd = open(argv->ttyPath, O_RDWR|O_NOCTTY); + if (ttyfd < 0) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("failed to mount /proc for container: %s"), - strerror(errno)); + _("open(%s) failed: %s"), argv->ttyPath, strerror(errno)); return -1; } - if (lxcContainerSetStdio(argv->monitor, argv->ttyPath) < 0) + if (lxcContainerSetStdio(argv->monitor, ttyfd) < 0) return -1; /* Wait for interface devices to show up */ diff -r eaa42985aed4 src/util.c --- a/src/util.c Tue Aug 05 16:50:59 2008 +0100 +++ b/src/util.c Tue Aug 05 16:51:14 2008 +0100 @@ -524,13 +524,11 @@ if (!(p = strrchr(parent, '/'))) return EINVAL; - if (p == parent) - return EPERM; - - *p = '\0'; - - if ((err = virFileMakePath(parent))) - return err; + if (p != parent) { + *p = '\0'; + if ((err = virFileMakePath(parent))) + return err; + } if (mkdir(path, 0777) < 0 && errno != EEXIST) return errno; -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

"Daniel P. Berrange" <berrange@redhat.com> wrote: ...
-static int lxcContainerSetStdio(int control, const char *ttyPath) +static int lxcContainerSetStdio(int control, int ttyfd) {
Hi Dan, Not a big deal, but since ttyfd is now an input to this function, it'd be less surprising if the caller were to close it. Probably not worth changing...
int rc = -1; - int ttyfd; int open_max, i;
if (setsid() < 0) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("setsid failed: %s"), strerror(errno)); - goto error_out; - } - - ttyfd = open(ttyPath, O_RDWR|O_NOCTTY); - if (ttyfd < 0) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("open(%s) failed: %s"), ttyPath, strerror(errno)); - goto error_out; + goto cleanup; }
if (ioctl(ttyfd, TIOCSCTTY, NULL) < 0) { @@ -159,8 +161,6 @@
cleanup: close(ttyfd); - -error_out: return rc; }
@@ -223,6 +223,7 @@ return 0; }
+ /** * lxcEnableInterfaces: * @vm: Pointer to vm structure @@ -252,6 +253,20 @@ return rc; }
...
+ if (root) { + char *oldroot; + struct mntent *mntent; + char **mounts = NULL; + int nmounts = 0; + FILE *procmnt;
This can be "const":
+ struct { + int maj; + int min; + const char *path; + } devs[] = { + { 1, 3, "/dev/null" }, + { 1, 5, "/dev/zero" }, + { 1, 7, "/dev/full" }, + { 5, 1, "/dev/console" },
Might be good to add /dev/random and /dev/urandom. I recently had trouble on a system where udev malfunctioned, and some /dev/{u,}random-requiring libs/services failed in unusual ways. Also, how about adding permission bits to the table, so that console doesn't end up being mode 0777: struct { int maj; int min; mode_t mode, const char *path; } const devs[] = { { 1, 3, 0666, "/dev/null" }, { 1, 5, 0666, "/dev/zero" }, { 1, 7, 0666, "/dev/full" }, { 5, 1, 0600, "/dev/console" }, { 1, 8, 0666, "/dev/random" }, { 1, 9, 0666, "/dev/urandom" }, };
+ if (virFileMakePath("/dev/pts") < 0 || + mount("/.oldroot/dev/pts", "/dev/pts", NULL, + MS_MOVE, NULL) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to move /dev/pts into container: %s"), + strerror(errno)); + return -1; + } + + /* Populate /dev/ with a few important bits */ + umask(0);
In principle, it's better never to change umask. Otherwise, when multi-threaded, that temporarily-cleared umask could hose a file-creation operation in another thread -- and it'd be a race, so probably hard to reproduce.
+ for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) { + dev_t dev = makedev(devs[i].maj, devs[i].min); + if (mknod(devs[i].path, + 0777 | S_IFCHR,
Dropping the umask, you might do s/0777/0/ here, and then call chmod to set each mode to devs[i].mode
+ dev) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to make device %s: %s"), + devs[i].path, strerror(errno));
Returning here would leave umask set to 0. Another reason not to change it in the first place.
+ return -1; + } + } + umask(0700);
When you do change umask, be sure to restore to the previous value.
+ /* Pull in rest of container's mounts */ + for (tmp = vmDef->fss; tmp; tmp = tmp->next) { + char *src; + if (STREQ(tmp->dst, "/")) + continue; + // XXX fix + if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT) + continue; + + if (asprintf(&src, "/.oldroot/%s", tmp->src) < 0) + return -1; + + if (virFileMakePath(tmp->dst) < 0 || + mount(src, tmp->dst, NULL, MS_BIND, NULL) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to mount %s at %s for container: %s"), + tmp->src, tmp->dst, strerror(errno));
Call VIR_FREE(src) here to avoid a leak.
+ return -1; + } + VIR_FREE(src); + } + + if (!(procmnt = setmntent("/proc/mounts", "r"))) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to read /proc/mounts: %s"), + strerror(errno)); + return -1; + } + while ((mntent = getmntent(procmnt)) != NULL) { + if (!STRPREFIX(mntent->mnt_dir, "/.oldroot")) + continue; + if (VIR_REALLOC_N(mounts, nmounts+1) < 0)
Call endmntent(procmnt) here, to avoid a potential file descriptor leak.
+ return -1;
+ mounts[nmounts++] = strdup(mntent->mnt_dir);
A failed strdup here would lead to segfault via qsort's comparator.
+ } + endmntent(procmnt); + + qsort(mounts, nmounts, sizeof(mounts[0]), + lxcContainerChildMountSort); + + for (i = 0 ; i < nmounts ; i++) { + if (umount(mounts[i]) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to unmount %s: %s"), + mounts[i], strerror(errno)); + return -1;
Maybe try to unmount all of them before returning, even if an earlier one fails? Also, be sure to free "mounts" before returning.
+ } + } + } else { ...

On Wed, Aug 06, 2008 at 11:00:05AM +0200, Jim Meyering wrote:
"Daniel P. Berrange" <berrange@redhat.com> wrote: ...
-static int lxcContainerSetStdio(int control, const char *ttyPath) +static int lxcContainerSetStdio(int control, int ttyfd) {
Hi Dan,
Not a big deal, but since ttyfd is now an input to this function, it'd be less surprising if the caller were to close it. Probably not worth changing...
Yes, that does make sense actually - I'll change it.
+ struct { + int maj; + int min; + const char *path; + } devs[] = { + { 1, 3, "/dev/null" }, + { 1, 5, "/dev/zero" }, + { 1, 7, "/dev/full" }, + { 5, 1, "/dev/console" },
Might be good to add /dev/random and /dev/urandom. I recently had trouble on a system where udev malfunctioned, and some /dev/{u,}random-requiring libs/services failed in unusual ways.
Also, how about adding permission bits to the table, so that console doesn't end up being mode 0777:
Yes, that's a good idea. I'll add permissions & random/urandom.
+ if (virFileMakePath("/dev/pts") < 0 || + mount("/.oldroot/dev/pts", "/dev/pts", NULL, + MS_MOVE, NULL) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to move /dev/pts into container: %s"), + strerror(errno)); + return -1; + } + + /* Populate /dev/ with a few important bits */ + umask(0);
In principle, it's better never to change umask. Otherwise, when multi-threaded, that temporarily-cleared umask could hose a file-creation operation in another thread -- and it'd be a race, so probably hard to reproduce.
In this particular scenario changing umask is safe because this code is running inside the container namespace in PID 1 - which at this point is the container's equivalent of the initrd, just doing some basic setup before running the 'init' process.
+ for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) { + dev_t dev = makedev(devs[i].maj, devs[i].min); + if (mknod(devs[i].path, + 0777 | S_IFCHR,
Dropping the umask, you might do s/0777/0/ here, and then call chmod to set each mode to devs[i].mode
Yep, if we include permissions in the table of devs, then we need to chmod anyway, so can do away with changing umaks anyway
+ } + endmntent(procmnt); + + qsort(mounts, nmounts, sizeof(mounts[0]), + lxcContainerChildMountSort); + + for (i = 0 ; i < nmounts ; i++) { + if (umount(mounts[i]) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to unmount %s: %s"), + mounts[i], strerror(errno)); + return -1;
Maybe try to unmount all of them before returning, even if an earlier one fails?
Also, be sure to free "mounts" before returning.
In general there's no need todo that cleanup in this particular method, because if any part of this initialization fails, the container will exit and the kernel will cleanup all the mounts, and of course memory will be free'd. That said I'll add the free of 'mounts' anyway just as good practice. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

DB> This is a long overdue followup to my previous set of patches to DB> make the LXC driver use the new domain XML apis. Have you been able to test this with NET_NS support enabled yet? I am hitting the same issue I was before. On starting an LXC domain with a network interface, the daemon finishes the action and reports a good status to virsh. The domain fails to start, with the following in the per-domain log file: DEBUG: lxc_container.c: lxcContainerStart (clone() returned, 8082) libvir: Linux Container error : internal error read of fd 6 failed: Input/output error DEBUG: veth.c: vethDelete (veth: veth2) However, now, the daemon crashes well after leaving the LXC driver's domain startup process. From the timing, I'd guess it's in some event (or SIGCHLD) handler, but I can't reproduce it in gdb to get more information. This looks to be identical in root cause to what I was seeing with the previous set of patches, and was unable to figure out why file descriptor 6 was getting prematurely closed. I'll get started on debugging it again tomorrow, but it might be good if you can reproduce it as well :) -- Dan Smith IBM Linux Technology Center Open Hypervisor Team email: danms@us.ibm.com

On Tue, Aug 05, 2008 at 02:17:27PM -0700, Dan Smith wrote:
DB> This is a long overdue followup to my previous set of patches to DB> make the LXC driver use the new domain XML apis.
Have you been able to test this with NET_NS support enabled yet?
I am hitting the same issue I was before. On starting an LXC domain with a network interface, the daemon finishes the action and reports a good status to virsh. The domain fails to start, with the following in the per-domain log file:
DEBUG: lxc_container.c: lxcContainerStart (clone() returned, 8082) libvir: Linux Container error : internal error read of fd 6 failed: Input/output error DEBUG: veth.c: vethDelete (veth: veth2)
However, now, the daemon crashes well after leaving the LXC driver's domain startup process. From the timing, I'd guess it's in some event (or SIGCHLD) handler, but I can't reproduce it in gdb to get more information.
That's not good. There should be no SIGCHLD's anywhere now - the container double-forks into the background. Its probably some error in the cleanup path - valgrind may help if gdb fails.
This looks to be identical in root cause to what I was seeing with the previous set of patches, and was unable to figure out why file descriptor 6 was getting prematurely closed. I'll get started on debugging it again tomorrow, but it might be good if you can reproduce it as well :)
I've unfortunately still not got a kernel build with all the neccessary NET_NS bits added on. If you can capture an strace it can probably help me diagnose the problem strace -f -ff -o net.log -s 30000 -p `pid of libvirtd` IIRC, you said removing the bit of code which closed all open file descriptor would make it work again ? If you can grab an strace with that change in too I can compare and figure out where we're going wrong. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

On Wed, Aug 06, 2008 at 10:41:42AM +0100, Daniel P. Berrange wrote:
On Tue, Aug 05, 2008 at 02:17:27PM -0700, Dan Smith wrote:
DB> This is a long overdue followup to my previous set of patches to DB> make the LXC driver use the new domain XML apis.
Have you been able to test this with NET_NS support enabled yet?
I am hitting the same issue I was before. On starting an LXC domain with a network interface, the daemon finishes the action and reports a good status to virsh. The domain fails to start, with the following in the per-domain log file:
DEBUG: lxc_container.c: lxcContainerStart (clone() returned, 8082) libvir: Linux Container error : internal error read of fd 6 failed: Input/output error DEBUG: veth.c: vethDelete (veth: veth2)
However, now, the daemon crashes well after leaving the LXC driver's domain startup process. From the timing, I'd guess it's in some event (or SIGCHLD) handler, but I can't reproduce it in gdb to get more information.
That's not good. There should be no SIGCHLD's anywhere now - the container double-forks into the background. Its probably some error in the cleanup path - valgrind may help if gdb fails.
This looks to be identical in root cause to what I was seeing with the previous set of patches, and was unable to figure out why file descriptor 6 was getting prematurely closed. I'll get started on debugging it again tomorrow, but it might be good if you can reproduce it as well :)
I've unfortunately still not got a kernel build with all the neccessary NET_NS bits added on.
Turns out this wasn't needed. Simply commenting out use of the CLONE_NETNS flag and using a currently 2.6.26 kernel is sufficient to expose the container failing to start, and the libvirtd daemon crashing. So I'll track these down and fix it Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|
participants (3)
-
Dan Smith
-
Daniel P. Berrange
-
Jim Meyering