The -mm tree has Daniel Lezcano's patch changing the handling of
sys_reboot in a non-init pidns. That means that, with that
support, (a) it is safe to grant CAP_SYS_BOOT to a container, and
(b) it's possible to distinguish between reboot and shutdown.
I've implemented partial support of this for libvirt in the patch
below. If Daniel's patch is not in the running kernel, then
CAP_SYS_BOOT will be dropped for the container. Otherwise, it will
be kept in. When the container exits, if it was determined to
be a shutdown, the container will terminate.
However, I didn't know how to properly do the reboot part.
The patch below shows how to detect it (and sets the static bool
wantreboot to true in that case), but I didn't know quite what to
do with that. It looks like the code flow between lxcControllerRun
and lxcControllerMain would need to be changed a bit so that we
could re-run the lxcContainerStart() without causing the
monitor.serverFD (or whichever pipe sends monitor events to
lxc_driver.c to trigger autodestroy) to be closed.
So for now I'm sending this patch, and hoping the sorcerers on this
list can hook reboot up as well, or show the best way how.
thanks,
-serge
Subject: [PATCH 1/1] lxc: handle shutdown (and detect, but mis-handle reboot)
If Daniel Lezcano's pidns reboot patch is in the kernel, then don't
drop CAP_SYS_BOOT. When container calls shutdown, terminate the
container.
This patch detects when the container wanted to reboot, but goes
ahead and terminates the container because I don't know how to best
structure the code to support restarting a container that wanted
to reboot.
Signed-off-by: Serge Hallyn <serge.hallyn(a)canonical.com>
---
src/lxc/lxc_container.c | 13 ++++--
src/lxc/lxc_container.h | 3 +-
src/lxc/lxc_controller.c | 97 ++++++++++++++++++++++++++++++++++++++++++++--
3 files changed, 103 insertions(+), 10 deletions(-)
diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c
index e93fda5..793cb19 100644
--- a/src/lxc/lxc_container.c
+++ b/src/lxc/lxc_container.c
@@ -102,6 +102,7 @@ struct __lxc_child_argv {
char **ttyPaths;
size_t nttyPaths;
int handshakefd;
+ bool dropreboot;
};
@@ -1216,7 +1217,7 @@ static int lxcContainerSetupMounts(virDomainDefPtr vmDef,
* It removes some capabilities that could be dangerous to
* host system, since they are not currently "containerized"
*/
-static int lxcContainerDropCapabilities(void)
+static int lxcContainerDropCapabilities(bool dropreboot)
{
#if HAVE_CAPNG
int ret;
@@ -1226,11 +1227,11 @@ static int lxcContainerDropCapabilities(void)
if ((ret = capng_updatev(CAPNG_DROP,
CAPNG_EFFECTIVE | CAPNG_PERMITTED |
CAPNG_INHERITABLE | CAPNG_BOUNDING_SET,
- CAP_SYS_BOOT, /* No use of reboot */
CAP_SYS_MODULE, /* No kernel module loading */
CAP_SYS_TIME, /* No changing the clock */
CAP_AUDIT_CONTROL, /* No messing with auditing status */
CAP_MAC_ADMIN, /* No messing with LSM config */
+ dropreboot ? CAP_SYS_BOOT : -1, /* No use of reboot? */
-1 /* sentinal */)) < 0) {
lxcError(VIR_ERR_INTERNAL_ERROR,
_("Failed to remove capabilities: %d"), ret);
@@ -1343,7 +1344,7 @@ static int lxcContainerChild( void *data )
}
/* drop a set of root capabilities */
- if (lxcContainerDropCapabilities() < 0)
+ if (lxcContainerDropCapabilities(argv->dropreboot) < 0)
goto cleanup;
if (lxcContainerSendContinue(argv->handshakefd) < 0) {
@@ -1416,6 +1417,7 @@ const char *lxcContainerGetAlt32bitArch(const char *arch)
* @veths: interface names
* @control: control FD to the container
* @ttyPath: path of tty to set as the container console
+ * @dropreboot: do we need to drop CAP_SYS_BOOT
*
* Starts a container process by calling clone() with the namespace flags
*
@@ -1428,7 +1430,8 @@ int lxcContainerStart(virDomainDefPtr def,
int control,
int handshakefd,
char **ttyPaths,
- size_t nttyPaths)
+ size_t nttyPaths,
+ bool dropreboot)
{
pid_t pid;
int cflags;
@@ -1436,7 +1439,7 @@ int lxcContainerStart(virDomainDefPtr def,
char *stack, *stacktop;
lxc_child_argv_t args = { def, securityDriver,
nveths, veths, control,
- ttyPaths, nttyPaths, handshakefd};
+ ttyPaths, nttyPaths, handshakefd, dropreboot};
/* allocate a stack for the container */
if (VIR_ALLOC_N(stack, stacksize) < 0) {
diff --git a/src/lxc/lxc_container.h b/src/lxc/lxc_container.h
index 77fb9b2..15738c8 100644
--- a/src/lxc/lxc_container.h
+++ b/src/lxc/lxc_container.h
@@ -56,7 +56,8 @@ int lxcContainerStart(virDomainDefPtr def,
int control,
int handshakefd,
char **ttyPaths,
- size_t nttyPaths);
+ size_t nttyPaths,
+ bool dropreboot);
int lxcContainerAvailable(int features);
diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c
index 8f336f5..e9aa904 100644
--- a/src/lxc/lxc_controller.c
+++ b/src/lxc/lxc_controller.c
@@ -708,6 +708,7 @@ ignorable_accept_errno(int errnum)
}
static bool quit = false;
+static bool wantreboot = false;
static virMutex lock;
static int sigpipe[2];
@@ -721,12 +722,33 @@ static void lxcSignalChildIO(int watch ATTRIBUTE_UNUSED,
int events ATTRIBUTE_UNUSED, void *opaque)
{
char buf[1];
- int ret;
+ int ret, status;
int *container = opaque;
ignore_value(read(sigpipe[0], buf, 1));
- ret = waitpid(-1, NULL, WNOHANG);
+ ret = waitpid(-1, &status, WNOHANG);
if (ret == *container) {
+ if (WIFSIGNALED(status)) {
+ switch(WTERMSIG(status)) {
+ case SIGINT: /* halt */
+ VIR_DEBUG("XXX Container halting");
+ virMutexLock(&lock);
+ quit = true;
+ virMutexUnlock(&lock);
+ VIR_DEBUG("XXX set quit to true");
+ return;
+ case SIGHUP: /* reboot */
+ VIR_DEBUG("XXX Container rebooting");
+ virMutexLock(&lock);
+ wantreboot = true;
+ virMutexUnlock(&lock);
+ VIR_DEBUG("XXX set wantreboot true (i'm pid %d)",
getpid());
+ return;
+ default:
+ VIR_DEBUG("XXX unknown exit status for init: %d\n",
WTERMSIG(status));
+ break;
+ }
+ }
virMutexLock(&lock);
quit = true;
virMutexUnlock(&lock);
@@ -1082,6 +1104,62 @@ error:
virMutexUnlock(&lock);
}
+#include <sys/reboot.h>
+#include <linux/reboot.h>
+
+/*
+ * reboot(LINUX_REBOOT_CMD_CAD_ON) will return -EINVAL
+ * in a child pid namespace if container reboot support exists.
+ * Otherwise, it will either succeed or return -EPERM.
+ */
+static int container_reboot_supported(void *arg)
+{
+ int *cmd = arg;
+ int ret;
+
+ ret = reboot(*cmd);
+ if (ret == -1 && errno == EINVAL)
+ return 1;
+ return 0;
+}
+
+static int container_reboot_is_supported(void)
+{
+ FILE *f = fopen("/proc/sys/kernel/ctrl-alt-del", "r");
+ int ret, cmd, v;
+ long stack_size = 4096;
+ void *stack = alloca(stack_size) + stack_size;
+ int status;
+ pid_t pid;
+
+ if (!f) {
+ VIR_DEBUG("failed to open /proc/sys/kernel/ctrl-alt-del");
+ return 0;
+ }
+
+ ret = fscanf(f, "%d", &v);
+ fclose(f);
+ if (ret != 1) {
+ VIR_DEBUG("Failed to read /proc/sys/kernel/ctrl-alt-del");
+ return 0;
+ }
+ cmd = v ? LINUX_REBOOT_CMD_CAD_ON : LINUX_REBOOT_CMD_CAD_OFF;
+
+ pid = clone(container_reboot_supported, stack, CLONE_NEWPID | SIGCHLD,
&cmd);
+ if (pid < 0) {
+ VIR_DEBUG("failed to clone\n");
+ return 0;
+ }
+ if (wait(&status) < 0) {
+ VIR_DEBUG("unexpected wait error: %m\n");
+ return 0;
+ }
+
+ if (WEXITSTATUS(status) != 1)
+ return 0;
+
+ return 1;
+}
/**
* lxcControllerMain
@@ -1214,13 +1292,19 @@ static int lxcControllerMain(int serverFd,
}
virMutexLock(&lock);
- while (!quit) {
+ while (!quit && !wantreboot) {
virMutexUnlock(&lock);
if (virEventRunDefaultImpl() < 0)
goto cleanup;
virMutexLock(&lock);
}
virMutexUnlock(&lock);
+ VIR_DEBUG("XXX (pid %d) container is done", getpid());
+ if (wantreboot) {
+ rc = 0;
+ VIR_DEBUG("XXX (pid %d) wantreboot is true", getpid());
+ //goto cleanup2;
+ }
err = virGetLastError();
if (!err || err->code == VIR_ERR_OK)
@@ -1385,6 +1469,7 @@ lxcControllerRun(virDomainDefPtr def,
size_t nloopDevs = 0;
int *loopDevs = NULL;
size_t i;
+ bool dropreboot = true;
if (VIR_ALLOC_N(containerTtyFDs, nttyFDs) < 0) {
virReportOOMError();
@@ -1542,6 +1627,7 @@ lxcControllerRun(virDomainDefPtr def,
if (lxcSetPersonality(def) < 0)
goto cleanup;
+ dropreboot = !container_reboot_is_supported();
if ((container = lxcContainerStart(def,
securityDriver,
@@ -1550,7 +1636,8 @@ lxcControllerRun(virDomainDefPtr def,
control[1],
containerhandshake[1],
containerTtyPaths,
- nttyFDs)) < 0)
+ nttyFDs,
+ dropreboot)) < 0)
goto cleanup;
VIR_FORCE_CLOSE(control[1]);
VIR_FORCE_CLOSE(containerhandshake[1]);
@@ -1603,7 +1690,9 @@ lxcControllerRun(virDomainDefPtr def,
}
}
+ VIR_DEBUG("XXX starting lxcControllerMain (i'm pid %d)\n", getpid());
rc = lxcControllerMain(monitor, client, ttyFDs, containerTtyFDs, nttyFDs,
container);
+ VIR_DEBUG("XXX lxcControllerMain returned %d (i'm pid %d)\n", rc,
getpid());
monitor = client = -1;
cleanup:
--
1.7.9