[libvirt] [PATCH 0 of 2] Add cgroup manipulation and LXC driver support

Revision 3 of the cgroups patch, with the round 2 comments addressed.

On Fri, Oct 03, 2008 at 08:40:22AM -0700, Dan Smith wrote:
Revision 3 of the cgroups patch, with the round 2 comments addressed.
Fine for me +1 Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

This patch adds src/cgroup.{c,h} with support for creating and manipulating cgroups. All groups created with the internal API are forced under $mount/libvirt/ to keep everything together. The first time a group is created, the libvirt directory is also created, and the settings from the root are inherited. The code creates groups in all mounts requires to get memory and devices functionality. When setting a value, the appropriate mount is determined and the value is set there. I have tested this with all controllers mounted in a single location, as well as all of them mounted separately. Changes: - Handle multiple mount points - Add more abstract internal API, per recent discussion - Consider absence of memory or devices controllers as "no cgroup support" diff -r 444e2614d0a2 -r 5590af0c8c3e src/cgroup.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/cgroup.c Fri Oct 03 08:06:52 2008 -0700 @@ -0,0 +1,762 @@ +/* + * cgroup.c: Tools for managing cgroups + * + * Copyright IBM Corp. 2008 + * + * See COPYING.LIB for the License of this software + * + * Authors: + * Dan Smith <danms@us.ibm.com> + */ +#include <config.h> + +#include <stdio.h> +#include <stdint.h> +#include <inttypes.h> +#include <mntent.h> +#include <fcntl.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> +#include <stdbool.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <libgen.h> + +#include "internal.h" +#include "util.h" +#include "memory.h" +#include "cgroup.h" + +#define DEBUG(fmt,...) VIR_DEBUG(__FILE__, fmt, __VA_ARGS__) +#define DEBUG0(msg) VIR_DEBUG(__FILE__, "%s", msg) + +#define CGROUP_MAX_VAL 512 + +struct virCgroup { + char *path; +}; + +const char *supported_controllers[] = { + "memory", + "devices", + NULL +}; + +/** + * virCgroupFree: + * + * @group: The group structure to free + */ +void virCgroupFree(virCgroupPtr *group) +{ + if (*group != NULL) { + VIR_FREE((*group)->path); + VIR_FREE(*group); + *group = NULL; + } +} + +static virCgroupPtr virCgroupGetMount(const char *controller) +{ + FILE *mounts; + struct mntent entry; + char buf[CGROUP_MAX_VAL]; + virCgroupPtr root = NULL; + + if (VIR_ALLOC(root) != 0) + return NULL; + + mounts = fopen("/proc/mounts", "r"); + if (mounts == NULL) { + DEBUG0("Unable to open /proc/mounts: %m"); + goto err; + } + + while (getmntent_r(mounts, &entry, buf, sizeof(buf)) != NULL) { + if (STREQ(entry.mnt_type, "cgroup") && + (strstr(entry.mnt_opts, controller))) { + root->path = strdup(entry.mnt_dir); + break; + } + } + + DEBUG("Mount for %s is %s\n", controller, root->path); + + if (root->path == NULL) { + DEBUG0("Did not find cgroup mount"); + goto err; + } + + fclose(mounts); + + return root; +err: + virCgroupFree(&root); + + return NULL; +} + +/** + * virCgroupHaveSupport: + * + * Returns 0 if support is present, negative if not + */ +int virCgroupHaveSupport(void) +{ + virCgroupPtr root; + int i; + + for (i = 0; supported_controllers[i] != NULL; i++) { + root = virCgroupGetMount(supported_controllers[i]); + if (root == NULL) + return -1; + virCgroupFree(&root); + } + + return 0; +} + +static int virCgroupPathOfGroup(const char *group, + const char *controller, + char **path) +{ + virCgroupPtr root = NULL; + int rc = 0; + + root = virCgroupGetMount(controller); + if (root == NULL) { + rc = -ENOTDIR; + goto out; + } + + if (asprintf(path, "%s/%s", root->path, group) == -1) + rc = -ENOMEM; +out: + virCgroupFree(&root); + + return rc; +} + +static int virCgroupPathOf(const char *grppath, + const char *key, + char **path) +{ + virCgroupPtr root; + int rc = 0; + char *controller = NULL; + + if (strchr(key, '.') == NULL) + return -EINVAL; + + if (sscanf(key, "%a[^.]", &controller) != 1) + return -EINVAL; + + root = virCgroupGetMount(controller); + if (root == NULL) { + rc = -ENOTDIR; + goto out; + } + + if (asprintf(path, "%s/%s/%s", root->path, grppath, key) == -1) + rc = -ENOMEM; +out: + virCgroupFree(&root); + VIR_FREE(controller); + + return rc; +} + +static int virCgroupSetValueStr(virCgroupPtr group, + const char *key, + const char *value) +{ + int fd = -1; + int rc = 0; + char *keypath = NULL; + + rc = virCgroupPathOf(group->path, key, &keypath); + if (rc != 0) + return rc; + + fd = open(keypath, O_WRONLY); + if (fd < 0) { + DEBUG("Unable to open %s: %m", keypath); + rc = -ENOENT; + goto out; + } + + DEBUG("Writing '%s' to '%s'", value, keypath); + + rc = safewrite(fd, value, strlen(value)); + if (rc < 0) { + DEBUG("Failed to write value '%s': %m", value); + rc = -errno; + goto out; + } else if (rc != strlen(value)) { + DEBUG("Short write of value '%s'", value); + rc = -ENOSPC; + goto out; + } + + rc = 0; +out: + VIR_FREE(keypath); + close(fd); + + return rc; +} + +static int virCgroupSetValueU64(virCgroupPtr group, + const char *key, + uint64_t value) +{ + char *strval = NULL; + int rc; + + if (asprintf(&strval, "%" PRIu64, value) == -1) + return -ENOMEM; + + rc = virCgroupSetValueStr(group, key, strval); + + VIR_FREE(strval); + + return rc; +} + +#if 0 +/* This is included for completeness, but not yet used */ + +static int virCgroupSetValueI64(virCgroupPtr group, + const char *key, + int64_t value) +{ + char *strval = NULL; + int rc; + + if (asprintf(&strval, "%" PRIi64, value) == -1) + return -ENOMEM; + + rc = virCgroupSetValueStr(group, key, strval); + + VIR_FREE(strval); + + return rc; +} + +static int virCgroupGetValueStr(virCgroupPtr group, + const char *key, + char **value) +{ + int fd = -1; + int rc; + char *keypath = NULL; + char buf[CGROUP_MAX_VAL]; + + memset(buf, 0, sizeof(buf)); + + rc = virCgroupPathOf(group->path, key, &keypath); + if (rc != 0) { + DEBUG("No path of %s, %s", group->path, key); + return rc; + } + + fd = open(keypath, O_RDONLY); + if (fd < 0) { + DEBUG("Unable to open %s: %m", keypath); + rc = -ENOENT; + goto out; + } + + rc = saferead(fd, buf, sizeof(buf)); + if (rc < 0) { + DEBUG("Failed to read %s: %m\n", keypath); + rc = -errno; + goto out; + } else if (rc == 0) { + DEBUG("Short read of %s\n", keypath); + rc = -EIO; + goto out; + } + + *value = strdup(buf); + if (*value == NULL) { + rc = -ENOMEM; + goto out; + } + + rc = 0; +out: + VIR_FREE(keypath); + close(fd); + + return rc; +} + +static int virCgroupGetValueU64(virCgroupPtr group, + const char *key, + uint64_t *value) +{ + char *strval = NULL; + int rc = 0; + + rc = virCgroupGetValueStr(group, key, &strval); + if (rc != 0) + goto out; + + if (sscanf(strval, "%" SCNu64, value) != 1) + rc = -EINVAL; +out: + VIR_FREE(strval); + + return rc; +} + +static int virCgroupGetValueI64(virCgroupPtr group, + const char *key, + int64_t *value) +{ + char *strval = NULL; + int rc = 0; + + rc = virCgroupGetValueStr(group, key, &strval); + if (rc != 0) + goto out; + + if (sscanf(strval, "%" SCNi64, value) != 1) + rc = -EINVAL; +out: + VIR_FREE(strval); + + return rc; +} +#endif + +static int _virCgroupInherit(const char *path, + const char *key) +{ + int rc = 0; + int fd = -1; + char buf[CGROUP_MAX_VAL]; + char *keypath = NULL; + char *pkeypath = NULL; + + memset(buf, 0, sizeof(buf)); + + if (asprintf(&keypath, "%s/%s", path, key) == -1) { + rc = -ENOMEM; + goto out; + } + + if (access(keypath, F_OK) != 0) { + DEBUG("Group %s has no key %s\n", path, key); + goto out; + } + + if (asprintf(&pkeypath, "%s/../%s", path, key) == -1) { + rc = -ENOMEM; + VIR_FREE(keypath); + goto out; + } + + fd = open(pkeypath, O_RDONLY); + if (fd < 0) { + rc = -errno; + goto out; + } + + if (saferead(fd, buf, sizeof(buf)) <= 0) { + rc = -errno; + goto out; + } + + close(fd); + + fd = open(keypath, O_WRONLY); + if (fd < 0) { + rc = -errno; + goto out; + } + + if (safewrite(fd, buf, strlen(buf)) != strlen(buf)) { + rc = -errno; + goto out; + } + +out: + VIR_FREE(keypath); + VIR_FREE(pkeypath); + close(fd); + + return rc; +} + +static int virCgroupInherit(const char *grppath) +{ + int i; + int rc = 0; + const char *inherit_values[] = { + "cpuset.cpus", + "cpuset.mems", + NULL + }; + + for (i = 0; inherit_values[i] != NULL; i++) { + const char *key = inherit_values[i]; + + rc = _virCgroupInherit(grppath, key); + if (rc != 0) { + DEBUG("inherit of %s failed\n", key); + break; + } + } + + return rc; +} + +static int virCgroupMakeGroup(const char *name) +{ + int i; + int rc = 0; + + for (i = 0; supported_controllers[i] != NULL; i++) { + char *path = NULL; + virCgroupPtr root; + + root = virCgroupGetMount(supported_controllers[i]); + if (root == NULL) + continue; + + rc = virCgroupPathOfGroup(name, supported_controllers[i], &path); + if (rc != 0) { + virCgroupFree(&root); + break; + } + + virCgroupFree(&root); + + if (access(path, F_OK) != 0) { + if (mkdir(path, 0655) < 0) { + rc = -errno; + VIR_FREE(path); + break; + } + virCgroupInherit(path); + } + + VIR_FREE(path); + } + + return rc; +} + +static int virCgroupRoot(virCgroupPtr *root) +{ + int rc = 0; + char *grppath = NULL; + + if (VIR_ALLOC((*root)) != 0) { + rc = -ENOMEM; + goto out; + } + + (*root)->path = strdup("libvirt"); + if ((*root)->path == NULL) { + rc = -ENOMEM; + goto out; + } + + rc = virCgroupMakeGroup((*root)->path); +out: + if (rc != 0) + virCgroupFree(root); + VIR_FREE(grppath); + + return rc; +} + +static int virCgroupNew(virCgroupPtr *parent, + const char *group, + virCgroupPtr *newgroup) +{ + int rc = 0; + char *typpath = NULL; + + *newgroup = NULL; + + if (*parent == NULL) { + rc = virCgroupRoot(parent); + if (rc != 0) + goto err; + } + + if (VIR_ALLOC((*newgroup)) != 0) { + rc = -ENOMEM; + goto err; + } + + rc = asprintf(&((*newgroup)->path), + "%s/%s", + (*parent)->path, + group); + if (rc == -1) { + rc = -ENOMEM; + goto err; + } + + rc = 0; + + return rc; +err: + virCgroupFree(newgroup); + *newgroup = NULL; + + VIR_FREE(typpath); + + return rc; +} + +static int virCgroupOpen(virCgroupPtr parent, + const char *group, + virCgroupPtr *newgroup) +{ + int rc = 0; + char *grppath = NULL; + bool free_parent = (parent == NULL); + + rc = virCgroupNew(&parent, group, newgroup); + if (rc != 0) + goto err; + + if (free_parent) + virCgroupFree(&parent); + + rc = virCgroupPathOfGroup((*newgroup)->path, + supported_controllers[0], + &grppath); + if (rc != 0) + goto err; + + if (access(grppath, F_OK) != 0) { + rc = -ENOENT; + goto err; + } + + return rc; +err: + virCgroupFree(newgroup); + *newgroup = NULL; + + return rc; +} + +static int virCgroupCreate(virCgroupPtr parent, + const char *group, + virCgroupPtr *newgroup) +{ + int rc = 0; + bool free_parent = (parent == NULL); + + rc = virCgroupNew(&parent, group, newgroup); + if (rc != 0) { + DEBUG0("Unable to allocate new virCgroup structure"); + goto err; + } + + rc = virCgroupMakeGroup((*newgroup)->path); + if (rc != 0) + goto err; + + if (free_parent) + virCgroupFree(&parent); + + return rc; +err: + virCgroupFree(newgroup); + *newgroup = NULL; + + if (free_parent) + virCgroupFree(&parent); + + return rc; +} + +/** + * virCgroupRemove: + * + * @group: The group to be removed + * + * Returns: 0 on success + */ +int virCgroupRemove(virCgroupPtr group) +{ + int rc = 0; + int i; + char *grppath = NULL; + + for (i = 0; supported_controllers[i] != NULL; i++) { + if (virCgroupPathOfGroup(group->path, + supported_controllers[i], + &grppath) != 0) + continue; + + if (rmdir(grppath) != 0) + rc = -errno; + + VIR_FREE(grppath); + } + + return rc; +} + +/** + * virCgroupAddTask: + * + * @group: The cgroup to add a task to + * @pid: The pid of the task to add + * + * Returns: 0 on success + */ +int virCgroupAddTask(virCgroupPtr group, pid_t pid) +{ + int rc = 0; + int fd = -1; + int i; + char *grppath = NULL; + char *taskpath = NULL; + char *pidstr = NULL; + + for (i = 0; supported_controllers[i] != NULL; i++) { + rc = virCgroupPathOfGroup(group->path, + supported_controllers[i], + &grppath); + if (rc != 0) + goto done; + + if (asprintf(&taskpath, "%s/tasks", grppath) == -1) { + rc = -ENOMEM; + goto done; + } + + fd = open(taskpath, O_WRONLY); + if (fd < 0) { + rc = -errno; + goto done; + } + + if (asprintf(&pidstr, "%lu", (unsigned long)pid) == -1) { + rc = -ENOMEM; + goto done; + } + + if (write(fd, pidstr, strlen(pidstr)) <= 0) { + rc = -errno; + goto done; + } + + done: + VIR_FREE(grppath); + VIR_FREE(taskpath); + VIR_FREE(pidstr); + close(fd); + + if (rc != 0) + break; + } + + return rc; +} + +/** + * virCgroupForDomain: + * + * @def: Domain definition to create cgroup for + * @driverName: Classification of this domain type (e.g., xen, qemu, lxc) + * @group: Pointer to returned virCgroupPtr + * + * Returns 0 on success + */ +int virCgroupForDomain(virDomainDefPtr def, + const char *driverName, + virCgroupPtr *group) +{ + int rc; + virCgroupPtr typegrp = NULL; + + rc = virCgroupOpen(NULL, driverName, &typegrp); + if (rc == -ENOENT) { + rc = virCgroupCreate(NULL, driverName, &typegrp); + if (rc != 0) + goto out; + } else if (rc != 0) + goto out; + + rc = virCgroupOpen(typegrp, def->name, group); + if (rc == -ENOENT) + rc = virCgroupCreate(typegrp, def->name, group); +out: + virCgroupFree(&typegrp); + + return rc; +} + +/** + * virCgroupSetMemory: + * + * @group: The cgroup to change memory for + * @kb: The memory amount in kilobytes + * + * Returns: 0 on success + */ +int virCgroupSetMemory(virCgroupPtr group, unsigned long kb) +{ + return virCgroupSetValueU64(group, + "memory.limit_in_bytes", + kb << 10); +} + +/** + * virCgroupDenyAllDevices: + * + * @group: The cgroup to deny devices for + * + * Returns: 0 on success + */ +int virCgroupDenyAllDevices(virCgroupPtr group) +{ + return virCgroupSetValueStr(group, + "devices.deny", + "a"); +} + +/** + * virCgroupAllowDevice: + * + * @group: The cgroup to allow a device for + * @type: The device type (i.e., 'c' or 'b') + * @major: The major number of the device + * @minor: The minor number of the device + * + * Returns: 0 on success + */ +int virCgroupAllowDevice(virCgroupPtr group, + char type, + int major, + int minor) +{ + int rc; + char *devstr = NULL; + + if (asprintf(&devstr, "%c %i:%i rwm", type, major, minor) == -1) { + rc = -ENOMEM; + goto out; + } + + rc = virCgroupSetValueStr(group, + "devices.allow", + devstr); +out: + VIR_FREE(devstr); + + return rc; +} diff -r 444e2614d0a2 -r 5590af0c8c3e src/cgroup.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/cgroup.h Fri Oct 03 08:06:52 2008 -0700 @@ -0,0 +1,53 @@ +/* + * cgroup.h: Interface to tools for managing cgroups + * + * Copyright IBM Corp. 2008 + * + * See COPYING.LIB for the License of this software + * + * Authors: + * Dan Smith <danms@us.ibm.com> + */ + +#ifndef CGROUP_H +#define CGROUP_H + +#include <stdint.h> + +struct virCgroup; +typedef struct virCgroup *virCgroupPtr; + +#include "domain_conf.h" + +#define VIR_CG_DEV_MAJ_MEMORY 1 +#define VIR_CG_DEV_MIN_NULL 3 +#define VIR_CG_DEV_MIN_ZERO 5 +#define VIR_CG_DEV_MIN_FULL 7 +#define VIR_CG_DEV_MIN_RANDOM 8 +#define VIR_CG_DEV_MIN_URANDOM 9 + +#define VIR_CG_DEV_MAJ_TTY 5 +#define VIR_CG_DEV_MIN_CONSOLE 1 + +int virCgroupHaveSupport(void); + +int virCgroupForDomain(virDomainDefPtr def, + const char *driverName, + virCgroupPtr *group); + +int virCgroupAddTask(virCgroupPtr group, pid_t pid); + +int virCgroupSetMemory(virCgroupPtr group, unsigned long kb); + +int virCgroupDenyAllDevices(virCgroupPtr group); + +int virCgroupAllowDevice(virCgroupPtr group, + char type, + int major, + int minor); + +int virCgroupRemove(virCgroupPtr group); + +void virCgroupFree(virCgroupPtr *group); + +#endif /* CGROUP_H */

This patch adds code to the controller to set up a cgroup named after the domain name, set the memory limit, and restrict devices. It also adds bits to lxc_driver to properly clean up the cgroup on domain death. If virCgroupHaveSupport() says that no support is available, then we just allow the domain creation to proceed as it did before without resource controls in place. Changes: - Use device number constants from cgroup.h - Be sure to remove the cgroup on the failure path - Use the more abstract API diff -r 5590af0c8c3e -r e827f165c23b src/Makefile.am --- a/src/Makefile.am Fri Oct 03 08:06:52 2008 -0700 +++ b/src/Makefile.am Fri Oct 03 08:38:57 2008 -0700 @@ -90,13 +90,15 @@ lxc_conf.c lxc_conf.h \ lxc_container.c lxc_container.h \ lxc_driver.c lxc_driver.h \ - veth.c veth.h + veth.c veth.h \ + cgroup.c cgroup.h LXC_CONTROLLER_SOURCES = \ lxc_conf.c lxc_conf.h \ lxc_container.c lxc_container.h \ lxc_controller.c \ - veth.c veth.h + veth.c veth.h \ + cgroup.c cgroup.h OPENVZ_DRIVER_SOURCES = \ openvz_conf.c openvz_conf.h \ diff -r 5590af0c8c3e -r e827f165c23b src/lxc_controller.c --- a/src/lxc_controller.c Fri Oct 03 08:06:52 2008 -0700 +++ b/src/lxc_controller.c Fri Oct 03 08:38:57 2008 -0700 @@ -42,12 +42,82 @@ #include "veth.h" #include "memory.h" #include "util.h" - +#include "cgroup.h" #define DEBUG(fmt,...) VIR_DEBUG(__FILE__, fmt, __VA_ARGS__) #define DEBUG0(msg) VIR_DEBUG(__FILE__, "%s", msg) int debugFlag = 0; + +struct cgroup_device_policy { + char type; + int major; + int minor; +}; + +/** + * lxcSetContainerResources + * @def: pointer to virtual machine structure + * + * Creates a cgroup for the container, moves the task inside, + * and sets resource limits + * + * Returns 0 on success or -1 in case of error + */ +static int lxcSetContainerResources(virDomainDefPtr def) +{ + virCgroupPtr cgroup; + int rc = -1; + int i; + struct cgroup_device_policy devices[] = { + {'c', VIR_CG_DEV_MAJ_MEMORY, VIR_CG_DEV_MIN_NULL}, + {'c', VIR_CG_DEV_MAJ_MEMORY, VIR_CG_DEV_MIN_ZERO}, + {'c', VIR_CG_DEV_MAJ_MEMORY, VIR_CG_DEV_MIN_FULL}, + {'c', VIR_CG_DEV_MAJ_MEMORY, VIR_CG_DEV_MIN_RANDOM}, + {'c', VIR_CG_DEV_MAJ_MEMORY, VIR_CG_DEV_MIN_URANDOM}, + {'c', VIR_CG_DEV_MAJ_TTY, VIR_CG_DEV_MIN_CONSOLE}, + {0, 0, 0}}; + + if (virCgroupHaveSupport() != 0) + return 0; /* Not supported, so claim success */ + + rc = virCgroupForDomain(def, "lxc", &cgroup); + if (rc != 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("Unable to create cgroup for %s\n"), def->name); + return rc; + } + + rc = virCgroupSetMemory(cgroup, def->maxmem); + if (rc != 0) + goto out; + + rc = virCgroupDenyAllDevices(cgroup); + if (rc != 0) + goto out; + + for (i = 0; devices[i].type != 0; i++) { + struct cgroup_device_policy *dev = &devices[i]; + rc = virCgroupAllowDevice(cgroup, + dev->type, + dev->major, + dev->minor); + if (rc != 0) + goto out; + } + + rc = virCgroupAddTask(cgroup, getpid()); +out: + if (rc != 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("Failed to set lxc resources: %s\n"), strerror(-rc)); + virCgroupRemove(cgroup); + } + + virCgroupFree(&cgroup); + + return rc; +} static char*lxcMonitorPath(virDomainDefPtr def) { @@ -394,6 +464,9 @@ if (lxcControllerMoveInterfaces(nveths, veths, container) < 0) goto cleanup; + if (lxcSetContainerResources(def) < 0) + goto cleanup; + if (lxcContainerSendContinue(control[0]) < 0) goto cleanup; diff -r 5590af0c8c3e -r e827f165c23b src/lxc_driver.c --- a/src/lxc_driver.c Fri Oct 03 08:06:52 2008 -0700 +++ b/src/lxc_driver.c Fri Oct 03 08:38:57 2008 -0700 @@ -43,6 +43,7 @@ #include "bridge.h" #include "veth.h" #include "event.h" +#include "cgroup.h" /* debug macros */ @@ -376,6 +377,7 @@ int waitRc; int childStatus = -1; virDomainNetDefPtr net; + virCgroupPtr cgroup; while (((waitRc = waitpid(vm->pid, &childStatus, 0)) == -1) && errno == EINTR) @@ -408,6 +410,11 @@ for (net = vm->def->nets; net; net = net->next) { vethInterfaceUpOrDown(net->ifname, 0); vethDelete(net->ifname); + } + + if (virCgroupForDomain(vm->def, "lxc", &cgroup) == 0) { + virCgroupRemove(cgroup); + virCgroupFree(&cgroup); } return rc;

On Fri, Oct 03, 2008 at 08:40:24AM -0700, Dan Smith wrote:
This patch adds code to the controller to set up a cgroup named after the domain name, set the memory limit, and restrict devices. It also adds bits to lxc_driver to properly clean up the cgroup on domain death.
If virCgroupHaveSupport() says that no support is available, then we just allow the domain creation to proceed as it did before without resource controls in place.
+ struct cgroup_device_policy devices[] = { + {'c', VIR_CG_DEV_MAJ_MEMORY, VIR_CG_DEV_MIN_NULL}, + {'c', VIR_CG_DEV_MAJ_MEMORY, VIR_CG_DEV_MIN_ZERO}, + {'c', VIR_CG_DEV_MAJ_MEMORY, VIR_CG_DEV_MIN_FULL}, + {'c', VIR_CG_DEV_MAJ_MEMORY, VIR_CG_DEV_MIN_RANDOM}, + {'c', VIR_CG_DEV_MAJ_MEMORY, VIR_CG_DEV_MIN_URANDOM}, + {'c', VIR_CG_DEV_MAJ_TTY, VIR_CG_DEV_MIN_CONSOLE}, + {0, 0, 0}};
You're going to hate me for suggesting more changes, but.... This list of devices is currently duplicated in two places - once here where we set permissions, and again when we actually create the container and populate its /dev/ in lxc_container.c. Could do with a master list of device nodes used by both. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

DB> This list of devices is currently duplicated in two places - once DB> here where we set permissions, and again when we actually create DB> the container and populate its /dev/ in lxc_container.c. Could do DB> with a master list of device nodes used by both. So does that mean they should be somewhere other than cgroup.h? If not, I think it might be appropriate to make that change in a separate patch, since populating /dev isn't really cgroup-related. -- Dan Smith IBM Linux Technology Center Open Hypervisor Team email: danms@us.ibm.com

On Fri, Oct 03, 2008 at 08:45:00AM -0700, Dan Smith wrote:
DB> This list of devices is currently duplicated in two places - once DB> here where we set permissions, and again when we actually create DB> the container and populate its /dev/ in lxc_container.c. Could do DB> with a master list of device nodes used by both.
So does that mean they should be somewhere other than cgroup.h?
I'd think lxc_config.h or lcx_container.h would be a more natural place for the constants, since both the .c files which would use them as lxc_XXXX driver files. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

DB> I'd think lxc_config.h or lcx_container.h would be a more natural DB> place for the constants, since both the .c files which would use DB> them as lxc_XXXX driver files. Okay, I moved them to lxc_container.h and updated the device numbers in lxc_container.c to match. Shall I spam the list with another set? :) -- Dan Smith IBM Linux Technology Center Open Hypervisor Team email: danms@us.ibm.com

On Fri, Oct 03, 2008 at 09:06:49AM -0700, Dan Smith wrote:
DB> I'd think lxc_config.h or lcx_container.h would be a more natural DB> place for the constants, since both the .c files which would use DB> them as lxc_XXXX driver files.
Okay, I moved them to lxc_container.h and updated the device numbers in lxc_container.c to match.
Shall I spam the list with another set? :)
IMHO, no need, pushing to CVS and fixing things from there will be simpler, Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

DV> IMHO, no need, pushing to CVS and fixing things from there will be DV> simpler, Done, thanks! -- Dan Smith IBM Linux Technology Center Open Hypervisor Team email: danms@us.ibm.com

On Fri, Oct 03, 2008 at 08:40:24AM -0700, Dan Smith wrote:
This patch adds code to the controller to set up a cgroup named after the domain name, set the memory limit, and restrict devices. It also adds bits to lxc_driver to properly clean up the cgroup on domain death.
The device whitelisting is all very nice, but we completely forgot / ignored the fact that there's nothing stopping a container mounting the cgroups device controller and giving itself the device access we just took away :-) The kernel code says /* * Modify the whitelist using allow/deny rules. * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD * so we can give a container CAP_MKNOD to let it create devices but not * modify the whitelist. * It seems likely we'll want to add a CAP_CONTAINER capability to allow * us to also grant CAP_SYS_ADMIN to containers without giving away the * device whitelist controls, but for now we'll stick with CAP_SYS_ADMIN * * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting * new access is only allowed if you're in the top-level cgroup, or your * parent cgroup has the access you're asking for. */ So, looks like we need to explicitly set the capabilities of containers to either mask out CAP_SYS_ADMIN from libvirtd's set, or construct an explicit capability whitelist Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

DB> The device whitelisting is all very nice, but we completely forgot DB> / ignored the fact that there's nothing stopping a container DB> mounting the cgroups device controller and giving itself the DB> device access we just took away :-) Ah, interesting. DB> So, looks like we need to explicitly set the capabilities of DB> containers to either mask out CAP_SYS_ADMIN from libvirtd's set, DB> or construct an explicit capability whitelist Yeah, I guess so. I'll start looking into this :) -- Dan Smith IBM Linux Technology Center Open Hypervisor Team email: danms@us.ibm.com

On Thu, Oct 16, 2008 at 02:07:20PM +0100, Daniel P. Berrange wrote:
On Fri, Oct 03, 2008 at 08:40:24AM -0700, Dan Smith wrote:
This patch adds code to the controller to set up a cgroup named after the domain name, set the memory limit, and restrict devices. It also adds bits to lxc_driver to properly clean up the cgroup on domain death.
The device whitelisting is all very nice, but we completely forgot / ignored the fact that there's nothing stopping a container mounting the cgroups device controller and giving itself the device access we just took away :-)
The kernel code says
/* * Modify the whitelist using allow/deny rules. * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD * so we can give a container CAP_MKNOD to let it create devices but not * modify the whitelist. * It seems likely we'll want to add a CAP_CONTAINER capability to allow * us to also grant CAP_SYS_ADMIN to containers without giving away the * device whitelist controls, but for now we'll stick with CAP_SYS_ADMIN * * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting * new access is only allowed if you're in the top-level cgroup, or your * parent cgroup has the access you're asking for. */
That last paragraph actually suggest another possible aproach which won't require messing with capabilities. Consider a hierarchy of cgroups $ROOT-CGROUP | ... | +- libvirtd | +- lxc | +- $VM-NAME +- $VM-NAME +- $VM-NAME libvirtd itself can be in any cgroup, beit the root one, or a child of the root. If libvirtd is in the root it can grant whatever it likes to cgroups for guests. If libvirt is not in root, then assuming its parent has access to the device it can also grant devices as needed. To prevent a LXC containre from giving itself device acess we need to make sure either, it doesn't have CAP_SYS_ADMIN, or make sure its parent cgroup doesn't have access. Fortunately we already have a cgroup in the heirarchy between libvirtd's cgroup & the one the VM sits. ie the cgroup named after the driver - 'lxc' in this case. No process ever lives in this group - we're just using it for filesystem namespace uniqueness. So if that source code comment is correct, all we need todo is set a deny-all rule in that intermediate 'lxc' cgroup, and then containers will not be able to get access back, even if they have CAP_SYS_ADMIN Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

DB> So if that source code comment is correct, all we need todo is set DB> a deny-all rule in that intermediate 'lxc' cgroup, and then DB> containers will not be able to get access back, even if they have DB> CAP_SYS_ADMIN Even if I make the per-driver group have a deny-all policy, I can still add arbitrary items to devices.allow and gain access from a subgroup. So, I think we're going to need to restrict CAP_SYS_ADMIN if we really want isolation, but I'm not sure what else that is likely to break. -- Dan Smith IBM Linux Technology Center Open Hypervisor Team email: danms@us.ibm.com
participants (3)
-
Dan Smith
-
Daniel P. Berrange
-
Daniel Veillard