[libvirt] [PATCH 0 of 2] Add cgroup manipulation and LXC driver support

This is my updated set for cgroup manipulation. I think I have addressed all of the concerns and comments so far. There are no major caveats I'm aware of with this round.

This patch adds src/cgroup.{c,h} with support for creating and manipulating cgroups. All groups created with the internal API are forced under $mount/libvirt/ to keep everything together. The first time a group is created, the libvirt directory is also created, and the settings from the root are inherited. The code creates groups in all mounts requires to get memory and devices functionality. When setting a value, the appropriate mount is determined and the value is set there. I have tested this with all controllers mounted in a single location, as well as all of them mounted separately. Changes: - Handle multiple mount points - Add more abstract internal API, per recent discussion - Consider absence of memory or devices controllers as "no cgroup support" diff -r 444e2614d0a2 -r c33c6d6369e4 src/cgroup.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/cgroup.c Wed Oct 01 13:16:03 2008 -0700 @@ -0,0 +1,705 @@ +/* + * cgroup.c: Tools for managing cgroups + * + * Copyright IBM Corp. 2008 + * + * See COPYING.LIB for the License of this software + * + * Authors: + * Dan Smith <danms@us.ibm.com> + */ +#include <config.h> + +#include <stdio.h> +#include <stdint.h> +#include <inttypes.h> +#include <mntent.h> +#include <fcntl.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> +#include <stdbool.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <libgen.h> + +#include "internal.h" +#include "util.h" +#include "memory.h" +#include "cgroup.h" + +#define DEBUG(fmt,...) VIR_DEBUG(__FILE__, fmt, __VA_ARGS__) +#define DEBUG0(msg) VIR_DEBUG(__FILE__, "%s", msg) + +#define CGROUP_MAX_VAL 512 + +struct virCgroup { + char *path; +}; + +const char *supported_controllers[] = { + "memory", + "devices", + NULL +}; + +void virCgroupFree(virCgroupPtr *group) +{ + if (*group != NULL) { + VIR_FREE((*group)->path); + VIR_FREE(*group); + *group = NULL; + } +} + +static virCgroupPtr virCgroupGetMount(const char *controller) +{ + FILE *mounts; + struct mntent entry; + char buf[CGROUP_MAX_VAL]; + virCgroupPtr root = NULL; + + if (VIR_ALLOC(root) != 0) + return NULL; + + mounts = fopen("/proc/mounts", "r"); + if (mounts == NULL) { + DEBUG0("Unable to open /proc/mounts: %m"); + goto err; + } + + while (getmntent_r(mounts, &entry, buf, sizeof(buf)) != NULL) { + if (STREQ(entry.mnt_type, "cgroup") && + (strstr(entry.mnt_opts, controller))) { + root->path = strdup(entry.mnt_dir); + break; + } + } + + DEBUG("Mount for %s is %s\n", controller, root->path); + + if (root->path == NULL) { + DEBUG0("Did not find cgroup mount"); + goto err; + } + + fclose(mounts); + + return root; +err: + virCgroupFree(&root); + + return NULL; +} + +int virCgroupHaveSupport(void) +{ + virCgroupPtr root; + int i; + + for (i = 0; supported_controllers[i] != NULL; i++) { + root = virCgroupGetMount(supported_controllers[i]); + if (root == NULL) + return -1; + virCgroupFree(&root); + } + + return 0; +} + +static int virCgroupPathOfGroup(const char *group, + const char *controller, + char **path) +{ + virCgroupPtr root = NULL; + int rc = 0; + + root = virCgroupGetMount(controller); + if (root == NULL) { + rc = -ENOTDIR; + goto out; + } + + if (asprintf(path, "%s/%s", root->path, group) == -1) + rc = -ENOMEM; +out: + virCgroupFree(&root); + + return rc; +} + +static int virCgroupPathOf(const char *grppath, + const char *key, + char **path) +{ + virCgroupPtr root; + int rc = 0; + char *controller = NULL; + + if (strchr(key, '.') == NULL) + return -EINVAL; + + if (sscanf(key, "%a[^.]", &controller) != 1) + return -EINVAL; + + root = virCgroupGetMount(controller); + if (root == NULL) { + rc = -ENOTDIR; + goto out; + } + + if (asprintf(path, "%s/%s/%s", root->path, grppath, key) == -1) + rc = -ENOMEM; +out: + virCgroupFree(&root); + VIR_FREE(controller); + + return rc; +} + +static int virCgroupSetValueStr(virCgroupPtr group, + const char *key, + const char *value) +{ + int fd = -1; + int rc = 0; + char *keypath = NULL; + + rc = virCgroupPathOf(group->path, key, &keypath); + if (rc != 0) + return rc; + + fd = open(keypath, O_WRONLY); + if (fd < 0) { + DEBUG("Unable to open %s: %m", keypath); + rc = -ENOENT; + goto out; + } + + DEBUG("Writing '%s' to '%s'", value, keypath); + + rc = safewrite(fd, value, strlen(value)); + if (rc < 0) { + DEBUG("Failed to write value '%s': %m", value); + rc = -errno; + goto out; + } else if (rc != strlen(value)) { + DEBUG("Short write of value '%s'", value); + rc = -ENOSPC; + goto out; + } + + rc = 0; +out: + VIR_FREE(keypath); + close(fd); + + return rc; +} + +static int virCgroupSetValueU64(virCgroupPtr group, + const char *key, + uint64_t value) +{ + char *strval = NULL; + int rc; + + if (asprintf(&strval, "%" PRIu64, value) == -1) + return -ENOMEM; + + rc = virCgroupSetValueStr(group, key, strval); + + VIR_FREE(strval); + + return rc; +} + +#if 0 +/* This is included for completeness, but not yet used */ + +static int virCgroupSetValueI64(virCgroupPtr group, + const char *key, + int64_t value) +{ + char *strval = NULL; + int rc; + + if (asprintf(&strval, "%" PRIi64, value) == -1) + return -ENOMEM; + + rc = virCgroupSetValueStr(group, key, strval); + + VIR_FREE(strval); + + return rc; +} + +static int virCgroupGetValueStr(virCgroupPtr group, + const char *key, + char **value) +{ + int fd = -1; + int rc; + char *keypath = NULL; + char buf[CGROUP_MAX_VAL]; + + memset(buf, 0, sizeof(buf)); + + rc = virCgroupPathOf(group->path, key, &keypath); + if (rc != 0) { + DEBUG("No path of %s, %s", group->path, key); + return rc; + } + + fd = open(keypath, O_RDONLY); + if (fd < 0) { + DEBUG("Unable to open %s: %m", keypath); + rc = -ENOENT; + goto out; + } + + rc = saferead(fd, buf, sizeof(buf)); + if (rc < 0) { + DEBUG("Failed to read %s: %m\n", keypath); + rc = -errno; + goto out; + } else if (rc == 0) { + DEBUG("Short read of %s\n", keypath); + rc = -EIO; + goto out; + } + + *value = strdup(buf); + if (*value == NULL) { + rc = -ENOMEM; + goto out; + } + + rc = 0; +out: + VIR_FREE(keypath); + close(fd); + + return rc; +} + +static int virCgroupGetValueU64(virCgroupPtr group, + const char *key, + uint64_t *value) +{ + char *strval = NULL; + int rc = 0; + + rc = virCgroupGetValueStr(group, key, &strval); + if (rc != 0) + goto out; + + if (sscanf(strval, "%" SCNu64, value) != 1) + rc = -EINVAL; +out: + VIR_FREE(strval); + + return rc; +} + +static int virCgroupGetValueI64(virCgroupPtr group, + const char *key, + int64_t *value) +{ + char *strval = NULL; + int rc = 0; + + rc = virCgroupGetValueStr(group, key, &strval); + if (rc != 0) + goto out; + + if (sscanf(strval, "%" SCNi64, value) != 1) + rc = -EINVAL; +out: + VIR_FREE(strval); + + return rc; +} +#endif + +static int _virCgroupInherit(const char *path, + const char *key) +{ + int rc = 0; + int fd = -1; + char buf[CGROUP_MAX_VAL]; + char *keypath = NULL; + char *pkeypath = NULL; + + memset(buf, 0, sizeof(buf)); + + if (asprintf(&keypath, "%s/%s", path, key) == -1) { + rc = -ENOMEM; + goto out; + } + + if (access(keypath, F_OK) != 0) { + DEBUG("Group %s has no key %s\n", path, key); + goto out; + } + + if (asprintf(&pkeypath, "%s/../%s", path, key) == -1) { + rc = -ENOMEM; + VIR_FREE(keypath); + goto out; + } + + fd = open(pkeypath, O_RDONLY); + if (fd < 0) { + rc = -errno; + goto out; + } + + if (saferead(fd, buf, sizeof(buf)) <= 0) { + rc = -errno; + goto out; + } + + close(fd); + + fd = open(keypath, O_WRONLY); + if (fd < 0) { + rc = -errno; + goto out; + } + + if (safewrite(fd, buf, strlen(buf)) != strlen(buf)) { + rc = -errno; + goto out; + } + +out: + VIR_FREE(keypath); + VIR_FREE(pkeypath); + close(fd); + + return rc; +} + +static int virCgroupInherit(const char *grppath) +{ + int i; + int rc = 0; + const char *inherit_values[] = { + "cpuset.cpus", + "cpuset.mems", + NULL + }; + + for (i = 0; inherit_values[i] != NULL; i++) { + const char *key = inherit_values[i]; + + rc = _virCgroupInherit(grppath, key); + if (rc != 0) { + DEBUG("inherit of %s failed\n", key); + break; + } + } + + return rc; +} + +static int virCgroupMakeGroup(const char *name) +{ + int i; + int rc = 0; + + for (i = 0; supported_controllers[i] != NULL; i++) { + char *path = NULL; + virCgroupPtr root; + + root = virCgroupGetMount(supported_controllers[i]); + if (root == NULL) + continue; + + rc = virCgroupPathOfGroup(name, supported_controllers[i], &path); + if (rc != 0) { + virCgroupFree(&root); + break; + } + + virCgroupFree(&root); + + if (access(path, F_OK) != 0) { + if (mkdir(path, 0655) < 0) { + rc = -errno; + VIR_FREE(path); + break; + } + virCgroupInherit(path); + } + + VIR_FREE(path); + } + + return rc; +} + +static int virCgroupRoot(virCgroupPtr *root) +{ + int rc = 0; + char *grppath = NULL; + + if (VIR_ALLOC((*root)) != 0) { + rc = -ENOMEM; + goto out; + } + + (*root)->path = strdup("libvirt"); + if ((*root)->path == NULL) { + rc = -ENOMEM; + goto out; + } + + rc = virCgroupMakeGroup((*root)->path); +out: + if (rc != 0) + virCgroupFree(root); + VIR_FREE(grppath); + + return rc; +} + +static int virCgroupNew(virCgroupPtr *parent, + const char *group, + virCgroupPtr *newgroup) +{ + int rc = 0; + char *typpath = NULL; + + *newgroup = NULL; + + if (*parent == NULL) { + rc = virCgroupRoot(parent); + if (rc != 0) + goto err; + } + + if (VIR_ALLOC((*newgroup)) != 0) { + rc = -ENOMEM; + goto err; + } + + rc = asprintf(&((*newgroup)->path), + "%s/%s", + (*parent)->path, + group); + if (rc == -1) { + rc = -ENOMEM; + goto err; + } + + rc = 0; + + return rc; +err: + virCgroupFree(newgroup); + *newgroup = NULL; + + VIR_FREE(typpath); + + return rc; +} + +static int virCgroupOpen(virCgroupPtr parent, + const char *group, + virCgroupPtr *newgroup) +{ + int rc = 0; + char *grppath = NULL; + bool free_parent = (parent == NULL); + + rc = virCgroupNew(&parent, group, newgroup); + if (rc != 0) + goto err; + + if (free_parent) + virCgroupFree(&parent); + + rc = virCgroupPathOfGroup((*newgroup)->path, + supported_controllers[0], + &grppath); + if (rc != 0) + goto err; + + if (access(grppath, F_OK) != 0) { + rc = -ENOENT; + goto err; + } + + return rc; +err: + virCgroupFree(newgroup); + *newgroup = NULL; + + return rc; +} + +static int virCgroupCreate(virCgroupPtr parent, + const char *group, + virCgroupPtr *newgroup) +{ + int rc = 0; + bool free_parent = (parent == NULL); + + rc = virCgroupNew(&parent, group, newgroup); + if (rc != 0) { + DEBUG0("Unable to allocate new virCgroup structure"); + goto err; + } + + rc = virCgroupMakeGroup((*newgroup)->path); + if (rc != 0) + goto err; + + if (free_parent) + virCgroupFree(&parent); + + return rc; +err: + virCgroupFree(newgroup); + *newgroup = NULL; + + if (free_parent) + virCgroupFree(&parent); + + return rc; +} + +int virCgroupRemove(virCgroupPtr group) +{ + int rc = 0; + int i; + char *grppath = NULL; + + for (i = 0; supported_controllers[i] != NULL; i++) { + if (virCgroupPathOfGroup(group->path, + supported_controllers[i], + &grppath) != 0) + continue; + + if (rmdir(grppath) != 0) + rc = -errno; + + VIR_FREE(grppath); + } + + return rc; +} + +int virCgroupAddTask(virCgroupPtr group, pid_t pid) +{ + int rc = 0; + int fd = -1; + int i; + char *grppath = NULL; + char *taskpath = NULL; + char *pidstr = NULL; + + for (i = 0; supported_controllers[i] != NULL; i++) { + rc = virCgroupPathOfGroup(group->path, + supported_controllers[i], + &grppath); + if (rc != 0) + goto done; + + if (asprintf(&taskpath, "%s/tasks", grppath) == -1) { + rc = -ENOMEM; + goto done; + } + + fd = open(taskpath, O_WRONLY); + if (fd < 0) { + rc = -errno; + goto done; + } + + if (asprintf(&pidstr, "%lu", (unsigned long)pid) == -1) { + rc = -ENOMEM; + goto done; + } + + if (write(fd, pidstr, strlen(pidstr)) <= 0) { + rc = -errno; + goto done; + } + + done: + VIR_FREE(grppath); + VIR_FREE(taskpath); + VIR_FREE(pidstr); + close(fd); + + if (rc != 0) + break; + } + + return rc; +} + +int virCgroupForDomain(virDomainDefPtr def, virCgroupPtr *group) +{ + int rc; + virCgroupPtr typegrp = NULL; + const char *typestr = virDomainVirtTypeToString(def->virtType); + + if (typestr == NULL) + return -EINVAL; + + rc = virCgroupOpen(NULL, typestr, &typegrp); + if (rc == -ENOENT) { + rc = virCgroupCreate(NULL, typestr, &typegrp); + if (rc != 0) + goto out; + } else if (rc != 0) + goto out; + + rc = virCgroupOpen(typegrp, def->name, group); + if (rc == -ENOENT) + rc = virCgroupCreate(typegrp, def->name, group); +out: + virCgroupFree(&typegrp); + + return rc; +} + +int virCgroupSetMemory(virCgroupPtr group, unsigned long kb) +{ + return virCgroupSetValueU64(group, + "memory.limit_in_bytes", + kb << 10); +} + +int virCgroupDenyAllDevices(virCgroupPtr group) +{ + return virCgroupSetValueStr(group, + "devices.deny", + "a"); +} + +int virCgroupAllowDevice(virCgroupPtr group, + char type, + int major, + int minor) +{ + int rc; + char *devstr = NULL; + + if (asprintf(&devstr, "%c %i:%i rwm", type, major, minor) == -1) { + rc = -ENOMEM; + goto out; + } + + rc = virCgroupSetValueStr(group, + "devices.allow", + devstr); +out: + VIR_FREE(devstr); + + return rc; +} diff -r 444e2614d0a2 -r c33c6d6369e4 src/cgroup.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/cgroup.h Wed Oct 01 13:16:03 2008 -0700 @@ -0,0 +1,99 @@ +/* + * cgroup.h: Interface to tools for managing cgroups + * + * Copyright IBM Corp. 2008 + * + * See COPYING.LIB for the License of this software + * + * Authors: + * Dan Smith <danms@us.ibm.com> + */ + +#ifndef CGROUP_H +#define CGROUP_H + +#include <stdint.h> + +struct virCgroup; +typedef struct virCgroup *virCgroupPtr; + +#include "domain_conf.h" + +/** + * virCgroupHaveSupport: + * + * Returns 0 if support is present, negative if not + */ +int virCgroupHaveSupport(void); + +/** + * virCgroupForDomain: + * + * @def: Domain definition to create cgroup for + * @group: Pointer to returned virCgroupPtr + * + * Returns 0 on success + */ +int virCgroupForDomain(virDomainDefPtr def, virCgroupPtr *group); + +/** + * virCgroupAddTask: + * + * @group: The cgroup to add a task to + * @pid: The pid of the task to add + * + * Returns: 0 on success + */ +int virCgroupAddTask(virCgroupPtr group, pid_t pid); + +/** + * virCgroupSetMemory: + * + * @group: The cgroup to change memory for + * @kb: The memory amount in kilobytes + * + * Returns: 0 on success + */ +int virCgroupSetMemory(virCgroupPtr group, unsigned long kb); + +/** + * virCgroupDenyAllDevices: + * + * @group: The cgroup to deny devices for + * + * Returns: 0 on success + */ +int virCgroupDenyAllDevices(virCgroupPtr group); + +/** + * virCgroupAllowDevice: + * + * @group: The cgroup to allow a device for + * @type: The device type (i.e., 'c' or 'b') + * @major: The major number of the device + * @minor: The minor number of the device + * + * Returns: 0 on success + */ +int virCgroupAllowDevice(virCgroupPtr group, + char type, + int major, + int minor); + +/** + * virCgroupRemove: + * + * @group: The group to be removed + * + * Returns: 0 on success + */ +int virCgroupRemove(virCgroupPtr group); + +/** + * virCgroupFree: + * + * @group: The group structure to free + */ +void virCgroupFree(virCgroupPtr *group); + +#endif /* CGROUP_H */

On Wed, Oct 01, 2008 at 01:19:03PM -0700, Dan Smith wrote:
This patch adds src/cgroup.{c,h} with support for creating and manipulating cgroups.
All groups created with the internal API are forced under $mount/libvirt/ to keep everything together. The first time a group is created, the libvirt directory is also created, and the settings from the root are inherited.
The code creates groups in all mounts requires to get memory and devices functionality. When setting a value, the appropriate mount is determined and the value is set there. I have tested this with all controllers mounted in a single location, as well as all of them mounted separately.
+int virCgroupForDomain(virDomainDefPtr def, virCgroupPtr *group)
I think we should pass in a 2nd 'const char *driverName' arg here with "lxc" as its value
+{ + int rc; + virCgroupPtr typegrp = NULL; + const char *typestr = virDomainVirtTypeToString(def->virtType);
Rather than rely on this virt type string - virt type is not adding enough uniqueness - can still have a (virttype, name) clash between 2 libvirt drivers. Name is unique per host, per driver, so we'd want to have $CONTROLLER_MOUNT/libvirt/$DRIVER_NAME/$DOMAIN_NAME
+ + if (typestr == NULL) + return -EINVAL; + + rc = virCgroupOpen(NULL, typestr, &typegrp); + if (rc == -ENOENT) { + rc = virCgroupCreate(NULL, typestr, &typegrp); + if (rc != 0) + goto out; + } else if (rc != 0) + goto out; + + rc = virCgroupOpen(typegrp, def->name, group); + if (rc == -ENOENT) + rc = virCgroupCreate(typegrp, def->name, group); +out: + virCgroupFree(&typegrp); + + return rc; +} +
Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

DB> Rather than rely on this virt type string - virt type is not DB> adding enough uniqueness - can still have a (virttype, name) clash DB> between 2 libvirt drivers. Name is unique per host, per driver, so DB> we'd want to have DB> $CONTROLLER_MOUNT/libvirt/$DRIVER_NAME/$DOMAIN_NAME Hmm, I'm not sure I follow. It seems like the virt type offers more uniqueness than driver name. Either way, I have no problem making this change :) -- Dan Smith IBM Linux Technology Center Open Hypervisor Team email: danms@us.ibm.com

On Wed, Oct 01, 2008 at 02:11:24PM -0700, Dan Smith wrote:
DB> Rather than rely on this virt type string - virt type is not DB> adding enough uniqueness - can still have a (virttype, name) clash DB> between 2 libvirt drivers. Name is unique per host, per driver, so DB> we'd want to have
DB> $CONTROLLER_MOUNT/libvirt/$DRIVER_NAME/$DOMAIN_NAME
Hmm, I'm not sure I follow. It seems like the virt type offers more uniqueness than driver name.
Both, Xen and QEMU drivers support a virt type of 'hvm'. If both drivers have a domain called 'foo', then both would end up using a group of $MOUNT/libvirt/hvm/foo Instead of $MOUNT/libvirt/qemu/foo $MOUNT/libvirt/xen/foo Both of which are guarenteed unique Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

DB> Both, Xen and QEMU drivers support a virt type of 'hvm'. Ah, gotcha. I didn't realize that just looking at the enum. I've changed it locally. -- Dan Smith IBM Linux Technology Center Open Hypervisor Team email: danms@us.ibm.com

On Wed, Oct 01, 2008 at 11:10:33PM +0100, Daniel P. Berrange wrote:
On Wed, Oct 01, 2008 at 02:11:24PM -0700, Dan Smith wrote:
DB> Rather than rely on this virt type string - virt type is not DB> adding enough uniqueness - can still have a (virttype, name) clash DB> between 2 libvirt drivers. Name is unique per host, per driver, so DB> we'd want to have
DB> $CONTROLLER_MOUNT/libvirt/$DRIVER_NAME/$DOMAIN_NAME
Hmm, I'm not sure I follow. It seems like the virt type offers more uniqueness than driver name.
Both, Xen and QEMU drivers support a virt type of 'hvm'. If both drivers have a domain called 'foo', then both would end up using a group of
$MOUNT/libvirt/hvm/foo
Opps, I was actually mixing up 'virt type' with 'os type' here. The virt type will /probably/ be unique, but we don't mandate it, so it is probably safest to use the driver name.
Instead of
$MOUNT/libvirt/qemu/foo $MOUNT/libvirt/xen/foo
Both of which are guarenteed unique
Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

On Wed, Oct 01, 2008 at 01:19:03PM -0700, Dan Smith wrote:
This patch adds src/cgroup.{c,h} with support for creating and manipulating cgroups.
All groups created with the internal API are forced under $mount/libvirt/ to keep everything together. The first time a group is created, the libvirt directory is also created, and the settings from the root are inherited.
The code creates groups in all mounts requires to get memory and devices functionality. When setting a value, the appropriate mount is determined and the value is set there. I have tested this with all controllers mounted in a single location, as well as all of them mounted separately.
Changes: - Handle multiple mount points - Add more abstract internal API, per recent discussion - Consider absence of memory or devices controllers as "no cgroup support"
Okay, the patch looks fine to me, things looks simple and without risk. I have ony one stylistic issue, which is that I usually prefer to have the comment about the funtions and its parameter in the C code rather than on the header (keeping it closer to the code means one more easilly update/fix it, and if you use something like ctags to get to the function implementation you see the comment immediately but it's cosmetic). +1 for me, Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

DV> I have ony one stylistic issue, which is that I usually prefer to DV> have the comment about the funtions and its parameter in the C DV> code rather than on the header (keeping it closer to the code DV> means one more easilly update/fix it, and if you use something DV> like ctags to get to the function implementation you see the DV> comment immediately but it's cosmetic). Okay, that's a good point. I will just move the comment block to the .c file. Thanks! -- Dan Smith IBM Linux Technology Center Open Hypervisor Team email: danms@us.ibm.com

This patch adds code to the controller to set up a cgroup named after the domain name, set the memory limit, and restrict devices. It also adds bits to lxc_driver to properly clean up the cgroup on domain death. If virCgroupHaveSupport() says that no support is available, then we just allow the domain creation to proceed as it did before without resource controls in place. Changes: - Use the more abstract API diff -r c33c6d6369e4 -r 67d1be5bd856 src/Makefile.am --- a/src/Makefile.am Wed Oct 01 13:16:03 2008 -0700 +++ b/src/Makefile.am Wed Oct 01 13:16:04 2008 -0700 @@ -90,13 +90,15 @@ lxc_conf.c lxc_conf.h \ lxc_container.c lxc_container.h \ lxc_driver.c lxc_driver.h \ - veth.c veth.h + veth.c veth.h \ + cgroup.c cgroup.h LXC_CONTROLLER_SOURCES = \ lxc_conf.c lxc_conf.h \ lxc_container.c lxc_container.h \ lxc_controller.c \ - veth.c veth.h + veth.c veth.h \ + cgroup.c cgroup.h OPENVZ_DRIVER_SOURCES = \ openvz_conf.c openvz_conf.h \ diff -r c33c6d6369e4 -r 67d1be5bd856 src/lxc_controller.c --- a/src/lxc_controller.c Wed Oct 01 13:16:03 2008 -0700 +++ b/src/lxc_controller.c Wed Oct 01 13:16:04 2008 -0700 @@ -42,12 +42,78 @@ #include "veth.h" #include "memory.h" #include "util.h" - +#include "cgroup.h" #define DEBUG(fmt,...) VIR_DEBUG(__FILE__, fmt, __VA_ARGS__) #define DEBUG0(msg) VIR_DEBUG(__FILE__, "%s", msg) int debugFlag = 0; + +struct cgroup_device_policy { + char type; + int major; + int minor; +}; + +/** + * lxcSetContainerResources + * @def: pointer to virtual machine structure + * + * Creates a cgroup for the container, moves the task inside, + * and sets resource limits + * + * Returns 0 on success or -1 in case of error + */ +static int lxcSetContainerResources(virDomainDefPtr def) +{ + virCgroupPtr cgroup; + int rc = -1; + int i; + struct cgroup_device_policy devices[] = { + {'c', 1, 3}, + {'c', 1, 5}, + {'c', 1, 7}, + {'c', 1, 8}, + {'c', 1, 9}, + {'c', 5, 1}, + {0, 0, 0}}; + + if (virCgroupHaveSupport() != 0) + return 0; /* Not supported, so claim success */ + + rc = virCgroupForDomain(def, &cgroup); + if (rc != 0) { + fprintf(stderr, "Unable to create cgroup for %s\n", def->name); + goto out; + } + + rc = virCgroupSetMemory(cgroup, def->maxmem); + if (rc != 0) + goto out; + + rc = virCgroupDenyAllDevices(cgroup); + if (rc != 0) + goto out; + + for (i = 0; devices[i].type != 0; i++) { + struct cgroup_device_policy *dev = &devices[i]; + rc = virCgroupAllowDevice(cgroup, + dev->type, + dev->major, + dev->minor); + if (rc != 0) + goto out; + } + + rc = virCgroupAddTask(cgroup, getpid()); +out: + virCgroupFree(&cgroup); + + if (rc != 0) + fprintf(stderr, "Failed to set lxc resources: %s\n", strerror(-rc)); + + return rc; +} static char*lxcMonitorPath(virDomainDefPtr def) { @@ -394,6 +460,9 @@ if (lxcControllerMoveInterfaces(nveths, veths, container) < 0) goto cleanup; + if (lxcSetContainerResources(def) < 0) + goto cleanup; + if (lxcContainerSendContinue(control[0]) < 0) goto cleanup; diff -r c33c6d6369e4 -r 67d1be5bd856 src/lxc_driver.c --- a/src/lxc_driver.c Wed Oct 01 13:16:03 2008 -0700 +++ b/src/lxc_driver.c Wed Oct 01 13:16:04 2008 -0700 @@ -43,6 +43,7 @@ #include "bridge.h" #include "veth.h" #include "event.h" +#include "cgroup.h" /* debug macros */ @@ -376,6 +377,7 @@ int waitRc; int childStatus = -1; virDomainNetDefPtr net; + virCgroupPtr cgroup; while (((waitRc = waitpid(vm->pid, &childStatus, 0)) == -1) && errno == EINTR) @@ -408,6 +410,11 @@ for (net = vm->def->nets; net; net = net->next) { vethInterfaceUpOrDown(net->ifname, 0); vethDelete(net->ifname); + } + + if (virCgroupForDomain(vm->def, &cgroup) == 0) { + virCgroupRemove(cgroup); + virCgroupFree(&cgroup); } return rc;

On Wed, Oct 01, 2008 at 01:19:04PM -0700, Dan Smith wrote:
This patch adds code to the controller to set up a cgroup named after the domain name, set the memory limit, and restrict devices. It also adds bits to lxc_driver to properly clean up the cgroup on domain death.
If virCgroupHaveSupport() says that no support is available, then we just allow the domain creation to proceed as it did before without resource controls in place.
+ + rc = virCgroupForDomain(def, &cgroup); + if (rc != 0) { + fprintf(stderr, "Unable to create cgroup for %s\n", def->name); + goto out; + }
Should use lxcError() here - as things are setup by virExec(), calls to lxcError() call to virRaiseError() which ultimately calls fprintf(). By using lxcError() can we more easily change logging later if needed.
+ + rc = virCgroupSetMemory(cgroup, def->maxmem); + if (rc != 0) + goto out; + + rc = virCgroupDenyAllDevices(cgroup); + if (rc != 0) + goto out; + + for (i = 0; devices[i].type != 0; i++) { + struct cgroup_device_policy *dev = &devices[i]; + rc = virCgroupAllowDevice(cgroup, + dev->type, + dev->major, + dev->minor); + if (rc != 0) + goto out; + } + + rc = virCgroupAddTask(cgroup, getpid()); +out: + virCgroupFree(&cgroup); + + if (rc != 0) + fprintf(stderr, "Failed to set lxc resources: %s\n", strerror(-rc));
Likewise here use lxcError(). I think we need to virCgroupRemove() in the error path to cleanup a partially constructed cgroup ? Regards, Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

DB> Should use lxcError() here - as things are setup by virExec(), DB> calls to lxcError() call to virRaiseError() which ultimately calls DB> fprintf(). By using lxcError() can we more easily change logging DB> later if needed. Indeed. DB> I think we need to virCgroupRemove() in the error path to cleanup DB> a partially constructed cgroup ? Yep, I'll fix that too. Thanks! -- Dan Smith IBM Linux Technology Center Open Hypervisor Team email: danms@us.ibm.com

On Wed, Oct 01, 2008 at 01:19:04PM -0700, Dan Smith wrote:
This patch adds code to the controller to set up a cgroup named after the domain name, set the memory limit, and restrict devices. It also adds bits to lxc_driver to properly clean up the cgroup on domain death.
If virCgroupHaveSupport() says that no support is available, then we just allow the domain creation to proceed as it did before without resource controls in place.
Okay in addition to Dan feedback on the error handling,
+/** + * lxcSetContainerResources + * @def: pointer to virtual machine structure + * + * Creates a cgroup for the container, moves the task inside, + * and sets resource limits + * + * Returns 0 on success or -1 in case of error + */ +static int lxcSetContainerResources(virDomainDefPtr def) +{ + virCgroupPtr cgroup; + int rc = -1; + int i; + struct cgroup_device_policy devices[] = { + {'c', 1, 3}, + {'c', 1, 5}, + {'c', 1, 7}, + {'c', 1, 8}, + {'c', 1, 9}, + {'c', 5, 1}, + {0, 0, 0}};
creating a domain is always the trickiest part of domain lifecycle and setting up the framework is often something a bit obfuscated, nearly by definition. But if we could associate meaningful constants to the device policies here, this would help people having to improve or debug that code in the future ;-) otherwise looks fine to me, +1 Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

DV> creating a domain is always the trickiest part of domain lifecycle DV> and setting up the framework is often something a bit obfuscated, DV> nearly by definition. But if we could associate meaningful DV> constants to the device policies here, this would help people DV> having to improve or debug that code in the future ;-) Okay, I suppose that if the QEMU driver ends up doing something similar, the constants would be reused. Should they go in cgroup.h since they're expected to be used with that interface? -- Dan Smith IBM Linux Technology Center Open Hypervisor Team email: danms@us.ibm.com

On Fri, Oct 03, 2008 at 07:45:24AM -0700, Dan Smith wrote:
DV> creating a domain is always the trickiest part of domain lifecycle DV> and setting up the framework is often something a bit obfuscated, DV> nearly by definition. But if we could associate meaningful DV> constants to the device policies here, this would help people DV> having to improve or debug that code in the future ;-)
Okay, I suppose that if the QEMU driver ends up doing something similar, the constants would be reused.
Should they go in cgroup.h since they're expected to be used with that interface?
Fine by me, but I'm more looking to being able to map semantic to the value than sharing the values at this point, Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

On Fri, Oct 03, 2008 at 07:45:24AM -0700, Dan Smith wrote:
DV> creating a domain is always the trickiest part of domain lifecycle DV> and setting up the framework is often something a bit obfuscated, DV> nearly by definition. But if we could associate meaningful DV> constants to the device policies here, this would help people DV> having to improve or debug that code in the future ;-)
Okay, I suppose that if the QEMU driver ends up doing something similar, the constants would be reused.
Possibly - for QEMU we'd need to drive device ACL's off the list of configured disks in the XML - we'd probably just 'stat()' them to lookup their major/minor numbers at time of launch.
Should they go in cgroup.h since they're expected to be used with that interface?
If needing to be shared, I'd suggest util.h since that has a collection of file/filesystem related utility functions. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|
participants (3)
-
Dan Smith
-
Daniel P. Berrange
-
Daniel Veillard