[PATCH v3 0/5] lxc: Add VCPU features for LXC

This series cover a lots of functionalities to LXC VCPUs. It enables sharing some timer devices between host and LXC guest using `timer` settings. It still has other improvements related to VCPU and LXC such as virtual cpuinfo content based on VCPU settings and some better resource limits. Each patch has the description of the problem and what it is trying to fix. v1-v2: Add Daniel's comments and some cleanups. v2-v3: Remove dependency from patch 4 and 5. Julio Faracco (5): lxc: Add Real Time Clock device into allowed devices lxc: Add HPET device into allowed devices lxc: Replacing default strings definitions by g_autofree statement. lxc: Implement virtual /proc/cpuinfo via LXC fuse lxc: Count max VCPUs based on cpuset.cpus in native config. docs/formatdomain.html.in | 4 +- src/lxc/lxc_cgroup.c | 91 ++++++++- src/lxc/lxc_container.c | 62 ++++-- src/lxc/lxc_container.h | 2 + src/lxc/lxc_controller.c | 187 ++++++++++++------ src/lxc/lxc_fuse.c | 107 ++++++++-- src/lxc/lxc_native.c | 24 ++- .../lxcconf2xml-cpusettune.xml | 2 +- 8 files changed, 368 insertions(+), 111 deletions(-) -- 2.20.1

This commit share host Real Time Clock device (rtc) into LXC containers to support hardware clock. This should be available setting up a `rtc` timer under clock section. Since this option is not emulated, it should be available only for `localtime` clock. This option should be readonly due to security reasons. Before: root# hwclock --verbose hwclock from util-linux 2.32.1 System Time: 1581877557.598365 Trying to open: /dev/rtc0 Trying to open: /dev/rtc Trying to open: /dev/misc/rtc No usable clock interface found. hwclock: Cannot access the Hardware Clock via any known method. Now: root# hwclock 2020-02-16 18:23:55.374134+00:00 root# hwclock -w hwclock: ioctl(RTC_SET_TIME) to /dev/rtc to set the time failed: Permission denied Signed-off-by: Julio Faracco <jcfaracco@gmail.com> --- docs/formatdomain.html.in | 2 +- src/lxc/lxc_cgroup.c | 36 +++++++++++++++++++++ src/lxc/lxc_controller.c | 68 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 1 deletion(-) diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 4fef2a0a97..5598bf41b4 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -2465,7 +2465,7 @@ being modified, and can be one of "platform" (currently unsupported), "hpet" (libxl, xen, qemu), "kvmclock" (qemu), - "pit" (qemu), "rtc" (qemu), "tsc" (libxl, qemu - + "pit" (qemu), "rtc" (qemu, lxc), "tsc" (libxl, qemu - <span class="since">since 3.2.0</span>), "hypervclock" (qemu - <span class="since">since 1.2.2</span>) or "armvtimer" (qemu - <span class="since">since 6.1.0</span>). diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c index 4ebe5ef467..6a103055a4 100644 --- a/src/lxc/lxc_cgroup.c +++ b/src/lxc/lxc_cgroup.c @@ -337,6 +337,42 @@ static int virLXCCgroupSetupDeviceACL(virDomainDefPtr def, VIR_CGROUP_DEVICE_RWM) < 0) return -1; + VIR_DEBUG("Allowing timers char devices"); + + /* Sync'ed with Host clock */ + if (def->clock.offset == VIR_DOMAIN_CLOCK_OFFSET_LOCALTIME) { + for (i = 0; i < def->clock.ntimers; i++) { + virDomainTimerDefPtr timer = def->clock.timers[i]; + + switch ((virDomainTimerNameType)timer->name) { + case VIR_DOMAIN_TIMER_NAME_PLATFORM: + case VIR_DOMAIN_TIMER_NAME_TSC: + case VIR_DOMAIN_TIMER_NAME_KVMCLOCK: + case VIR_DOMAIN_TIMER_NAME_HYPERVCLOCK: + case VIR_DOMAIN_TIMER_NAME_PIT: + case VIR_DOMAIN_TIMER_NAME_HPET: + case VIR_DOMAIN_TIMER_NAME_ARMVTIMER: + case VIR_DOMAIN_TIMER_NAME_LAST: + break; + case VIR_DOMAIN_TIMER_NAME_RTC: + if (!timer->present) + break; + + if (virFileExists("/dev/rtc")) { + if (virCgroupAllowDevicePath(cgroup, "/dev/rtc", + VIR_CGROUP_DEVICE_READ, + false) < 0) + return -1; + } else { + VIR_DEBUG("Ignoring non-existent device /dev/rtc"); + } + break; + } + } + } else { + VIR_DEBUG("Ignoring non-localtime clock"); + } + VIR_DEBUG("Device whitelist complete"); return 0; diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index c3dec0859c..eba6bfe0bf 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -1550,6 +1550,71 @@ static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) } +static int +virLXCControllerSetupTimers(virLXCControllerPtr ctrl) +{ + g_autofree char *path = NULL; + size_t i; + struct stat sb; + virDomainDefPtr def = ctrl->def; + + /* Not sync'ed with Host clock */ + if (def->clock.offset != VIR_DOMAIN_CLOCK_OFFSET_LOCALTIME) + return 0; + + for (i = 0; i < def->clock.ntimers; i++) { + dev_t dev; + virDomainTimerDefPtr timer = def->clock.timers[i]; + + switch ((virDomainTimerNameType)timer->name) { + case VIR_DOMAIN_TIMER_NAME_PLATFORM: + case VIR_DOMAIN_TIMER_NAME_TSC: + case VIR_DOMAIN_TIMER_NAME_KVMCLOCK: + case VIR_DOMAIN_TIMER_NAME_HYPERVCLOCK: + case VIR_DOMAIN_TIMER_NAME_PIT: + case VIR_DOMAIN_TIMER_NAME_HPET: + case VIR_DOMAIN_TIMER_NAME_ARMVTIMER: + case VIR_DOMAIN_TIMER_NAME_LAST: + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("unsupported timer type (name) '%s'"), + virDomainTimerNameTypeToString(timer->name)); + return -1; + case VIR_DOMAIN_TIMER_NAME_RTC: + if (!timer->present) + break; + + if (stat("/dev/rtc", &sb) < 0) { + if (errno == EACCES) + return -1; + + virReportSystemError(errno, + _("Path '%s' is not accessible"), + path); + return -1; + } + + path = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, + ctrl->def->name, "/rtc"); + + dev = makedev(major(sb.st_rdev), minor(sb.st_rdev)); + if (mknod(path, S_IFCHR, dev) < 0 || + chmod(path, sb.st_mode)) { + virReportSystemError(errno, + _("Failed to make device %s"), + path); + return -1; + } + + if (lxcContainerChown(ctrl->def, path) < 0) + return -1; + break; + } + } + + return 0; +} + + static int virLXCControllerSetupHostdevSubsysUSB(virDomainDefPtr vmDef, virDomainHostdevDefPtr def, @@ -2352,6 +2417,9 @@ virLXCControllerRun(virLXCControllerPtr ctrl) if (virLXCControllerPopulateDevices(ctrl) < 0) goto cleanup; + if (virLXCControllerSetupTimers(ctrl) < 0) + goto cleanup; + if (virLXCControllerSetupAllDisks(ctrl) < 0) goto cleanup; -- 2.20.1

This commit is related to RTC timer device too. HPET is being shared from host device through `localtime` clock. This timer is available creating a new timer using `hpet` name. Signed-off-by: Julio Faracco <jcfaracco@gmail.com> --- docs/formatdomain.html.in | 2 +- src/lxc/lxc_cgroup.c | 17 +++++++++++++---- src/lxc/lxc_controller.c | 33 +++++++++++++++++++++++++++++---- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 5598bf41b4..8571db89dc 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -2464,7 +2464,7 @@ The <code>name</code> attribute selects which timer is being modified, and can be one of "platform" (currently unsupported), - "hpet" (libxl, xen, qemu), "kvmclock" (qemu), + "hpet" (libxl, xen, qemu, lxc), "kvmclock" (qemu), "pit" (qemu), "rtc" (qemu, lxc), "tsc" (libxl, qemu - <span class="since">since 3.2.0</span>), "hypervclock" (qemu - <span class="since">since 1.2.2</span>) or diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c index 6a103055a4..997a5c3dfa 100644 --- a/src/lxc/lxc_cgroup.c +++ b/src/lxc/lxc_cgroup.c @@ -344,20 +344,19 @@ static int virLXCCgroupSetupDeviceACL(virDomainDefPtr def, for (i = 0; i < def->clock.ntimers; i++) { virDomainTimerDefPtr timer = def->clock.timers[i]; + if (!timer->present) + break; + switch ((virDomainTimerNameType)timer->name) { case VIR_DOMAIN_TIMER_NAME_PLATFORM: case VIR_DOMAIN_TIMER_NAME_TSC: case VIR_DOMAIN_TIMER_NAME_KVMCLOCK: case VIR_DOMAIN_TIMER_NAME_HYPERVCLOCK: case VIR_DOMAIN_TIMER_NAME_PIT: - case VIR_DOMAIN_TIMER_NAME_HPET: case VIR_DOMAIN_TIMER_NAME_ARMVTIMER: case VIR_DOMAIN_TIMER_NAME_LAST: break; case VIR_DOMAIN_TIMER_NAME_RTC: - if (!timer->present) - break; - if (virFileExists("/dev/rtc")) { if (virCgroupAllowDevicePath(cgroup, "/dev/rtc", VIR_CGROUP_DEVICE_READ, @@ -367,6 +366,16 @@ static int virLXCCgroupSetupDeviceACL(virDomainDefPtr def, VIR_DEBUG("Ignoring non-existent device /dev/rtc"); } break; + case VIR_DOMAIN_TIMER_NAME_HPET: + if (virFileExists("/dev/hpet")) { + if (virCgroupAllowDevicePath(cgroup, "/dev/hpet", + VIR_CGROUP_DEVICE_READ, + false) < 0) + return -1; + } else { + VIR_DEBUG("Ignoring non-existent device /dev/hpet"); + } + break; } } } else { diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index eba6bfe0bf..518967ee83 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -1566,13 +1566,15 @@ virLXCControllerSetupTimers(virLXCControllerPtr ctrl) dev_t dev; virDomainTimerDefPtr timer = def->clock.timers[i]; + if (!timer->present) + continue; + switch ((virDomainTimerNameType)timer->name) { case VIR_DOMAIN_TIMER_NAME_PLATFORM: case VIR_DOMAIN_TIMER_NAME_TSC: case VIR_DOMAIN_TIMER_NAME_KVMCLOCK: case VIR_DOMAIN_TIMER_NAME_HYPERVCLOCK: case VIR_DOMAIN_TIMER_NAME_PIT: - case VIR_DOMAIN_TIMER_NAME_HPET: case VIR_DOMAIN_TIMER_NAME_ARMVTIMER: case VIR_DOMAIN_TIMER_NAME_LAST: virReportError(VIR_ERR_CONFIG_UNSUPPORTED, @@ -1580,9 +1582,6 @@ virLXCControllerSetupTimers(virLXCControllerPtr ctrl) virDomainTimerNameTypeToString(timer->name)); return -1; case VIR_DOMAIN_TIMER_NAME_RTC: - if (!timer->present) - break; - if (stat("/dev/rtc", &sb) < 0) { if (errno == EACCES) return -1; @@ -1605,6 +1604,32 @@ virLXCControllerSetupTimers(virLXCControllerPtr ctrl) return -1; } + if (lxcContainerChown(ctrl->def, path) < 0) + return -1; + break; + case VIR_DOMAIN_TIMER_NAME_HPET: + if (stat("/dev/hpet", &sb) < 0) { + if (errno == EACCES) + return -1; + + virReportSystemError(errno, + _("Path '%s' is not accessible"), + path); + return -1; + } + + path = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, + ctrl->def->name, "/hpet"); + + dev = makedev(major(sb.st_rdev), minor(sb.st_rdev)); + if (mknod(path, S_IFCHR, dev) < 0 || + chmod(path, sb.st_mode)) { + virReportSystemError(errno, + _("Failed to make device %s"), + path); + return -1; + } + if (lxcContainerChown(ctrl->def, path) < 0) return -1; break; -- 2.20.1

There are a lots of strings being handled inside some LXC functions. They can be moved to g_autofree to avoid declaring a return value to get proper code cleanups. This commit is changing functions from lxc_{controller,cgroup,fuse} only. Signed-off-by: Julio Faracco <jcfaracco@gmail.com> --- src/lxc/lxc_cgroup.c | 15 +++---- src/lxc/lxc_controller.c | 96 ++++++++++++++-------------------------- src/lxc/lxc_fuse.c | 23 +++------- 3 files changed, 44 insertions(+), 90 deletions(-) diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c index 997a5c3dfa..d29b65092a 100644 --- a/src/lxc/lxc_cgroup.c +++ b/src/lxc/lxc_cgroup.c @@ -54,8 +54,7 @@ static int virLXCCgroupSetupCpusetTune(virDomainDefPtr def, virCgroupPtr cgroup, virBitmapPtr nodemask) { - int ret = -1; - char *mask = NULL; + g_autofree char *mask = NULL; virDomainNumatuneMemMode mode; if (def->placement_mode != VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO && @@ -66,21 +65,17 @@ static int virLXCCgroupSetupCpusetTune(virDomainDefPtr def, if (virDomainNumatuneGetMode(def->numa, -1, &mode) < 0 || mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { - ret = 0; - goto cleanup; + return 0; } if (virDomainNumatuneMaybeFormatNodeset(def->numa, nodemask, &mask, -1) < 0) - goto cleanup; + return -1; if (mask && virCgroupSetCpusetMems(cgroup, mask) < 0) - goto cleanup; + return -1; - ret = 0; - cleanup: - VIR_FREE(mask); - return ret; + return 0; } diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index 518967ee83..c580b17f5f 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -802,8 +802,7 @@ static int virLXCControllerGetNumadAdvice(virLXCControllerPtr ctrl, virBitmapPtr *mask) { virBitmapPtr nodemask = NULL; - char *nodeset = NULL; - int ret = -1; + g_autofree char *nodeset = NULL; /* Get the advisory nodeset from numad if 'placement' of * either <vcpu> or <numatune> is 'auto'. @@ -812,20 +811,17 @@ static int virLXCControllerGetNumadAdvice(virLXCControllerPtr ctrl, nodeset = virNumaGetAutoPlacementAdvice(virDomainDefGetVcpus(ctrl->def), ctrl->def->mem.cur_balloon); if (!nodeset) - goto cleanup; + return -1; VIR_DEBUG("Nodeset returned from numad: %s", nodeset); if (virBitmapParse(nodeset, &nodemask, VIR_DOMAIN_CPUMASK_LEN) < 0) - goto cleanup; + return -1; } - ret = 0; *mask = nodemask; - cleanup: - VIR_FREE(nodeset); - return ret; + return 0; } @@ -1434,9 +1430,8 @@ virLXCControllerSetupUsernsMap(virDomainIdMapEntryPtr map, */ static int virLXCControllerSetupUserns(virLXCControllerPtr ctrl) { - char *uid_map = NULL; - char *gid_map = NULL; - int ret = -1; + g_autofree char *uid_map = NULL; + g_autofree char *gid_map = NULL; /* User namespace is disabled for container */ if (ctrl->def->idmap.nuidmap == 0) { @@ -1450,28 +1445,23 @@ static int virLXCControllerSetupUserns(virLXCControllerPtr ctrl) if (virLXCControllerSetupUsernsMap(ctrl->def->idmap.uidmap, ctrl->def->idmap.nuidmap, uid_map) < 0) - goto cleanup; + return -1; gid_map = g_strdup_printf("/proc/%d/gid_map", ctrl->initpid); if (virLXCControllerSetupUsernsMap(ctrl->def->idmap.gidmap, ctrl->def->idmap.ngidmap, gid_map) < 0) - goto cleanup; + return -1; - ret = 0; - cleanup: - VIR_FREE(uid_map); - VIR_FREE(gid_map); - return ret; + return 0; } static int virLXCControllerSetupDev(virLXCControllerPtr ctrl) { - char *mount_options = NULL; - char *opts = NULL; - char *dev = NULL; - int ret = -1; + g_autofree char *mount_options = NULL; + g_autofree char *opts = NULL; + g_autofree char *dev = NULL; VIR_DEBUG("Setting up /dev/ for container"); @@ -1488,24 +1478,18 @@ static int virLXCControllerSetupDev(virLXCControllerPtr ctrl) opts = g_strdup_printf("mode=755,size=65536%s", mount_options); if (virFileSetupDev(dev, opts) < 0) - goto cleanup; + return -1; if (lxcContainerChown(ctrl->def, dev) < 0) - goto cleanup; + return -1; - ret = 0; - cleanup: - VIR_FREE(opts); - VIR_FREE(mount_options); - VIR_FREE(dev); - return ret; + return 0; } static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) { size_t i; - int ret = -1; - char *path = NULL; + g_autofree char *path = NULL; const struct { int maj; int min; @@ -1521,7 +1505,7 @@ static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) }; if (virLXCControllerSetupDev(ctrl) < 0) - goto cleanup; + return -1; /* Populate /dev/ with a few important bits */ for (i = 0; i < G_N_ELEMENTS(devs); i++) { @@ -1534,19 +1518,14 @@ static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) virReportSystemError(errno, _("Failed to make device %s"), path); - goto cleanup; + return -1; } if (lxcContainerChown(ctrl->def, path) < 0) - goto cleanup; - - VIR_FREE(path); + return -1; } - ret = 0; - cleanup: - VIR_FREE(path); - return ret; + return 0; } @@ -2202,10 +2181,9 @@ virLXCControllerSetupPrivateNS(void) static int virLXCControllerSetupDevPTS(virLXCControllerPtr ctrl) { - char *mount_options = NULL; - char *opts = NULL; - char *devpts = NULL; - int ret = -1; + g_autofree char *mount_options = NULL; + g_autofree char *opts = NULL; + g_autofree char *devpts = NULL; gid_t ptsgid = 5; VIR_DEBUG("Setting up private /dev/pts"); @@ -2220,7 +2198,7 @@ virLXCControllerSetupDevPTS(virLXCControllerPtr ctrl) virReportSystemError(errno, _("Failed to make path %s"), devpts); - goto cleanup; + return -1; } if (ctrl->def->idmap.ngidmap) @@ -2239,26 +2217,20 @@ virLXCControllerSetupDevPTS(virLXCControllerPtr ctrl) virReportSystemError(errno, _("Failed to mount devpts on %s"), devpts); - goto cleanup; + return -1; } if (access(ctrl->devptmx, R_OK) < 0) { virReportSystemError(ENOSYS, "%s", _("Kernel does not support private devpts")); - goto cleanup; + return -1; } if ((lxcContainerChown(ctrl->def, ctrl->devptmx) < 0) || (lxcContainerChown(ctrl->def, devpts) < 0)) - goto cleanup; - - ret = 0; + return -1; - cleanup: - VIR_FREE(opts); - VIR_FREE(devpts); - VIR_FREE(mount_options); - return ret; + return 0; } @@ -2279,8 +2251,7 @@ virLXCControllerSetupConsoles(virLXCControllerPtr ctrl, char **containerTTYPaths) { size_t i; - int ret = -1; - char *ttyHostPath = NULL; + g_autofree char *ttyHostPath = NULL; for (i = 0; i < ctrl->nconsoles; i++) { VIR_DEBUG("Opening tty on private %s", ctrl->devptmx); @@ -2289,20 +2260,17 @@ virLXCControllerSetupConsoles(virLXCControllerPtr ctrl, &containerTTYPaths[i], &ttyHostPath) < 0) { virReportSystemError(errno, "%s", _("Failed to allocate tty")); - goto cleanup; + return -1; } /* Change the owner of tty device to the root user of container */ if (lxcContainerChown(ctrl->def, ttyHostPath) < 0) - goto cleanup; + return -1; VIR_FREE(ttyHostPath); } - ret = 0; - cleanup: - VIR_FREE(ttyHostPath); - return ret; + return 0; } diff --git a/src/lxc/lxc_fuse.c b/src/lxc/lxc_fuse.c index 44f240a0b5..8cfccdd7e0 100644 --- a/src/lxc/lxc_fuse.c +++ b/src/lxc/lxc_fuse.c @@ -40,8 +40,7 @@ static const char *fuse_meminfo_path = "/meminfo"; static int lxcProcGetattr(const char *path, struct stat *stbuf) { - int res; - char *mempath = NULL; + g_autofree char *mempath = NULL; struct stat sb; struct fuse_context *context = fuse_get_context(); virDomainDefPtr def = (virDomainDefPtr)context->private_data; @@ -49,16 +48,12 @@ static int lxcProcGetattr(const char *path, struct stat *stbuf) memset(stbuf, 0, sizeof(struct stat)); mempath = g_strdup_printf("/proc/%s", path); - res = 0; - if (STREQ(path, "/")) { stbuf->st_mode = S_IFDIR | 0755; stbuf->st_nlink = 2; } else if (STREQ(path, fuse_meminfo_path)) { - if (stat(mempath, &sb) < 0) { - res = -errno; - goto cleanup; - } + if (stat(mempath, &sb) < 0) + return -errno; stbuf->st_uid = def->idmap.uidmap ? def->idmap.uidmap[0].target : 0; stbuf->st_gid = def->idmap.gidmap ? def->idmap.gidmap[0].target : 0; @@ -71,12 +66,10 @@ static int lxcProcGetattr(const char *path, struct stat *stbuf) stbuf->st_ctime = sb.st_ctime; stbuf->st_mtime = sb.st_mtime; } else { - res = -ENOENT; + return -ENOENT; } - cleanup: - VIR_FREE(mempath); - return res; + return 0; } static int lxcProcReaddir(const char *path, void *buf, @@ -127,7 +120,7 @@ static int lxcProcReadMeminfo(char *hostpath, virDomainDefPtr def, { int res; FILE *fd = NULL; - char *line = NULL; + g_autofree char *line = NULL; size_t n; struct virLXCMeminfo meminfo; virBuffer buffer = VIR_BUFFER_INITIALIZER; @@ -229,7 +222,6 @@ static int lxcProcReadMeminfo(char *hostpath, virDomainDefPtr def, memcpy(buf, virBufferCurrentContent(new_meminfo), res); cleanup: - VIR_FREE(line); virBufferFreeAndReset(new_meminfo); VIR_FORCE_FCLOSE(fd); return res; @@ -242,7 +234,7 @@ static int lxcProcRead(const char *path G_GNUC_UNUSED, struct fuse_file_info *fi G_GNUC_UNUSED) { int res = -ENOENT; - char *hostpath = NULL; + g_autofree char *hostpath = NULL; struct fuse_context *context = NULL; virDomainDefPtr def = NULL; @@ -256,7 +248,6 @@ static int lxcProcRead(const char *path G_GNUC_UNUSED, res = lxcProcHostRead(hostpath, buf, size, offset); } - VIR_FREE(hostpath); return res; } -- 2.20.1

This commit tries to fix lots of issues related to LXC VCPUs. One of them is related to /proc/cpuinfo content. If only 1 VCPU is set, LXC containers will show all CPUs available for host. The second one is related to CPU share, if an user set only 1 VCPU, the container/process will use all available CPUs. (This is not the case when `cpuset` attribute is declared). So, this commit adds a virtual cpuinfo based on VCPU mapping and it automatically limits the CPU usage according VCPU count. Example (now): LXC container - 8 CPUS with 2 VCPU: lxc-root# stress --cpu 8 On host machine, only CPU 0 and 1 have 100% usage. Signed-off-by: Julio Faracco <jcfaracco@gmail.com> --- src/lxc/lxc_cgroup.c | 31 +++++++++++++++ src/lxc/lxc_container.c | 39 +++++++++++------- src/lxc/lxc_fuse.c | 88 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 139 insertions(+), 19 deletions(-) diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c index d29b65092a..912a252473 100644 --- a/src/lxc/lxc_cgroup.c +++ b/src/lxc/lxc_cgroup.c @@ -50,6 +50,34 @@ static int virLXCCgroupSetupCpuTune(virDomainDefPtr def, } +static int virLXCCgroupSetupVcpuAuto(virDomainDefPtr def, + virCgroupPtr cgroup) +{ + size_t i; + int vcpumax; + virBuffer buffer = VIR_BUFFER_INITIALIZER; + virBufferPtr cpuset = &buffer; + + vcpumax = virDomainDefGetVcpusMax(def); + for (i = 0; i < vcpumax; i++) { + virDomainVcpuDefPtr vcpu = virDomainDefGetVcpu(def, i); + /* Cgroup is smart enough to convert numbers separated + * by comma into ranges. Example: "0,1,2,5," -> "0-2,5". + * Libvirt does not need to process it here. */ + if (vcpu) + virBufferAsprintf(cpuset, "%zu,", i); + } + if (virCgroupSetCpusetCpus(cgroup, + virBufferCurrentContent(cpuset)) < 0) { + virBufferFreeAndReset(cpuset); + return -1; + } + + virBufferFreeAndReset(cpuset); + return 0; +} + + static int virLXCCgroupSetupCpusetTune(virDomainDefPtr def, virCgroupPtr cgroup, virBitmapPtr nodemask) @@ -61,6 +89,9 @@ static int virLXCCgroupSetupCpusetTune(virDomainDefPtr def, def->cpumask && virCgroupSetupCpusetCpus(cgroup, def->cpumask) < 0) { return -1; + } else { + /* auto mode for VCPU limits */ + virLXCCgroupSetupVcpuAuto(def, cgroup); } if (virDomainNumatuneGetMode(def->numa, -1, &mode) < 0 || diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 41efe43a14..88e27f3060 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -997,8 +997,8 @@ static int lxcContainerMountBasicFS(bool userns_enabled, static int lxcContainerMountProcFuse(virDomainDefPtr def, const char *stateDir) { - int ret; - char *meminfo_path = NULL; + g_autofree char *meminfo_path = NULL; + g_autofree char *cpuinfo_path = NULL; VIR_DEBUG("Mount /proc/meminfo stateDir=%s", stateDir); @@ -1006,15 +1006,29 @@ static int lxcContainerMountProcFuse(virDomainDefPtr def, stateDir, def->name); - if ((ret = mount(meminfo_path, "/proc/meminfo", - NULL, MS_BIND, NULL)) < 0) { + if (mount(meminfo_path, "/proc/meminfo", + NULL, MS_BIND, NULL) < 0) { virReportSystemError(errno, _("Failed to mount %s on /proc/meminfo"), meminfo_path); + return -1; } - VIR_FREE(meminfo_path); - return ret; + VIR_DEBUG("Mount /proc/cpuinfo stateDir=%s", stateDir); + + cpuinfo_path = g_strdup_printf("/.oldroot/%s/%s.fuse/cpuinfo", + stateDir, + def->name); + + if (mount(cpuinfo_path, "/proc/cpuinfo", + NULL, MS_BIND, NULL) < 0) { + virReportSystemError(errno, + _("Failed to mount %s on /proc/cpuinfo"), + cpuinfo_path); + return -1; + } + + return 0; } #else static int lxcContainerMountProcFuse(virDomainDefPtr def G_GNUC_UNUSED, @@ -1027,8 +1041,7 @@ static int lxcContainerMountProcFuse(virDomainDefPtr def G_GNUC_UNUSED, static int lxcContainerMountFSDev(virDomainDefPtr def, const char *stateDir) { - int ret = -1; - char *path = NULL; + g_autofree char *path = NULL; int flags = def->idmap.nuidmap ? MS_BIND : MS_MOVE; VIR_DEBUG("Mount /dev/ stateDir=%s", stateDir); @@ -1038,7 +1051,7 @@ static int lxcContainerMountFSDev(virDomainDefPtr def, if (virFileMakePath("/dev") < 0) { virReportSystemError(errno, "%s", _("Cannot create /dev")); - goto cleanup; + return -1; } VIR_DEBUG("Trying to %s %s to /dev", def->idmap.nuidmap ? @@ -1048,14 +1061,10 @@ static int lxcContainerMountFSDev(virDomainDefPtr def, virReportSystemError(errno, _("Failed to mount %s on /dev"), path); - goto cleanup; + return -1; } - ret = 0; - - cleanup: - VIR_FREE(path); - return ret; + return 0; } static int lxcContainerMountFSDevPTS(virDomainDefPtr def, diff --git a/src/lxc/lxc_fuse.c b/src/lxc/lxc_fuse.c index 8cfccdd7e0..402382b4d8 100644 --- a/src/lxc/lxc_fuse.c +++ b/src/lxc/lxc_fuse.c @@ -37,22 +37,23 @@ #if WITH_FUSE static const char *fuse_meminfo_path = "/meminfo"; +static const char *fuse_cpuinfo_path = "/cpuinfo"; static int lxcProcGetattr(const char *path, struct stat *stbuf) { - g_autofree char *mempath = NULL; + g_autofree char *procpath = NULL; struct stat sb; struct fuse_context *context = fuse_get_context(); virDomainDefPtr def = (virDomainDefPtr)context->private_data; memset(stbuf, 0, sizeof(struct stat)); - mempath = g_strdup_printf("/proc/%s", path); + procpath = g_strdup_printf("/proc/%s", path); if (STREQ(path, "/")) { stbuf->st_mode = S_IFDIR | 0755; stbuf->st_nlink = 2; } else if (STREQ(path, fuse_meminfo_path)) { - if (stat(mempath, &sb) < 0) + if (stat(procpath, &sb) < 0) return -errno; stbuf->st_uid = def->idmap.uidmap ? def->idmap.uidmap[0].target : 0; @@ -83,6 +84,7 @@ static int lxcProcReaddir(const char *path, void *buf, filler(buf, ".", NULL, 0); filler(buf, "..", NULL, 0); filler(buf, fuse_meminfo_path + 1, NULL, 0); + filler(buf, fuse_cpuinfo_path + 1, NULL, 0); return 0; } @@ -90,7 +92,8 @@ static int lxcProcReaddir(const char *path, void *buf, static int lxcProcOpen(const char *path G_GNUC_UNUSED, struct fuse_file_info *fi G_GNUC_UNUSED) { - if (STRNEQ(path, fuse_meminfo_path)) + if (STRNEQ(path, fuse_meminfo_path) && + STRNEQ(path, fuse_cpuinfo_path)) return -ENOENT; if ((fi->flags & 3) != O_RDONLY) @@ -227,6 +230,80 @@ static int lxcProcReadMeminfo(char *hostpath, virDomainDefPtr def, return res; } + +static int +lxcProcReadCpuinfoParse(virDomainDefPtr def, char *base, + virBufferPtr new_cpuinfo) +{ + char *procline = NULL; + char *saveptr = base; + size_t cpu; + size_t nvcpu; + size_t curcpu = 0; + bool get_proc = false; + + nvcpu = virDomainDefGetVcpus(def); + while ((procline = strtok_r(NULL, "\n", &saveptr))) { + if (sscanf(procline, "processor\t: %zu", &cpu) == 1) { + virDomainVcpuDefPtr vcpu = virDomainDefGetVcpu(def, cpu); + /* VCPU is mapped */ + if (vcpu) { + if (curcpu == nvcpu) + break; + + if (curcpu > 0) + virBufferAddLit(new_cpuinfo, "\n"); + + virBufferAsprintf(new_cpuinfo, "processor\t: %zu\n", + curcpu); + curcpu++; + get_proc = true; + } else { + get_proc = false; + } + } else { + /* It is not a processor index */ + if (get_proc) + virBufferAsprintf(new_cpuinfo, "%s\n", procline); + } + } + + virBufferAddLit(new_cpuinfo, "\n"); + + return strlen(virBufferCurrentContent(new_cpuinfo)); +} + + +static int lxcProcReadCpuinfo(char *hostpath, virDomainDefPtr def, + char *buf, size_t size, off_t offset) +{ + virBuffer buffer = VIR_BUFFER_INITIALIZER; + virBufferPtr new_cpuinfo = &buffer; + g_autofree char *outbuf = NULL; + int res = -1; + + /* Gather info from /proc/cpuinfo */ + if (virFileReadAll(hostpath, 1024*1024, &outbuf) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Failed to open %s"), hostpath); + return -1; + } + + /* /proc/cpuinfo does not support fseek */ + if (offset > 0) + return 0; + + res = lxcProcReadCpuinfoParse(def, outbuf, new_cpuinfo); + + if (res > size) + res = size; + memcpy(buf, virBufferCurrentContent(new_cpuinfo), res); + + virBufferFreeAndReset(new_cpuinfo); + return res; +} + + static int lxcProcRead(const char *path G_GNUC_UNUSED, char *buf G_GNUC_UNUSED, size_t size G_GNUC_UNUSED, @@ -246,6 +323,9 @@ static int lxcProcRead(const char *path G_GNUC_UNUSED, if (STREQ(path, fuse_meminfo_path)) { if ((res = lxcProcReadMeminfo(hostpath, def, buf, size, offset)) < 0) res = lxcProcHostRead(hostpath, buf, size, offset); + } else if (STREQ(path, fuse_cpuinfo_path)) { + if ((res = lxcProcReadCpuinfo(hostpath, def, buf, size, offset)) < 0) + res = lxcProcHostRead(hostpath, buf, size, offset); } return res; -- 2.20.1

Native config files sometimes can setup cpuset.cpus to pin some CPUs. Before this, LXC was using a fixed number of 1 VCPU. After this commit, XML definition will generate a dynamic number of VCPUs based on that cgroup attribute. Signed-off-by: Julio Faracco <jcfaracco@gmail.com> --- src/lxc/lxc_container.c | 23 ++++++++++++++++++ src/lxc/lxc_container.h | 2 ++ src/lxc/lxc_native.c | 24 +++++++++++++++++-- .../lxcconf2xml-cpusettune.xml | 2 +- 4 files changed, 48 insertions(+), 3 deletions(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 88e27f3060..c5788e5c32 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -2487,3 +2487,26 @@ int lxcContainerChown(virDomainDefPtr def, const char *path) return 0; } + + +int lxcContainerGetMaxCpusInCpuset(const char *cpuset) +{ + const char *c = cpuset; + int max_cpu = 0; + + while (c) { + int a, b, ret; + + ret = sscanf(c, "%d-%d", &a, &b); + if (ret == 1) + max_cpu++; + else if (ret == 2) + max_cpu += a > b ? a - b + 1 : b - a + 1; + + if (!(c = strchr(c+1, ','))) + break; + c++; + } + + return max_cpu; +} diff --git a/src/lxc/lxc_container.h b/src/lxc/lxc_container.h index 94a6c5309c..6f112e0667 100644 --- a/src/lxc/lxc_container.h +++ b/src/lxc/lxc_container.h @@ -63,3 +63,5 @@ virArch lxcContainerGetAlt32bitArch(virArch arch); int lxcContainerChown(virDomainDefPtr def, const char *path); bool lxcIsBasicMountLocation(const char *path); + +int lxcContainerGetMaxCpusInCpuset(const char *cpuset); diff --git a/src/lxc/lxc_native.c b/src/lxc/lxc_native.c index 02d2bf33e4..409bf00bd2 100644 --- a/src/lxc/lxc_native.c +++ b/src/lxc/lxc_native.c @@ -993,6 +993,24 @@ lxcSetCpusetTune(virDomainDefPtr def, virConfPtr properties) return 0; } + +static int +lxcGetVCpuMax(virConfPtr properties) +{ + g_autofree char *value = NULL; + int vcpumax = 1; + + if (virConfGetValueString(properties, "lxc.cgroup.cpuset.cpus", + &value) > 0) { + vcpumax = lxcContainerGetMaxCpusInCpuset(value); + if (vcpumax > 0) + return vcpumax; + } + + return vcpumax; +} + + static int lxcBlkioDeviceWalkCallback(const char *name, virConfValuePtr value, void *data) { @@ -1132,6 +1150,7 @@ lxcParseConfigString(const char *config, virDomainDefPtr vmdef = NULL; g_autoptr(virConf) properties = NULL; g_autofree char *value = NULL; + int vcpumax; if (!(properties = virConfReadString(config, VIR_CONF_FLAG_LXC_FORMAT))) return NULL; @@ -1155,10 +1174,11 @@ lxcParseConfigString(const char *config, /* Value not handled by the LXC driver, setting to * minimum required to make XML parsing pass */ - if (virDomainDefSetVcpusMax(vmdef, 1, xmlopt) < 0) + vcpumax = lxcGetVCpuMax(properties); + if (virDomainDefSetVcpusMax(vmdef, vcpumax, xmlopt) < 0) goto error; - if (virDomainDefSetVcpus(vmdef, 1) < 0) + if (virDomainDefSetVcpus(vmdef, vcpumax) < 0) goto error; vmdef->nfss = 0; diff --git a/tests/lxcconf2xmldata/lxcconf2xml-cpusettune.xml b/tests/lxcconf2xmldata/lxcconf2xml-cpusettune.xml index 6df089d00f..a1fec12d9b 100644 --- a/tests/lxcconf2xmldata/lxcconf2xml-cpusettune.xml +++ b/tests/lxcconf2xmldata/lxcconf2xml-cpusettune.xml @@ -3,7 +3,7 @@ <uuid>c7a5fdbd-edaf-9455-926a-d65c16db1809</uuid> <memory unit='KiB'>65536</memory> <currentMemory unit='KiB'>65536</currentMemory> - <vcpu placement='static' cpuset='1-2,5-7'>1</vcpu> + <vcpu placement='static' cpuset='1-2,5-7'>5</vcpu> <numatune> <memory mode='strict' nodeset='1-4'/> </numatune> -- 2.20.1
participants (1)
-
Julio Faracco