[PATCH v4 0/5] lxc: Add VCPU features for LXC

This series cover a lots of functionalities to LXC VCPUs. It enables sharing some timer devices between host and LXC guest using `timer` settings. It still has other improvements related to VCPU and LXC such as virtual cpuinfo content based on VCPU settings and some better resource limits. Each patch has the description of the problem and what it is trying to fix. v1-v2: Add Daniel's comments and some cleanups. v2-v3: Remove dependency from patch 4 and 5. v3-v4: Missing cpuinfo file from Fuse Getattr handler. Julio Faracco (5): lxc: Add Real Time Clock device into allowed devices lxc: Add HPET device into allowed devices lxc: Replacing default strings definitions by g_autofree statement lxc: Implement virtual /proc/cpuinfo via LXC fuse lxc: Count max VCPUs based on cpuset.cpus in native config docs/formatdomain.html.in | 4 +- src/lxc/lxc_cgroup.c | 91 ++++++++- src/lxc/lxc_container.c | 62 ++++-- src/lxc/lxc_container.h | 2 + src/lxc/lxc_controller.c | 187 ++++++++++++------ src/lxc/lxc_fuse.c | 114 +++++++++-- src/lxc/lxc_native.c | 24 ++- .../lxcconf2xml-cpusettune.xml | 2 +- 8 files changed, 374 insertions(+), 112 deletions(-) -- 2.20.1

This commit share host Real Time Clock device (rtc) into LXC containers to support hardware clock. This should be available setting up a `rtc` timer under clock section. Since this option is not emulated, it should be available only for `localtime` clock. This option should be readonly due to security reasons. Before: root# hwclock --verbose hwclock from util-linux 2.32.1 System Time: 1581877557.598365 Trying to open: /dev/rtc0 Trying to open: /dev/rtc Trying to open: /dev/misc/rtc No usable clock interface found. hwclock: Cannot access the Hardware Clock via any known method. Now: root# hwclock 2020-02-16 18:23:55.374134+00:00 root# hwclock -w hwclock: ioctl(RTC_SET_TIME) to /dev/rtc to set the time failed: Permission denied Signed-off-by: Julio Faracco <jcfaracco@gmail.com> --- docs/formatdomain.html.in | 2 +- src/lxc/lxc_cgroup.c | 36 +++++++++++++++++++++ src/lxc/lxc_controller.c | 68 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 1 deletion(-) diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 4fef2a0a97..5598bf41b4 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -2465,7 +2465,7 @@ being modified, and can be one of "platform" (currently unsupported), "hpet" (libxl, xen, qemu), "kvmclock" (qemu), - "pit" (qemu), "rtc" (qemu), "tsc" (libxl, qemu - + "pit" (qemu), "rtc" (qemu, lxc), "tsc" (libxl, qemu - <span class="since">since 3.2.0</span>), "hypervclock" (qemu - <span class="since">since 1.2.2</span>) or "armvtimer" (qemu - <span class="since">since 6.1.0</span>). diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c index 4ebe5ef467..6a103055a4 100644 --- a/src/lxc/lxc_cgroup.c +++ b/src/lxc/lxc_cgroup.c @@ -337,6 +337,42 @@ static int virLXCCgroupSetupDeviceACL(virDomainDefPtr def, VIR_CGROUP_DEVICE_RWM) < 0) return -1; + VIR_DEBUG("Allowing timers char devices"); + + /* Sync'ed with Host clock */ + if (def->clock.offset == VIR_DOMAIN_CLOCK_OFFSET_LOCALTIME) { + for (i = 0; i < def->clock.ntimers; i++) { + virDomainTimerDefPtr timer = def->clock.timers[i]; + + switch ((virDomainTimerNameType)timer->name) { + case VIR_DOMAIN_TIMER_NAME_PLATFORM: + case VIR_DOMAIN_TIMER_NAME_TSC: + case VIR_DOMAIN_TIMER_NAME_KVMCLOCK: + case VIR_DOMAIN_TIMER_NAME_HYPERVCLOCK: + case VIR_DOMAIN_TIMER_NAME_PIT: + case VIR_DOMAIN_TIMER_NAME_HPET: + case VIR_DOMAIN_TIMER_NAME_ARMVTIMER: + case VIR_DOMAIN_TIMER_NAME_LAST: + break; + case VIR_DOMAIN_TIMER_NAME_RTC: + if (!timer->present) + break; + + if (virFileExists("/dev/rtc")) { + if (virCgroupAllowDevicePath(cgroup, "/dev/rtc", + VIR_CGROUP_DEVICE_READ, + false) < 0) + return -1; + } else { + VIR_DEBUG("Ignoring non-existent device /dev/rtc"); + } + break; + } + } + } else { + VIR_DEBUG("Ignoring non-localtime clock"); + } + VIR_DEBUG("Device whitelist complete"); return 0; diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index c3dec0859c..eba6bfe0bf 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -1550,6 +1550,71 @@ static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) } +static int +virLXCControllerSetupTimers(virLXCControllerPtr ctrl) +{ + g_autofree char *path = NULL; + size_t i; + struct stat sb; + virDomainDefPtr def = ctrl->def; + + /* Not sync'ed with Host clock */ + if (def->clock.offset != VIR_DOMAIN_CLOCK_OFFSET_LOCALTIME) + return 0; + + for (i = 0; i < def->clock.ntimers; i++) { + dev_t dev; + virDomainTimerDefPtr timer = def->clock.timers[i]; + + switch ((virDomainTimerNameType)timer->name) { + case VIR_DOMAIN_TIMER_NAME_PLATFORM: + case VIR_DOMAIN_TIMER_NAME_TSC: + case VIR_DOMAIN_TIMER_NAME_KVMCLOCK: + case VIR_DOMAIN_TIMER_NAME_HYPERVCLOCK: + case VIR_DOMAIN_TIMER_NAME_PIT: + case VIR_DOMAIN_TIMER_NAME_HPET: + case VIR_DOMAIN_TIMER_NAME_ARMVTIMER: + case VIR_DOMAIN_TIMER_NAME_LAST: + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("unsupported timer type (name) '%s'"), + virDomainTimerNameTypeToString(timer->name)); + return -1; + case VIR_DOMAIN_TIMER_NAME_RTC: + if (!timer->present) + break; + + if (stat("/dev/rtc", &sb) < 0) { + if (errno == EACCES) + return -1; + + virReportSystemError(errno, + _("Path '%s' is not accessible"), + path); + return -1; + } + + path = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, + ctrl->def->name, "/rtc"); + + dev = makedev(major(sb.st_rdev), minor(sb.st_rdev)); + if (mknod(path, S_IFCHR, dev) < 0 || + chmod(path, sb.st_mode)) { + virReportSystemError(errno, + _("Failed to make device %s"), + path); + return -1; + } + + if (lxcContainerChown(ctrl->def, path) < 0) + return -1; + break; + } + } + + return 0; +} + + static int virLXCControllerSetupHostdevSubsysUSB(virDomainDefPtr vmDef, virDomainHostdevDefPtr def, @@ -2352,6 +2417,9 @@ virLXCControllerRun(virLXCControllerPtr ctrl) if (virLXCControllerPopulateDevices(ctrl) < 0) goto cleanup; + if (virLXCControllerSetupTimers(ctrl) < 0) + goto cleanup; + if (virLXCControllerSetupAllDisks(ctrl) < 0) goto cleanup; -- 2.20.1

On Mon, Feb 24, 2020 at 11:24:24AM -0300, Julio Faracco wrote:
This commit share host Real Time Clock device (rtc) into LXC containers to support hardware clock. This should be available setting up a `rtc` timer under clock section. Since this option is not emulated, it should be available only for `localtime` clock. This option should be readonly due to security reasons.
Before: root# hwclock --verbose hwclock from util-linux 2.32.1 System Time: 1581877557.598365 Trying to open: /dev/rtc0 Trying to open: /dev/rtc Trying to open: /dev/misc/rtc No usable clock interface found. hwclock: Cannot access the Hardware Clock via any known method.
Now: root# hwclock 2020-02-16 18:23:55.374134+00:00 root# hwclock -w hwclock: ioctl(RTC_SET_TIME) to /dev/rtc to set the time failed: Permission denied
Signed-off-by: Julio Faracco <jcfaracco@gmail.com> --- docs/formatdomain.html.in | 2 +- src/lxc/lxc_cgroup.c | 36 +++++++++++++++++++++ src/lxc/lxc_controller.c | 68 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 1 deletion(-)
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 4fef2a0a97..5598bf41b4 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -2465,7 +2465,7 @@ being modified, and can be one of "platform" (currently unsupported), "hpet" (libxl, xen, qemu), "kvmclock" (qemu), - "pit" (qemu), "rtc" (qemu), "tsc" (libxl, qemu - + "pit" (qemu), "rtc" (qemu, lxc), "tsc" (libxl, qemu - <span class="since">since 3.2.0</span>), "hypervclock" (qemu - <span class="since">since 1.2.2</span>) or "armvtimer" (qemu - <span class="since">since 6.1.0</span>). diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c index 4ebe5ef467..6a103055a4 100644 --- a/src/lxc/lxc_cgroup.c +++ b/src/lxc/lxc_cgroup.c @@ -337,6 +337,42 @@ static int virLXCCgroupSetupDeviceACL(virDomainDefPtr def, VIR_CGROUP_DEVICE_RWM) < 0) return -1;
+ VIR_DEBUG("Allowing timers char devices"); + + /* Sync'ed with Host clock */ + if (def->clock.offset == VIR_DOMAIN_CLOCK_OFFSET_LOCALTIME) { + for (i = 0; i < def->clock.ntimers; i++) { + virDomainTimerDefPtr timer = def->clock.timers[i]; + + switch ((virDomainTimerNameType)timer->name) { + case VIR_DOMAIN_TIMER_NAME_PLATFORM: + case VIR_DOMAIN_TIMER_NAME_TSC: + case VIR_DOMAIN_TIMER_NAME_KVMCLOCK: + case VIR_DOMAIN_TIMER_NAME_HYPERVCLOCK: + case VIR_DOMAIN_TIMER_NAME_PIT: + case VIR_DOMAIN_TIMER_NAME_HPET: + case VIR_DOMAIN_TIMER_NAME_ARMVTIMER: + case VIR_DOMAIN_TIMER_NAME_LAST: + break; + case VIR_DOMAIN_TIMER_NAME_RTC: + if (!timer->present) + break;
This field is not a boolean, it is a tri-state. * -1 == the hypervisor default - historically with LXC this means no /dev/rtc device * 0 == disable it (matches the default for LXC historically) * 1 == enable it So this check needs to be "if (timer->present != 1) "
+ + if (virFileExists("/dev/rtc")) { + if (virCgroupAllowDevicePath(cgroup, "/dev/rtc", + VIR_CGROUP_DEVICE_READ, + false) < 0) + return -1; + } else { + VIR_DEBUG("Ignoring non-existent device /dev/rtc");
We should be raising an error if the user requested RTC and it doesn't exist.
+ } + break; + } + } + } else { + VIR_DEBUG("Ignoring non-localtime clock"); + }
The setup for <timer> elements should be independant of whether or not localtime is set. Ideally we would raise an unsupported config error for containers which don't have localtime set, but there's way too large a risk of causing regressions if we do this. I'd suggest we just don't check the clock offset at all, or at least do the check independantly of the timer setup.
diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index c3dec0859c..eba6bfe0bf 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -1550,6 +1550,71 @@ static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) }
+static int +virLXCControllerSetupTimers(virLXCControllerPtr ctrl) +{ + g_autofree char *path = NULL; + size_t i; + struct stat sb; + virDomainDefPtr def = ctrl->def; + + /* Not sync'ed with Host clock */ + if (def->clock.offset != VIR_DOMAIN_CLOCK_OFFSET_LOCALTIME) + return 0; + + for (i = 0; i < def->clock.ntimers; i++) { + dev_t dev; + virDomainTimerDefPtr timer = def->clock.timers[i]; + + switch ((virDomainTimerNameType)timer->name) { + case VIR_DOMAIN_TIMER_NAME_PLATFORM: + case VIR_DOMAIN_TIMER_NAME_TSC: + case VIR_DOMAIN_TIMER_NAME_KVMCLOCK: + case VIR_DOMAIN_TIMER_NAME_HYPERVCLOCK: + case VIR_DOMAIN_TIMER_NAME_PIT: + case VIR_DOMAIN_TIMER_NAME_HPET: + case VIR_DOMAIN_TIMER_NAME_ARMVTIMER: + case VIR_DOMAIN_TIMER_NAME_LAST: + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("unsupported timer type (name) '%s'"), + virDomainTimerNameTypeToString(timer->name)); + return -1; + case VIR_DOMAIN_TIMER_NAME_RTC: + if (!timer->present) + break;
Same comments as previously.
+ + if (stat("/dev/rtc", &sb) < 0) { + if (errno == EACCES) + return -1;
Why this special case ? You're returning an error here without setting any error message which is certainly not right.
+ + virReportSystemError(errno, + _("Path '%s' is not accessible"), + path); + return -1; + }
No error reporting if /dev/rtc is missing.
+ + path = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, + ctrl->def->name, "/rtc"); + + dev = makedev(major(sb.st_rdev), minor(sb.st_rdev)); + if (mknod(path, S_IFCHR, dev) < 0 || + chmod(path, sb.st_mode)) { + virReportSystemError(errno, + _("Failed to make device %s"), + path); + return -1; + } + + if (lxcContainerChown(ctrl->def, path) < 0) + return -1; + break; + } + } + + return 0; +}
Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

This commit is related to RTC timer device too. HPET is being shared from host device through `localtime` clock. This timer is available creating a new timer using `hpet` name. Signed-off-by: Julio Faracco <jcfaracco@gmail.com> --- docs/formatdomain.html.in | 2 +- src/lxc/lxc_cgroup.c | 17 +++++++++++++---- src/lxc/lxc_controller.c | 33 +++++++++++++++++++++++++++++---- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 5598bf41b4..8571db89dc 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -2464,7 +2464,7 @@ The <code>name</code> attribute selects which timer is being modified, and can be one of "platform" (currently unsupported), - "hpet" (libxl, xen, qemu), "kvmclock" (qemu), + "hpet" (libxl, xen, qemu, lxc), "kvmclock" (qemu), "pit" (qemu), "rtc" (qemu, lxc), "tsc" (libxl, qemu - <span class="since">since 3.2.0</span>), "hypervclock" (qemu - <span class="since">since 1.2.2</span>) or diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c index 6a103055a4..997a5c3dfa 100644 --- a/src/lxc/lxc_cgroup.c +++ b/src/lxc/lxc_cgroup.c @@ -344,20 +344,19 @@ static int virLXCCgroupSetupDeviceACL(virDomainDefPtr def, for (i = 0; i < def->clock.ntimers; i++) { virDomainTimerDefPtr timer = def->clock.timers[i]; + if (!timer->present) + break; + switch ((virDomainTimerNameType)timer->name) { case VIR_DOMAIN_TIMER_NAME_PLATFORM: case VIR_DOMAIN_TIMER_NAME_TSC: case VIR_DOMAIN_TIMER_NAME_KVMCLOCK: case VIR_DOMAIN_TIMER_NAME_HYPERVCLOCK: case VIR_DOMAIN_TIMER_NAME_PIT: - case VIR_DOMAIN_TIMER_NAME_HPET: case VIR_DOMAIN_TIMER_NAME_ARMVTIMER: case VIR_DOMAIN_TIMER_NAME_LAST: break; case VIR_DOMAIN_TIMER_NAME_RTC: - if (!timer->present) - break; - if (virFileExists("/dev/rtc")) { if (virCgroupAllowDevicePath(cgroup, "/dev/rtc", VIR_CGROUP_DEVICE_READ, @@ -367,6 +366,16 @@ static int virLXCCgroupSetupDeviceACL(virDomainDefPtr def, VIR_DEBUG("Ignoring non-existent device /dev/rtc"); } break; + case VIR_DOMAIN_TIMER_NAME_HPET: + if (virFileExists("/dev/hpet")) { + if (virCgroupAllowDevicePath(cgroup, "/dev/hpet", + VIR_CGROUP_DEVICE_READ, + false) < 0) + return -1; + } else { + VIR_DEBUG("Ignoring non-existent device /dev/hpet"); + } + break; } } } else { diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index eba6bfe0bf..518967ee83 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -1566,13 +1566,15 @@ virLXCControllerSetupTimers(virLXCControllerPtr ctrl) dev_t dev; virDomainTimerDefPtr timer = def->clock.timers[i]; + if (!timer->present) + continue; + switch ((virDomainTimerNameType)timer->name) { case VIR_DOMAIN_TIMER_NAME_PLATFORM: case VIR_DOMAIN_TIMER_NAME_TSC: case VIR_DOMAIN_TIMER_NAME_KVMCLOCK: case VIR_DOMAIN_TIMER_NAME_HYPERVCLOCK: case VIR_DOMAIN_TIMER_NAME_PIT: - case VIR_DOMAIN_TIMER_NAME_HPET: case VIR_DOMAIN_TIMER_NAME_ARMVTIMER: case VIR_DOMAIN_TIMER_NAME_LAST: virReportError(VIR_ERR_CONFIG_UNSUPPORTED, @@ -1580,9 +1582,6 @@ virLXCControllerSetupTimers(virLXCControllerPtr ctrl) virDomainTimerNameTypeToString(timer->name)); return -1; case VIR_DOMAIN_TIMER_NAME_RTC: - if (!timer->present) - break; - if (stat("/dev/rtc", &sb) < 0) { if (errno == EACCES) return -1; @@ -1605,6 +1604,32 @@ virLXCControllerSetupTimers(virLXCControllerPtr ctrl) return -1; } + if (lxcContainerChown(ctrl->def, path) < 0) + return -1; + break; + case VIR_DOMAIN_TIMER_NAME_HPET: + if (stat("/dev/hpet", &sb) < 0) { + if (errno == EACCES) + return -1; + + virReportSystemError(errno, + _("Path '%s' is not accessible"), + path); + return -1; + } + + path = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, + ctrl->def->name, "/hpet"); + + dev = makedev(major(sb.st_rdev), minor(sb.st_rdev)); + if (mknod(path, S_IFCHR, dev) < 0 || + chmod(path, sb.st_mode)) { + virReportSystemError(errno, + _("Failed to make device %s"), + path); + return -1; + } + if (lxcContainerChown(ctrl->def, path) < 0) return -1; break; -- 2.20.1

On Mon, Feb 24, 2020 at 11:24:25AM -0300, Julio Faracco wrote:
This commit is related to RTC timer device too. HPET is being shared from host device through `localtime` clock. This timer is available creating a new timer using `hpet` name.
Signed-off-by: Julio Faracco <jcfaracco@gmail.com> --- docs/formatdomain.html.in | 2 +- src/lxc/lxc_cgroup.c | 17 +++++++++++++---- src/lxc/lxc_controller.c | 33 +++++++++++++++++++++++++++++---- 3 files changed, 43 insertions(+), 9 deletions(-)
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 5598bf41b4..8571db89dc 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -2464,7 +2464,7 @@ The <code>name</code> attribute selects which timer is being modified, and can be one of "platform" (currently unsupported), - "hpet" (libxl, xen, qemu), "kvmclock" (qemu), + "hpet" (libxl, xen, qemu, lxc), "kvmclock" (qemu), "pit" (qemu), "rtc" (qemu, lxc), "tsc" (libxl, qemu - <span class="since">since 3.2.0</span>), "hypervclock" (qemu - <span class="since">since 1.2.2</span>) or diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c index 6a103055a4..997a5c3dfa 100644 --- a/src/lxc/lxc_cgroup.c +++ b/src/lxc/lxc_cgroup.c @@ -344,20 +344,19 @@ static int virLXCCgroupSetupDeviceACL(virDomainDefPtr def, for (i = 0; i < def->clock.ntimers; i++) { virDomainTimerDefPtr timer = def->clock.timers[i];
+ if (!timer->present) + break; + switch ((virDomainTimerNameType)timer->name) { case VIR_DOMAIN_TIMER_NAME_PLATFORM: case VIR_DOMAIN_TIMER_NAME_TSC: case VIR_DOMAIN_TIMER_NAME_KVMCLOCK: case VIR_DOMAIN_TIMER_NAME_HYPERVCLOCK: case VIR_DOMAIN_TIMER_NAME_PIT: - case VIR_DOMAIN_TIMER_NAME_HPET: case VIR_DOMAIN_TIMER_NAME_ARMVTIMER: case VIR_DOMAIN_TIMER_NAME_LAST: break; case VIR_DOMAIN_TIMER_NAME_RTC: - if (!timer->present) - break;
Instead of moving the check here, just put it in the right place immediately in the previous patch. Note the comment about this not being a boolean, but rather a tri-state.
- if (virFileExists("/dev/rtc")) { if (virCgroupAllowDevicePath(cgroup, "/dev/rtc", VIR_CGROUP_DEVICE_READ, @@ -367,6 +366,16 @@ static int virLXCCgroupSetupDeviceACL(virDomainDefPtr def, VIR_DEBUG("Ignoring non-existent device /dev/rtc"); } break; + case VIR_DOMAIN_TIMER_NAME_HPET: + if (virFileExists("/dev/hpet")) { + if (virCgroupAllowDevicePath(cgroup, "/dev/hpet", + VIR_CGROUP_DEVICE_READ, + false) < 0) + return -1; + } else { + VIR_DEBUG("Ignoring non-existent device /dev/hpet"); + }
Same comment about needing to report an error here.
+ case VIR_DOMAIN_TIMER_NAME_HPET: + if (stat("/dev/hpet", &sb) < 0) { + if (errno == EACCES) + return -1;
Same strange special case missing error message reporting.
+ + virReportSystemError(errno, + _("Path '%s' is not accessible"), + path); + return -1; + }
Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

There are a lots of strings being handled inside some LXC functions. They can be moved to g_autofree to avoid declaring a return value to get proper code cleanups. This commit is changing functions from lxc_{controller,cgroup,fuse} only. Signed-off-by: Julio Faracco <jcfaracco@gmail.com> --- src/lxc/lxc_cgroup.c | 15 +++---- src/lxc/lxc_controller.c | 96 ++++++++++++++-------------------------- src/lxc/lxc_fuse.c | 23 +++------- 3 files changed, 44 insertions(+), 90 deletions(-) diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c index 997a5c3dfa..d29b65092a 100644 --- a/src/lxc/lxc_cgroup.c +++ b/src/lxc/lxc_cgroup.c @@ -54,8 +54,7 @@ static int virLXCCgroupSetupCpusetTune(virDomainDefPtr def, virCgroupPtr cgroup, virBitmapPtr nodemask) { - int ret = -1; - char *mask = NULL; + g_autofree char *mask = NULL; virDomainNumatuneMemMode mode; if (def->placement_mode != VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO && @@ -66,21 +65,17 @@ static int virLXCCgroupSetupCpusetTune(virDomainDefPtr def, if (virDomainNumatuneGetMode(def->numa, -1, &mode) < 0 || mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { - ret = 0; - goto cleanup; + return 0; } if (virDomainNumatuneMaybeFormatNodeset(def->numa, nodemask, &mask, -1) < 0) - goto cleanup; + return -1; if (mask && virCgroupSetCpusetMems(cgroup, mask) < 0) - goto cleanup; + return -1; - ret = 0; - cleanup: - VIR_FREE(mask); - return ret; + return 0; } diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c index 518967ee83..c580b17f5f 100644 --- a/src/lxc/lxc_controller.c +++ b/src/lxc/lxc_controller.c @@ -802,8 +802,7 @@ static int virLXCControllerGetNumadAdvice(virLXCControllerPtr ctrl, virBitmapPtr *mask) { virBitmapPtr nodemask = NULL; - char *nodeset = NULL; - int ret = -1; + g_autofree char *nodeset = NULL; /* Get the advisory nodeset from numad if 'placement' of * either <vcpu> or <numatune> is 'auto'. @@ -812,20 +811,17 @@ static int virLXCControllerGetNumadAdvice(virLXCControllerPtr ctrl, nodeset = virNumaGetAutoPlacementAdvice(virDomainDefGetVcpus(ctrl->def), ctrl->def->mem.cur_balloon); if (!nodeset) - goto cleanup; + return -1; VIR_DEBUG("Nodeset returned from numad: %s", nodeset); if (virBitmapParse(nodeset, &nodemask, VIR_DOMAIN_CPUMASK_LEN) < 0) - goto cleanup; + return -1; } - ret = 0; *mask = nodemask; - cleanup: - VIR_FREE(nodeset); - return ret; + return 0; } @@ -1434,9 +1430,8 @@ virLXCControllerSetupUsernsMap(virDomainIdMapEntryPtr map, */ static int virLXCControllerSetupUserns(virLXCControllerPtr ctrl) { - char *uid_map = NULL; - char *gid_map = NULL; - int ret = -1; + g_autofree char *uid_map = NULL; + g_autofree char *gid_map = NULL; /* User namespace is disabled for container */ if (ctrl->def->idmap.nuidmap == 0) { @@ -1450,28 +1445,23 @@ static int virLXCControllerSetupUserns(virLXCControllerPtr ctrl) if (virLXCControllerSetupUsernsMap(ctrl->def->idmap.uidmap, ctrl->def->idmap.nuidmap, uid_map) < 0) - goto cleanup; + return -1; gid_map = g_strdup_printf("/proc/%d/gid_map", ctrl->initpid); if (virLXCControllerSetupUsernsMap(ctrl->def->idmap.gidmap, ctrl->def->idmap.ngidmap, gid_map) < 0) - goto cleanup; + return -1; - ret = 0; - cleanup: - VIR_FREE(uid_map); - VIR_FREE(gid_map); - return ret; + return 0; } static int virLXCControllerSetupDev(virLXCControllerPtr ctrl) { - char *mount_options = NULL; - char *opts = NULL; - char *dev = NULL; - int ret = -1; + g_autofree char *mount_options = NULL; + g_autofree char *opts = NULL; + g_autofree char *dev = NULL; VIR_DEBUG("Setting up /dev/ for container"); @@ -1488,24 +1478,18 @@ static int virLXCControllerSetupDev(virLXCControllerPtr ctrl) opts = g_strdup_printf("mode=755,size=65536%s", mount_options); if (virFileSetupDev(dev, opts) < 0) - goto cleanup; + return -1; if (lxcContainerChown(ctrl->def, dev) < 0) - goto cleanup; + return -1; - ret = 0; - cleanup: - VIR_FREE(opts); - VIR_FREE(mount_options); - VIR_FREE(dev); - return ret; + return 0; } static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) { size_t i; - int ret = -1; - char *path = NULL; + g_autofree char *path = NULL; const struct { int maj; int min; @@ -1521,7 +1505,7 @@ static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) }; if (virLXCControllerSetupDev(ctrl) < 0) - goto cleanup; + return -1; /* Populate /dev/ with a few important bits */ for (i = 0; i < G_N_ELEMENTS(devs); i++) { @@ -1534,19 +1518,14 @@ static int virLXCControllerPopulateDevices(virLXCControllerPtr ctrl) virReportSystemError(errno, _("Failed to make device %s"), path); - goto cleanup; + return -1; } if (lxcContainerChown(ctrl->def, path) < 0) - goto cleanup; - - VIR_FREE(path); + return -1; } - ret = 0; - cleanup: - VIR_FREE(path); - return ret; + return 0; } @@ -2202,10 +2181,9 @@ virLXCControllerSetupPrivateNS(void) static int virLXCControllerSetupDevPTS(virLXCControllerPtr ctrl) { - char *mount_options = NULL; - char *opts = NULL; - char *devpts = NULL; - int ret = -1; + g_autofree char *mount_options = NULL; + g_autofree char *opts = NULL; + g_autofree char *devpts = NULL; gid_t ptsgid = 5; VIR_DEBUG("Setting up private /dev/pts"); @@ -2220,7 +2198,7 @@ virLXCControllerSetupDevPTS(virLXCControllerPtr ctrl) virReportSystemError(errno, _("Failed to make path %s"), devpts); - goto cleanup; + return -1; } if (ctrl->def->idmap.ngidmap) @@ -2239,26 +2217,20 @@ virLXCControllerSetupDevPTS(virLXCControllerPtr ctrl) virReportSystemError(errno, _("Failed to mount devpts on %s"), devpts); - goto cleanup; + return -1; } if (access(ctrl->devptmx, R_OK) < 0) { virReportSystemError(ENOSYS, "%s", _("Kernel does not support private devpts")); - goto cleanup; + return -1; } if ((lxcContainerChown(ctrl->def, ctrl->devptmx) < 0) || (lxcContainerChown(ctrl->def, devpts) < 0)) - goto cleanup; - - ret = 0; + return -1; - cleanup: - VIR_FREE(opts); - VIR_FREE(devpts); - VIR_FREE(mount_options); - return ret; + return 0; } @@ -2279,8 +2251,7 @@ virLXCControllerSetupConsoles(virLXCControllerPtr ctrl, char **containerTTYPaths) { size_t i; - int ret = -1; - char *ttyHostPath = NULL; + g_autofree char *ttyHostPath = NULL; for (i = 0; i < ctrl->nconsoles; i++) { VIR_DEBUG("Opening tty on private %s", ctrl->devptmx); @@ -2289,20 +2260,17 @@ virLXCControllerSetupConsoles(virLXCControllerPtr ctrl, &containerTTYPaths[i], &ttyHostPath) < 0) { virReportSystemError(errno, "%s", _("Failed to allocate tty")); - goto cleanup; + return -1; } /* Change the owner of tty device to the root user of container */ if (lxcContainerChown(ctrl->def, ttyHostPath) < 0) - goto cleanup; + return -1; VIR_FREE(ttyHostPath); } - ret = 0; - cleanup: - VIR_FREE(ttyHostPath); - return ret; + return 0; } diff --git a/src/lxc/lxc_fuse.c b/src/lxc/lxc_fuse.c index 44f240a0b5..8cfccdd7e0 100644 --- a/src/lxc/lxc_fuse.c +++ b/src/lxc/lxc_fuse.c @@ -40,8 +40,7 @@ static const char *fuse_meminfo_path = "/meminfo"; static int lxcProcGetattr(const char *path, struct stat *stbuf) { - int res; - char *mempath = NULL; + g_autofree char *mempath = NULL; struct stat sb; struct fuse_context *context = fuse_get_context(); virDomainDefPtr def = (virDomainDefPtr)context->private_data; @@ -49,16 +48,12 @@ static int lxcProcGetattr(const char *path, struct stat *stbuf) memset(stbuf, 0, sizeof(struct stat)); mempath = g_strdup_printf("/proc/%s", path); - res = 0; - if (STREQ(path, "/")) { stbuf->st_mode = S_IFDIR | 0755; stbuf->st_nlink = 2; } else if (STREQ(path, fuse_meminfo_path)) { - if (stat(mempath, &sb) < 0) { - res = -errno; - goto cleanup; - } + if (stat(mempath, &sb) < 0) + return -errno; stbuf->st_uid = def->idmap.uidmap ? def->idmap.uidmap[0].target : 0; stbuf->st_gid = def->idmap.gidmap ? def->idmap.gidmap[0].target : 0; @@ -71,12 +66,10 @@ static int lxcProcGetattr(const char *path, struct stat *stbuf) stbuf->st_ctime = sb.st_ctime; stbuf->st_mtime = sb.st_mtime; } else { - res = -ENOENT; + return -ENOENT; } - cleanup: - VIR_FREE(mempath); - return res; + return 0; } static int lxcProcReaddir(const char *path, void *buf, @@ -127,7 +120,7 @@ static int lxcProcReadMeminfo(char *hostpath, virDomainDefPtr def, { int res; FILE *fd = NULL; - char *line = NULL; + g_autofree char *line = NULL; size_t n; struct virLXCMeminfo meminfo; virBuffer buffer = VIR_BUFFER_INITIALIZER; @@ -229,7 +222,6 @@ static int lxcProcReadMeminfo(char *hostpath, virDomainDefPtr def, memcpy(buf, virBufferCurrentContent(new_meminfo), res); cleanup: - VIR_FREE(line); virBufferFreeAndReset(new_meminfo); VIR_FORCE_FCLOSE(fd); return res; @@ -242,7 +234,7 @@ static int lxcProcRead(const char *path G_GNUC_UNUSED, struct fuse_file_info *fi G_GNUC_UNUSED) { int res = -ENOENT; - char *hostpath = NULL; + g_autofree char *hostpath = NULL; struct fuse_context *context = NULL; virDomainDefPtr def = NULL; @@ -256,7 +248,6 @@ static int lxcProcRead(const char *path G_GNUC_UNUSED, res = lxcProcHostRead(hostpath, buf, size, offset); } - VIR_FREE(hostpath); return res; } -- 2.20.1

On Mon, Feb 24, 2020 at 11:24:27AM -0300, Julio Faracco wrote:
There are a lots of strings being handled inside some LXC functions. They can be moved to g_autofree to avoid declaring a return value to get proper code cleanups. This commit is changing functions from lxc_{controller,cgroup,fuse} only.
Signed-off-by: Julio Faracco <jcfaracco@gmail.com> --- src/lxc/lxc_cgroup.c | 15 +++---- src/lxc/lxc_controller.c | 96 ++++++++++++++-------------------------- src/lxc/lxc_fuse.c | 23 +++------- 3 files changed, 44 insertions(+), 90 deletions(-)
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> I'll push this one since its independant of the rest of the series. Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

This commit tries to fix lots of issues related to LXC VCPUs. One of them is related to /proc/cpuinfo content. If only 1 VCPU is set, LXC containers will show all CPUs available for host. The second one is related to CPU share, if an user set only 1 VCPU, the container/process will use all available CPUs. (This is not the case when `cpuset` attribute is declared). So, this commit adds a virtual cpuinfo based on VCPU mapping and it automatically limits the CPU usage according VCPU count. Example (now): LXC container - 8 CPUS with 2 VCPU: lxc-root# stress --cpu 8 On host machine, only CPU 0 and 1 have 100% usage. Signed-off-by: Julio Faracco <jcfaracco@gmail.com> --- src/lxc/lxc_cgroup.c | 31 ++++++++++++++ src/lxc/lxc_container.c | 39 ++++++++++------- src/lxc/lxc_fuse.c | 95 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 145 insertions(+), 20 deletions(-) diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c index d29b65092a..912a252473 100644 --- a/src/lxc/lxc_cgroup.c +++ b/src/lxc/lxc_cgroup.c @@ -50,6 +50,34 @@ static int virLXCCgroupSetupCpuTune(virDomainDefPtr def, } +static int virLXCCgroupSetupVcpuAuto(virDomainDefPtr def, + virCgroupPtr cgroup) +{ + size_t i; + int vcpumax; + virBuffer buffer = VIR_BUFFER_INITIALIZER; + virBufferPtr cpuset = &buffer; + + vcpumax = virDomainDefGetVcpusMax(def); + for (i = 0; i < vcpumax; i++) { + virDomainVcpuDefPtr vcpu = virDomainDefGetVcpu(def, i); + /* Cgroup is smart enough to convert numbers separated + * by comma into ranges. Example: "0,1,2,5," -> "0-2,5". + * Libvirt does not need to process it here. */ + if (vcpu) + virBufferAsprintf(cpuset, "%zu,", i); + } + if (virCgroupSetCpusetCpus(cgroup, + virBufferCurrentContent(cpuset)) < 0) { + virBufferFreeAndReset(cpuset); + return -1; + } + + virBufferFreeAndReset(cpuset); + return 0; +} + + static int virLXCCgroupSetupCpusetTune(virDomainDefPtr def, virCgroupPtr cgroup, virBitmapPtr nodemask) @@ -61,6 +89,9 @@ static int virLXCCgroupSetupCpusetTune(virDomainDefPtr def, def->cpumask && virCgroupSetupCpusetCpus(cgroup, def->cpumask) < 0) { return -1; + } else { + /* auto mode for VCPU limits */ + virLXCCgroupSetupVcpuAuto(def, cgroup); } if (virDomainNumatuneGetMode(def->numa, -1, &mode) < 0 || diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 41efe43a14..88e27f3060 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -997,8 +997,8 @@ static int lxcContainerMountBasicFS(bool userns_enabled, static int lxcContainerMountProcFuse(virDomainDefPtr def, const char *stateDir) { - int ret; - char *meminfo_path = NULL; + g_autofree char *meminfo_path = NULL; + g_autofree char *cpuinfo_path = NULL; VIR_DEBUG("Mount /proc/meminfo stateDir=%s", stateDir); @@ -1006,15 +1006,29 @@ static int lxcContainerMountProcFuse(virDomainDefPtr def, stateDir, def->name); - if ((ret = mount(meminfo_path, "/proc/meminfo", - NULL, MS_BIND, NULL)) < 0) { + if (mount(meminfo_path, "/proc/meminfo", + NULL, MS_BIND, NULL) < 0) { virReportSystemError(errno, _("Failed to mount %s on /proc/meminfo"), meminfo_path); + return -1; } - VIR_FREE(meminfo_path); - return ret; + VIR_DEBUG("Mount /proc/cpuinfo stateDir=%s", stateDir); + + cpuinfo_path = g_strdup_printf("/.oldroot/%s/%s.fuse/cpuinfo", + stateDir, + def->name); + + if (mount(cpuinfo_path, "/proc/cpuinfo", + NULL, MS_BIND, NULL) < 0) { + virReportSystemError(errno, + _("Failed to mount %s on /proc/cpuinfo"), + cpuinfo_path); + return -1; + } + + return 0; } #else static int lxcContainerMountProcFuse(virDomainDefPtr def G_GNUC_UNUSED, @@ -1027,8 +1041,7 @@ static int lxcContainerMountProcFuse(virDomainDefPtr def G_GNUC_UNUSED, static int lxcContainerMountFSDev(virDomainDefPtr def, const char *stateDir) { - int ret = -1; - char *path = NULL; + g_autofree char *path = NULL; int flags = def->idmap.nuidmap ? MS_BIND : MS_MOVE; VIR_DEBUG("Mount /dev/ stateDir=%s", stateDir); @@ -1038,7 +1051,7 @@ static int lxcContainerMountFSDev(virDomainDefPtr def, if (virFileMakePath("/dev") < 0) { virReportSystemError(errno, "%s", _("Cannot create /dev")); - goto cleanup; + return -1; } VIR_DEBUG("Trying to %s %s to /dev", def->idmap.nuidmap ? @@ -1048,14 +1061,10 @@ static int lxcContainerMountFSDev(virDomainDefPtr def, virReportSystemError(errno, _("Failed to mount %s on /dev"), path); - goto cleanup; + return -1; } - ret = 0; - - cleanup: - VIR_FREE(path); - return ret; + return 0; } static int lxcContainerMountFSDevPTS(virDomainDefPtr def, diff --git a/src/lxc/lxc_fuse.c b/src/lxc/lxc_fuse.c index 8cfccdd7e0..b2117bfa17 100644 --- a/src/lxc/lxc_fuse.c +++ b/src/lxc/lxc_fuse.c @@ -36,23 +36,29 @@ #if WITH_FUSE +#ifndef CPUINFO_FILE_LEN +# define CPUINFO_FILE_LEN (1024*1024) +#endif + static const char *fuse_meminfo_path = "/meminfo"; +static const char *fuse_cpuinfo_path = "/cpuinfo"; static int lxcProcGetattr(const char *path, struct stat *stbuf) { - g_autofree char *mempath = NULL; + g_autofree char *procpath = NULL; struct stat sb; struct fuse_context *context = fuse_get_context(); virDomainDefPtr def = (virDomainDefPtr)context->private_data; memset(stbuf, 0, sizeof(struct stat)); - mempath = g_strdup_printf("/proc/%s", path); + procpath = g_strdup_printf("/proc/%s", path); if (STREQ(path, "/")) { stbuf->st_mode = S_IFDIR | 0755; stbuf->st_nlink = 2; - } else if (STREQ(path, fuse_meminfo_path)) { - if (stat(mempath, &sb) < 0) + } else if (STREQ(path, fuse_meminfo_path) || + STREQ(path, fuse_cpuinfo_path)) { + if (stat(procpath, &sb) < 0) return -errno; stbuf->st_uid = def->idmap.uidmap ? def->idmap.uidmap[0].target : 0; @@ -83,6 +89,7 @@ static int lxcProcReaddir(const char *path, void *buf, filler(buf, ".", NULL, 0); filler(buf, "..", NULL, 0); filler(buf, fuse_meminfo_path + 1, NULL, 0); + filler(buf, fuse_cpuinfo_path + 1, NULL, 0); return 0; } @@ -90,7 +97,8 @@ static int lxcProcReaddir(const char *path, void *buf, static int lxcProcOpen(const char *path G_GNUC_UNUSED, struct fuse_file_info *fi G_GNUC_UNUSED) { - if (STRNEQ(path, fuse_meminfo_path)) + if (STRNEQ(path, fuse_meminfo_path) && + STRNEQ(path, fuse_cpuinfo_path)) return -ENOENT; if ((fi->flags & 3) != O_RDONLY) @@ -227,6 +235,80 @@ static int lxcProcReadMeminfo(char *hostpath, virDomainDefPtr def, return res; } + +static int +lxcProcReadCpuinfoParse(virDomainDefPtr def, char *base, + virBufferPtr new_cpuinfo) +{ + char *procline = NULL; + char *saveptr = base; + size_t cpu; + size_t nvcpu; + size_t curcpu = 0; + bool get_proc = false; + + nvcpu = virDomainDefGetVcpus(def); + while ((procline = strtok_r(NULL, "\n", &saveptr))) { + if (sscanf(procline, "processor\t: %zu", &cpu) == 1) { + virDomainVcpuDefPtr vcpu = virDomainDefGetVcpu(def, cpu); + /* VCPU is mapped */ + if (vcpu) { + if (curcpu == nvcpu) + break; + + if (curcpu > 0) + virBufferAddLit(new_cpuinfo, "\n"); + + virBufferAsprintf(new_cpuinfo, "processor\t: %zu\n", + curcpu); + curcpu++; + get_proc = true; + } else { + get_proc = false; + } + } else { + /* It is not a processor index */ + if (get_proc) + virBufferAsprintf(new_cpuinfo, "%s\n", procline); + } + } + + virBufferAddLit(new_cpuinfo, "\n"); + + return strlen(virBufferCurrentContent(new_cpuinfo)); +} + + +static int lxcProcReadCpuinfo(char *hostpath, virDomainDefPtr def, + char *buf, size_t size, off_t offset) +{ + virBuffer buffer = VIR_BUFFER_INITIALIZER; + virBufferPtr new_cpuinfo = &buffer; + g_autofree char *outbuf = NULL; + int res = -1; + + /* Gather info from /proc/cpuinfo */ + if (virFileReadAll(hostpath, CPUINFO_FILE_LEN, &outbuf) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Failed to open %s"), hostpath); + return -1; + } + + /* /proc/cpuinfo does not support fseek */ + if (offset > 0) + return 0; + + res = lxcProcReadCpuinfoParse(def, outbuf, new_cpuinfo); + + if (res > size) + res = size; + memcpy(buf, virBufferCurrentContent(new_cpuinfo), res); + + virBufferFreeAndReset(new_cpuinfo); + return res; +} + + static int lxcProcRead(const char *path G_GNUC_UNUSED, char *buf G_GNUC_UNUSED, size_t size G_GNUC_UNUSED, @@ -246,6 +328,9 @@ static int lxcProcRead(const char *path G_GNUC_UNUSED, if (STREQ(path, fuse_meminfo_path)) { if ((res = lxcProcReadMeminfo(hostpath, def, buf, size, offset)) < 0) res = lxcProcHostRead(hostpath, buf, size, offset); + } else if (STREQ(path, fuse_cpuinfo_path)) { + if ((res = lxcProcReadCpuinfo(hostpath, def, buf, size, offset)) < 0) + res = lxcProcHostRead(hostpath, buf, size, offset); } return res; -- 2.20.1

On Mon, Feb 24, 2020 at 11:24:28AM -0300, Julio Faracco wrote:
This commit tries to fix lots of issues related to LXC VCPUs. One of them is related to /proc/cpuinfo content. If only 1 VCPU is set, LXC containers will show all CPUs available for host. The second one is related to CPU share, if an user set only 1 VCPU, the container/process will use all available CPUs. (This is not the case when `cpuset` attribute is declared). So, this commit adds a virtual cpuinfo based on VCPU mapping and it automatically limits the CPU usage according VCPU count.
Example (now): LXC container - 8 CPUS with 2 VCPU: lxc-root# stress --cpu 8
On host machine, only CPU 0 and 1 have 100% usage.
Signed-off-by: Julio Faracco <jcfaracco@gmail.com> --- src/lxc/lxc_cgroup.c | 31 ++++++++++++++ src/lxc/lxc_container.c | 39 ++++++++++------- src/lxc/lxc_fuse.c | 95 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 145 insertions(+), 20 deletions(-)
diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c index d29b65092a..912a252473 100644 --- a/src/lxc/lxc_cgroup.c +++ b/src/lxc/lxc_cgroup.c @@ -50,6 +50,34 @@ static int virLXCCgroupSetupCpuTune(virDomainDefPtr def, }
+static int virLXCCgroupSetupVcpuAuto(virDomainDefPtr def, + virCgroupPtr cgroup) +{ + size_t i; + int vcpumax; + virBuffer buffer = VIR_BUFFER_INITIALIZER; + virBufferPtr cpuset = &buffer; + + vcpumax = virDomainDefGetVcpusMax(def); + for (i = 0; i < vcpumax; i++) { + virDomainVcpuDefPtr vcpu = virDomainDefGetVcpu(def, i); + /* Cgroup is smart enough to convert numbers separated + * by comma into ranges. Example: "0,1,2,5," -> "0-2,5". + * Libvirt does not need to process it here. */ + if (vcpu) + virBufferAsprintf(cpuset, "%zu,", i); + }
If I have 4 containers, each with vcpu==2, then all 4 containers are going to be taskset onto host CPUs 0 & 1, and host CPUs 2,3,4,5,6,7 are going to be unused. This is not viable as a strategy.
+ if (virCgroupSetCpusetCpus(cgroup, + virBufferCurrentContent(cpuset)) < 0) { + virBufferFreeAndReset(cpuset); + return -1; + } + + virBufferFreeAndReset(cpuset); + return 0; +}
+static int +lxcProcReadCpuinfoParse(virDomainDefPtr def, char *base, + virBufferPtr new_cpuinfo) +{ + char *procline = NULL; + char *saveptr = base; + size_t cpu; + size_t nvcpu; + size_t curcpu = 0; + bool get_proc = false; + + nvcpu = virDomainDefGetVcpus(def); + while ((procline = strtok_r(NULL, "\n", &saveptr))) { + if (sscanf(procline, "processor\t: %zu", &cpu) == 1) {
This doesn't work because it is assuming the /proc/cpuinfo data format for x86 architecture. Every architecture puts different info in this file, and only some of them using "processor: NN" as a delimiter for new entries. There's many examples in tests/sysinfodata/ and in tests/virhostcpudata/ showing this problem We had considered taking /proc/cpuinfo in the past, but decided against it. Since this file is non-standard data format varying across arches, well written apps won't actually look at /proc/cpuinfo at all. Instead they'll use /sys/devices/system/cpu instead. I don't think we want to make /proc/cpuinfo be inconsistent with data in sysfs.
+ virDomainVcpuDefPtr vcpu = virDomainDefGetVcpu(def, cpu); + /* VCPU is mapped */ + if (vcpu) { + if (curcpu == nvcpu) + break; + + if (curcpu > 0) + virBufferAddLit(new_cpuinfo, "\n"); + + virBufferAsprintf(new_cpuinfo, "processor\t: %zu\n", + curcpu); + curcpu++; + get_proc = true; + } else { + get_proc = false; + } + } else { + /* It is not a processor index */ + if (get_proc) + virBufferAsprintf(new_cpuinfo, "%s\n", procline); + } + } + + virBufferAddLit(new_cpuinfo, "\n"); + + return strlen(virBufferCurrentContent(new_cpuinfo)); +}
Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

Thanks for your comments, Daniel. Well, since you have a consensus about /proc/cpuinfo, I will only resubmit the patches related to timer. Julio Cesar Faracco Em ter., 25 de fev. de 2020 às 08:34, Daniel P. Berrangé <berrange@redhat.com> escreveu:
On Mon, Feb 24, 2020 at 11:24:28AM -0300, Julio Faracco wrote:
This commit tries to fix lots of issues related to LXC VCPUs. One of them is related to /proc/cpuinfo content. If only 1 VCPU is set, LXC containers will show all CPUs available for host. The second one is related to CPU share, if an user set only 1 VCPU, the container/process will use all available CPUs. (This is not the case when `cpuset` attribute is declared). So, this commit adds a virtual cpuinfo based on VCPU mapping and it automatically limits the CPU usage according VCPU count.
Example (now): LXC container - 8 CPUS with 2 VCPU: lxc-root# stress --cpu 8
On host machine, only CPU 0 and 1 have 100% usage.
Signed-off-by: Julio Faracco <jcfaracco@gmail.com> --- src/lxc/lxc_cgroup.c | 31 ++++++++++++++ src/lxc/lxc_container.c | 39 ++++++++++------- src/lxc/lxc_fuse.c | 95 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 145 insertions(+), 20 deletions(-)
diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c index d29b65092a..912a252473 100644 --- a/src/lxc/lxc_cgroup.c +++ b/src/lxc/lxc_cgroup.c @@ -50,6 +50,34 @@ static int virLXCCgroupSetupCpuTune(virDomainDefPtr def, }
+static int virLXCCgroupSetupVcpuAuto(virDomainDefPtr def, + virCgroupPtr cgroup) +{ + size_t i; + int vcpumax; + virBuffer buffer = VIR_BUFFER_INITIALIZER; + virBufferPtr cpuset = &buffer; + + vcpumax = virDomainDefGetVcpusMax(def); + for (i = 0; i < vcpumax; i++) { + virDomainVcpuDefPtr vcpu = virDomainDefGetVcpu(def, i); + /* Cgroup is smart enough to convert numbers separated + * by comma into ranges. Example: "0,1,2,5," -> "0-2,5". + * Libvirt does not need to process it here. */ + if (vcpu) + virBufferAsprintf(cpuset, "%zu,", i); + }
If I have 4 containers, each with vcpu==2, then all 4 containers are going to be taskset onto host CPUs 0 & 1, and host CPUs 2,3,4,5,6,7 are going to be unused. This is not viable as a strategy.
+ if (virCgroupSetCpusetCpus(cgroup, + virBufferCurrentContent(cpuset)) < 0) { + virBufferFreeAndReset(cpuset); + return -1; + } + + virBufferFreeAndReset(cpuset); + return 0; +}
+static int +lxcProcReadCpuinfoParse(virDomainDefPtr def, char *base, + virBufferPtr new_cpuinfo) +{ + char *procline = NULL; + char *saveptr = base; + size_t cpu; + size_t nvcpu; + size_t curcpu = 0; + bool get_proc = false; + + nvcpu = virDomainDefGetVcpus(def); + while ((procline = strtok_r(NULL, "\n", &saveptr))) { + if (sscanf(procline, "processor\t: %zu", &cpu) == 1) {
This doesn't work because it is assuming the /proc/cpuinfo data format for x86 architecture. Every architecture puts different info in this file, and only some of them using "processor: NN" as a delimiter for new entries. There's many examples in tests/sysinfodata/ and in tests/virhostcpudata/ showing this problem
We had considered taking /proc/cpuinfo in the past, but decided against it. Since this file is non-standard data format varying across arches, well written apps won't actually look at /proc/cpuinfo at all. Instead they'll use /sys/devices/system/cpu instead. I don't think we want to make /proc/cpuinfo be inconsistent with data in sysfs.
+ virDomainVcpuDefPtr vcpu = virDomainDefGetVcpu(def, cpu); + /* VCPU is mapped */ + if (vcpu) { + if (curcpu == nvcpu) + break; + + if (curcpu > 0) + virBufferAddLit(new_cpuinfo, "\n"); + + virBufferAsprintf(new_cpuinfo, "processor\t: %zu\n", + curcpu); + curcpu++; + get_proc = true; + } else { + get_proc = false; + } + } else { + /* It is not a processor index */ + if (get_proc) + virBufferAsprintf(new_cpuinfo, "%s\n", procline); + } + } + + virBufferAddLit(new_cpuinfo, "\n"); + + return strlen(virBufferCurrentContent(new_cpuinfo)); +}
Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

Native config files sometimes can setup cpuset.cpus to pin some CPUs. Before this, LXC was using a fixed number of 1 VCPU. After this commit, XML definition will generate a dynamic number of VCPUs based on that cgroup attribute. Signed-off-by: Julio Faracco <jcfaracco@gmail.com> --- src/lxc/lxc_container.c | 23 ++++++++++++++++++ src/lxc/lxc_container.h | 2 ++ src/lxc/lxc_native.c | 24 +++++++++++++++++-- .../lxcconf2xml-cpusettune.xml | 2 +- 4 files changed, 48 insertions(+), 3 deletions(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 88e27f3060..c5788e5c32 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -2487,3 +2487,26 @@ int lxcContainerChown(virDomainDefPtr def, const char *path) return 0; } + + +int lxcContainerGetMaxCpusInCpuset(const char *cpuset) +{ + const char *c = cpuset; + int max_cpu = 0; + + while (c) { + int a, b, ret; + + ret = sscanf(c, "%d-%d", &a, &b); + if (ret == 1) + max_cpu++; + else if (ret == 2) + max_cpu += a > b ? a - b + 1 : b - a + 1; + + if (!(c = strchr(c+1, ','))) + break; + c++; + } + + return max_cpu; +} diff --git a/src/lxc/lxc_container.h b/src/lxc/lxc_container.h index 94a6c5309c..6f112e0667 100644 --- a/src/lxc/lxc_container.h +++ b/src/lxc/lxc_container.h @@ -63,3 +63,5 @@ virArch lxcContainerGetAlt32bitArch(virArch arch); int lxcContainerChown(virDomainDefPtr def, const char *path); bool lxcIsBasicMountLocation(const char *path); + +int lxcContainerGetMaxCpusInCpuset(const char *cpuset); diff --git a/src/lxc/lxc_native.c b/src/lxc/lxc_native.c index 02d2bf33e4..409bf00bd2 100644 --- a/src/lxc/lxc_native.c +++ b/src/lxc/lxc_native.c @@ -993,6 +993,24 @@ lxcSetCpusetTune(virDomainDefPtr def, virConfPtr properties) return 0; } + +static int +lxcGetVCpuMax(virConfPtr properties) +{ + g_autofree char *value = NULL; + int vcpumax = 1; + + if (virConfGetValueString(properties, "lxc.cgroup.cpuset.cpus", + &value) > 0) { + vcpumax = lxcContainerGetMaxCpusInCpuset(value); + if (vcpumax > 0) + return vcpumax; + } + + return vcpumax; +} + + static int lxcBlkioDeviceWalkCallback(const char *name, virConfValuePtr value, void *data) { @@ -1132,6 +1150,7 @@ lxcParseConfigString(const char *config, virDomainDefPtr vmdef = NULL; g_autoptr(virConf) properties = NULL; g_autofree char *value = NULL; + int vcpumax; if (!(properties = virConfReadString(config, VIR_CONF_FLAG_LXC_FORMAT))) return NULL; @@ -1155,10 +1174,11 @@ lxcParseConfigString(const char *config, /* Value not handled by the LXC driver, setting to * minimum required to make XML parsing pass */ - if (virDomainDefSetVcpusMax(vmdef, 1, xmlopt) < 0) + vcpumax = lxcGetVCpuMax(properties); + if (virDomainDefSetVcpusMax(vmdef, vcpumax, xmlopt) < 0) goto error; - if (virDomainDefSetVcpus(vmdef, 1) < 0) + if (virDomainDefSetVcpus(vmdef, vcpumax) < 0) goto error; vmdef->nfss = 0; diff --git a/tests/lxcconf2xmldata/lxcconf2xml-cpusettune.xml b/tests/lxcconf2xmldata/lxcconf2xml-cpusettune.xml index 6df089d00f..a1fec12d9b 100644 --- a/tests/lxcconf2xmldata/lxcconf2xml-cpusettune.xml +++ b/tests/lxcconf2xmldata/lxcconf2xml-cpusettune.xml @@ -3,7 +3,7 @@ <uuid>c7a5fdbd-edaf-9455-926a-d65c16db1809</uuid> <memory unit='KiB'>65536</memory> <currentMemory unit='KiB'>65536</currentMemory> - <vcpu placement='static' cpuset='1-2,5-7'>1</vcpu> + <vcpu placement='static' cpuset='1-2,5-7'>5</vcpu> <numatune> <memory mode='strict' nodeset='1-4'/> </numatune> -- 2.20.1

On Mon, Feb 24, 2020 at 11:24:29AM -0300, Julio Faracco wrote:
Native config files sometimes can setup cpuset.cpus to pin some CPUs. Before this, LXC was using a fixed number of 1 VCPU. After this commit, XML definition will generate a dynamic number of VCPUs based on that cgroup attribute.
I'm not sure whether this is worth doing - mapping between VCPU count and CPUset doesn't really work during startup. Historically we've just completely ignored <vcpu> for containers. Ideally that element should have been optional in the XML parser as it is not a good conceptual fit for LXC.
Signed-off-by: Julio Faracco <jcfaracco@gmail.com> --- src/lxc/lxc_container.c | 23 ++++++++++++++++++ src/lxc/lxc_container.h | 2 ++ src/lxc/lxc_native.c | 24 +++++++++++++++++-- .../lxcconf2xml-cpusettune.xml | 2 +- 4 files changed, 48 insertions(+), 3 deletions(-)
diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index 88e27f3060..c5788e5c32 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -2487,3 +2487,26 @@ int lxcContainerChown(virDomainDefPtr def, const char *path)
return 0; } + + +int lxcContainerGetMaxCpusInCpuset(const char *cpuset) +{ + const char *c = cpuset; + int max_cpu = 0; + + while (c) { + int a, b, ret; + + ret = sscanf(c, "%d-%d", &a, &b); + if (ret == 1) + max_cpu++; + else if (ret == 2) + max_cpu += a > b ? a - b + 1 : b - a + 1; + + if (!(c = strchr(c+1, ','))) + break; + c++; + } + + return max_cpu; +} diff --git a/src/lxc/lxc_container.h b/src/lxc/lxc_container.h index 94a6c5309c..6f112e0667 100644 --- a/src/lxc/lxc_container.h +++ b/src/lxc/lxc_container.h @@ -63,3 +63,5 @@ virArch lxcContainerGetAlt32bitArch(virArch arch); int lxcContainerChown(virDomainDefPtr def, const char *path);
bool lxcIsBasicMountLocation(const char *path); + +int lxcContainerGetMaxCpusInCpuset(const char *cpuset); diff --git a/src/lxc/lxc_native.c b/src/lxc/lxc_native.c index 02d2bf33e4..409bf00bd2 100644 --- a/src/lxc/lxc_native.c +++ b/src/lxc/lxc_native.c @@ -993,6 +993,24 @@ lxcSetCpusetTune(virDomainDefPtr def, virConfPtr properties) return 0; }
+ +static int +lxcGetVCpuMax(virConfPtr properties) +{ + g_autofree char *value = NULL; + int vcpumax = 1; + + if (virConfGetValueString(properties, "lxc.cgroup.cpuset.cpus", + &value) > 0) { + vcpumax = lxcContainerGetMaxCpusInCpuset(value); + if (vcpumax > 0) + return vcpumax; + } + + return vcpumax; +} + + static int lxcBlkioDeviceWalkCallback(const char *name, virConfValuePtr value, void *data) { @@ -1132,6 +1150,7 @@ lxcParseConfigString(const char *config, virDomainDefPtr vmdef = NULL; g_autoptr(virConf) properties = NULL; g_autofree char *value = NULL; + int vcpumax;
if (!(properties = virConfReadString(config, VIR_CONF_FLAG_LXC_FORMAT))) return NULL; @@ -1155,10 +1174,11 @@ lxcParseConfigString(const char *config,
/* Value not handled by the LXC driver, setting to * minimum required to make XML parsing pass */ - if (virDomainDefSetVcpusMax(vmdef, 1, xmlopt) < 0) + vcpumax = lxcGetVCpuMax(properties); + if (virDomainDefSetVcpusMax(vmdef, vcpumax, xmlopt) < 0) goto error;
- if (virDomainDefSetVcpus(vmdef, 1) < 0) + if (virDomainDefSetVcpus(vmdef, vcpumax) < 0) goto error;
vmdef->nfss = 0; diff --git a/tests/lxcconf2xmldata/lxcconf2xml-cpusettune.xml b/tests/lxcconf2xmldata/lxcconf2xml-cpusettune.xml index 6df089d00f..a1fec12d9b 100644 --- a/tests/lxcconf2xmldata/lxcconf2xml-cpusettune.xml +++ b/tests/lxcconf2xmldata/lxcconf2xml-cpusettune.xml @@ -3,7 +3,7 @@ <uuid>c7a5fdbd-edaf-9455-926a-d65c16db1809</uuid> <memory unit='KiB'>65536</memory> <currentMemory unit='KiB'>65536</currentMemory> - <vcpu placement='static' cpuset='1-2,5-7'>1</vcpu> + <vcpu placement='static' cpuset='1-2,5-7'>5</vcpu> <numatune> <memory mode='strict' nodeset='1-4'/> </numatune> -- 2.20.1
Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
participants (2)
-
Daniel P. Berrangé
-
Julio Faracco