[libvirt] [libvirt PATCH 0/8] Fixes for cgroup setting

1/8 ~ 5/8 are refactorings, real fixes are 6/8, 7/8. 8/8 is RFC. Osier Yang (8): qemu: Abstract the code for blkio controller setting into a helper qemu: Abstract code for memory controller setting into a helper qemu: Abstract code for devices controller setting into a helper qemu: Abstract code for cpuset controller setting into a helper qemu: Abstract code for the cpu controller setting into a helper qemu: Set cpuset.cpus for domain process qemu: Prohibit getting the numa parameters if mode is not strict qemu: Set cpuset.mems even if the numatune mode is not strict src/qemu/qemu_cgroup.c | 564 +++++++++++++++++++++++++++++-------------------- src/qemu/qemu_driver.c | 11 +- 2 files changed, 348 insertions(+), 227 deletions(-) -- 1.8.1.4

--- src/qemu/qemu_cgroup.c | 90 ++++++++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 40 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 9c45b76..6772ab2 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -365,6 +365,53 @@ cleanup: return ret; } +static int +qemuSetupBlkioCgroup(virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc = -1; + int i; + + if (!virCgroupHasController(priv->cgroup, + VIR_CGROUP_CONTROLLER_BLKIO)) { + if (vm->def->blkio.weight || vm->def->blkio.ndevices) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Block I/O tuning is not available on this host")); + return -1; + } else { + return 0; + } + } + + if (vm->def->blkio.weight != 0) { + rc = virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set io weight for domain %s"), + vm->def->name); + return -1; + } + } + + if (vm->def->blkio.ndevices) { + for (i = 0; i < vm->def->blkio.ndevices; i++) { + virBlkioDeviceWeightPtr dw = &vm->def->blkio.devices[i]; + if (!dw->weight) + continue; + rc = virCgroupSetBlkioDeviceWeight(priv->cgroup, dw->path, + dw->weight); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set io device weight " + "for domain %s"), + vm->def->name); + return -1; + } + } + } + + return 0; +} int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -475,7 +522,6 @@ cleanup: return rc; } - int qemuSetupCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, virBitmapPtr nodemask) @@ -567,7 +613,7 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, if (vm->def->tpm && (qemuSetupTPMCgroup(vm->def, - vm->def->tpm, + vm->def->tpm, vm) < 0)) goto cleanup; @@ -577,44 +623,8 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, } } - if (vm->def->blkio.weight != 0) { - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_BLKIO)) { - rc = virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set io weight for domain %s"), - vm->def->name); - goto cleanup; - } - } else { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("Block I/O tuning is not available on this host")); - goto cleanup; - } - } - - if (vm->def->blkio.ndevices) { - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_BLKIO)) { - for (i = 0; i < vm->def->blkio.ndevices; i++) { - virBlkioDeviceWeightPtr dw = &vm->def->blkio.devices[i]; - if (!dw->weight) - continue; - rc = virCgroupSetBlkioDeviceWeight(priv->cgroup, dw->path, - dw->weight); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set io device weight " - "for domain %s"), - vm->def->name); - goto cleanup; - } - } - } else { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("Block I/O tuning is not available on this host")); - goto cleanup; - } - } + if (qemuSetupBlkioCgroup(vm) < 0) + goto cleanup; if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) { unsigned long long hard_limit = vm->def->mem.hard_limit; -- 1.8.1.4

--- src/qemu/qemu_cgroup.c | 120 ++++++++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 52 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 6772ab2..8c4bc0f 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -413,6 +413,72 @@ qemuSetupBlkioCgroup(virDomainObjPtr vm) return 0; } +static int +qemuSetupMemoryCgroup(virDomainObjPtr vm) { + qemuDomainObjPrivatePtr priv = vm->privateData; + unsigned long long hard_limit; + int rc; + int i; + + if (!virCgroupHasController(priv->cgroup,VIR_CGROUP_CONTROLLER_MEMORY)) { + if (vm->def->mem.hard_limit != 0 || + vm->def->mem.soft_limit != 0 || + vm->def->mem.swap_hard_limit != 0) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Memory cgroup is not available on this host")); + return -1; + } else { + VIR_WARN("Could not autoset a RSS limit for domain %s", vm->def->name); + return 0; + } + } + + hard_limit = vm->def->mem.hard_limit; + if (!hard_limit) { + /* If there is no hard_limit set, set a reasonable one to avoid + * system thrashing caused by exploited qemu. A 'reasonable + * limit' has been chosen: + * (1 + k) * (domain memory + total video memory) + (32MB for + * cache per each disk) + F + * where k = 0.5 and F = 200MB. The cache for disks is important as + * kernel cache on the host side counts into the RSS limit. */ + hard_limit = vm->def->mem.max_balloon; + for (i = 0; i < vm->def->nvideos; i++) + hard_limit += vm->def->videos[i]->vram; + hard_limit = hard_limit * 1.5 + 204800; + hard_limit += vm->def->ndisks * 32768; + } + + rc = virCgroupSetMemoryHardLimit(priv->cgroup, hard_limit); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set memory hard limit for domain %s"), + vm->def->name); + return -1; + } + if (vm->def->mem.soft_limit != 0) { + rc = virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set memory soft limit for domain %s"), + vm->def->name); + return -1; + } + } + + if (vm->def->mem.swap_hard_limit != 0) { + rc = virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set swap hard limit for domain %s"), + vm->def->name); + return -1; + } + } + + return 0; +} + int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, bool startup) @@ -626,58 +692,8 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, if (qemuSetupBlkioCgroup(vm) < 0) goto cleanup; - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) { - unsigned long long hard_limit = vm->def->mem.hard_limit; - - if (!hard_limit) { - /* If there is no hard_limit set, set a reasonable one to avoid - * system thrashing caused by exploited qemu. A 'reasonable - * limit' has been chosen: - * (1 + k) * (domain memory + total video memory) + (32MB for - * cache per each disk) + F - * where k = 0.5 and F = 200MB. The cache for disks is important as - * kernel cache on the host side counts into the RSS limit. */ - hard_limit = vm->def->mem.max_balloon; - for (i = 0; i < vm->def->nvideos; i++) - hard_limit += vm->def->videos[i]->vram; - hard_limit = hard_limit * 1.5 + 204800; - hard_limit += vm->def->ndisks * 32768; - } - - rc = virCgroupSetMemoryHardLimit(priv->cgroup, hard_limit); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set memory hard limit for domain %s"), - vm->def->name); - goto cleanup; - } - if (vm->def->mem.soft_limit != 0) { - rc = virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set memory soft limit for domain %s"), - vm->def->name); - goto cleanup; - } - } - - if (vm->def->mem.swap_hard_limit != 0) { - rc = virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set swap hard limit for domain %s"), - vm->def->name); - goto cleanup; - } - } - } else if (vm->def->mem.hard_limit != 0 || - vm->def->mem.soft_limit != 0 || - vm->def->mem.swap_hard_limit != 0) { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("Memory cgroup is not available on this host")); - } else { - VIR_WARN("Could not autoset a RSS limit for domain %s", vm->def->name); - } + if (qemuSetupMemoryCgroup(vm) < 0) + goto cleanup; if (vm->def->cputune.shares != 0) { if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { -- 1.8.1.4

--- src/qemu/qemu_cgroup.c | 195 +++++++++++++++++++++++++++---------------------- 1 file changed, 107 insertions(+), 88 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 8c4bc0f..057ddaf 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -479,6 +479,111 @@ qemuSetupMemoryCgroup(virDomainObjPtr vm) { return 0; } +static int +qemuSetupDevicesCgroup(virQEMUDriverPtr driver, + virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + virQEMUDriverConfigPtr cfg = NULL; + const char *const *deviceACL = NULL; + int rc = -1; + int ret = -1; + int i; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) + return 0; + + rc = virCgroupDenyAllDevices(priv->cgroup); + virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rc == 0); + if (rc != 0) { + if (rc == -EPERM) { + VIR_WARN("Group devices ACL is not accessible, disabling whitelisting"); + return 0; + } + + virReportSystemError(-rc, + _("Unable to deny all devices for %s"), vm->def->name); + goto cleanup; + } + + for (i = 0; i < vm->def->ndisks ; i++) { + if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0) + goto cleanup; + } + + rc = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_PTY_MAJOR, + VIR_CGROUP_DEVICE_RW); + virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR, + "pty", "rw", rc == 0); + if (rc != 0) { + virReportSystemError(-rc, "%s", + _("unable to allow /dev/pts/ devices")); + goto cleanup; + } + + cfg = virQEMUDriverGetConfig(driver); + deviceACL = cfg->cgroupDeviceACL ? + (const char *const *)cfg->cgroupDeviceACL : + defaultDeviceACL; + + if (vm->def->nsounds && + (!vm->def->ngraphics || + ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC && + cfg->vncAllowHostAudio) || + (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL)))) { + rc = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_SND_MAJOR, + VIR_CGROUP_DEVICE_RW); + virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR, + "sound", "rw", rc == 0); + if (rc != 0) { + virReportSystemError(-rc, "%s", + _("unable to allow /dev/snd/ devices")); + goto cleanup; + } + } + + for (i = 0; deviceACL[i] != NULL ; i++) { + if (access(deviceACL[i], F_OK) < 0) { + VIR_DEBUG("Ignoring non-existant device %s", + deviceACL[i]); + continue; + } + + rc = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i], + VIR_CGROUP_DEVICE_RW); + virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rc); + if (rc < 0 && + rc != -ENOENT) { + virReportSystemError(-rc, + _("unable to allow device %s"), + deviceACL[i]); + goto cleanup; + } + } + + if (virDomainChrDefForeach(vm->def, + true, + qemuSetupChardevCgroup, + vm) < 0) + goto cleanup; + + if (vm->def->tpm && + (qemuSetupTPMCgroup(vm->def, + vm->def->tpm, + vm) < 0)) + goto cleanup; + + for (i = 0; i < vm->def->nhostdevs; i++) { + if (qemuSetupHostdevCGroup(vm, vm->def->hostdevs[i]) < 0) + goto cleanup; + } + + ret = 0; +cleanup: + virObjectUnref(cfg); + return ret; +} + int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, bool startup) @@ -593,13 +698,7 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, virBitmapPtr nodemask) { int rc = -1; - unsigned int i; - virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver); qemuDomainObjPrivatePtr priv = vm->privateData; - const char *const *deviceACL = - cfg->cgroupDeviceACL ? - (const char *const *)cfg->cgroupDeviceACL : - defaultDeviceACL; if (qemuInitCgroup(driver, vm, true) < 0) return -1; @@ -607,87 +706,8 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, if (!priv->cgroup) goto done; - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) { - rc = virCgroupDenyAllDevices(priv->cgroup); - virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rc == 0); - if (rc != 0) { - if (rc == -EPERM) { - VIR_WARN("Group devices ACL is not accessible, disabling whitelisting"); - goto done; - } - - virReportSystemError(-rc, - _("Unable to deny all devices for %s"), vm->def->name); - goto cleanup; - } - - for (i = 0; i < vm->def->ndisks ; i++) { - if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0) - goto cleanup; - } - - rc = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_PTY_MAJOR, - VIR_CGROUP_DEVICE_RW); - virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR, - "pty", "rw", rc == 0); - if (rc != 0) { - virReportSystemError(-rc, "%s", - _("unable to allow /dev/pts/ devices")); - goto cleanup; - } - - if (vm->def->nsounds && - (!vm->def->ngraphics || - ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC && - cfg->vncAllowHostAudio) || - (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL)))) { - rc = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_SND_MAJOR, - VIR_CGROUP_DEVICE_RW); - virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR, - "sound", "rw", rc == 0); - if (rc != 0) { - virReportSystemError(-rc, "%s", - _("unable to allow /dev/snd/ devices")); - goto cleanup; - } - } - - for (i = 0; deviceACL[i] != NULL ; i++) { - if (access(deviceACL[i], F_OK) < 0) { - VIR_DEBUG("Ignoring non-existant device %s", - deviceACL[i]); - continue; - } - - rc = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i], - VIR_CGROUP_DEVICE_RW); - virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rc); - if (rc < 0 && - rc != -ENOENT) { - virReportSystemError(-rc, - _("unable to allow device %s"), - deviceACL[i]); - goto cleanup; - } - } - - if (virDomainChrDefForeach(vm->def, - true, - qemuSetupChardevCgroup, - vm) < 0) - goto cleanup; - - if (vm->def->tpm && - (qemuSetupTPMCgroup(vm->def, - vm->def->tpm, - vm) < 0)) - goto cleanup; - - for (i = 0; i < vm->def->nhostdevs; i++) { - if (qemuSetupHostdevCGroup(vm, vm->def->hostdevs[i]) < 0) - goto cleanup; - } - } + if (qemuSetupDevicesCgroup(driver, vm) < 0) + goto cleanup; if (qemuSetupBlkioCgroup(vm) < 0) goto cleanup; @@ -740,7 +760,6 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, done: rc = 0; cleanup: - virObjectUnref(cfg); return rc == 0 ? 0 : -1; } -- 1.8.1.4

--- src/qemu/qemu_cgroup.c | 73 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 057ddaf..f384b98 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -584,6 +584,51 @@ cleanup: return ret; } +static int +qemuSetupCpusetCgroup(virDomainObjPtr vm, + virBitmapPtr nodemask) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + char *mask = NULL; + int rc; + int ret = -1; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) + return 0; + + if ((vm->def->numatune.memory.nodemask || + (vm->def->numatune.memory.placement_mode == + VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) && + vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { + + if (vm->def->numatune.memory.placement_mode == + VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) + mask = virBitmapFormat(nodemask); + else + mask = virBitmapFormat(vm->def->numatune.memory.nodemask); + + if (!mask) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to convert memory nodemask")); + goto cleanup; + } + + rc = virCgroupSetCpusetMems(priv->cgroup, mask); + + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpuset.mems for domain %s"), + vm->def->name); + goto cleanup; + } + } + + ret = 0; +cleanup: + VIR_FREE(mask); + return ret; +} + int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, bool startup) @@ -730,32 +775,8 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, } } - if ((vm->def->numatune.memory.nodemask || - (vm->def->numatune.memory.placement_mode == - VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) && - vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && - virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) { - char *mask = NULL; - if (vm->def->numatune.memory.placement_mode == - VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) - mask = virBitmapFormat(nodemask); - else - mask = virBitmapFormat(vm->def->numatune.memory.nodemask); - if (!mask) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("failed to convert memory nodemask")); - goto cleanup; - } - - rc = virCgroupSetCpusetMems(priv->cgroup, mask); - VIR_FREE(mask); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set cpuset.mems for domain %s"), - vm->def->name); - goto cleanup; - } - } + if (qemuSetupCpusetCgroup(vm, nodemask) < 0) + goto cleanup; done: rc = 0; -- 1.8.1.4

--- src/qemu/qemu_cgroup.c | 63 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index f384b98..0e00b47 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -629,6 +629,35 @@ cleanup: return ret; } +static int +qemuSetupCpuCgroup(virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc = -1; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { + if (vm->def->cputune.shares) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("CPU tuning is not available on this host")); + return -1; + } else { + return 0; + } + } + + if (vm->def->cputune.shares) { + rc = virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set io cpu shares for domain %s"), + vm->def->name); + return -1; + } + } + + return 0; +} + int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, bool startup) @@ -742,46 +771,30 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, virBitmapPtr nodemask) { - int rc = -1; qemuDomainObjPrivatePtr priv = vm->privateData; if (qemuInitCgroup(driver, vm, true) < 0) return -1; if (!priv->cgroup) - goto done; + return 0; if (qemuSetupDevicesCgroup(driver, vm) < 0) - goto cleanup; + return -1; if (qemuSetupBlkioCgroup(vm) < 0) - goto cleanup; + return -1; if (qemuSetupMemoryCgroup(vm) < 0) - goto cleanup; - - if (vm->def->cputune.shares != 0) { - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { - rc = virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set io cpu shares for domain %s"), - vm->def->name); - goto cleanup; - } - } else { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("CPU tuning is not available on this host")); - } - } + return -1; if (qemuSetupCpusetCgroup(vm, nodemask) < 0) - goto cleanup; + return -1; -done: - rc = 0; -cleanup: - return rc == 0 ? 0 : -1; + if (qemuSetupCpuCgroup(vm) < 0) + return -1; + + return 0; } int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, unsigned long long period, -- 1.8.1.4

When either "cpuset" of <vcpu> is specified, or the "placement" of <vcpu> is "auto", only setting the cpuset.mems might cause the guest starting to fail. E.g. ("placement" of both <vcpu> and <numatune> is "auto"): 1) Related XMLs <vcpu placement='auto'>4</vcpu> <numatune> <memory mode='strict' placement='auto'/> </numatune> 2) Host NUMA topology % numactl --hardware available: 8 nodes (0-7) node 0 cpus: 0 4 8 12 16 20 24 28 node 0 size: 16374 MB node 0 free: 11899 MB node 1 cpus: 32 36 40 44 48 52 56 60 node 1 size: 16384 MB node 1 free: 15318 MB node 2 cpus: 2 6 10 14 18 22 26 30 node 2 size: 16384 MB node 2 free: 15766 MB node 3 cpus: 34 38 42 46 50 54 58 62 node 3 size: 16384 MB node 3 free: 15347 MB node 4 cpus: 3 7 11 15 19 23 27 31 node 4 size: 16384 MB node 4 free: 15041 MB node 5 cpus: 35 39 43 47 51 55 59 63 node 5 size: 16384 MB node 5 free: 15202 MB node 6 cpus: 1 5 9 13 17 21 25 29 node 6 size: 16384 MB node 6 free: 15197 MB node 7 cpus: 33 37 41 45 49 53 57 61 node 7 size: 16368 MB node 7 free: 15669 MB 4) cpuset.cpus will be set as: (from debug log) 2013-05-09 16:50:17.296+0000: 417: debug : virCgroupSetValueStr:331 : Set value '/sys/fs/cgroup/cpuset/libvirt/qemu/toy/cpuset.cpus' to '0-63' 5) The advisory nodeset got from querying numad (from debug log) 2013-05-09 16:50:17.295+0000: 417: debug : qemuProcessStart:3614 : Nodeset returned from numad: 1 6) cpuset.mems will be set as: (from debug log) 2013-05-09 16:50:17.296+0000: 417: debug : virCgroupSetValueStr:331 : Set value '/sys/fs/cgroup/cpuset/libvirt/qemu/toy/cpuset.mems' to '0-7' I.E, the domain process's memory is restricted on the first NUMA node, however, it can use all of the CPUs, which will very likely cause the domain process to fail to start because of the kernel fails to allocate memory with the possible mismatching between CPU nodes and memory nodes. % tail -n 20 /var/log/libvirt/qemu/toy.log ... 2013-05-09 05:53:32.972+0000: 7318: debug : virCommandHandshakeChild:377 : Handshake with parent is done char device redirected to /dev/pts/2 (label charserial0) kvm_init_vcpu failed: Cannot allocate memory ... --- src/qemu/qemu_cgroup.c | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 0e00b47..33eebd7 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -589,7 +589,8 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, virBitmapPtr nodemask) { qemuDomainObjPrivatePtr priv = vm->privateData; - char *mask = NULL; + char *mem_mask = NULL; + char *cpu_mask = NULL; int rc; int ret = -1; @@ -603,17 +604,17 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, if (vm->def->numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) - mask = virBitmapFormat(nodemask); + mem_mask = virBitmapFormat(nodemask); else - mask = virBitmapFormat(vm->def->numatune.memory.nodemask); + mem_mask = virBitmapFormat(vm->def->numatune.memory.nodemask); - if (!mask) { + if (!mem_mask) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("failed to convert memory nodemask")); goto cleanup; } - rc = virCgroupSetCpusetMems(priv->cgroup, mask); + rc = virCgroupSetCpusetMems(priv->cgroup, mem_mask); if (rc != 0) { virReportSystemError(-rc, @@ -623,9 +624,35 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, } } + if (vm->def->cpumask || + (vm->def->placement_mode == + VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) { + if (vm->def->placement_mode == + VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) + cpu_mask = virBitmapFormat(nodemask); + else + cpu_mask = virBitmapFormat(vm->def->cpumask); + + if (!cpu_mask) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to convert memory nodemask")); + goto cleanup; + } + + rc = virCgroupSetCpusetCpus(priv->cgroup, cpu_mask); + + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpuset.cpus for domain %s"), + vm->def->name); + goto cleanup; + } + } + ret = 0; cleanup: - VIR_FREE(mask); + VIR_FREE(mem_mask); + VIR_FREE(cpu_mask); return ret; } -- 1.8.1.4

Hi, Doug Do you think if this fixes your problem? https://www.redhat.com/archives/libvirt-users/2013-January/msg00071.html Osier On 09/05/13 18:22, Osier Yang wrote:
When either "cpuset" of <vcpu> is specified, or the "placement" of <vcpu> is "auto", only setting the cpuset.mems might cause the guest starting to fail. E.g. ("placement" of both <vcpu> and <numatune> is "auto"):
1) Related XMLs <vcpu placement='auto'>4</vcpu> <numatune> <memory mode='strict' placement='auto'/> </numatune>
2) Host NUMA topology % numactl --hardware available: 8 nodes (0-7) node 0 cpus: 0 4 8 12 16 20 24 28 node 0 size: 16374 MB node 0 free: 11899 MB node 1 cpus: 32 36 40 44 48 52 56 60 node 1 size: 16384 MB node 1 free: 15318 MB node 2 cpus: 2 6 10 14 18 22 26 30 node 2 size: 16384 MB node 2 free: 15766 MB node 3 cpus: 34 38 42 46 50 54 58 62 node 3 size: 16384 MB node 3 free: 15347 MB node 4 cpus: 3 7 11 15 19 23 27 31 node 4 size: 16384 MB node 4 free: 15041 MB node 5 cpus: 35 39 43 47 51 55 59 63 node 5 size: 16384 MB node 5 free: 15202 MB node 6 cpus: 1 5 9 13 17 21 25 29 node 6 size: 16384 MB node 6 free: 15197 MB node 7 cpus: 33 37 41 45 49 53 57 61 node 7 size: 16368 MB node 7 free: 15669 MB
4) cpuset.cpus will be set as: (from debug log)
2013-05-09 16:50:17.296+0000: 417: debug : virCgroupSetValueStr:331 : Set value '/sys/fs/cgroup/cpuset/libvirt/qemu/toy/cpuset.cpus' to '0-63'
5) The advisory nodeset got from querying numad (from debug log)
2013-05-09 16:50:17.295+0000: 417: debug : qemuProcessStart:3614 : Nodeset returned from numad: 1
6) cpuset.mems will be set as: (from debug log)
2013-05-09 16:50:17.296+0000: 417: debug : virCgroupSetValueStr:331 : Set value '/sys/fs/cgroup/cpuset/libvirt/qemu/toy/cpuset.mems' to '0-7'
I.E, the domain process's memory is restricted on the first NUMA node, however, it can use all of the CPUs, which will very likely cause the domain process to fail to start because of the kernel fails to allocate memory with the possible mismatching between CPU nodes and memory nodes.
% tail -n 20 /var/log/libvirt/qemu/toy.log ... 2013-05-09 05:53:32.972+0000: 7318: debug : virCommandHandshakeChild:377 : Handshake with parent is done char device redirected to /dev/pts/2 (label charserial0) kvm_init_vcpu failed: Cannot allocate memory ...
--- src/qemu/qemu_cgroup.c | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 0e00b47..33eebd7 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -589,7 +589,8 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, virBitmapPtr nodemask) { qemuDomainObjPrivatePtr priv = vm->privateData; - char *mask = NULL; + char *mem_mask = NULL; + char *cpu_mask = NULL; int rc; int ret = -1;
@@ -603,17 +604,17 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm,
if (vm->def->numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) - mask = virBitmapFormat(nodemask); + mem_mask = virBitmapFormat(nodemask); else - mask = virBitmapFormat(vm->def->numatune.memory.nodemask); + mem_mask = virBitmapFormat(vm->def->numatune.memory.nodemask);
- if (!mask) { + if (!mem_mask) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("failed to convert memory nodemask")); goto cleanup; }
- rc = virCgroupSetCpusetMems(priv->cgroup, mask); + rc = virCgroupSetCpusetMems(priv->cgroup, mem_mask);
if (rc != 0) { virReportSystemError(-rc, @@ -623,9 +624,35 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, } }
+ if (vm->def->cpumask || + (vm->def->placement_mode == + VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) { + if (vm->def->placement_mode == + VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) + cpu_mask = virBitmapFormat(nodemask); + else + cpu_mask = virBitmapFormat(vm->def->cpumask); + + if (!cpu_mask) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to convert memory nodemask")); + goto cleanup; + } + + rc = virCgroupSetCpusetCpus(priv->cgroup, cpu_mask); + + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpuset.cpus for domain %s"), + vm->def->name); + goto cleanup; + } + } + ret = 0; cleanup: - VIR_FREE(mask); + VIR_FREE(mem_mask); + VIR_FREE(cpu_mask); return ret; }

I don't see any reason to getting the numa parameters if mode is not strict, as long as setNumaParameters doesn't allow to set "nodeset" if the mode is not strict, and cpuset.mems only understand "strict" mode. Things could be changed if we support to get numa parameters with libnuma one day, but let's prohibit it now. --- src/qemu/qemu_driver.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 5005afa..e60f852 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -7828,8 +7828,15 @@ qemuDomainGetNumaParameters(virDomainPtr dom, if (flags & VIR_DOMAIN_AFFECT_LIVE) { if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) { - virReportError(VIR_ERR_OPERATION_INVALID, - "%s", _("cgroup memory controller is not mounted")); + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("cgroup memory controller is not mounted")); + goto cleanup; + } + + if (vm->def->numatune.memory.mode != VIR_DOMAIN_NUMATUNE_MEM_STRICT) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("getting numa parameters of running domain is " + "only valid for strict numa mode")); goto cleanup; } } -- 1.8.1.4

When the numatune memory mode is not "strict", the cpuset.mems inherits the parent's setting, which causes problem like: % virsh dumpxml rhel6_local | grep interleave -2 <vcpu placement='static'>2</vcpu> <numatune> <memory mode='interleave' nodeset='1-2'/> </numatune> <os> % cat /proc/3713/status | grep Mems_allowed_list Mems_allowed_list: 0-3 % virsh numatune rhel6_local numa_mode : interleave numa_nodeset : 0-3 Though the domain process's memory binding is set with libnuma after the cgroup setting. The reason for only allowing "strict" mode in current code is the cpuset.mems doesn't understand the memory policy modes (interleave, prefered, strict), it actually equals to the "strict" mode ("strict" means the allocation will fail if the memory cannot be allocated on the target node. Default operation is to fall back to other nodes.
From man numa(3)). However, writing the the cpuset.mems even if the numatune memory mode is not strict should be better than the blind inheritance anyway.
--- However, I'm not comfortable with the solution, since anyway the modes except "strict" are not meaningful for cpuset.mems. Another problem what I'm not sure about is: If the cpuset.cpus will affect the libnuma setting? Assuming without this patch, domain process's cpuset.mems will be set as '0-7' (8 NUMA nodes, each has 8 CPUs). And the numatune memory mode is "interleave", and libnuma set the memory binding as "1-2". Even with this patch applied, setting cpuset.mems as "1-2", any potential problem? So this patch is mainly for raising up the problem, and to see if guys have any opinions. @hutao, since these codes are from you, any opinions/idea? Thanks. --- src/qemu/qemu_cgroup.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 33eebd7..22fe25b 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -597,11 +597,9 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) return 0; - if ((vm->def->numatune.memory.nodemask || - (vm->def->numatune.memory.placement_mode == - VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) && - vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { - + if (vm->def->numatune.memory.nodemask || + (vm->def->numatune.memory.placement_mode == + VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) { if (vm->def->numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) mem_mask = virBitmapFormat(nodemask); @@ -614,6 +612,16 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, goto cleanup; } + if (vm->def->numatune.memory.mode == + VIR_DOMAIN_NUMATUNE_MEM_PREFERRED && + strlen(mem_mask) != 1) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("NUMA memory tuning in 'preferred' mode " + "only supports single node")); + goto cleanup; + + } + rc = virCgroupSetCpusetMems(priv->cgroup, mem_mask); if (rc != 0) { -- 1.8.1.4

[cC Hu Tao] On 09/05/13 18:22, Osier Yang wrote:
When the numatune memory mode is not "strict", the cpuset.mems inherits the parent's setting, which causes problem like:
% virsh dumpxml rhel6_local | grep interleave -2 <vcpu placement='static'>2</vcpu> <numatune> <memory mode='interleave' nodeset='1-2'/> </numatune> <os>
% cat /proc/3713/status | grep Mems_allowed_list Mems_allowed_list: 0-3
% virsh numatune rhel6_local numa_mode : interleave numa_nodeset : 0-3
Though the domain process's memory binding is set with libnuma after the cgroup setting.
The reason for only allowing "strict" mode in current code is the cpuset.mems doesn't understand the memory policy modes (interleave, prefered, strict), it actually equals to the "strict" mode ("strict" means the allocation will fail if the memory cannot be allocated on the target node. Default operation is to fall back to other nodes. From man numa(3)). However, writing the the cpuset.mems even if the numatune memory mode is not strict should be better than the blind inheritance anyway.
--- However, I'm not comfortable with the solution, since anyway the modes except "strict" are not meaningful for cpuset.mems.
Another problem what I'm not sure about is: If the cpuset.cpus will affect the libnuma setting? Assuming without this patch, domain process's cpuset.mems will be set as '0-7' (8 NUMA nodes, each has 8 CPUs). And the numatune memory mode is "interleave", and libnuma set the memory binding as "1-2". Even with this patch applied, setting cpuset.mems as "1-2", any potential problem?
So this patch is mainly for raising up the problem, and to see if guys have any opinions. @hutao, since these codes are from you, any opinions/idea? Thanks. --- src/qemu/qemu_cgroup.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 33eebd7..22fe25b 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -597,11 +597,9 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) return 0;
- if ((vm->def->numatune.memory.nodemask || - (vm->def->numatune.memory.placement_mode == - VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) && - vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { - + if (vm->def->numatune.memory.nodemask || + (vm->def->numatune.memory.placement_mode == + VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) { if (vm->def->numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) mem_mask = virBitmapFormat(nodemask); @@ -614,6 +612,16 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, goto cleanup; }
+ if (vm->def->numatune.memory.mode == + VIR_DOMAIN_NUMATUNE_MEM_PREFERRED && + strlen(mem_mask) != 1) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("NUMA memory tuning in 'preferred' mode " + "only supports single node")); + goto cleanup; + + } + rc = virCgroupSetCpusetMems(priv->cgroup, mem_mask);
if (rc != 0) {

On 09/05/13 18:23, Osier Yang wrote: Missed the address, :\
[cC Hu Tao]
On 09/05/13 18:22, Osier Yang wrote:
When the numatune memory mode is not "strict", the cpuset.mems inherits the parent's setting, which causes problem like:
% virsh dumpxml rhel6_local | grep interleave -2 <vcpu placement='static'>2</vcpu> <numatune> <memory mode='interleave' nodeset='1-2'/> </numatune> <os>
% cat /proc/3713/status | grep Mems_allowed_list Mems_allowed_list: 0-3
% virsh numatune rhel6_local numa_mode : interleave numa_nodeset : 0-3
Though the domain process's memory binding is set with libnuma after the cgroup setting.
The reason for only allowing "strict" mode in current code is the cpuset.mems doesn't understand the memory policy modes (interleave, prefered, strict), it actually equals to the "strict" mode ("strict" means the allocation will fail if the memory cannot be allocated on the target node. Default operation is to fall back to other nodes. From man numa(3)). However, writing the the cpuset.mems even if the numatune memory mode is not strict should be better than the blind inheritance anyway.
--- However, I'm not comfortable with the solution, since anyway the modes except "strict" are not meaningful for cpuset.mems.
Another problem what I'm not sure about is: If the cpuset.cpus will affect the libnuma setting? Assuming without this patch, domain process's cpuset.mems will be set as '0-7' (8 NUMA nodes, each has 8 CPUs). And the numatune memory mode is "interleave", and libnuma set the memory binding as "1-2". Even with this patch applied, setting cpuset.mems as "1-2", any potential problem?
So this patch is mainly for raising up the problem, and to see if guys have any opinions. @hutao, since these codes are from you, any opinions/idea? Thanks. --- src/qemu/qemu_cgroup.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 33eebd7..22fe25b 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -597,11 +597,9 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) return 0; - if ((vm->def->numatune.memory.nodemask || - (vm->def->numatune.memory.placement_mode == - VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) && - vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { - + if (vm->def->numatune.memory.nodemask || + (vm->def->numatune.memory.placement_mode == + VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) { if (vm->def->numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) mem_mask = virBitmapFormat(nodemask); @@ -614,6 +612,16 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, goto cleanup; } + if (vm->def->numatune.memory.mode == + VIR_DOMAIN_NUMATUNE_MEM_PREFERRED && + strlen(mem_mask) != 1) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("NUMA memory tuning in 'preferred' mode " + "only supports single node")); + goto cleanup; + + } + rc = virCgroupSetCpusetMems(priv->cgroup, mem_mask); if (rc != 0) {
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list

On Thu, May 09, 2013 at 06:22:17PM +0800, Osier Yang wrote:
When the numatune memory mode is not "strict", the cpuset.mems inherits the parent's setting, which causes problem like:
% virsh dumpxml rhel6_local | grep interleave -2 <vcpu placement='static'>2</vcpu> <numatune> <memory mode='interleave' nodeset='1-2'/> </numatune> <os>
% cat /proc/3713/status | grep Mems_allowed_list Mems_allowed_list: 0-3
% virsh numatune rhel6_local numa_mode : interleave numa_nodeset : 0-3
Yes the information is misleading.
Though the domain process's memory binding is set with libnuma after the cgroup setting.
The reason for only allowing "strict" mode in current code is the cpuset.mems doesn't understand the memory policy modes (interleave, prefered, strict), it actually equals to the "strict" mode ("strict" means the allocation will fail if the memory cannot be allocated on the target node. Default operation is to fall back to other nodes.
Default is localalloc.
From man numa(3)). However, writing the the cpuset.mems even if the numatune memory mode is not strict should be better than the blind inheritance anyway.
It's OK to interleave mode, combined with cpuset.memory_spread_xxx. But what about preferred mode? comparing: strict: Strict means the allocation will fail if the memory cannot be allocated on the target node. preferred: The system will attempt to allocate memory from the preferred node, but will fall back to other nodes if no memory is available on the the preferred node.
--- However, I'm not comfortable with the solution, since anyway the modes except "strict" are not meaningful for cpuset.mems.
Another problem what I'm not sure about is: If the cpuset.cpus will affect the libnuma setting? Assuming without this patch, domain process's cpuset.mems will be set as '0-7' (8 NUMA nodes, each has 8 CPUs). And the numatune memory mode is "interleave", and libnuma set the memory binding as "1-2". Even with this patch applied, setting cpuset.mems as "1-2", any potential problem?
So this patch is mainly for raising up the problem, and to see if guys have any opinions. @hutao, since these codes are from you, any opinions/idea? Thanks. --- src/qemu/qemu_cgroup.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 33eebd7..22fe25b 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -597,11 +597,9 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) return 0;
- if ((vm->def->numatune.memory.nodemask || - (vm->def->numatune.memory.placement_mode == - VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) && - vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { - + if (vm->def->numatune.memory.nodemask || + (vm->def->numatune.memory.placement_mode == + VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) { if (vm->def->numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) mem_mask = virBitmapFormat(nodemask); @@ -614,6 +612,16 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, goto cleanup; }
+ if (vm->def->numatune.memory.mode == + VIR_DOMAIN_NUMATUNE_MEM_PREFERRED && + strlen(mem_mask) != 1) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("NUMA memory tuning in 'preferred' mode " + "only supports single node")); + goto cleanup; + + } + rc = virCgroupSetCpusetMems(priv->cgroup, mem_mask);
if (rc != 0) { -- 1.8.1.4
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list

On 13/05/13 14:46, Hu Tao wrote:
On Thu, May 09, 2013 at 06:22:17PM +0800, Osier Yang wrote:
When the numatune memory mode is not "strict", the cpuset.mems inherits the parent's setting, which causes problem like:
% virsh dumpxml rhel6_local | grep interleave -2 <vcpu placement='static'>2</vcpu> <numatune> <memory mode='interleave' nodeset='1-2'/> </numatune> <os>
% cat /proc/3713/status | grep Mems_allowed_list Mems_allowed_list: 0-3
% virsh numatune rhel6_local numa_mode : interleave numa_nodeset : 0-3 Yes the information is misleading.
Though the domain process's memory binding is set with libnuma after the cgroup setting.
The reason for only allowing "strict" mode in current code is the cpuset.mems doesn't understand the memory policy modes (interleave, prefered, strict), it actually equals to the "strict" mode ("strict" means the allocation will fail if the memory cannot be allocated on the target node. Default operation is to fall back to other nodes. Default is localalloc.
From man numa(3)). However, writing the the cpuset.mems even if the numatune memory mode is not strict should be better than the blind inheritance anyway. It's OK to interleave mode, combined with cpuset.memory_spread_xxx.
- cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes Looks reasonable.
But what about preferred mode? comparing:
strict: Strict means the allocation will fail if the memory cannot be allocated on the target node.
preferred: The system will attempt to allocate memory from the preferred node, but will fall back to other nodes if no memory is available on the the preferred node.
For "preferred" mode, I have no idea, there is no related cgroup file(s) like memory_spread_*. If we set cpuset.mems with the nodeset, it means the memory allocation will behave like 'strict', which is not expected.
--- However, I'm not comfortable with the solution, since anyway the modes except "strict" are not meaningful for cpuset.mems.
Another problem what I'm not sure about is: If the cpuset.cpus will affect the libnuma setting? Assuming without this patch, domain process's cpuset.mems will be set as '0-7' (8 NUMA nodes, each has 8 CPUs). And the numatune memory mode is "interleave", and libnuma set the memory binding as "1-2". Even with this patch applied, setting cpuset.mems as "1-2", any potential problem?
So this patch is mainly for raising up the problem, and to see if guys have any opinions. @hutao, since these codes are from you, any opinions/idea? Thanks. --- src/qemu/qemu_cgroup.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 33eebd7..22fe25b 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -597,11 +597,9 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) return 0;
- if ((vm->def->numatune.memory.nodemask || - (vm->def->numatune.memory.placement_mode == - VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) && - vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { - + if (vm->def->numatune.memory.nodemask || + (vm->def->numatune.memory.placement_mode == + VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) { if (vm->def->numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) mem_mask = virBitmapFormat(nodemask); @@ -614,6 +612,16 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, goto cleanup; }
+ if (vm->def->numatune.memory.mode == + VIR_DOMAIN_NUMATUNE_MEM_PREFERRED && + strlen(mem_mask) != 1) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("NUMA memory tuning in 'preferred' mode " + "only supports single node")); + goto cleanup; + + } + rc = virCgroupSetCpusetMems(priv->cgroup, mem_mask);
if (rc != 0) { -- 1.8.1.4
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list

On 09/05/13 18:22, Osier Yang wrote:
1/8 ~ 5/8 are refactorings, real fixes are 6/8, 7/8. 8/8 is RFC.
Ping, except 8/8, for which there is no clear solution yet.
Osier Yang (8): qemu: Abstract the code for blkio controller setting into a helper qemu: Abstract code for memory controller setting into a helper qemu: Abstract code for devices controller setting into a helper qemu: Abstract code for cpuset controller setting into a helper qemu: Abstract code for the cpu controller setting into a helper qemu: Set cpuset.cpus for domain process qemu: Prohibit getting the numa parameters if mode is not strict qemu: Set cpuset.mems even if the numatune mode is not strict
src/qemu/qemu_cgroup.c | 564 +++++++++++++++++++++++++++++-------------------- src/qemu/qemu_driver.c | 11 +- 2 files changed, 348 insertions(+), 227 deletions(-)
participants (2)
-
Hu Tao
-
Osier Yang