[libvirt] [PATCH 0/7 v2] Fixes for cgroup setting.

/8 ~ 5/8 are refactorings, real fixes are 6/8, 7/8. v1 - v2: * Only rebasing Osier Yang (7): qemu: Abstract the code for blkio controller setting into a helper qemu: Abstract code for memory controller setting into a helper qemu: Abstract code for devices controller setting into a helper qemu: Abstract code for cpuset controller setting into a helper qemu: Abstract code for the cpu controller setting into a helper qemu: Set cpuset.cpus for domain process qemu: Prohibit getting the numa parameters if mode is not strict src/qemu/qemu_cgroup.c | 556 +++++++++++++++++++++++++++++-------------------- src/qemu/qemu_driver.c | 11 +- 2 files changed, 340 insertions(+), 227 deletions(-) -- 1.8.1.4

--- src/qemu/qemu_cgroup.c | 90 ++++++++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 40 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 9784f31..0c4792e 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -407,6 +407,53 @@ cleanup: return ret; } +static int +qemuSetupBlkioCgroup(virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc = -1; + int i; + + if (!virCgroupHasController(priv->cgroup, + VIR_CGROUP_CONTROLLER_BLKIO)) { + if (vm->def->blkio.weight || vm->def->blkio.ndevices) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Block I/O tuning is not available on this host")); + return -1; + } else { + return 0; + } + } + + if (vm->def->blkio.weight != 0) { + rc = virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set io weight for domain %s"), + vm->def->name); + return -1; + } + } + + if (vm->def->blkio.ndevices) { + for (i = 0; i < vm->def->blkio.ndevices; i++) { + virBlkioDeviceWeightPtr dw = &vm->def->blkio.devices[i]; + if (!dw->weight) + continue; + rc = virCgroupSetBlkioDeviceWeight(priv->cgroup, dw->path, + dw->weight); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set io device weight " + "for domain %s"), + vm->def->name); + return -1; + } + } + } + + return 0; +} int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -517,7 +564,6 @@ cleanup: return rc; } - int qemuSetupCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, virBitmapPtr nodemask) @@ -609,7 +655,7 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, if (vm->def->tpm && (qemuSetupTPMCgroup(vm->def, - vm->def->tpm, + vm->def->tpm, vm) < 0)) goto cleanup; @@ -619,44 +665,8 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, } } - if (vm->def->blkio.weight != 0) { - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_BLKIO)) { - rc = virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set io weight for domain %s"), - vm->def->name); - goto cleanup; - } - } else { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("Block I/O tuning is not available on this host")); - goto cleanup; - } - } - - if (vm->def->blkio.ndevices) { - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_BLKIO)) { - for (i = 0; i < vm->def->blkio.ndevices; i++) { - virBlkioDeviceWeightPtr dw = &vm->def->blkio.devices[i]; - if (!dw->weight) - continue; - rc = virCgroupSetBlkioDeviceWeight(priv->cgroup, dw->path, - dw->weight); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set io device weight " - "for domain %s"), - vm->def->name); - goto cleanup; - } - } - } else { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("Block I/O tuning is not available on this host")); - goto cleanup; - } - } + if (qemuSetupBlkioCgroup(vm) < 0) + goto cleanup; if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) { unsigned long long hard_limit = vm->def->mem.hard_limit; -- 1.8.1.4

On Fri, May 17, 2013 at 07:59:31PM +0800, Osier Yang wrote:
--- src/qemu/qemu_cgroup.c | 90 ++++++++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 40 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 9784f31..0c4792e 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -407,6 +407,53 @@ cleanup: return ret; }
+static int +qemuSetupBlkioCgroup(virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc = -1; + int i; + + if (!virCgroupHasController(priv->cgroup, + VIR_CGROUP_CONTROLLER_BLKIO)) { + if (vm->def->blkio.weight || vm->def->blkio.ndevices) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Block I/O tuning is not available on this host")); + return -1; + } else { + return 0; + } + } + + if (vm->def->blkio.weight != 0) { + rc = virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set io weight for domain %s"), + vm->def->name); + return -1; + } + } + + if (vm->def->blkio.ndevices) { + for (i = 0; i < vm->def->blkio.ndevices; i++) { + virBlkioDeviceWeightPtr dw = &vm->def->blkio.devices[i]; + if (!dw->weight) + continue; + rc = virCgroupSetBlkioDeviceWeight(priv->cgroup, dw->path, + dw->weight); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set io device weight " + "for domain %s"), + vm->def->name); + return -1; + } + } + } + + return 0; +}
Use 2 blank lines here
int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -517,7 +564,6 @@ cleanup: return rc; }
-
Keep 2 blank lines between functions
int qemuSetupCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, virBitmapPtr nodemask) @@ -609,7 +655,7 @@ int qemuSetupCgroup(virQEMUDriverPtr driver,
if (vm->def->tpm && (qemuSetupTPMCgroup(vm->def, - vm->def->tpm, + vm->def->tpm, vm) < 0)) goto cleanup;
@@ -619,44 +665,8 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, } }
- if (vm->def->blkio.weight != 0) { - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_BLKIO)) { - rc = virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set io weight for domain %s"), - vm->def->name); - goto cleanup; - } - } else { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("Block I/O tuning is not available on this host")); - goto cleanup; - } - } - - if (vm->def->blkio.ndevices) { - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_BLKIO)) { - for (i = 0; i < vm->def->blkio.ndevices; i++) { - virBlkioDeviceWeightPtr dw = &vm->def->blkio.devices[i]; - if (!dw->weight) - continue; - rc = virCgroupSetBlkioDeviceWeight(priv->cgroup, dw->path, - dw->weight); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set io device weight " - "for domain %s"), - vm->def->name); - goto cleanup; - } - } - } else { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("Block I/O tuning is not available on this host")); - goto cleanup; - } - } + if (qemuSetupBlkioCgroup(vm) < 0) + goto cleanup;
if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) { unsigned long long hard_limit = vm->def->mem.hard_limit;
ACK with whitespace tweak Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 20/05/13 19:06, Daniel P. Berrange wrote:
On Fri, May 17, 2013 at 07:59:31PM +0800, Osier Yang wrote:
--- src/qemu/qemu_cgroup.c | 90 ++++++++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 40 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 9784f31..0c4792e 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -407,6 +407,53 @@ cleanup: return ret; }
+static int +qemuSetupBlkioCgroup(virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc = -1; + int i; + + if (!virCgroupHasController(priv->cgroup, + VIR_CGROUP_CONTROLLER_BLKIO)) { + if (vm->def->blkio.weight || vm->def->blkio.ndevices) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Block I/O tuning is not available on this host")); + return -1; + } else { + return 0; + } + } + + if (vm->def->blkio.weight != 0) { + rc = virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set io weight for domain %s"), + vm->def->name); + return -1; + } + } + + if (vm->def->blkio.ndevices) { + for (i = 0; i < vm->def->blkio.ndevices; i++) { + virBlkioDeviceWeightPtr dw = &vm->def->blkio.devices[i]; + if (!dw->weight) + continue; + rc = virCgroupSetBlkioDeviceWeight(priv->cgroup, dw->path, + dw->weight); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set io device weight " + "for domain %s"), + vm->def->name); + return -1; + } + } + } + + return 0; +}
Use 2 blank lines here
int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -517,7 +564,6 @@ cleanup: return rc; }
-
Keep 2 blank lines between functions
int qemuSetupCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, virBitmapPtr nodemask) @@ -609,7 +655,7 @@ int qemuSetupCgroup(virQEMUDriverPtr driver,
if (vm->def->tpm && (qemuSetupTPMCgroup(vm->def, - vm->def->tpm, + vm->def->tpm, vm) < 0)) goto cleanup;
@@ -619,44 +665,8 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, } }
- if (vm->def->blkio.weight != 0) { - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_BLKIO)) { - rc = virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set io weight for domain %s"), - vm->def->name); - goto cleanup; - } - } else { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("Block I/O tuning is not available on this host")); - goto cleanup; - } - } - - if (vm->def->blkio.ndevices) { - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_BLKIO)) { - for (i = 0; i < vm->def->blkio.ndevices; i++) { - virBlkioDeviceWeightPtr dw = &vm->def->blkio.devices[i]; - if (!dw->weight) - continue; - rc = virCgroupSetBlkioDeviceWeight(priv->cgroup, dw->path, - dw->weight); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set io device weight " - "for domain %s"), - vm->def->name); - goto cleanup; - } - } - } else { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("Block I/O tuning is not available on this host")); - goto cleanup; - } - } + if (qemuSetupBlkioCgroup(vm) < 0) + goto cleanup;
if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) { unsigned long long hard_limit = vm->def->mem.hard_limit; ACK with whitespace tweak Pushed with the tweaks.

--- src/qemu/qemu_cgroup.c | 120 ++++++++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 52 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 0c4792e..2751be0 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -455,6 +455,72 @@ qemuSetupBlkioCgroup(virDomainObjPtr vm) return 0; } +static int +qemuSetupMemoryCgroup(virDomainObjPtr vm) { + qemuDomainObjPrivatePtr priv = vm->privateData; + unsigned long long hard_limit; + int rc; + int i; + + if (!virCgroupHasController(priv->cgroup,VIR_CGROUP_CONTROLLER_MEMORY)) { + if (vm->def->mem.hard_limit != 0 || + vm->def->mem.soft_limit != 0 || + vm->def->mem.swap_hard_limit != 0) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Memory cgroup is not available on this host")); + return -1; + } else { + VIR_WARN("Could not autoset a RSS limit for domain %s", vm->def->name); + return 0; + } + } + + hard_limit = vm->def->mem.hard_limit; + if (!hard_limit) { + /* If there is no hard_limit set, set a reasonable one to avoid + * system thrashing caused by exploited qemu. A 'reasonable + * limit' has been chosen: + * (1 + k) * (domain memory + total video memory) + (32MB for + * cache per each disk) + F + * where k = 0.5 and F = 200MB. The cache for disks is important as + * kernel cache on the host side counts into the RSS limit. */ + hard_limit = vm->def->mem.max_balloon; + for (i = 0; i < vm->def->nvideos; i++) + hard_limit += vm->def->videos[i]->vram; + hard_limit = hard_limit * 1.5 + 204800; + hard_limit += vm->def->ndisks * 32768; + } + + rc = virCgroupSetMemoryHardLimit(priv->cgroup, hard_limit); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set memory hard limit for domain %s"), + vm->def->name); + return -1; + } + if (vm->def->mem.soft_limit != 0) { + rc = virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set memory soft limit for domain %s"), + vm->def->name); + return -1; + } + } + + if (vm->def->mem.swap_hard_limit != 0) { + rc = virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set swap hard limit for domain %s"), + vm->def->name); + return -1; + } + } + + return 0; +} + int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, bool startup) @@ -668,58 +734,8 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, if (qemuSetupBlkioCgroup(vm) < 0) goto cleanup; - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) { - unsigned long long hard_limit = vm->def->mem.hard_limit; - - if (!hard_limit) { - /* If there is no hard_limit set, set a reasonable one to avoid - * system thrashing caused by exploited qemu. A 'reasonable - * limit' has been chosen: - * (1 + k) * (domain memory + total video memory) + (32MB for - * cache per each disk) + F - * where k = 0.5 and F = 200MB. The cache for disks is important as - * kernel cache on the host side counts into the RSS limit. */ - hard_limit = vm->def->mem.max_balloon; - for (i = 0; i < vm->def->nvideos; i++) - hard_limit += vm->def->videos[i]->vram; - hard_limit = hard_limit * 1.5 + 204800; - hard_limit += vm->def->ndisks * 32768; - } - - rc = virCgroupSetMemoryHardLimit(priv->cgroup, hard_limit); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set memory hard limit for domain %s"), - vm->def->name); - goto cleanup; - } - if (vm->def->mem.soft_limit != 0) { - rc = virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set memory soft limit for domain %s"), - vm->def->name); - goto cleanup; - } - } - - if (vm->def->mem.swap_hard_limit != 0) { - rc = virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set swap hard limit for domain %s"), - vm->def->name); - goto cleanup; - } - } - } else if (vm->def->mem.hard_limit != 0 || - vm->def->mem.soft_limit != 0 || - vm->def->mem.swap_hard_limit != 0) { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("Memory cgroup is not available on this host")); - } else { - VIR_WARN("Could not autoset a RSS limit for domain %s", vm->def->name); - } + if (qemuSetupMemoryCgroup(vm) < 0) + goto cleanup; if (vm->def->cputune.shares != 0) { if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { -- 1.8.1.4

On Fri, May 17, 2013 at 07:59:32PM +0800, Osier Yang wrote:
--- src/qemu/qemu_cgroup.c | 120 ++++++++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 52 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 0c4792e..2751be0 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -455,6 +455,72 @@ qemuSetupBlkioCgroup(virDomainObjPtr vm) return 0; }
My email client is showing trailing whitespace here. Not sure if that is genuine or not. Also 2 blank lines between functions
+static int +qemuSetupMemoryCgroup(virDomainObjPtr vm) {
Put the '{' on a new line as you did with previous patches
+ qemuDomainObjPrivatePtr priv = vm->privateData; + unsigned long long hard_limit; + int rc; + int i; + + if (!virCgroupHasController(priv->cgroup,VIR_CGROUP_CONTROLLER_MEMORY)) { + if (vm->def->mem.hard_limit != 0 || + vm->def->mem.soft_limit != 0 || + vm->def->mem.swap_hard_limit != 0) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Memory cgroup is not available on this host")); + return -1; + } else { + VIR_WARN("Could not autoset a RSS limit for domain %s", vm->def->name); + return 0; + }
Not sure why we need this VIR_WARN at all. If no limits are set in the XML, then we should not warn about a missing feature that we don't actually need.
@@ -668,58 +734,8 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, if (qemuSetupBlkioCgroup(vm) < 0) goto cleanup;
- if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) { - unsigned long long hard_limit = vm->def->mem.hard_limit; - - if (!hard_limit) { - /* If there is no hard_limit set, set a reasonable one to avoid - * system thrashing caused by exploited qemu. A 'reasonable - * limit' has been chosen: - * (1 + k) * (domain memory + total video memory) + (32MB for - * cache per each disk) + F - * where k = 0.5 and F = 200MB. The cache for disks is important as - * kernel cache on the host side counts into the RSS limit. */ - hard_limit = vm->def->mem.max_balloon; - for (i = 0; i < vm->def->nvideos; i++) - hard_limit += vm->def->videos[i]->vram; - hard_limit = hard_limit * 1.5 + 204800; - hard_limit += vm->def->ndisks * 32768; - } - - rc = virCgroupSetMemoryHardLimit(priv->cgroup, hard_limit); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set memory hard limit for domain %s"), - vm->def->name); - goto cleanup; - } - if (vm->def->mem.soft_limit != 0) { - rc = virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set memory soft limit for domain %s"), - vm->def->name); - goto cleanup; - } - } - - if (vm->def->mem.swap_hard_limit != 0) { - rc = virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set swap hard limit for domain %s"), - vm->def->name); - goto cleanup; - } - } - } else if (vm->def->mem.hard_limit != 0 || - vm->def->mem.soft_limit != 0 || - vm->def->mem.swap_hard_limit != 0) { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("Memory cgroup is not available on this host")); - } else { - VIR_WARN("Could not autoset a RSS limit for domain %s", vm->def->name); - } + if (qemuSetupMemoryCgroup(vm) < 0) + goto cleanup;
ACK with warning removed Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 20/05/13 19:08, Daniel P. Berrange wrote:
On Fri, May 17, 2013 at 07:59:32PM +0800, Osier Yang wrote:
--- src/qemu/qemu_cgroup.c | 120 ++++++++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 52 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 0c4792e..2751be0 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -455,6 +455,72 @@ qemuSetupBlkioCgroup(virDomainObjPtr vm) return 0; }
My email client is showing trailing whitespace here. Not sure if that is genuine or not. Also 2 blank lines between functions
I found no trailing spaces.
+static int +qemuSetupMemoryCgroup(virDomainObjPtr vm) { Put the '{' on a new line as you did with previous patches
Okay.
+ qemuDomainObjPrivatePtr priv = vm->privateData; + unsigned long long hard_limit; + int rc; + int i; + + if (!virCgroupHasController(priv->cgroup,VIR_CGROUP_CONTROLLER_MEMORY)) { + if (vm->def->mem.hard_limit != 0 || + vm->def->mem.soft_limit != 0 || + vm->def->mem.swap_hard_limit != 0) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Memory cgroup is not available on this host")); + return -1; + } else { + VIR_WARN("Could not autoset a RSS limit for domain %s", vm->def->name); + return 0; + } Not sure why we need this VIR_WARN at all. If no limits are set in the XML, then we should not warn about a missing feature that we don't actually need.
Agreed. Having a warning for no XML config is confused. I removed it.
@@ -668,58 +734,8 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, if (qemuSetupBlkioCgroup(vm) < 0) goto cleanup;
- if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) { - unsigned long long hard_limit = vm->def->mem.hard_limit; - - if (!hard_limit) { - /* If there is no hard_limit set, set a reasonable one to avoid - * system thrashing caused by exploited qemu. A 'reasonable - * limit' has been chosen: - * (1 + k) * (domain memory + total video memory) + (32MB for - * cache per each disk) + F - * where k = 0.5 and F = 200MB. The cache for disks is important as - * kernel cache on the host side counts into the RSS limit. */ - hard_limit = vm->def->mem.max_balloon; - for (i = 0; i < vm->def->nvideos; i++) - hard_limit += vm->def->videos[i]->vram; - hard_limit = hard_limit * 1.5 + 204800; - hard_limit += vm->def->ndisks * 32768; - } - - rc = virCgroupSetMemoryHardLimit(priv->cgroup, hard_limit); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set memory hard limit for domain %s"), - vm->def->name); - goto cleanup; - } - if (vm->def->mem.soft_limit != 0) { - rc = virCgroupSetMemorySoftLimit(priv->cgroup, vm->def->mem.soft_limit); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set memory soft limit for domain %s"), - vm->def->name); - goto cleanup; - } - } - - if (vm->def->mem.swap_hard_limit != 0) { - rc = virCgroupSetMemSwapHardLimit(priv->cgroup, vm->def->mem.swap_hard_limit); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set swap hard limit for domain %s"), - vm->def->name); - goto cleanup; - } - } - } else if (vm->def->mem.hard_limit != 0 || - vm->def->mem.soft_limit != 0 || - vm->def->mem.swap_hard_limit != 0) { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("Memory cgroup is not available on this host")); - } else { - VIR_WARN("Could not autoset a RSS limit for domain %s", vm->def->name); - } + if (qemuSetupMemoryCgroup(vm) < 0) + goto cleanup; ACK with warning removed
Pushed. Thanks. Osier

On 05/20/2013 01:35 PM, Osier Yang wrote:
(!virCgroupHasController(priv->cgroup,VIR_CGROUP_CONTROLLER_MEMORY)) { + if (vm->def->mem.hard_limit != 0 || + vm->def->mem.soft_limit != 0 || + vm->def->mem.swap_hard_limit != 0) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Memory cgroup is not available on this host")); + return -1; + } else { + VIR_WARN("Could not autoset a RSS limit for domain %s", vm->def->name); + return 0; + } Not sure why we need this VIR_WARN at all. If no limits are set in the XML, then we should not warn about a missing feature that we don't actually need.
Agreed. Having a warning for no XML config is confused. I removed it.
We may not need the warning, but the return 0 must stay. I can't start guests on my system with no memory controller after this commit. -- Mit freundlichen Grüßen/Kind Regards Viktor Mihajlovski IBM Deutschland Research & Development GmbH Vorsitzender des Aufsichtsrats: Martina Köderitz Geschäftsführung: Dirk Wittkopp Sitz der Gesellschaft: Böblingen Registergericht: Amtsgericht Stuttgart, HRB 243294

On Tue, May 21, 2013 at 04:56:37PM +0200, Viktor Mihajlovski wrote:
On 05/20/2013 01:35 PM, Osier Yang wrote:
(!virCgroupHasController(priv->cgroup,VIR_CGROUP_CONTROLLER_MEMORY)) { + if (vm->def->mem.hard_limit != 0 || + vm->def->mem.soft_limit != 0 || + vm->def->mem.swap_hard_limit != 0) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Memory cgroup is not available on this host")); + return -1; + } else { + VIR_WARN("Could not autoset a RSS limit for domain %s", vm->def->name); + return 0; + } Not sure why we need this VIR_WARN at all. If no limits are set in the XML, then we should not warn about a missing feature that we don't actually need.
Agreed. Having a warning for no XML config is confused. I removed it.
We may not need the warning, but the return 0 must stay. I can't start guests on my system with no memory controller after this commit.
Yes, absolutely. I only suggested killing the warning, the 'return 0' must remain for sure. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 21/05/13 22:56, Viktor Mihajlovski wrote:
On 05/20/2013 01:35 PM, Osier Yang wrote:
(!virCgroupHasController(priv->cgroup,VIR_CGROUP_CONTROLLER_MEMORY)) { + if (vm->def->mem.hard_limit != 0 || + vm->def->mem.soft_limit != 0 || + vm->def->mem.swap_hard_limit != 0) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("Memory cgroup is not available on this host")); + return -1; + } else { + VIR_WARN("Could not autoset a RSS limit for domain %s", vm->def->name); + return 0; + } Not sure why we need this VIR_WARN at all. If no limits are set in the XML, then we should not warn about a missing feature that we don't actually need.
Agreed. Having a warning for no XML config is confused. I removed it.
We may not need the warning, but the return 0 must stay. I can't start guests on my system with no memory controller after this commit.
oops, I blindly removed it. Will fix soon.

--- src/qemu/qemu_cgroup.c | 195 +++++++++++++++++++++++++++---------------------- 1 file changed, 107 insertions(+), 88 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 2751be0..1e8afb1 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -521,6 +521,111 @@ qemuSetupMemoryCgroup(virDomainObjPtr vm) { return 0; } +static int +qemuSetupDevicesCgroup(virQEMUDriverPtr driver, + virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + virQEMUDriverConfigPtr cfg = NULL; + const char *const *deviceACL = NULL; + int rc = -1; + int ret = -1; + int i; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) + return 0; + + rc = virCgroupDenyAllDevices(priv->cgroup); + virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rc == 0); + if (rc != 0) { + if (rc == -EPERM) { + VIR_WARN("Group devices ACL is not accessible, disabling whitelisting"); + return 0; + } + + virReportSystemError(-rc, + _("Unable to deny all devices for %s"), vm->def->name); + goto cleanup; + } + + for (i = 0; i < vm->def->ndisks ; i++) { + if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0) + goto cleanup; + } + + rc = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_PTY_MAJOR, + VIR_CGROUP_DEVICE_RW); + virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR, + "pty", "rw", rc == 0); + if (rc != 0) { + virReportSystemError(-rc, "%s", + _("unable to allow /dev/pts/ devices")); + goto cleanup; + } + + cfg = virQEMUDriverGetConfig(driver); + deviceACL = cfg->cgroupDeviceACL ? + (const char *const *)cfg->cgroupDeviceACL : + defaultDeviceACL; + + if (vm->def->nsounds && + (!vm->def->ngraphics || + ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC && + cfg->vncAllowHostAudio) || + (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL)))) { + rc = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_SND_MAJOR, + VIR_CGROUP_DEVICE_RW); + virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR, + "sound", "rw", rc == 0); + if (rc != 0) { + virReportSystemError(-rc, "%s", + _("unable to allow /dev/snd/ devices")); + goto cleanup; + } + } + + for (i = 0; deviceACL[i] != NULL ; i++) { + if (access(deviceACL[i], F_OK) < 0) { + VIR_DEBUG("Ignoring non-existant device %s", + deviceACL[i]); + continue; + } + + rc = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i], + VIR_CGROUP_DEVICE_RW); + virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rc); + if (rc < 0 && + rc != -ENOENT) { + virReportSystemError(-rc, + _("unable to allow device %s"), + deviceACL[i]); + goto cleanup; + } + } + + if (virDomainChrDefForeach(vm->def, + true, + qemuSetupChardevCgroup, + vm) < 0) + goto cleanup; + + if (vm->def->tpm && + (qemuSetupTPMCgroup(vm->def, + vm->def->tpm, + vm) < 0)) + goto cleanup; + + for (i = 0; i < vm->def->nhostdevs; i++) { + if (qemuSetupHostdevCGroup(vm, vm->def->hostdevs[i]) < 0) + goto cleanup; + } + + ret = 0; +cleanup: + virObjectUnref(cfg); + return ret; +} + int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, bool startup) @@ -635,13 +740,7 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, virBitmapPtr nodemask) { int rc = -1; - unsigned int i; - virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver); qemuDomainObjPrivatePtr priv = vm->privateData; - const char *const *deviceACL = - cfg->cgroupDeviceACL ? - (const char *const *)cfg->cgroupDeviceACL : - defaultDeviceACL; if (qemuInitCgroup(driver, vm, true) < 0) return -1; @@ -649,87 +748,8 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, if (!priv->cgroup) goto done; - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) { - rc = virCgroupDenyAllDevices(priv->cgroup); - virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rc == 0); - if (rc != 0) { - if (rc == -EPERM) { - VIR_WARN("Group devices ACL is not accessible, disabling whitelisting"); - goto done; - } - - virReportSystemError(-rc, - _("Unable to deny all devices for %s"), vm->def->name); - goto cleanup; - } - - for (i = 0; i < vm->def->ndisks ; i++) { - if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0) - goto cleanup; - } - - rc = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_PTY_MAJOR, - VIR_CGROUP_DEVICE_RW); - virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR, - "pty", "rw", rc == 0); - if (rc != 0) { - virReportSystemError(-rc, "%s", - _("unable to allow /dev/pts/ devices")); - goto cleanup; - } - - if (vm->def->nsounds && - (!vm->def->ngraphics || - ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC && - cfg->vncAllowHostAudio) || - (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL)))) { - rc = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_SND_MAJOR, - VIR_CGROUP_DEVICE_RW); - virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR, - "sound", "rw", rc == 0); - if (rc != 0) { - virReportSystemError(-rc, "%s", - _("unable to allow /dev/snd/ devices")); - goto cleanup; - } - } - - for (i = 0; deviceACL[i] != NULL ; i++) { - if (access(deviceACL[i], F_OK) < 0) { - VIR_DEBUG("Ignoring non-existant device %s", - deviceACL[i]); - continue; - } - - rc = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i], - VIR_CGROUP_DEVICE_RW); - virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rc); - if (rc < 0 && - rc != -ENOENT) { - virReportSystemError(-rc, - _("unable to allow device %s"), - deviceACL[i]); - goto cleanup; - } - } - - if (virDomainChrDefForeach(vm->def, - true, - qemuSetupChardevCgroup, - vm) < 0) - goto cleanup; - - if (vm->def->tpm && - (qemuSetupTPMCgroup(vm->def, - vm->def->tpm, - vm) < 0)) - goto cleanup; - - for (i = 0; i < vm->def->nhostdevs; i++) { - if (qemuSetupHostdevCGroup(vm, vm->def->hostdevs[i]) < 0) - goto cleanup; - } - } + if (qemuSetupDevicesCgroup(driver, vm) < 0) + goto cleanup; if (qemuSetupBlkioCgroup(vm) < 0) goto cleanup; @@ -782,7 +802,6 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, done: rc = 0; cleanup: - virObjectUnref(cfg); return rc == 0 ? 0 : -1; } -- 1.8.1.4

On Fri, May 17, 2013 at 07:59:33PM +0800, Osier Yang wrote:
--- src/qemu/qemu_cgroup.c | 195 +++++++++++++++++++++++++++---------------------- 1 file changed, 107 insertions(+), 88 deletions(-)
ACK Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 20/05/13 19:10, Daniel P. Berrange wrote:
On Fri, May 17, 2013 at 07:59:33PM +0800, Osier Yang wrote:
--- src/qemu/qemu_cgroup.c | 195 +++++++++++++++++++++++++++---------------------- 1 file changed, 107 insertions(+), 88 deletions(-) ACK
Pushed.

--- src/qemu/qemu_cgroup.c | 73 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 1e8afb1..8f84ef9 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -626,6 +626,51 @@ cleanup: return ret; } +static int +qemuSetupCpusetCgroup(virDomainObjPtr vm, + virBitmapPtr nodemask) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + char *mask = NULL; + int rc; + int ret = -1; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) + return 0; + + if ((vm->def->numatune.memory.nodemask || + (vm->def->numatune.memory.placement_mode == + VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) && + vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { + + if (vm->def->numatune.memory.placement_mode == + VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) + mask = virBitmapFormat(nodemask); + else + mask = virBitmapFormat(vm->def->numatune.memory.nodemask); + + if (!mask) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to convert memory nodemask")); + goto cleanup; + } + + rc = virCgroupSetCpusetMems(priv->cgroup, mask); + + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpuset.mems for domain %s"), + vm->def->name); + goto cleanup; + } + } + + ret = 0; +cleanup: + VIR_FREE(mask); + return ret; +} + int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, bool startup) @@ -772,32 +817,8 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, } } - if ((vm->def->numatune.memory.nodemask || - (vm->def->numatune.memory.placement_mode == - VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) && - vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && - virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) { - char *mask = NULL; - if (vm->def->numatune.memory.placement_mode == - VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) - mask = virBitmapFormat(nodemask); - else - mask = virBitmapFormat(vm->def->numatune.memory.nodemask); - if (!mask) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("failed to convert memory nodemask")); - goto cleanup; - } - - rc = virCgroupSetCpusetMems(priv->cgroup, mask); - VIR_FREE(mask); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set cpuset.mems for domain %s"), - vm->def->name); - goto cleanup; - } - } + if (qemuSetupCpusetCgroup(vm, nodemask) < 0) + goto cleanup; done: rc = 0; -- 1.8.1.4

On Fri, May 17, 2013 at 07:59:34PM +0800, Osier Yang wrote:
--- src/qemu/qemu_cgroup.c | 73 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 26 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 1e8afb1..8f84ef9 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -626,6 +626,51 @@ cleanup: return ret; }
+static int +qemuSetupCpusetCgroup(virDomainObjPtr vm, + virBitmapPtr nodemask) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + char *mask = NULL; + int rc; + int ret = -1; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) + return 0; + + if ((vm->def->numatune.memory.nodemask || + (vm->def->numatune.memory.placement_mode == + VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) && + vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { + + if (vm->def->numatune.memory.placement_mode == + VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) + mask = virBitmapFormat(nodemask); + else + mask = virBitmapFormat(vm->def->numatune.memory.nodemask); + + if (!mask) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to convert memory nodemask")); + goto cleanup; + } + + rc = virCgroupSetCpusetMems(priv->cgroup, mask); + + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpuset.mems for domain %s"), + vm->def->name); + goto cleanup; + } + } + + ret = 0; +cleanup: + VIR_FREE(mask); + return ret; +} +
Add a 2nd blank line
int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, bool startup) @@ -772,32 +817,8 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, } }
- if ((vm->def->numatune.memory.nodemask || - (vm->def->numatune.memory.placement_mode == - VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) && - vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && - virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) { - char *mask = NULL; - if (vm->def->numatune.memory.placement_mode == - VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) - mask = virBitmapFormat(nodemask); - else - mask = virBitmapFormat(vm->def->numatune.memory.nodemask); - if (!mask) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("failed to convert memory nodemask")); - goto cleanup; - } - - rc = virCgroupSetCpusetMems(priv->cgroup, mask); - VIR_FREE(mask); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set cpuset.mems for domain %s"), - vm->def->name); - goto cleanup; - } - } + if (qemuSetupCpusetCgroup(vm, nodemask) < 0) + goto cleanup;
done: rc = 0;
ACK with whitespace tweak Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 20/05/13 19:10, Daniel P. Berrange wrote:
On Fri, May 17, 2013 at 07:59:34PM +0800, Osier Yang wrote:
--- src/qemu/qemu_cgroup.c | 73 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 26 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 1e8afb1..8f84ef9 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -626,6 +626,51 @@ cleanup: return ret; }
+static int +qemuSetupCpusetCgroup(virDomainObjPtr vm, + virBitmapPtr nodemask) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + char *mask = NULL; + int rc; + int ret = -1; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) + return 0; + + if ((vm->def->numatune.memory.nodemask || + (vm->def->numatune.memory.placement_mode == + VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) && + vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { + + if (vm->def->numatune.memory.placement_mode == + VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) + mask = virBitmapFormat(nodemask); + else + mask = virBitmapFormat(vm->def->numatune.memory.nodemask); + + if (!mask) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to convert memory nodemask")); + goto cleanup; + } + + rc = virCgroupSetCpusetMems(priv->cgroup, mask); + + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpuset.mems for domain %s"), + vm->def->name); + goto cleanup; + } + } + + ret = 0; +cleanup: + VIR_FREE(mask); + return ret; +} +
Add a 2nd blank line
int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, bool startup) @@ -772,32 +817,8 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, } }
- if ((vm->def->numatune.memory.nodemask || - (vm->def->numatune.memory.placement_mode == - VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) && - vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && - virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) { - char *mask = NULL; - if (vm->def->numatune.memory.placement_mode == - VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) - mask = virBitmapFormat(nodemask); - else - mask = virBitmapFormat(vm->def->numatune.memory.nodemask); - if (!mask) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("failed to convert memory nodemask")); - goto cleanup; - } - - rc = virCgroupSetCpusetMems(priv->cgroup, mask); - VIR_FREE(mask); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set cpuset.mems for domain %s"), - vm->def->name); - goto cleanup; - } - } + if (qemuSetupCpusetCgroup(vm, nodemask) < 0) + goto cleanup;
done: rc = 0; ACK with whitespace tweak
Pushed with the tweaks.

On 20/05/13 19:10, Daniel P. Berrange wrote:
On Fri, May 17, 2013 at 07:59:34PM +0800, Osier Yang wrote:
--- src/qemu/qemu_cgroup.c | 73 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 26 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 1e8afb1..8f84ef9 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -626,6 +626,51 @@ cleanup: return ret; }
+static int +qemuSetupCpusetCgroup(virDomainObjPtr vm, + virBitmapPtr nodemask) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + char *mask = NULL; + int rc; + int ret = -1; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) + return 0; + + if ((vm->def->numatune.memory.nodemask || + (vm->def->numatune.memory.placement_mode == + VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) && + vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { + + if (vm->def->numatune.memory.placement_mode == + VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) + mask = virBitmapFormat(nodemask); + else + mask = virBitmapFormat(vm->def->numatune.memory.nodemask); + + if (!mask) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to convert memory nodemask")); + goto cleanup; + } + + rc = virCgroupSetCpusetMems(priv->cgroup, mask); + + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpuset.mems for domain %s"), + vm->def->name); + goto cleanup; + } + } + + ret = 0; +cleanup: + VIR_FREE(mask); + return ret; +} +
Add a 2nd blank line
int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, bool startup) @@ -772,32 +817,8 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, } }
- if ((vm->def->numatune.memory.nodemask || - (vm->def->numatune.memory.placement_mode == - VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) && - vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && - virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) { - char *mask = NULL; - if (vm->def->numatune.memory.placement_mode == - VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) - mask = virBitmapFormat(nodemask); - else - mask = virBitmapFormat(vm->def->numatune.memory.nodemask); - if (!mask) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("failed to convert memory nodemask")); - goto cleanup; - } - - rc = virCgroupSetCpusetMems(priv->cgroup, mask); - VIR_FREE(mask); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set cpuset.mems for domain %s"), - vm->def->name); - goto cleanup; - } - } + if (qemuSetupCpusetCgroup(vm, nodemask) < 0) + goto cleanup;
done: rc = 0; ACK with whitespace tweak
Pushed with the tweak.

--- src/qemu/qemu_cgroup.c | 63 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 8f84ef9..8a2cc9d 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -671,6 +671,35 @@ cleanup: return ret; } +static int +qemuSetupCpuCgroup(virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc = -1; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { + if (vm->def->cputune.shares) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("CPU tuning is not available on this host")); + return -1; + } else { + return 0; + } + } + + if (vm->def->cputune.shares) { + rc = virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set io cpu shares for domain %s"), + vm->def->name); + return -1; + } + } + + return 0; +} + int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, bool startup) @@ -784,46 +813,30 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, virBitmapPtr nodemask) { - int rc = -1; qemuDomainObjPrivatePtr priv = vm->privateData; if (qemuInitCgroup(driver, vm, true) < 0) return -1; if (!priv->cgroup) - goto done; + return 0; if (qemuSetupDevicesCgroup(driver, vm) < 0) - goto cleanup; + return -1; if (qemuSetupBlkioCgroup(vm) < 0) - goto cleanup; + return -1; if (qemuSetupMemoryCgroup(vm) < 0) - goto cleanup; - - if (vm->def->cputune.shares != 0) { - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { - rc = virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set io cpu shares for domain %s"), - vm->def->name); - goto cleanup; - } - } else { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("CPU tuning is not available on this host")); - } - } + return -1; if (qemuSetupCpusetCgroup(vm, nodemask) < 0) - goto cleanup; + return -1; -done: - rc = 0; -cleanup: - return rc == 0 ? 0 : -1; + if (qemuSetupCpuCgroup(vm) < 0) + return -1; + + return 0; } int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, unsigned long long period, -- 1.8.1.4

On Fri, May 17, 2013 at 07:59:35PM +0800, Osier Yang wrote:
--- src/qemu/qemu_cgroup.c | 63 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 25 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 8f84ef9..8a2cc9d 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -671,6 +671,35 @@ cleanup: return ret; }
+static int +qemuSetupCpuCgroup(virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc = -1; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { + if (vm->def->cputune.shares) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("CPU tuning is not available on this host")); + return -1; + } else { + return 0; + } + } + + if (vm->def->cputune.shares) { + rc = virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set io cpu shares for domain %s"), + vm->def->name); + return -1; + } + } + + return 0; +} +
+ blank line
int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, bool startup) @@ -784,46 +813,30 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, virBitmapPtr nodemask) { - int rc = -1; qemuDomainObjPrivatePtr priv = vm->privateData;
if (qemuInitCgroup(driver, vm, true) < 0) return -1;
if (!priv->cgroup) - goto done; + return 0;
if (qemuSetupDevicesCgroup(driver, vm) < 0) - goto cleanup; + return -1;
if (qemuSetupBlkioCgroup(vm) < 0) - goto cleanup; + return -1;
if (qemuSetupMemoryCgroup(vm) < 0) - goto cleanup; - - if (vm->def->cputune.shares != 0) { - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { - rc = virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set io cpu shares for domain %s"), - vm->def->name); - goto cleanup; - } - } else { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("CPU tuning is not available on this host")); - } - } + return -1;
if (qemuSetupCpusetCgroup(vm, nodemask) < 0) - goto cleanup; + return -1;
-done: - rc = 0; -cleanup: - return rc == 0 ? 0 : -1; + if (qemuSetupCpuCgroup(vm) < 0) + return -1; + + return 0; }
I don't think you should be replacing all the 'goto cleanup' lines with 'return -1' here. It is good practice to have a cleanup block, even if not currently used, since historically we've found that we need to add such blocks in more often than not. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

--- v1 - v2: * Keep the "cleanup" label of qemuSetupCgroup, as it could be used in future. --- src/qemu/qemu_cgroup.c | 55 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 78623c8..2b9c73e 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -675,6 +675,36 @@ cleanup: } +static int +qemuSetupCpuCgroup(virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc = -1; + + if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { + if (vm->def->cputune.shares) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("CPU tuning is not available on this host")); + return -1; + } else { + return 0; + } + } + + if (vm->def->cputune.shares) { + rc = virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set io cpu shares for domain %s"), + vm->def->name); + return -1; + } + } + + return 0; +} + + int qemuInitCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, bool startup) @@ -789,14 +819,14 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm, virBitmapPtr nodemask) { - int rc = -1; qemuDomainObjPrivatePtr priv = vm->privateData; + int ret = -1; if (qemuInitCgroup(driver, vm, true) < 0) return -1; if (!priv->cgroup) - goto done; + return 0; if (qemuSetupDevicesCgroup(driver, vm) < 0) goto cleanup; @@ -807,28 +837,15 @@ int qemuSetupCgroup(virQEMUDriverPtr driver, if (qemuSetupMemoryCgroup(vm) < 0) goto cleanup; - if (vm->def->cputune.shares != 0) { - if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { - rc = virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares); - if (rc != 0) { - virReportSystemError(-rc, - _("Unable to set io cpu shares for domain %s"), - vm->def->name); - goto cleanup; - } - } else { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("CPU tuning is not available on this host")); - } - } + if (qemuSetupCpuCgroup(vm) < 0) + goto cleanup; if (qemuSetupCpusetCgroup(vm, nodemask) < 0) goto cleanup; -done: - rc = 0; + ret = 0; cleanup: - return rc == 0 ? 0 : -1; + return ret; } int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, unsigned long long period, -- 1.8.1.4

When either "cpuset" of <vcpu> is specified, or the "placement" of <vcpu> is "auto", only setting the cpuset.mems might cause the guest starting to fail. E.g. ("placement" of both <vcpu> and <numatune> is "auto"): 1) Related XMLs <vcpu placement='auto'>4</vcpu> <numatune> <memory mode='strict' placement='auto'/> </numatune> 2) Host NUMA topology % numactl --hardware available: 8 nodes (0-7) node 0 cpus: 0 4 8 12 16 20 24 28 node 0 size: 16374 MB node 0 free: 11899 MB node 1 cpus: 32 36 40 44 48 52 56 60 node 1 size: 16384 MB node 1 free: 15318 MB node 2 cpus: 2 6 10 14 18 22 26 30 node 2 size: 16384 MB node 2 free: 15766 MB node 3 cpus: 34 38 42 46 50 54 58 62 node 3 size: 16384 MB node 3 free: 15347 MB node 4 cpus: 3 7 11 15 19 23 27 31 node 4 size: 16384 MB node 4 free: 15041 MB node 5 cpus: 35 39 43 47 51 55 59 63 node 5 size: 16384 MB node 5 free: 15202 MB node 6 cpus: 1 5 9 13 17 21 25 29 node 6 size: 16384 MB node 6 free: 15197 MB node 7 cpus: 33 37 41 45 49 53 57 61 node 7 size: 16368 MB node 7 free: 15669 MB 4) cpuset.cpus will be set as: (from debug log) 2013-05-09 16:50:17.296+0000: 417: debug : virCgroupSetValueStr:331 : Set value '/sys/fs/cgroup/cpuset/libvirt/qemu/toy/cpuset.cpus' to '0-63' 5) The advisory nodeset got from querying numad (from debug log) 2013-05-09 16:50:17.295+0000: 417: debug : qemuProcessStart:3614 : Nodeset returned from numad: 1 6) cpuset.mems will be set as: (from debug log) 2013-05-09 16:50:17.296+0000: 417: debug : virCgroupSetValueStr:331 : Set value '/sys/fs/cgroup/cpuset/libvirt/qemu/toy/cpuset.mems' to '0-7' I.E, the domain process's memory is restricted on the first NUMA node, however, it can use all of the CPUs, which will very likely cause the domain process to fail to start because of the kernel fails to allocate memory with the possible mismatching between CPU nodes and memory nodes. % tail -n 20 /var/log/libvirt/qemu/toy.log ... 2013-05-09 05:53:32.972+0000: 7318: debug : virCommandHandshakeChild:377 : Handshake with parent is done char device redirected to /dev/pts/2 (label charserial0) kvm_init_vcpu failed: Cannot allocate memory ... --- src/qemu/qemu_cgroup.c | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 8a2cc9d..6217564 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -631,7 +631,8 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, virBitmapPtr nodemask) { qemuDomainObjPrivatePtr priv = vm->privateData; - char *mask = NULL; + char *mem_mask = NULL; + char *cpu_mask = NULL; int rc; int ret = -1; @@ -645,17 +646,17 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, if (vm->def->numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) - mask = virBitmapFormat(nodemask); + mem_mask = virBitmapFormat(nodemask); else - mask = virBitmapFormat(vm->def->numatune.memory.nodemask); + mem_mask = virBitmapFormat(vm->def->numatune.memory.nodemask); - if (!mask) { + if (!mem_mask) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("failed to convert memory nodemask")); goto cleanup; } - rc = virCgroupSetCpusetMems(priv->cgroup, mask); + rc = virCgroupSetCpusetMems(priv->cgroup, mem_mask); if (rc != 0) { virReportSystemError(-rc, @@ -665,9 +666,35 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, } } + if (vm->def->cpumask || + (vm->def->placement_mode == + VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) { + if (vm->def->placement_mode == + VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) + cpu_mask = virBitmapFormat(nodemask); + else + cpu_mask = virBitmapFormat(vm->def->cpumask); + + if (!cpu_mask) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to convert memory nodemask")); + goto cleanup; + } + + rc = virCgroupSetCpusetCpus(priv->cgroup, cpu_mask); + + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpuset.cpus for domain %s"), + vm->def->name); + goto cleanup; + } + } + ret = 0; cleanup: - VIR_FREE(mask); + VIR_FREE(mem_mask); + VIR_FREE(cpu_mask); return ret; } -- 1.8.1.4

On Fri, May 17, 2013 at 07:59:36PM +0800, Osier Yang wrote:
When either "cpuset" of <vcpu> is specified, or the "placement" of <vcpu> is "auto", only setting the cpuset.mems might cause the guest starting to fail. E.g. ("placement" of both <vcpu> and <numatune> is "auto"):
1) Related XMLs <vcpu placement='auto'>4</vcpu> <numatune> <memory mode='strict' placement='auto'/> </numatune>
2) Host NUMA topology % numactl --hardware available: 8 nodes (0-7) node 0 cpus: 0 4 8 12 16 20 24 28 node 0 size: 16374 MB node 0 free: 11899 MB node 1 cpus: 32 36 40 44 48 52 56 60 node 1 size: 16384 MB node 1 free: 15318 MB node 2 cpus: 2 6 10 14 18 22 26 30 node 2 size: 16384 MB node 2 free: 15766 MB node 3 cpus: 34 38 42 46 50 54 58 62 node 3 size: 16384 MB node 3 free: 15347 MB node 4 cpus: 3 7 11 15 19 23 27 31 node 4 size: 16384 MB node 4 free: 15041 MB node 5 cpus: 35 39 43 47 51 55 59 63 node 5 size: 16384 MB node 5 free: 15202 MB node 6 cpus: 1 5 9 13 17 21 25 29 node 6 size: 16384 MB node 6 free: 15197 MB node 7 cpus: 33 37 41 45 49 53 57 61 node 7 size: 16368 MB node 7 free: 15669 MB
4) cpuset.cpus will be set as: (from debug log)
2013-05-09 16:50:17.296+0000: 417: debug : virCgroupSetValueStr:331 : Set value '/sys/fs/cgroup/cpuset/libvirt/qemu/toy/cpuset.cpus' to '0-63'
5) The advisory nodeset got from querying numad (from debug log)
2013-05-09 16:50:17.295+0000: 417: debug : qemuProcessStart:3614 : Nodeset returned from numad: 1
6) cpuset.mems will be set as: (from debug log)
2013-05-09 16:50:17.296+0000: 417: debug : virCgroupSetValueStr:331 : Set value '/sys/fs/cgroup/cpuset/libvirt/qemu/toy/cpuset.mems' to '0-7'
I.E, the domain process's memory is restricted on the first NUMA node, however, it can use all of the CPUs, which will very likely cause the domain process to fail to start because of the kernel fails to allocate memory with the possible mismatching between CPU nodes and memory nodes.
This is only a problem if the kernel is forced to do allocation from a memory node which matches the CPU node. It is perfectly acceptable for the kernel to allocate memory from a node that is different from the CPU node in general. eg, it is the mode='strict' attribute in the XML above that causes the bug.
@@ -665,9 +666,35 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, } }
+ if (vm->def->cpumask || + (vm->def->placement_mode == + VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) {
I think you should only be doing this if placement==auto *and* mode=strict.
+ if (vm->def->placement_mode == + VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) + cpu_mask = virBitmapFormat(nodemask); + else + cpu_mask = virBitmapFormat(vm->def->cpumask); + + if (!cpu_mask) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to convert memory nodemask")); + goto cleanup; + } + + rc = virCgroupSetCpusetCpus(priv->cgroup, cpu_mask); + + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpuset.cpus for domain %s"), + vm->def->name); + goto cleanup; + } + } + ret = 0; cleanup: - VIR_FREE(mask); + VIR_FREE(mem_mask); + VIR_FREE(cpu_mask); return ret;
Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 20/05/13 19:18, Daniel P. Berrange wrote:
On Fri, May 17, 2013 at 07:59:36PM +0800, Osier Yang wrote:
When either "cpuset" of <vcpu> is specified, or the "placement" of <vcpu> is "auto", only setting the cpuset.mems might cause the guest starting to fail. E.g. ("placement" of both <vcpu> and <numatune> is "auto"):
1) Related XMLs <vcpu placement='auto'>4</vcpu> <numatune> <memory mode='strict' placement='auto'/> </numatune>
2) Host NUMA topology % numactl --hardware available: 8 nodes (0-7) node 0 cpus: 0 4 8 12 16 20 24 28 node 0 size: 16374 MB node 0 free: 11899 MB node 1 cpus: 32 36 40 44 48 52 56 60 node 1 size: 16384 MB node 1 free: 15318 MB node 2 cpus: 2 6 10 14 18 22 26 30 node 2 size: 16384 MB node 2 free: 15766 MB node 3 cpus: 34 38 42 46 50 54 58 62 node 3 size: 16384 MB node 3 free: 15347 MB node 4 cpus: 3 7 11 15 19 23 27 31 node 4 size: 16384 MB node 4 free: 15041 MB node 5 cpus: 35 39 43 47 51 55 59 63 node 5 size: 16384 MB node 5 free: 15202 MB node 6 cpus: 1 5 9 13 17 21 25 29 node 6 size: 16384 MB node 6 free: 15197 MB node 7 cpus: 33 37 41 45 49 53 57 61 node 7 size: 16368 MB node 7 free: 15669 MB
4) cpuset.cpus will be set as: (from debug log)
2013-05-09 16:50:17.296+0000: 417: debug : virCgroupSetValueStr:331 : Set value '/sys/fs/cgroup/cpuset/libvirt/qemu/toy/cpuset.cpus' to '0-63'
5) The advisory nodeset got from querying numad (from debug log)
2013-05-09 16:50:17.295+0000: 417: debug : qemuProcessStart:3614 : Nodeset returned from numad: 1
6) cpuset.mems will be set as: (from debug log)
2013-05-09 16:50:17.296+0000: 417: debug : virCgroupSetValueStr:331 : Set value '/sys/fs/cgroup/cpuset/libvirt/qemu/toy/cpuset.mems' to '0-7'
I.E, the domain process's memory is restricted on the first NUMA node, however, it can use all of the CPUs, which will very likely cause the domain process to fail to start because of the kernel fails to allocate memory with the possible mismatching between CPU nodes and memory nodes. This is only a problem if the kernel is forced to do allocation from a memory node which matches the CPU node.
It is perfectly acceptable for the kernel to allocate memory from a node that is different from the CPU node in general.
eg, it is the mode='strict' attribute in the XML above that causes the bug.
I can update the commit log to explain it more clearly.
@@ -665,9 +666,35 @@ qemuSetupCpusetCgroup(virDomainObjPtr vm, } }
+ if (vm->def->cpumask || + (vm->def->placement_mode == + VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) { I think you should only be doing this if placement==auto *and* mode=strict.
There is no "mode" for cpu. See: http://libvirt.org/formatdomain.html#elementsCPUAllocation
+ if (vm->def->placement_mode == + VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) + cpu_mask = virBitmapFormat(nodemask); + else + cpu_mask = virBitmapFormat(vm->def->cpumask); + + if (!cpu_mask) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to convert memory nodemask")); + goto cleanup; + } + + rc = virCgroupSetCpusetCpus(priv->cgroup, cpu_mask); + + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpuset.cpus for domain %s"), + vm->def->name); + goto cleanup; + } + } + ret = 0; cleanup: - VIR_FREE(mask); + VIR_FREE(mem_mask); + VIR_FREE(cpu_mask); return ret;
Daniel

I don't see any reason to getting the numa parameters if mode is not strict, as long as setNumaParameters doesn't allow to set "nodeset" if the mode is not strict, and cpuset.mems only understand "strict" mode. Things could be changed if we support to get numa parameters with libnuma one day, but let's prohibit it now. --- See [1] for further discussion on the possibility to use "interleave" and "preferred" modes for cgroup memory controller. But before we have any conclusion yet, let's prohibit getting the values from cpuset.mems as long as the numatune mode is not "strict" [1] https://www.redhat.com/archives/libvir-list/2013-May/msg00643.html --- src/qemu/qemu_driver.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 203cc6d..8290a44 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -7858,8 +7858,15 @@ qemuDomainGetNumaParameters(virDomainPtr dom, if (flags & VIR_DOMAIN_AFFECT_LIVE) { if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) { - virReportError(VIR_ERR_OPERATION_INVALID, - "%s", _("cgroup memory controller is not mounted")); + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("cgroup memory controller is not mounted")); + goto cleanup; + } + + if (vm->def->numatune.memory.mode != VIR_DOMAIN_NUMATUNE_MEM_STRICT) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("getting numa parameters of running domain is " + "only valid for strict numa mode")); goto cleanup; } } -- 1.8.1.4
participants (3)
-
Daniel P. Berrange
-
Osier Yang
-
Viktor Mihajlovski