[libvirt] numad: Convert node list to cpumap before affinity setting

Previous patches on numad made mistake with thinking numad will return CPUs list (numad document says CPUs list one place, and node list another place. :( Please see the patches' commit message for the details. Osier Yang (3) numad: Convert node list to cpumap before setting affinity numad: Ignore cpuset if placement is auto qemu: Avoid the memory allocation and freeing docs/formatdomain.html.in | 10 ++++---- src/conf/domain_conf.c | 28 ++++++++++++++------------ src/qemu/qemu_process.c | 47 +++++++++++++++++++++++--------------------- 3 files changed, 45 insertions(+), 40 deletions(-) Regards, Osier

Instead of returning a CPUs list, numad returns NUMA node list instead, this patch is to convert the node list to cpumap before affinity setting. Otherwise, the domain processes will be pinned only to CPU[$numa_cell_num], which will cause significiant performance losing. Also because numad will balance the affinity dynamically, reflecting the cpuset from numad back doesn't make much sense then, and it may just could produce confusion for users' eyes. Thus the better way is not to reflect it back to XML. And in this case, it's better to ignore the cpuset when parsing XML. The codes to update the cpuset is removed in this patch incidentally, and there will be a follow up patch to ignore the manually specified "cpuset" if "placement" is "auto", and document will be updated too. --- The patch is tested on a NUMA box with 4 cells, 24 CPUs. --- src/qemu/qemu_process.c | 33 ++++++++++++++++++++------------- 1 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 481b4f3..0bf743b 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -1819,36 +1819,43 @@ qemuProcessInitCpuAffinity(struct qemud_driver *driver, } if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { - char *tmp_cpumask = NULL; char *nodeset = NULL; + char *nodemask = NULL; nodeset = qemuGetNumadAdvice(vm->def); if (!nodeset) return -1; - if (VIR_ALLOC_N(tmp_cpumask, VIR_DOMAIN_CPUMASK_LEN) < 0) { + if (VIR_ALLOC_N(nodemask, VIR_DOMAIN_CPUMASK_LEN) < 0) { virReportOOMError(); return -1; } - if (virDomainCpuSetParse(nodeset, 0, tmp_cpumask, + if (virDomainCpuSetParse(nodeset, 0, nodemask, VIR_DOMAIN_CPUMASK_LEN) < 0) { - VIR_FREE(tmp_cpumask); + VIR_FREE(nodemask); VIR_FREE(nodeset); return -1; } - for (i = 0; i < maxcpu && i < VIR_DOMAIN_CPUMASK_LEN; i++) { - if (tmp_cpumask[i]) - VIR_USE_CPU(cpumap, i); + /* numad returns the NUMA node list, convert it to cpumap */ + int prev_total_ncpus = 0; + for (i = 0; i < driver->caps->host.nnumaCell; i++) { + int j; + int cur_ncpus = driver->caps->host.numaCell[i]->ncpus; + if (nodemask[i]) { + for (j = prev_total_ncpus; + j < cur_ncpus + prev_total_ncpus && + j < maxcpu && + j < VIR_DOMAIN_CPUMASK_LEN; + j++) { + VIR_USE_CPU(cpumap, j); + } + } + prev_total_ncpus += cur_ncpus; } - VIR_FREE(vm->def->cpumask); - vm->def->cpumask = tmp_cpumask; - if (virDomainSaveStatus(driver->caps, driver->stateDir, vm) < 0) { - VIR_WARN("Unable to save status on vm %s after state change", - vm->def->name); - } + VIR_FREE(nodemask); VIR_FREE(nodeset); } else { if (vm->def->cpumask) { -- 1.7.1

On Wed, Apr 11, 2012 at 10:40:32PM +0800, Osier Yang wrote:
Instead of returning a CPUs list, numad returns NUMA node list instead, this patch is to convert the node list to cpumap before affinity setting. Otherwise, the domain processes will be pinned only to CPU[$numa_cell_num], which will cause significiant performance losing.
s/losing/losses/
Also because numad will balance the affinity dynamically, reflecting the cpuset from numad back doesn't make much sense then, and it may just could produce confusion for users' eyes. Thus the better way is not to reflect it back
confusion for the users.
to XML. And in this case, it's better to ignore the cpuset when parsing XML.
The codes to update the cpuset is removed in this patch incidentally, and there will be a follow up patch to ignore the manually specified "cpuset" if "placement" is "auto", and document will be updated too.
--- The patch is tested on a NUMA box with 4 cells, 24 CPUs. --- src/qemu/qemu_process.c | 33 ++++++++++++++++++++------------- 1 files changed, 20 insertions(+), 13 deletions(-)
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 481b4f3..0bf743b 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -1819,36 +1819,43 @@ qemuProcessInitCpuAffinity(struct qemud_driver *driver, }
if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { - char *tmp_cpumask = NULL; char *nodeset = NULL; + char *nodemask = NULL;
nodeset = qemuGetNumadAdvice(vm->def); if (!nodeset) return -1;
- if (VIR_ALLOC_N(tmp_cpumask, VIR_DOMAIN_CPUMASK_LEN) < 0) { + if (VIR_ALLOC_N(nodemask, VIR_DOMAIN_CPUMASK_LEN) < 0) { virReportOOMError(); return -1; }
- if (virDomainCpuSetParse(nodeset, 0, tmp_cpumask, + if (virDomainCpuSetParse(nodeset, 0, nodemask, VIR_DOMAIN_CPUMASK_LEN) < 0) { - VIR_FREE(tmp_cpumask); + VIR_FREE(nodemask); VIR_FREE(nodeset); return -1; }
- for (i = 0; i < maxcpu && i < VIR_DOMAIN_CPUMASK_LEN; i++) { - if (tmp_cpumask[i]) - VIR_USE_CPU(cpumap, i); + /* numad returns the NUMA node list, convert it to cpumap */ + int prev_total_ncpus = 0; + for (i = 0; i < driver->caps->host.nnumaCell; i++) { + int j; + int cur_ncpus = driver->caps->host.numaCell[i]->ncpus; + if (nodemask[i]) { + for (j = prev_total_ncpus; + j < cur_ncpus + prev_total_ncpus && + j < maxcpu && + j < VIR_DOMAIN_CPUMASK_LEN; + j++) { + VIR_USE_CPU(cpumap, j); + } + } + prev_total_ncpus += cur_ncpus; }
- VIR_FREE(vm->def->cpumask); - vm->def->cpumask = tmp_cpumask; - if (virDomainSaveStatus(driver->caps, driver->stateDir, vm) < 0) { - VIR_WARN("Unable to save status on vm %s after state change", - vm->def->name); - } + VIR_FREE(nodemask); VIR_FREE(nodeset); } else { if (vm->def->cpumask) {
ACK, but fix the commit message, and wait to see the feedback on 2/3 and 3/3 as this should not be commited in isolation Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

On 2012年04月16日 15:45, Daniel Veillard wrote:
On Wed, Apr 11, 2012 at 10:40:32PM +0800, Osier Yang wrote:
Instead of returning a CPUs list, numad returns NUMA node list instead, this patch is to convert the node list to cpumap before affinity setting. Otherwise, the domain processes will be pinned only to CPU[$numa_cell_num], which will cause significiant performance losing.
s/losing/losses/
Also because numad will balance the affinity dynamically, reflecting the cpuset from numad back doesn't make much sense then, and it may just could produce confusion for users' eyes. Thus the better way is not to reflect it back
confusion for the users.
to XML. And in this case, it's better to ignore the cpuset when parsing XML.
The codes to update the cpuset is removed in this patch incidentally, and there will be a follow up patch to ignore the manually specified "cpuset" if "placement" is "auto", and document will be updated too.
--- The patch is tested on a NUMA box with 4 cells, 24 CPUs. --- src/qemu/qemu_process.c | 33 ++++++++++++++++++++------------- 1 files changed, 20 insertions(+), 13 deletions(-)
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 481b4f3..0bf743b 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -1819,36 +1819,43 @@ qemuProcessInitCpuAffinity(struct qemud_driver *driver, }
if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { - char *tmp_cpumask = NULL; char *nodeset = NULL; + char *nodemask = NULL;
nodeset = qemuGetNumadAdvice(vm->def); if (!nodeset) return -1;
- if (VIR_ALLOC_N(tmp_cpumask, VIR_DOMAIN_CPUMASK_LEN)< 0) { + if (VIR_ALLOC_N(nodemask, VIR_DOMAIN_CPUMASK_LEN)< 0) { virReportOOMError(); return -1; }
- if (virDomainCpuSetParse(nodeset, 0, tmp_cpumask, + if (virDomainCpuSetParse(nodeset, 0, nodemask, VIR_DOMAIN_CPUMASK_LEN)< 0) { - VIR_FREE(tmp_cpumask); + VIR_FREE(nodemask); VIR_FREE(nodeset); return -1; }
- for (i = 0; i< maxcpu&& i< VIR_DOMAIN_CPUMASK_LEN; i++) { - if (tmp_cpumask[i]) - VIR_USE_CPU(cpumap, i); + /* numad returns the NUMA node list, convert it to cpumap */ + int prev_total_ncpus = 0; + for (i = 0; i< driver->caps->host.nnumaCell; i++) { + int j; + int cur_ncpus = driver->caps->host.numaCell[i]->ncpus; + if (nodemask[i]) { + for (j = prev_total_ncpus; + j< cur_ncpus + prev_total_ncpus&& + j< maxcpu&& + j< VIR_DOMAIN_CPUMASK_LEN; + j++) { + VIR_USE_CPU(cpumap, j); + } + } + prev_total_ncpus += cur_ncpus; }
- VIR_FREE(vm->def->cpumask); - vm->def->cpumask = tmp_cpumask; - if (virDomainSaveStatus(driver->caps, driver->stateDir, vm)< 0) { - VIR_WARN("Unable to save status on vm %s after state change", - vm->def->name); - } + VIR_FREE(nodemask); VIR_FREE(nodeset); } else { if (vm->def->cpumask) {
ACK, but fix the commit message, and wait to see the feedback on 2/3 and 3/3 as this should not be commited in isolation
Daniel
Thanks, pushed series with nits fixed. Also with conflicts with commit 354e6d4ed. Regards, Osier

As explained in previous patch, numad will balance the affinity dynamically, so reflecting the cpuset from numad at the first time doesn't make much case, and may just could cause confusion. --- docs/formatdomain.html.in | 10 +++++----- src/conf/domain_conf.c | 28 +++++++++++++++------------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index a382d30..bb67cd1 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -365,11 +365,11 @@ "auto", defaults to "static" if <code>cpuset</code> is specified, "auto" indicates the domain process will be pinned to the advisory nodeset from querying numad, and the value of attribute - <code>cpuset</code> will be overridden by the advisory nodeset - from numad if it's specified. If both <code>cpuset</code> and - <code>placement</code> are not specified, or if <code>placement</code> - is "static", but no <code>cpuset</code> is specified, the domain - process will be pinned to all the available physical CPUs. + <code>cpuset</code> will be ignored if it's specified. If both + <code>cpuset</code> and <code>placement</code> are not specified, + or if <code>placement</code> is "static", but no <code>cpuset</code> + is specified, the domain process will be pinned to all the + available physical CPUs. </dd> </dl> diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index c6b97e1..07dcc89 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -7884,19 +7884,6 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, } } - tmp = virXPathString("string(./vcpu[1]/@cpuset)", ctxt); - if (tmp) { - char *set = tmp; - def->cpumasklen = VIR_DOMAIN_CPUMASK_LEN; - if (VIR_ALLOC_N(def->cpumask, def->cpumasklen) < 0) { - goto no_memory; - } - if (virDomainCpuSetParse(set, 0, def->cpumask, - def->cpumasklen) < 0) - goto error; - VIR_FREE(tmp); - } - tmp = virXPathString("string(./vcpu[1]/@placement)", ctxt); if (tmp) { if ((def->placement_mode = @@ -7913,6 +7900,21 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, def->placement_mode = VIR_DOMAIN_CPU_PLACEMENT_MODE_STATIC; } + if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_STATIC) { + tmp = virXPathString("string(./vcpu[1]/@cpuset)", ctxt); + if (tmp) { + char *set = tmp; + def->cpumasklen = VIR_DOMAIN_CPUMASK_LEN; + if (VIR_ALLOC_N(def->cpumask, def->cpumasklen) < 0) { + goto no_memory; + } + if (virDomainCpuSetParse(set, 0, def->cpumask, + def->cpumasklen) < 0) + goto error; + VIR_FREE(tmp); + } + } + /* Extract cpu tunables. */ if (virXPathULong("string(./cputune/shares[1])", ctxt, &def->cputune.shares) < 0) -- 1.7.1

On Wed, Apr 11, 2012 at 10:40:33PM +0800, Osier Yang wrote:
As explained in previous patch, numad will balance the affinity dynamically, so reflecting the cpuset from numad at the first time doesn't make much case, and may just could cause confusion. --- docs/formatdomain.html.in | 10 +++++----- src/conf/domain_conf.c | 28 +++++++++++++++------------- 2 files changed, 20 insertions(+), 18 deletions(-)
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index a382d30..bb67cd1 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -365,11 +365,11 @@ "auto", defaults to "static" if <code>cpuset</code> is specified, "auto" indicates the domain process will be pinned to the advisory nodeset from querying numad, and the value of attribute - <code>cpuset</code> will be overridden by the advisory nodeset - from numad if it's specified. If both <code>cpuset</code> and - <code>placement</code> are not specified, or if <code>placement</code> - is "static", but no <code>cpuset</code> is specified, the domain - process will be pinned to all the available physical CPUs. + <code>cpuset</code> will be ignored if it's specified. If both + <code>cpuset</code> and <code>placement</code> are not specified, + or if <code>placement</code> is "static", but no <code>cpuset</code> + is specified, the domain process will be pinned to all the + available physical CPUs. </dd> </dl>
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index c6b97e1..07dcc89 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -7884,19 +7884,6 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, } }
- tmp = virXPathString("string(./vcpu[1]/@cpuset)", ctxt); - if (tmp) { - char *set = tmp; - def->cpumasklen = VIR_DOMAIN_CPUMASK_LEN; - if (VIR_ALLOC_N(def->cpumask, def->cpumasklen) < 0) { - goto no_memory; - } - if (virDomainCpuSetParse(set, 0, def->cpumask, - def->cpumasklen) < 0) - goto error; - VIR_FREE(tmp); - } - tmp = virXPathString("string(./vcpu[1]/@placement)", ctxt); if (tmp) { if ((def->placement_mode = @@ -7913,6 +7900,21 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, def->placement_mode = VIR_DOMAIN_CPU_PLACEMENT_MODE_STATIC; }
+ if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_STATIC) { + tmp = virXPathString("string(./vcpu[1]/@cpuset)", ctxt); + if (tmp) { + char *set = tmp; + def->cpumasklen = VIR_DOMAIN_CPUMASK_LEN; + if (VIR_ALLOC_N(def->cpumask, def->cpumasklen) < 0) { + goto no_memory; + } + if (virDomainCpuSetParse(set, 0, def->cpumask, + def->cpumasklen) < 0) + goto error; + VIR_FREE(tmp); + } + } + /* Extract cpu tunables. */ if (virXPathULong("string(./cputune/shares[1])", ctxt, &def->cputune.shares) < 0)
Okay since we default to static in case the user didn't explicitely expressed the placement method, I'm fine with this this should not break existing setups. Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

--- src/qemu/qemu_process.c | 14 +++++--------- 1 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 0bf743b..7a48c12 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -1759,23 +1759,19 @@ static char * qemuGetNumadAdvice(virDomainDefPtr def) { virCommandPtr cmd = NULL; - char *args = NULL; char *output = NULL; - if (virAsprintf(&args, "%d:%llu", def->vcpus, def->mem.cur_balloon) < 0) { - virReportOOMError(); - goto out; - } - cmd = virCommandNewArgList(NUMAD, "-w", args, NULL); + cmd = virCommandNewArgList(NUMAD, "-w", NULL); + virCommandAddArgFormat(cmd, "%d:%llu", def->vcpus, + def->mem.cur_balloon); virCommandSetOutputBuffer(cmd, &output); if (virCommandRun(cmd, NULL) < 0) qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("Failed to query numad for the advisory nodeset")); + _("Failed to query numad for the " + "advisory nodeset")); -out: - VIR_FREE(args); virCommandFree(cmd); return output; } -- 1.7.1

On Wed, Apr 11, 2012 at 10:40:34PM +0800, Osier Yang wrote:
--- src/qemu/qemu_process.c | 14 +++++--------- 1 files changed, 5 insertions(+), 9 deletions(-)
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 0bf743b..7a48c12 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -1759,23 +1759,19 @@ static char * qemuGetNumadAdvice(virDomainDefPtr def) { virCommandPtr cmd = NULL; - char *args = NULL; char *output = NULL;
- if (virAsprintf(&args, "%d:%llu", def->vcpus, def->mem.cur_balloon) < 0) { - virReportOOMError(); - goto out; - } - cmd = virCommandNewArgList(NUMAD, "-w", args, NULL); + cmd = virCommandNewArgList(NUMAD, "-w", NULL); + virCommandAddArgFormat(cmd, "%d:%llu", def->vcpus, + def->mem.cur_balloon);
virCommandSetOutputBuffer(cmd, &output);
if (virCommandRun(cmd, NULL) < 0) qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("Failed to query numad for the advisory nodeset")); + _("Failed to query numad for the " + "advisory nodeset"));
-out: - VIR_FREE(args); virCommandFree(cmd); return output; } -- 1.7.1
ACK, that's a pure cleanup, should not change any functionality, Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/
participants (2)
-
Daniel Veillard
-
Osier Yang