[libvirt] [PATCH RFC]: Support numad

28 Feb 2012

numad is an user-level daemon that monitors NUMA topology and
processes resource consumption to facilitate good NUMA resource
alignment of applications/virtual machines to improve performance
and minimize cost of remote memory latencies. It provides a
pre-placement advisory interface, so significant processes can
be pre-bound to nodes with sufficient available resources.

More details: http://fedoraproject.org/wiki/Features/numad

"numad -w ncpus:memory_amount" is the advisory interface numad
provides currently.

This patch add the support by introducing new XML like:
  <numatune>
    <cpu required_cpus="4" required_memory="524288"/>
  </numatune>

And the corresponding numad command line will be:
  numad -w 4:500

The advisory nodeset returned from numad will be used to set
domain process CPU affinity then. (e.g. qemuProcessInitCpuAffinity).

If the user specifies both CPU affinity policy (e.g.
(<vcpu cpuset="1-10,^7,^8">4</vcpu>) and XML indicating to use
numad for the advisory nodeset, the specified CPU affinity will be
overridden by the nodeset returned from numad.

If no XML to specify the CPU affinity policy, and XML indicating
to use numad is specified, the returned nodeset will be printed
in <cpu cpuset="$nodeset_from_numad"/>4</vcpu>.

Only QEMU/KVM and LXC drivers support it now.
---
 configure.ac                  |    8 +++
 docs/formatdomain.html.in     |   18 ++++++-
 docs/schemas/domaincommon.rng |   12 ++++
 src/conf/domain_conf.c        |  125 +++++++++++++++++++++++++++++++----------
 src/conf/domain_conf.h        |    5 ++
 src/lxc/lxc_controller.c      |   98 ++++++++++++++++++++++++++++----
 src/qemu/qemu_process.c       |   99 +++++++++++++++++++++++++++++----
 7 files changed, 311 insertions(+), 54 deletions(-)

diff --git a/configure.ac b/configure.ac
index c9cdd7b..31f0835 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1445,6 +1445,14 @@ AM_CONDITIONAL([HAVE_NUMACTL], [test "$with_numactl" != "no"])
 AC_SUBST([NUMACTL_CFLAGS])
 AC_SUBST([NUMACTL_LIBS])
 
+dnl Do we have numad?
+if test "$with_qemu" = "yes"; then
+    AC_PATH_PROG([NUMAD], [numad], [], [/bin:/usr/bin:/usr/local/bin:$PATH])
+
+    if test -n "$NUMAD"; then
+        AC_DEFINE_UNQUOTED([NUMAD],["$NUMAD"], [Location or name of the numad program])
+    fi
+fi
 
 dnl pcap lib
 LIBPCAP_CONFIG="pcap-config"
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in
index 6fcca94..d8e70a6 100644
--- a/docs/formatdomain.html.in
+++ b/docs/formatdomain.html.in
@@ -505,6 +505,7 @@
   ...
   <numatune>
     <memory mode="strict" nodeset="1-4,^3"/>
+    <cpu required_cpus="3" required_memory="524288"/>
   </numatune>
   ...
 </domain>
@@ -519,7 +520,7 @@
         <span class='since'>Since 0.9.3</span>
       <dt><code>memory</code></dt>
       <dd>
-        The optional <code>memory</code> element specify how to allocate memory
+        The optional <code>memory</code> element specifies how to allocate memory
         for the domain process on a NUMA host. It contains two attributes,
         attribute <code>mode</code> is either 'interleave', 'strict',
         or 'preferred',
@@ -527,6 +528,21 @@
         syntax with attribute <code>cpuset</code> of element <code>vcpu</code>.
         <span class='since'>Since 0.9.3</span>
       </dd>
+      <dd>
+        The optional <code>cpu</code> element indicates pinning the virtual CPUs
+        to the nodeset returned by querying "numad" (a system daemon that monitors
+        NUMA topology and usage). It has two attributes, attribute
+        <code>required_cpus</code> specifies the number of physical CPUs the guest
+        process want to use. And the optional attribute <code>required_memory</code>
+        specifies the amount of free memory the guest process want to see on a node,
+        "numad" will pick the physical CPUs on the node which has enough free
+        memory of amount specified by <code>required_memory</code>.
+
+        NB, with using this element, the physical CPUs specified by attribute
+        <code>cpuset</code> (of element <code>vcpu</code>) will be overridden by the
+        nodeset returned from "numad".
+        <span class='since'>Since 0.9.11 (QEMU/KVM and LXC only)</span>
+      </dd>
     </dl>
 
 
diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng
index 3908733..d0f443d 100644
--- a/docs/schemas/domaincommon.rng
+++ b/docs/schemas/domaincommon.rng
@@ -549,6 +549,18 @@
               </attribute>
             </element>
           </optional>
+          <optional>
+            <element name="cpu">
+              <attribute name="required_cpu">
+                <ref name="countCPU"/>
+              </attribute>
+              <optional>
+                <attribute name="required_memory">
+                  <ref name="memoryKB"/>
+                </attribute>
+              </optional>
+            </element>
+          </optional>
         </element>
       </optional>
     </interleave>
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index f9654f1..aa03c05 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -7125,7 +7125,6 @@ error:
     goto cleanup;
 }
 
-
 static int virDomainDefMaybeAddController(virDomainDefPtr def,
                                           int type,
                                           int idx)
@@ -7185,6 +7184,7 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps,
     bool uuid_generated = false;
     virBitmapPtr bootMap = NULL;
     unsigned long bootMapSize = 0;
+    xmlNodePtr cur;
 
     if (VIR_ALLOC(def) < 0) {
         virReportOOMError();
@@ -7454,47 +7454,100 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps,
     VIR_FREE(nodes);
 
     /* Extract numatune if exists. */
-    if ((n = virXPathNodeSet("./numatune", ctxt, NULL)) < 0) {
+    if ((n = virXPathNodeSet("./numatune", ctxt, &nodes)) < 0) {
         virDomainReportError(VIR_ERR_INTERNAL_ERROR,
                              "%s", _("cannot extract numatune nodes"));
         goto error;
     }
 
+    if (n > 1) {
+        virDomainReportError(VIR_ERR_XML_ERROR, "%s",
+                             _("only one numatune is supported"));
+        VIR_FREE(nodes);
+        goto error;
+    }
+
     if (n) {
-        tmp = virXPathString("string(./numatune/memory/@nodeset)", ctxt);
-        if (tmp) {
-            char *set = tmp;
-            int nodemasklen = VIR_DOMAIN_CPUMASK_LEN;
+        cur = nodes[0]->children;
+        while (cur != NULL) {
+            if (cur->type == XML_ELEMENT_NODE) {
+                if ((xmlStrEqual(cur->name, BAD_CAST "memory"))) {
+                    tmp = virXMLPropString(cur, "nodeset");
 
-            if (VIR_ALLOC_N(def->numatune.memory.nodemask, nodemasklen) < 0) {
-                goto no_memory;
-            }
+                    if (tmp) {
+                        char *set = tmp;
+                        int nodemasklen = VIR_DOMAIN_CPUMASK_LEN;
 
-            /* "nodeset" leads same syntax with "cpuset". */
-            if (virDomainCpuSetParse(set, 0, def->numatune.memory.nodemask,
-                                     nodemasklen) < 0)
-               goto error;
-            VIR_FREE(tmp);
-        } else {
-            virDomainReportError(VIR_ERR_INTERNAL_ERROR,
-                                "%s", _("nodeset for NUMA memory tuning must be set"));
-            goto error;
-        }
+                        if (VIR_ALLOC_N(def->numatune.memory.nodemask,
+                                        nodemasklen) < 0) {
+                            virReportOOMError();
+                            goto error;
+                        }
 
-        tmp = virXPathString("string(./numatune/memory/@mode)", ctxt);
-        if (tmp) {
-            if ((def->numatune.memory.mode =
-                virDomainNumatuneMemModeTypeFromString(tmp)) < 0) {
-                virDomainReportError(VIR_ERR_INTERNAL_ERROR,
-                                    _("Unsupported NUMA memory tuning mode '%s'"),
-                                    tmp);
-                goto error;
+                        /* "nodeset" leads same syntax with "cpuset". */
+                        if (virDomainCpuSetParse(set, 0,
+                                                 def->numatune.memory.nodemask,
+                                                 nodemasklen) < 0)
+                            goto error;
+                        VIR_FREE(tmp);
+                    } else {
+                        virDomainReportError(VIR_ERR_XML_ERROR, "%s",
+                                             _("nodeset for NUMA memory "
+                                               "tuning must be set"));
+                        goto error;
+                    }
+
+                    tmp = virXMLPropString(cur, "mode");
+                    if (tmp) {
+                        if ((def->numatune.memory.mode =
+                            virDomainNumatuneMemModeTypeFromString(tmp)) < 0) {
+                            virDomainReportError(VIR_ERR_XML_ERROR,
+                                                 _("Unsupported NUMA memory "
+                                                   "tuning mode '%s'"),
+                                                 tmp);
+                            goto error;
+                        }
+                        VIR_FREE(tmp);
+                    } else {
+                        def->numatune.memory.mode = VIR_DOMAIN_NUMATUNE_MEM_STRICT;
+                    }
+                } else if (xmlStrEqual(cur->name, BAD_CAST "cpu")) {
+                    char *req_cpus = NULL;
+                    char *req_memory = NULL;
+                    req_cpus = virXMLPropString(cur, "required_cpus");
+                    req_memory = virXMLPropString(cur, "required_memory");
+
+                    if (req_cpus &&
+                        virStrToLong_ui(req_cpus, NULL, 10,
+                                        &def->numatune.cpu.required_cpus) < 0) {
+                        virDomainReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                                             _("Cannot parse <cpu> 'required_cpus'"
+                                               " attribute"));
+                        goto error;
+                    }
+
+                    if (req_memory &&
+                        virStrToLong_ul(req_memory, NULL, 10,
+                                        &def->numatune.cpu.required_memory) < 0) {
+                        virDomainReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                                             _("Cannot parse <cpu> 'required_memory'"
+                                               " attribute"));
+                        goto error;
+                    }
+
+                    VIR_FREE(req_cpus);
+                    VIR_FREE(req_memory);
+                } else {
+                    virDomainReportError(VIR_ERR_XML_ERROR,
+                                         _("unsupported XML element %s"),
+                                         (const char *)cur->name);
+                    goto error;
+                }
             }
-            VIR_FREE(tmp);
-        } else {
-            def->numatune.memory.mode = VIR_DOMAIN_NUMATUNE_MEM_STRICT;
+            cur = cur->next;
         }
     }
+    VIR_FREE(nodes);
 
     n = virXPathNodeSet("./features/*", ctxt, &nodes);
     if (n < 0)
@@ -11761,7 +11814,8 @@ virDomainDefFormatInternal(virDomainDefPtr def,
         def->cputune.period || def->cputune.quota)
         virBufferAddLit(buf, "  </cputune>\n");
 
-    if (def->numatune.memory.nodemask) {
+    if (def->numatune.memory.nodemask ||
+        def->numatune.cpu.required_cpus) {
         const char *mode;
         char *nodemask = NULL;
 
@@ -11778,6 +11832,15 @@ virDomainDefFormatInternal(virDomainDefPtr def,
         virBufferAsprintf(buf, "    <memory mode='%s' nodeset='%s'/>\n",
                           mode, nodemask);
         VIR_FREE(nodemask);
+
+        if (def->numatune.cpu.required_cpus)
+            virBufferAsprintf(buf, "    <cpu required_cpus='%d' ",
+                              def->numatune.cpu.required_cpus);
+
+        if (def->numatune.cpu.required_memory)
+            virBufferAsprintf(buf, "required_memory='%lu'/>\n",
+                              def->numatune.cpu.required_memory);
+
         virBufferAddLit(buf, "  </numatune>\n");
     }
 
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
index 596be4d..1284599 100644
--- a/src/conf/domain_conf.h
+++ b/src/conf/domain_conf.h
@@ -1416,6 +1416,11 @@ struct _virDomainNumatuneDef {
         int mode;
     } memory;
 
+    struct {
+        unsigned int required_cpus;
+        unsigned long required_memory;
+    } cpu;
+
     /* Future NUMA tuning related stuff should go here. */
 };
 
diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c
index 8f336f5..ec6434d 100644
--- a/src/lxc/lxc_controller.c
+++ b/src/lxc/lxc_controller.c
@@ -327,6 +327,47 @@ static int lxcSetContainerNUMAPolicy(virDomainDefPtr def)
 }
 #endif
 
+#if defined(NUMAD)
+static char *
+lxcGetNumadAdvice(unsigned int req_cpus,
+                  unsigned long req_memory) {
+    virCommandPtr cmd = NULL;
+    char *reqs = NULL;
+    char *ret = NULL;
+
+    /* numad uses "MB" for memory. */
+    if (req_memory) {
+        req_memory = req_memory / 1024;
+        if (virAsprintf(&reqs, "%d:%lu", req_cpus, req_memory) < 0) {
+            virReportOOMError();
+            goto out;
+        }
+        cmd = virCommandNewArgList(NUMAD, "-w", reqs, NULL);
+    } else {
+        cmd = virCommandNewArgList(NUMAD, "-w", "%d", req_cpus, NULL);
+    }
+
+    virCommandSetOutputBuffer(cmd, &ret);
+
+    if (virCommandRun(cmd, NULL) < 0) {
+        lxcError(VIR_ERR_INTERNAL_ERROR, "%s",
+                 _("Failed to query numad for the advisory nodeset"));
+    }
+
+out:
+    VIR_FREE(reqs);
+    virCommandFree(cmd);
+    return ret;
+}
+#else
+static char *
+lxcGetNumadAdvice(unsigned int req_cpus ATTRIBUTE_UNUSED,
+                  unsigned long req_memory ATTRIBUTE_UNUSED) {
+    lxcError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                    _("numad is not available on this host"));
+    return NULL;
+}
+#endif
 
 /*
  * To be run while still single threaded
@@ -355,19 +396,54 @@ static int lxcSetContainerCpuAffinity(virDomainDefPtr def)
         return -1;
     }
 
-    if (def->cpumask) {
-        /* XXX why don't we keep 'cpumask' in the libvirt cpumap
-         * format to start with ?!?! */
-        for (i = 0 ; i < maxcpu && i < def->cpumasklen ; i++)
-            if (def->cpumask[i])
+    /* def->cpumask will be overridden by the nodeset
+     * suggested by numad if it's specified.
+     */
+    if (def->numatune.cpu.required_cpus) {
+        char *tmp_cpumask = NULL;
+        char *nodeset = NULL;
+
+        nodeset = lxcGetNumadAdvice(def->numatune.cpu.required_cpus,
+                                    def->numatune.cpu.required_memory);
+        if (!nodeset)
+            return -1;
+
+        if (VIR_ALLOC_N(tmp_cpumask, VIR_DOMAIN_CPUMASK_LEN) < 0) {
+            virReportOOMError();
+            return -1;
+        }
+
+        if (virDomainCpuSetParse(nodeset, 0, tmp_cpumask,
+                                 VIR_DOMAIN_CPUMASK_LEN) < 0) {
+            VIR_FREE(tmp_cpumask);
+            VIR_FREE(nodeset);
+            return -1;
+        }
+
+        for (i = 0; i < maxcpu && i < VIR_DOMAIN_CPUMASK_LEN; i++) {
+            if (tmp_cpumask[i])
                 VIR_USE_CPU(cpumap, i);
+        }
+
+        /* Update def->cpumask */
+        VIR_FREE(def->cpumask);
+        def->cpumask = tmp_cpumask;
+        VIR_FREE(nodeset);
     } else {
-        /* You may think this is redundant, but we can't assume libvirtd
-         * itself is running on all pCPUs, so we need to explicitly set
-         * the spawned LXC instance to all pCPUs if no map is given in
-         * its config file */
-        for (i = 0 ; i < maxcpu ; i++)
-            VIR_USE_CPU(cpumap, i);
+        if (def->cpumask) {
+            /* XXX why don't we keep 'cpumask' in the libvirt cpumap
+             * format to start with ?!?! */
+            for (i = 0 ; i < maxcpu && i < def->cpumasklen ; i++)
+                if (def->cpumask[i])
+                    VIR_USE_CPU(cpumap, i);
+        } else {
+            /* You may think this is redundant, but we can't assume libvirtd
+             * itself is running on all pCPUs, so we need to explicitly set
+             * the spawned LXC instance to all pCPUs if no map is given in
+             * its config file */
+            for (i = 0 ; i < maxcpu ; i++)
+                VIR_USE_CPU(cpumap, i);
+        }
     }
 
     /* We are pressuming we are running between fork/exec of LXC
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 41218de..eb9f8f1 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -1633,6 +1633,48 @@ qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm)
 }
 #endif
 
+#if defined(NUMAD)
+static char *
+qemuGetNumadAdvice(unsigned int req_cpus,
+                   unsigned long req_memory) {
+    virCommandPtr cmd = NULL;
+    char *reqs = NULL;
+    char *output = NULL;
+
+    /* numad uses "MB" for memory. */
+    if (req_memory) {
+        req_memory = req_memory / 1024;
+        if (virAsprintf(&reqs, "%d:%lu", req_cpus, req_memory) < 0) {
+            virReportOOMError();
+            goto out;
+        }
+
+        cmd = virCommandNewArgList(NUMAD, "-w", reqs, NULL);
+    } else {
+        cmd = virCommandNewArgList(NUMAD, "-w", "%u", req_cpus, NULL);
+    }
+
+    virCommandSetOutputBuffer(cmd, &output);
+
+    if (virCommandRun(cmd, NULL) < 0)
+        qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                        _("Failed to query numad for the advisory nodeset"));
+
+out:
+    VIR_FREE(reqs);
+    virCommandFree(cmd);
+    return output;
+}
+#else
+static char *
+qemuGetNumadAdvice(unsigned int req_cpus ATTRIBUTE_UNUSED,
+                   unsigned long req_memory ATTRIBUTE_UNUSED) {
+    qemuReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                    _("numad is not available on this host"));
+    return NULL;
+}
+#endif
+
 /*
  * To be run between fork/exec of QEMU only
  */
@@ -1661,19 +1703,54 @@ qemuProcessInitCpuAffinity(virDomainObjPtr vm)
         return -1;
     }
 
-    if (vm->def->cpumask) {
-        /* XXX why don't we keep 'cpumask' in the libvirt cpumap
-         * format to start with ?!?! */
-        for (i = 0 ; i < maxcpu && i < vm->def->cpumasklen ; i++)
-            if (vm->def->cpumask[i])
+    /* vm->def->cpumask will be overridden by the nodeset
+     * suggested by numad if it's specified.
+     */
+    if (vm->def->numatune.cpu.required_cpus) {
+        char *tmp_cpumask = NULL;
+        char *nodeset = NULL;
+
+        nodeset = qemuGetNumadAdvice(vm->def->numatune.cpu.required_cpus,
+                                     vm->def->numatune.cpu.required_memory);
+        if (!nodeset)
+            return -1;
+
+        if (VIR_ALLOC_N(tmp_cpumask, VIR_DOMAIN_CPUMASK_LEN) < 0) {
+            virReportOOMError();
+            return -1;
+        }
+
+        if (virDomainCpuSetParse(nodeset, 0, tmp_cpumask,
+                                 VIR_DOMAIN_CPUMASK_LEN) < 0) {
+            VIR_FREE(tmp_cpumask);
+            VIR_FREE(nodeset);
+            return -1;
+        }
+
+        for (i = 0; i < maxcpu && i < VIR_DOMAIN_CPUMASK_LEN; i++) {
+            if (tmp_cpumask[i])
                 VIR_USE_CPU(cpumap, i);
+        }
+
+        /* Update vm->def->cpumask */
+        VIR_FREE(vm->def->cpumask);
+        vm->def->cpumask = tmp_cpumask;
+        VIR_FREE(nodeset);
     } else {
-        /* You may think this is redundant, but we can't assume libvirtd
-         * itself is running on all pCPUs, so we need to explicitly set
-         * the spawned QEMU instance to all pCPUs if no map is given in
-         * its config file */
-        for (i = 0 ; i < maxcpu ; i++)
-            VIR_USE_CPU(cpumap, i);
+        if (vm->def->cpumask) {
+            /* XXX why don't we keep 'cpumask' in the libvirt cpumap
+             * format to start with ?!?! */
+            for (i = 0 ; i < maxcpu && i < vm->def->cpumasklen ; i++)
+                if (vm->def->cpumask[i])
+                    VIR_USE_CPU(cpumap, i);
+        } else {
+            /* You may think this is redundant, but we can't assume libvirtd
+             * itself is running on all pCPUs, so we need to explicitly set
+             * the spawned QEMU instance to all pCPUs if no map is given in
+             * its config file */
+            for (i = 0 ; i < maxcpu ; i++)
+                VIR_USE_CPU(cpumap, i);
+        }
     }
 
     /* We are pressuming we are running between fork/exec of QEMU
-- 
1.7.7.3

    

Osier Yang

Dave Allan

Daniel P. Berrange

Dave Allan

Osier Yang

Bill Burns

Dave Allan

Bill Burns

Bill Gray

tags

participants (5)