[libvirt] [PATCH RFC v2] qemu: Support numad

7 Mar 2012

numad is an user-level daemon that monitors NUMA topology and
processes resource consumption to facilitate good NUMA resource
alignment of applications/virtual machines to improve performance
and minimize cost of remote memory latencies. It provides a
pre-placement advisory interface, so significant processes can
be pre-bound to nodes with sufficient available resources.

More details: http://fedoraproject.org/wiki/Features/numad

"numad -w ncpus:memory_amount" is the advisory interface numad
provides currently.

This patch add the support by introducing a bool XML element:
  <numatune>
    <autonuma/>
  </numatune>

If it's specified, the number of vcpus and the current memory
amount specified in domain XML will be used for numad command
line (numad uses MB for memory amount):
  numad -w $num_of_vcpus:$current_memory_amount / 1024

The advisory nodeset returned from numad will be used to set
domain process CPU affinity then. (e.g. qemuProcessInitCpuAffinity).

If the user specifies both CPU affinity policy (e.g.
(<vcpu cpuset="1-10,^7,^8">4</vcpu>) and XML indicating to use
numad for the advisory nodeset, the specified CPU affinity will be
ignored.

Only QEMU/KVM and LXC drivers support it now.

v1 - v2:
  * Since Bill Gray says it doesn't matter to use the number of
    vcpus and current memory amount as numad cmd line argument,
    though from sementics point of view, what numad expects are
    physical CPU numbers, let's go this way.
    v2 dropped XML <cpu required_cpu='4' required_memory='512000'/>,
    and just a new boolean XML element <autonuma>. Codes are refactored
    accordingly.

  * v1 overrides the cpuset specified by <vcpu cpuset='1-10,^7'>2</vcpu>,
    v2 doesn't do that, but just silently ignored.

  * xml2xml test is added
---
 configure.ac                                       |    8 ++
 docs/formatdomain.html.in                          |   11 ++-
 docs/schemas/domaincommon.rng                      |    5 +
 src/conf/domain_conf.c                             |  124 +++++++++++++-------
 src/conf/domain_conf.h                             |    1 +
 src/lxc/lxc_controller.c                           |   86 ++++++++++++--
 src/qemu/qemu_process.c                            |   86 ++++++++++++--
 .../qemuxml2argv-numatune-autonuma.xml             |   32 +++++
 tests/qemuxml2xmltest.c                            |    1 +
 9 files changed, 287 insertions(+), 67 deletions(-)
 create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-numatune-autonuma.xml

diff --git a/configure.ac b/configure.ac
index c9cdd7b..31f0835 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1445,6 +1445,14 @@ AM_CONDITIONAL([HAVE_NUMACTL], [test "$with_numactl" != "no"])
 AC_SUBST([NUMACTL_CFLAGS])
 AC_SUBST([NUMACTL_LIBS])
 
+dnl Do we have numad?
+if test "$with_qemu" = "yes"; then
+    AC_PATH_PROG([NUMAD], [numad], [], [/bin:/usr/bin:/usr/local/bin:$PATH])
+
+    if test -n "$NUMAD"; then
+        AC_DEFINE_UNQUOTED([NUMAD],["$NUMAD"], [Location or name of the numad program])
+    fi
+fi
 
 dnl pcap lib
 LIBPCAP_CONFIG="pcap-config"
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in
index 42f38d3..22872c8 100644
--- a/docs/formatdomain.html.in
+++ b/docs/formatdomain.html.in
@@ -505,6 +505,7 @@
   ...
   <numatune>
     <memory mode="strict" nodeset="1-4,^3"/>
+    <autonuma/>
   </numatune>
   ...
 </domain>
@@ -519,7 +520,7 @@
         <span class='since'>Since 0.9.3</span>
       <dt><code>memory</code></dt>
       <dd>
-        The optional <code>memory</code> element specify how to allocate memory
+        The optional <code>memory</code> element specifies how to allocate memory
         for the domain process on a NUMA host. It contains two attributes,
         attribute <code>mode</code> is either 'interleave', 'strict',
         or 'preferred',
@@ -527,6 +528,14 @@
         syntax with attribute <code>cpuset</code> of element <code>vcpu</code>.
         <span class='since'>Since 0.9.3</span>
       </dd>
+      <dd>
+        The optional <code>autonuma</code> element indicates pinning the virtual CPUs
+        to the advisory nodeset returned by querying "numad" (a system daemon that
+        monitors NUMA topology and usage). With using this element, the physical CPUs
+        specified by attribute <code>cpuset</code> (of element <code>vcpu</code>) will
+        be ignored.
+        <span class='since'>Since 0.9.11 (QEMU/KVM and LXC only)</span>
+      </dd>
     </dl>
 
 
diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng
index a905457..9066b40 100644
--- a/docs/schemas/domaincommon.rng
+++ b/docs/schemas/domaincommon.rng
@@ -549,6 +549,11 @@
               </attribute>
             </element>
           </optional>
+          <optional>
+            <element name="autonuma">
+              <empty/>
+            </element>
+          </optional>
         </element>
       </optional>
     </interleave>
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index b994718..89d00ae 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -7447,7 +7447,6 @@ error:
     goto cleanup;
 }
 
-
 static int virDomainDefMaybeAddController(virDomainDefPtr def,
                                           int type,
                                           int idx)
@@ -7507,6 +7506,7 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps,
     bool uuid_generated = false;
     virBitmapPtr bootMap = NULL;
     unsigned long bootMapSize = 0;
+    xmlNodePtr cur;
 
     if (VIR_ALLOC(def) < 0) {
         virReportOOMError();
@@ -7776,47 +7776,76 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps,
     VIR_FREE(nodes);
 
     /* Extract numatune if exists. */
-    if ((n = virXPathNodeSet("./numatune", ctxt, NULL)) < 0) {
+    if ((n = virXPathNodeSet("./numatune", ctxt, &nodes)) < 0) {
         virDomainReportError(VIR_ERR_INTERNAL_ERROR,
                              "%s", _("cannot extract numatune nodes"));
         goto error;
     }
 
+    if (n > 1) {
+        virDomainReportError(VIR_ERR_XML_ERROR, "%s",
+                             _("only one numatune is supported"));
+        VIR_FREE(nodes);
+        goto error;
+    }
+
     if (n) {
-        tmp = virXPathString("string(./numatune/memory/@nodeset)", ctxt);
-        if (tmp) {
-            char *set = tmp;
-            int nodemasklen = VIR_DOMAIN_CPUMASK_LEN;
+        cur = nodes[0]->children;
+        while (cur != NULL) {
+            if (cur->type == XML_ELEMENT_NODE) {
+                if ((xmlStrEqual(cur->name, BAD_CAST "memory"))) {
+                    tmp = virXMLPropString(cur, "nodeset");
 
-            if (VIR_ALLOC_N(def->numatune.memory.nodemask, nodemasklen) < 0) {
-                goto no_memory;
-            }
+                    if (tmp) {
+                        char *set = tmp;
+                        int nodemasklen = VIR_DOMAIN_CPUMASK_LEN;
 
-            /* "nodeset" leads same syntax with "cpuset". */
-            if (virDomainCpuSetParse(set, 0, def->numatune.memory.nodemask,
-                                     nodemasklen) < 0)
-               goto error;
-            VIR_FREE(tmp);
-        } else {
-            virDomainReportError(VIR_ERR_INTERNAL_ERROR,
-                                "%s", _("nodeset for NUMA memory tuning must be set"));
-            goto error;
-        }
+                        if (VIR_ALLOC_N(def->numatune.memory.nodemask,
+                                        nodemasklen) < 0) {
+                            virReportOOMError();
+                            goto error;
+                        }
 
-        tmp = virXPathString("string(./numatune/memory/@mode)", ctxt);
-        if (tmp) {
-            if ((def->numatune.memory.mode =
-                virDomainNumatuneMemModeTypeFromString(tmp)) < 0) {
-                virDomainReportError(VIR_ERR_INTERNAL_ERROR,
-                                    _("Unsupported NUMA memory tuning mode '%s'"),
-                                    tmp);
-                goto error;
+                        /* "nodeset" leads same syntax with "cpuset". */
+                        if (virDomainCpuSetParse(set, 0,
+                                                 def->numatune.memory.nodemask,
+                                                 nodemasklen) < 0)
+                            goto error;
+                        VIR_FREE(tmp);
+                    } else {
+                        virDomainReportError(VIR_ERR_XML_ERROR, "%s",
+                                             _("nodeset for NUMA memory "
+                                               "tuning must be set"));
+                        goto error;
+                    }
+
+                    tmp = virXMLPropString(cur, "mode");
+                    if (tmp) {
+                        if ((def->numatune.memory.mode =
+                            virDomainNumatuneMemModeTypeFromString(tmp)) < 0) {
+                            virDomainReportError(VIR_ERR_XML_ERROR,
+                                                 _("Unsupported NUMA memory "
+                                                   "tuning mode '%s'"),
+                                                 tmp);
+                            goto error;
+                        }
+                        VIR_FREE(tmp);
+                    } else {
+                        def->numatune.memory.mode = VIR_DOMAIN_NUMATUNE_MEM_STRICT;
+                    }
+                } else if (xmlStrEqual(cur->name, BAD_CAST "autonuma")) {
+                    def->numatune.autonuma = true;
+                } else {
+                    virDomainReportError(VIR_ERR_XML_ERROR,
+                                         _("unsupported XML element %s"),
+                                         (const char *)cur->name);
+                    goto error;
+                }
             }
-            VIR_FREE(tmp);
-        } else {
-            def->numatune.memory.mode = VIR_DOMAIN_NUMATUNE_MEM_STRICT;
+            cur = cur->next;
         }
     }
+    VIR_FREE(nodes);
 
     n = virXPathNodeSet("./features/*", ctxt, &nodes);
     if (n < 0)
@@ -12149,23 +12178,30 @@ virDomainDefFormatInternal(virDomainDefPtr def,
         def->cputune.period || def->cputune.quota)
         virBufferAddLit(buf, "  </cputune>\n");
 
-    if (def->numatune.memory.nodemask) {
-        const char *mode;
-        char *nodemask = NULL;
-
+    if (def->numatune.memory.nodemask ||
+        def->numatune.autonuma) {
         virBufferAddLit(buf, "  <numatune>\n");
-        nodemask = virDomainCpuSetFormat(def->numatune.memory.nodemask,
-                                         VIR_DOMAIN_CPUMASK_LEN);
-        if (nodemask == NULL) {
-            virDomainReportError(VIR_ERR_INTERNAL_ERROR, "%s",
-                          _("failed to format nodeset for NUMA memory tuning"));
-            goto cleanup;
+        if (def->numatune.memory.nodemask) {
+            const char *mode;
+            char *nodemask = NULL;
+
+            nodemask = virDomainCpuSetFormat(def->numatune.memory.nodemask,
+                                             VIR_DOMAIN_CPUMASK_LEN);
+            if (nodemask == NULL) {
+                virDomainReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                                    _("failed to format nodeset for "
+                                      "NUMA memory tuning"));
+                goto cleanup;
+            }
+
+            mode = virDomainNumatuneMemModeTypeToString(def->numatune.memory.mode);
+            virBufferAsprintf(buf, "    <memory mode='%s' nodeset='%s'/>\n",
+                              mode, nodemask);
+            VIR_FREE(nodemask);
         }
 
-        mode = virDomainNumatuneMemModeTypeToString(def->numatune.memory.mode);
-        virBufferAsprintf(buf, "    <memory mode='%s' nodeset='%s'/>\n",
-                          mode, nodemask);
-        VIR_FREE(nodemask);
+        if (def->numatune.autonuma)
+            virBufferAddLit(buf, "    <autonuma/>\n");
         virBufferAddLit(buf, "  </numatune>\n");
     }
 
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
index 87b4103..b15faf0 100644
--- a/src/conf/domain_conf.h
+++ b/src/conf/domain_conf.h
@@ -1458,6 +1458,7 @@ struct _virDomainNumatuneDef {
         int mode;
     } memory;
 
+    bool autonuma;
     /* Future NUMA tuning related stuff should go here. */
 };
 
diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c
index 8f336f5..507b2c6 100644
--- a/src/lxc/lxc_controller.c
+++ b/src/lxc/lxc_controller.c
@@ -327,6 +327,41 @@ static int lxcSetContainerNUMAPolicy(virDomainDefPtr def)
 }
 #endif
 
+#if defined(NUMAD)
+static char *
+lxcGetNumadAdvice(virDomainDefPtr def)
+{
+    virCommandPtr cmd = NULL;
+    char *args = NULL;
+    char *ret = NULL;
+
+    if (virAsprintf(&args, "%d:%lu", def->vcpus, def->mem.cur_balloon) < 0) {
+        virReportOOMError();
+        goto out;
+    }
+    cmd = virCommandNewArgList(NUMAD, "-w", args, NULL);
+
+    virCommandSetOutputBuffer(cmd, &ret);
+
+    if (virCommandRun(cmd, NULL) < 0) {
+        lxcError(VIR_ERR_INTERNAL_ERROR, "%s",
+                 _("Failed to query numad for the advisory nodeset"));
+    }
+
+out:
+    VIR_FREE(args);
+    virCommandFree(cmd);
+    return ret;
+}
+#else
+static char *
+lxcGetNumadAdvice(virDomainDefPtr def ATTRIBUTE_UNUSED)
+{
+    lxcError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                    _("numad is not available on this host"));
+    return NULL;
+}
+#endif
 
 /*
  * To be run while still single threaded
@@ -355,19 +390,48 @@ static int lxcSetContainerCpuAffinity(virDomainDefPtr def)
         return -1;
     }
 
-    if (def->cpumask) {
-        /* XXX why don't we keep 'cpumask' in the libvirt cpumap
-         * format to start with ?!?! */
-        for (i = 0 ; i < maxcpu && i < def->cpumasklen ; i++)
-            if (def->cpumask[i])
+    if (def->numatune.autonuma) {
+        char *tmp_cpumask = NULL;
+        char *nodeset = NULL;
+
+        nodeset = lxcGetNumadAdvice(def);
+        if (!nodeset)
+            return -1;
+
+        if (VIR_ALLOC_N(tmp_cpumask, VIR_DOMAIN_CPUMASK_LEN) < 0) {
+            virReportOOMError();
+            return -1;
+        }
+
+        if (virDomainCpuSetParse(nodeset, 0, tmp_cpumask,
+                                 VIR_DOMAIN_CPUMASK_LEN) < 0) {
+            VIR_FREE(tmp_cpumask);
+            VIR_FREE(nodeset);
+            return -1;
+        }
+
+        for (i = 0; i < maxcpu && i < VIR_DOMAIN_CPUMASK_LEN; i++) {
+            if (tmp_cpumask[i])
                 VIR_USE_CPU(cpumap, i);
+        }
+
+        VIR_FREE(tmp_cpumask);
+        VIR_FREE(nodeset);
     } else {
-        /* You may think this is redundant, but we can't assume libvirtd
-         * itself is running on all pCPUs, so we need to explicitly set
-         * the spawned LXC instance to all pCPUs if no map is given in
-         * its config file */
-        for (i = 0 ; i < maxcpu ; i++)
-            VIR_USE_CPU(cpumap, i);
+        if (def->cpumask) {
+            /* XXX why don't we keep 'cpumask' in the libvirt cpumap
+             * format to start with ?!?! */
+            for (i = 0 ; i < maxcpu && i < def->cpumasklen ; i++)
+                if (def->cpumask[i])
+                    VIR_USE_CPU(cpumap, i);
+        } else {
+            /* You may think this is redundant, but we can't assume libvirtd
+             * itself is running on all pCPUs, so we need to explicitly set
+             * the spawned LXC instance to all pCPUs if no map is given in
+             * its config file */
+            for (i = 0 ; i < maxcpu ; i++)
+                VIR_USE_CPU(cpumap, i);
+        }
     }
 
     /* We are pressuming we are running between fork/exec of LXC
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 7b99814..209403f 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -1633,6 +1633,41 @@ qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm)
 }
 #endif
 
+#if defined(NUMAD)
+static char *
+qemuGetNumadAdvice(virDomainDefPtr def)
+{
+    virCommandPtr cmd = NULL;
+    char *args = NULL;
+    char *output = NULL;
+
+    if (virAsprintf(&args, "%d:%lu", def->vcpus, def->mem.cur_balloon) < 0) {
+        virReportOOMError();
+        goto out;
+    }
+    cmd = virCommandNewArgList(NUMAD, "-w", args, NULL);
+
+    virCommandSetOutputBuffer(cmd, &output);
+
+    if (virCommandRun(cmd, NULL) < 0)
+        qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                        _("Failed to query numad for the advisory nodeset"));
+
+out:
+    VIR_FREE(args);
+    virCommandFree(cmd);
+    return output;
+}
+#else
+static char *
+qemuGetNumadAdvice(virDomainDefPtr def ATTRIBUTE_UNUSED)
+{
+    qemuReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                    _("numad is not available on this host"));
+    return NULL;
+}
+#endif
+
 /*
  * To be run between fork/exec of QEMU only
  */
@@ -1661,19 +1696,48 @@ qemuProcessInitCpuAffinity(virDomainObjPtr vm)
         return -1;
     }
 
-    if (vm->def->cpumask) {
-        /* XXX why don't we keep 'cpumask' in the libvirt cpumap
-         * format to start with ?!?! */
-        for (i = 0 ; i < maxcpu && i < vm->def->cpumasklen ; i++)
-            if (vm->def->cpumask[i])
+    if (vm->def->numatune.autonuma) {
+        char *tmp_cpumask = NULL;
+        char *nodeset = NULL;
+
+        nodeset = qemuGetNumadAdvice(vm->def);
+        if (!nodeset)
+            return -1;
+
+        if (VIR_ALLOC_N(tmp_cpumask, VIR_DOMAIN_CPUMASK_LEN) < 0) {
+            virReportOOMError();
+            return -1;
+        }
+
+        if (virDomainCpuSetParse(nodeset, 0, tmp_cpumask,
+                                 VIR_DOMAIN_CPUMASK_LEN) < 0) {
+            VIR_FREE(tmp_cpumask);
+            VIR_FREE(nodeset);
+            return -1;
+        }
+
+        for (i = 0; i < maxcpu && i < VIR_DOMAIN_CPUMASK_LEN; i++) {
+            if (tmp_cpumask[i])
                 VIR_USE_CPU(cpumap, i);
+        }
+
+        VIR_FREE(tmp_cpumask);
+        VIR_FREE(nodeset);
     } else {
-        /* You may think this is redundant, but we can't assume libvirtd
-         * itself is running on all pCPUs, so we need to explicitly set
-         * the spawned QEMU instance to all pCPUs if no map is given in
-         * its config file */
-        for (i = 0 ; i < maxcpu ; i++)
-            VIR_USE_CPU(cpumap, i);
+        if (vm->def->cpumask) {
+            /* XXX why don't we keep 'cpumask' in the libvirt cpumap
+             * format to start with ?!?! */
+            for (i = 0 ; i < maxcpu && i < vm->def->cpumasklen ; i++)
+                if (vm->def->cpumask[i])
+                    VIR_USE_CPU(cpumap, i);
+        } else {
+            /* You may think this is redundant, but we can't assume libvirtd
+             * itself is running on all pCPUs, so we need to explicitly set
+             * the spawned QEMU instance to all pCPUs if no map is given in
+             * its config file */
+            for (i = 0 ; i < maxcpu ; i++)
+                VIR_USE_CPU(cpumap, i);
+        }
     }
 
     /* We are pressuming we are running between fork/exec of QEMU
diff --git a/tests/qemuxml2argvdata/qemuxml2argv-numatune-autonuma.xml b/tests/qemuxml2argvdata/qemuxml2argv-numatune-autonuma.xml
new file mode 100644
index 0000000..7e50bbd
--- /dev/null
+++ b/tests/qemuxml2argvdata/qemuxml2argv-numatune-autonuma.xml
@@ -0,0 +1,32 @@
+<domain type='qemu'>
+  <name>QEMUGuest1</name>
+  <uuid>c7a5fdbd-edaf-9455-926a-d65c16db1809</uuid>
+  <memory>219136</memory>
+  <currentMemory>219136</currentMemory>
+  <vcpu current='1'>2</vcpu>
+  <numatune>
+    <autonuma/>
+  </numatune>
+  <os>
+    <type arch='i686' machine='pc'>hvm</type>
+    <boot dev='hd'/>
+  </os>
+  <cpu>
+    <topology sockets='2' cores='1' threads='1'/>
+  </cpu>
+  <clock offset='utc'/>
+  <on_poweroff>destroy</on_poweroff>
+  <on_reboot>restart</on_reboot>
+  <on_crash>destroy</on_crash>
+  <devices>
+    <emulator>/usr/bin/qemu</emulator>
+    <disk type='block' device='disk'>
+      <source dev='/dev/HostVG/QEMUGuest1'/>
+      <target dev='hda' bus='ide'/>
+      <address type='drive' controller='0' bus='0' target='0' unit='0'/>
+    </disk>
+    <controller type='usb' index='0'/>
+    <controller type='ide' index='0'/>
+    <memballoon model='virtio'/>
+  </devices>
+</domain>
diff --git a/tests/qemuxml2xmltest.c b/tests/qemuxml2xmltest.c
index 03c75f8..3893ece 100644
--- a/tests/qemuxml2xmltest.c
+++ b/tests/qemuxml2xmltest.c
@@ -199,6 +199,7 @@ mymain(void)
     DO_TEST("blkiotune");
     DO_TEST("blkiotune-device");
     DO_TEST("cputune");
+    DO_TEST("numatune-autonuma");
 
     DO_TEST("smp");
     DO_TEST("lease");
-- 
1.7.1

    

Osier Yang

Daniel P. Berrange

Osier Yang

Bill Gray

Osier Yang

tags

participants (3)