Though numad will manage the memory allocation of task dynamically,
but it wants management application (libvirt) to pre-set the memory
policy according to the advisory nodeset returned from querying numad,
(just like pre-bind CPU nodeset for domain process), and thus the
performance could benifit much more from it.
This patch introduces new XML tag 'placement', value 'auto' indicates
whether to set the memory policy with the advisory nodeset from numad,
and its value defaults to 'static'. e.g.
<numatune>
<memory placement='auto' mode='interleave'/>
</numatune>
Just like what current "numatune" does, the 'auto' numa memory policy
setting uses libnuma's API too.
So, to full drive numad, one needs to specify placement='auto' for
both "<vcpu>" and "<numatune><memory
.../></numatune>". It's a bit
inconvenient, but makes sense from semantics' point of view.
---
An alternative way is to not introduce the new XML tag, and pre-set
the memory policy implicitly with "<vcpu
placement='auto'>4</vcpu>",
but IMHO it implies too much, and I'd not like go this way unless
the new XML tag is not accepted.
---
docs/formatdomain.html.in | 11 ++-
docs/schemas/domaincommon.rng | 39 +++++++---
libvirt.spec.in | 1 +
src/conf/domain_conf.c | 96 ++++++++++++++++--------
src/conf/domain_conf.h | 9 ++
src/libvirt_private.syms | 2 +
src/qemu/qemu_process.c | 79 +++++++++++--------
tests/qemuxml2argvdata/qemuxml2argv-numad.args | 4 +
tests/qemuxml2argvdata/qemuxml2argv-numad.xml | 31 ++++++++
tests/qemuxml2argvtest.c | 1 +
10 files changed, 194 insertions(+), 79 deletions(-)
create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-numad.args
create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-numad.xml
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in
index e1fe0c4..01b3124 100644
--- a/docs/formatdomain.html.in
+++ b/docs/formatdomain.html.in
@@ -580,9 +580,14 @@
The optional <code>memory</code> element specifies how to allocate
memory
for the domain process on a NUMA host. It contains two attributes,
attribute <code>mode</code> is either 'interleave',
'strict',
- or 'preferred',
- attribute <code>nodeset</code> specifies the NUMA nodes, it leads
same
- syntax with attribute <code>cpuset</code> of element
<code>vcpu</code>.
+ or 'preferred', attribute <code>nodeset</code> specifies the
NUMA nodes,
+ it leads same syntax with attribute <code>cpuset</code> of element
+ <code>vcpu</code>, the optional attribute
<code>placement</code> can be
+ used to indicate the memory placement mode for domain process, its value
+ can be either "static" or "auto", defaults to
"static". "auto" indicates
+ the domain process will only allocate memory from the advisory nodeset
+ returned from querying numad, and the value of attribute
<code>nodeset</code>
+ will be ignored if it's specified.
<span class='since'>Since 0.9.3</span>
</dd>
</dl>
diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng
index 8419ccc..9b509f1 100644
--- a/docs/schemas/domaincommon.rng
+++ b/docs/schemas/domaincommon.rng
@@ -562,16 +562,35 @@
<element name="numatune">
<optional>
<element name="memory">
- <attribute name="mode">
- <choice>
- <value>strict</value>
- <value>preferred</value>
- <value>interleave</value>
- </choice>
- </attribute>
- <attribute name="nodeset">
- <ref name="cpuset"/>
- </attribute>
+ <choice>
+ <group>
+ <attribute name="mode">
+ <choice>
+ <value>strict</value>
+ <value>preferred</value>
+ <value>interleave</value>
+ </choice>
+ </attribute>
+ <attribute name='placement'>
+ <choice>
+ <value>static</value>
+ <value>auto</value>
+ </choice>
+ </attribute>
+ </group>
+ <group>
+ <attribute name="mode">
+ <choice>
+ <value>strict</value>
+ <value>preferred</value>
+ <value>interleave</value>
+ </choice>
+ </attribute>
+ <attribute name="nodeset">
+ <ref name="cpuset"/>
+ </attribute>
+ </group>
+ </choice>
</element>
</optional>
</element>
diff --git a/libvirt.spec.in b/libvirt.spec.in
index e7e0a55..0b119c2 100644
--- a/libvirt.spec.in
+++ b/libvirt.spec.in
@@ -454,6 +454,7 @@ BuildRequires: scrub
%if %{with_numad}
BuildRequires: numad
+BuildRequires: numactl-devel
%endif
%description
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 3fce7e5..b728cb6 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -640,6 +640,11 @@ VIR_ENUM_IMPL(virDomainDiskTray, VIR_DOMAIN_DISK_TRAY_LAST,
"closed",
"open");
+VIR_ENUM_IMPL(virDomainNumatuneMemPlacementMode,
+ VIR_DOMAIN_NUMATUNE_MEM_PLACEMENT_MODE_LAST,
+ "static",
+ "auto");
+
#define virDomainReportError(code, ...) \
virReportErrorHelper(VIR_FROM_DOMAIN, code, __FILE__, \
__FUNCTION__, __LINE__, __VA_ARGS__)
@@ -8023,30 +8028,22 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps,
while (cur != NULL) {
if (cur->type == XML_ELEMENT_NODE) {
if (xmlStrEqual(cur->name, BAD_CAST "memory")) {
- tmp = virXMLPropString(cur, "nodeset");
-
+ tmp = virXMLPropString(cur, "placement");
+ int placement_mode;
if (tmp) {
- char *set = tmp;
- int nodemasklen = VIR_DOMAIN_CPUMASK_LEN;
-
- if (VIR_ALLOC_N(def->numatune.memory.nodemask,
- nodemasklen) < 0) {
- virReportOOMError();
+ if ((placement_mode =
+ virDomainNumatuneMemPlacementModeTypeFromString(tmp)) <
0) {
+ virDomainReportError(VIR_ERR_XML_ERROR,
+ _("Unsupported memory placement
"
+ "mode '%s'"), tmp);
+ VIR_FREE(tmp);
goto error;
}
-
- /* "nodeset" leads same syntax with "cpuset".
*/
- if (virDomainCpuSetParse(set, 0,
- def->numatune.memory.nodemask,
- nodemasklen) < 0)
- goto error;
VIR_FREE(tmp);
} else {
- virDomainReportError(VIR_ERR_XML_ERROR, "%s",
- _("nodeset for NUMA memory "
- "tuning must be set"));
- goto error;
+ placement_mode = VIR_DOMAIN_NUMATUNE_MEM_PLACEMENT_MODE_STATIC;
}
+ def->numatune.memory.placement_mode = placement_mode;
tmp = virXMLPropString(cur, "mode");
if (tmp) {
@@ -8055,13 +8052,40 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps,
virDomainReportError(VIR_ERR_XML_ERROR,
_("Unsupported NUMA memory "
"tuning mode
'%s'"),
- tmp);
+ tmp);
goto error;
}
VIR_FREE(tmp);
} else {
def->numatune.memory.mode = VIR_DOMAIN_NUMATUNE_MEM_STRICT;
}
+
+ if (placement_mode ==
+ VIR_DOMAIN_NUMATUNE_MEM_PLACEMENT_MODE_STATIC) {
+ tmp = virXMLPropString(cur, "nodeset");
+ if (tmp) {
+ char *set = tmp;
+ int nodemasklen = VIR_DOMAIN_CPUMASK_LEN;
+
+ if (VIR_ALLOC_N(def->numatune.memory.nodemask,
+ nodemasklen) < 0) {
+ virReportOOMError();
+ goto error;
+ }
+
+ /* "nodeset" leads same syntax with
"cpuset". */
+ if (virDomainCpuSetParse(set, 0,
+ def->numatune.memory.nodemask,
+ nodemasklen) < 0)
+ goto error;
+ VIR_FREE(tmp);
+ } else {
+ virDomainReportError(VIR_ERR_XML_ERROR, "%s",
+ _("nodeset for NUMA memory "
+ "tuning must be set"));
+ goto error;
+ }
+ }
} else {
virDomainReportError(VIR_ERR_XML_ERROR,
_("unsupported XML element %s"),
@@ -12491,25 +12515,33 @@ virDomainDefFormatInternal(virDomainDefPtr def,
def->cputune.period || def->cputune.quota)
virBufferAddLit(buf, " </cputune>\n");
- if (def->numatune.memory.nodemask) {
+ if (def->numatune.memory.nodemask ||
+ (def->numatune.memory.placement_mode ==
+ VIR_DOMAIN_NUMATUNE_MEM_PLACEMENT_MODE_AUTO)) {
virBufferAddLit(buf, " <numatune>\n");
const char *mode;
char *nodemask = NULL;
-
- nodemask = virDomainCpuSetFormat(def->numatune.memory.nodemask,
- VIR_DOMAIN_CPUMASK_LEN);
- if (nodemask == NULL) {
- virDomainReportError(VIR_ERR_INTERNAL_ERROR, "%s",
- _("failed to format nodeset for "
- "NUMA memory tuning"));
- goto cleanup;
- }
+ const char *placement;
mode = virDomainNumatuneMemModeTypeToString(def->numatune.memory.mode);
- virBufferAsprintf(buf, " <memory mode='%s'
nodeset='%s'/>\n",
- mode, nodemask);
- VIR_FREE(nodemask);
+ virBufferAsprintf(buf, " <memory mode='%s' ", mode);
+ if (def->numatune.memory.placement_mode ==
+ VIR_DOMAIN_NUMATUNE_MEM_PLACEMENT_MODE_STATIC) {
+ nodemask = virDomainCpuSetFormat(def->numatune.memory.nodemask,
+ VIR_DOMAIN_CPUMASK_LEN);
+ if (nodemask == NULL) {
+ virDomainReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+ _("failed to format nodeset for "
+ "NUMA memory tuning"));
+ goto cleanup;
+ }
+ virBufferAsprintf(buf, "nodeset='%s'/>\n", nodemask);
+ VIR_FREE(nodemask);
+ } else {
+ placement =
virDomainNumatuneMemPlacementModeTypeToString(def->numatune.memory.placement_mode);
+ virBufferAsprintf(buf, "placement='%s'/>\n",
placement);
+ }
virBufferAddLit(buf, " </numatune>\n");
}
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
index 5aa8fc1..9aade99 100644
--- a/src/conf/domain_conf.h
+++ b/src/conf/domain_conf.h
@@ -1416,6 +1416,13 @@ enum virDomainCpuPlacementMode {
VIR_DOMAIN_CPU_PLACEMENT_MODE_LAST,
};
+enum virDomainNumatuneMemPlacementMode {
+ VIR_DOMAIN_NUMATUNE_MEM_PLACEMENT_MODE_STATIC = 0,
+ VIR_DOMAIN_NUMATUNE_MEM_PLACEMENT_MODE_AUTO,
+
+ VIR_DOMAIN_NUMATUNE_MEM_PLACEMENT_MODE_LAST,
+};
+
typedef struct _virDomainTimerCatchupDef virDomainTimerCatchupDef;
typedef virDomainTimerCatchupDef *virDomainTimerCatchupDefPtr;
struct _virDomainTimerCatchupDef {
@@ -1504,6 +1511,7 @@ struct _virDomainNumatuneDef {
struct {
char *nodemask;
int mode;
+ int placement_mode;
} memory;
/* Future NUMA tuning related stuff should go here. */
@@ -2176,6 +2184,7 @@ VIR_ENUM_DECL(virDomainGraphicsSpiceStreamingMode)
VIR_ENUM_DECL(virDomainGraphicsSpiceClipboardCopypaste)
VIR_ENUM_DECL(virDomainGraphicsSpiceMouseMode)
VIR_ENUM_DECL(virDomainNumatuneMemMode)
+VIR_ENUM_DECL(virDomainNumatuneMemPlacementMode)
VIR_ENUM_DECL(virDomainSnapshotState)
/* from libvirt.h */
VIR_ENUM_DECL(virDomainState)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index 88f8a21..c9c1486 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -400,6 +400,8 @@ virDomainNostateReasonTypeFromString;
virDomainNostateReasonTypeToString;
virDomainNumatuneMemModeTypeFromString;
virDomainNumatuneMemModeTypeToString;
+virDomainNumatuneMemPlacementModeTypeFromString;
+virDomainNumatuneMemPlacementModeTypeToString;
virDomainObjAssignDef;
virDomainObjCopyPersistentDef;
virDomainObjGetPersistentDef;
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index f1401e1..72beb83 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -1657,7 +1657,8 @@ qemuProcessDetectVcpuPIDs(struct qemud_driver *driver,
*/
#if HAVE_NUMACTL
static int
-qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm)
+qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm,
+ const char *nodemask)
{
nodemask_t mask;
int mode = -1;
@@ -1666,9 +1667,18 @@ qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm)
int i = 0;
int maxnode = 0;
bool warned = false;
+ virDomainNumatuneDef numatune = vm->def->numatune;
+ const char *tmp_nodemask = NULL;
- if (!vm->def->numatune.memory.nodemask)
- return 0;
+ if (numatune.memory.placement_mode ==
+ VIR_DOMAIN_NUMATUNE_MEM_PLACEMENT_MODE_STATIC) {
+ if (!numatune.memory.nodemask)
+ return 0;
+ tmp_nodemask = numatune.memory.nodemask;
+ } else if (numatune.memory.placement_mode ==
+ VIR_DOMAIN_NUMATUNE_MEM_PLACEMENT_MODE_AUTO) {
+ tmp_nodemask = nodemask;
+ }
VIR_DEBUG("Setting NUMA memory policy");
@@ -1679,11 +1689,10 @@ qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm)
}
maxnode = numa_max_node() + 1;
-
/* Convert nodemask to NUMA bitmask. */
nodemask_zero(&mask);
for (i = 0; i < VIR_DOMAIN_CPUMASK_LEN; i++) {
- if (vm->def->numatune.memory.nodemask[i]) {
+ if (tmp_nodemask[i]) {
if (i > NUMA_NUM_NODES) {
qemuReportError(VIR_ERR_INTERNAL_ERROR,
_("Host cannot support NUMA node %d"), i);
@@ -1693,12 +1702,12 @@ qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm)
VIR_WARN("nodeset is out of range, there is only %d NUMA "
"nodes on host", maxnode);
warned = true;
- }
+ }
nodemask_set(&mask, i);
}
}
- mode = vm->def->numatune.memory.mode;
+ mode = numatune.memory.mode;
if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
numa_set_bind_policy(1);
@@ -1789,7 +1798,8 @@ qemuGetNumadAdvice(virDomainDefPtr def ATTRIBUTE_UNUSED)
*/
static int
qemuProcessInitCpuAffinity(struct qemud_driver *driver,
- virDomainObjPtr vm)
+ virDomainObjPtr vm,
+ const char *nodemask)
{
int ret = -1;
int i, hostcpus, maxcpu = QEMUD_CPUMASK_LEN;
@@ -1815,27 +1825,6 @@ qemuProcessInitCpuAffinity(struct qemud_driver *driver,
}
if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
- char *nodeset = NULL;
- char *nodemask = NULL;
-
- nodeset = qemuGetNumadAdvice(vm->def);
- if (!nodeset)
- goto cleanup;
-
- if (VIR_ALLOC_N(nodemask, VIR_DOMAIN_CPUMASK_LEN) < 0) {
- virReportOOMError();
- VIR_FREE(nodeset);
- goto cleanup;
- }
-
- if (virDomainCpuSetParse(nodeset, 0, nodemask,
- VIR_DOMAIN_CPUMASK_LEN) < 0) {
- VIR_FREE(nodemask);
- VIR_FREE(nodeset);
- goto cleanup;
- }
- VIR_FREE(nodeset);
-
/* numad returns the NUMA node list, convert it to cpumap */
int prev_total_ncpus = 0;
for (i = 0; i < driver->caps->host.nnumaCell; i++) {
@@ -1852,8 +1841,6 @@ qemuProcessInitCpuAffinity(struct qemud_driver *driver,
}
prev_total_ncpus += cur_ncpus;
}
-
- VIR_FREE(nodemask);
} else {
if (vm->def->cpumask) {
/* XXX why don't we keep 'cpumask' in the libvirt cpumap
@@ -2564,6 +2551,8 @@ static int qemuProcessHook(void *data)
struct qemuProcessHookData *h = data;
int ret = -1;
int fd;
+ char *nodeset = NULL;
+ char *nodemask = NULL;
/* Some later calls want pid present */
h->vm->pid = getpid();
@@ -2597,14 +2586,34 @@ static int qemuProcessHook(void *data)
if (qemuAddToCgroup(h->driver, h->vm->def) < 0)
goto cleanup;
+ if ((h->vm->def->placement_mode ==
+ VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) ||
+ (h->vm->def->numatune.memory.placement_mode ==
+ VIR_DOMAIN_NUMATUNE_MEM_PLACEMENT_MODE_AUTO)) {
+ nodeset = qemuGetNumadAdvice(h->vm->def);
+ if (!nodeset)
+ goto cleanup;
+
+ VIR_DEBUG("Nodeset returned from numad: %s", nodeset);
+
+ if (VIR_ALLOC_N(nodemask, VIR_DOMAIN_CPUMASK_LEN) < 0) {
+ virReportOOMError();
+ goto cleanup;
+ }
+
+ if (virDomainCpuSetParse(nodeset, 0, nodemask,
+ VIR_DOMAIN_CPUMASK_LEN) < 0)
+ goto cleanup;
+ }
+
/* This must be done after cgroup placement to avoid resetting CPU
* affinity */
VIR_DEBUG("Setup CPU affinity");
- if (qemuProcessInitCpuAffinity(h->driver, h->vm) < 0)
+ if (qemuProcessInitCpuAffinity(h->driver, h->vm, nodemask) < 0)
goto cleanup;
- if (qemuProcessInitNumaMemoryPolicy(h->vm) < 0)
- return -1;
+ if (qemuProcessInitNumaMemoryPolicy(h->vm, nodemask) < 0)
+ goto cleanup;
VIR_DEBUG("Setting up security labelling");
if (virSecurityManagerSetProcessLabel(h->driver->securityManager,
h->vm->def) < 0)
@@ -2614,6 +2623,8 @@ static int qemuProcessHook(void *data)
cleanup:
VIR_DEBUG("Hook complete ret=%d", ret);
+ VIR_FREE(nodeset);
+ VIR_FREE(nodemask);
return ret;
}
diff --git a/tests/qemuxml2argvdata/qemuxml2argv-numad.args
b/tests/qemuxml2argvdata/qemuxml2argv-numad.args
new file mode 100644
index 0000000..23bcb70
--- /dev/null
+++ b/tests/qemuxml2argvdata/qemuxml2argv-numad.args
@@ -0,0 +1,4 @@
+LC_ALL=C PATH=/bin HOME=/home/test USER=test LOGNAME=test /usr/bin/qemu -S -M \
+pc -m 214 -smp 2 -nographic -monitor \
+unix:/tmp/test-monitor,server,nowait -no-acpi -boot c -hda \
+/dev/HostVG/QEMUGuest1 -net none -serial none -parallel none -usb
diff --git a/tests/qemuxml2argvdata/qemuxml2argv-numad.xml
b/tests/qemuxml2argvdata/qemuxml2argv-numad.xml
new file mode 100644
index 0000000..c87ec49
--- /dev/null
+++ b/tests/qemuxml2argvdata/qemuxml2argv-numad.xml
@@ -0,0 +1,31 @@
+<domain type='qemu'>
+ <name>QEMUGuest1</name>
+ <uuid>c7a5fdbd-edaf-9455-926a-d65c16db1809</uuid>
+ <memory unit='KiB'>219136</memory>
+ <currentMemory unit='KiB'>219136</currentMemory>
+ <vcpu placement='auto'>2</vcpu>
+ <numatune>
+ <memory mode="interleave" placement='auto'/>
+ </numatune>
+ <os>
+ <type arch='i686' machine='pc'>hvm</type>
+ <boot dev='hd'/>
+ </os>
+ <cpu>
+ <topology sockets='2' cores='1' threads='1'/>
+ </cpu>
+ <clock offset='utc'/>
+ <on_poweroff>destroy</on_poweroff>
+ <on_reboot>restart</on_reboot>
+ <on_crash>destroy</on_crash>
+ <devices>
+ <emulator>/usr/bin/qemu</emulator>
+ <disk type='block' device='disk'>
+ <source dev='/dev/HostVG/QEMUGuest1'/>
+ <target dev='hda' bus='ide'/>
+ <address type='drive' controller='0' bus='0'
target='0' unit='0'/>
+ </disk>
+ <controller type='ide' index='0'/>
+ <memballoon model='virtio'/>
+ </devices>
+</domain>
diff --git a/tests/qemuxml2argvtest.c b/tests/qemuxml2argvtest.c
index e47a385..3e529e2 100644
--- a/tests/qemuxml2argvtest.c
+++ b/tests/qemuxml2argvtest.c
@@ -732,6 +732,7 @@ mymain(void)
DO_TEST("blkiotune-device", false, QEMU_CAPS_NAME);
DO_TEST("cputune", false, QEMU_CAPS_NAME);
DO_TEST("numatune-memory", false, NONE);
+ DO_TEST("numad", false, NONE);
DO_TEST("blkdeviotune", false, QEMU_CAPS_NAME, QEMU_CAPS_DEVICE,
QEMU_CAPS_DRIVE, QEMU_CAPS_DRIVE_IOTUNE);
--
1.7.7.3