On Tue, Oct 06, 2020 at 13:47:25 +0800, Luyao Zhong wrote:
<numatune>
<memory mode="strict" nodeset="1-4,^3"
migratable="yes/no" />
<memnode cellid="0" mode="strict" nodeset="1"/>
<memnode cellid="2" mode="preferred"
nodeset="2"/>
</numatune>
Attribute ``migratable`` will be 'no' by default, and 'yes' indicates
that it allows operating system or hypervisor migrating the memory
pages between different memory nodes, that also means we will not
rely on hypervisor to set the memory policy or memory affinity, we only
use cgroups to restrict the memory nodes, so if ``migratable`` is 'yes',
the ``mode`` which indicates memory policy will be ignored.
Note that I'm not reviewing whether this is justified and makes sense.
I'm commenting purely on the code.
---
We require that patches are split into logical blocks. You
need to split
this commit at least into following patches:
docs/formatdomain.rst | 8 +++-
docs/schemas/domaincommon.rng | 5 +++
src/conf/numa_conf.c | 45 +++++++++++++++++++
src/conf/numa_conf.h | 3 ++
src/libvirt_private.syms | 1 +
Docs, schema and parser changes should be separated
src/qemu/qemu_command.c | 6 ++-
src/qemu/qemu_process.c | 21 +++++++++
.../numatune-memory-migratable.args | 34 ++++++++++++++
.../numatune-memory-migratable.err | 1 +
.../numatune-memory-migratable.xml | 33 ++++++++++++++
tests/qemuxml2argvtest.c | 5 +++
Implementation in qemu can be all together with tests. Please also add a
qemuxml2xmltest case.
I've provided some more feedback in a recent review:
https://www.redhat.com/archives/libvir-list/2020-October/msg00231.html
specifically some guidance on tests:
https://www.redhat.com/archives/libvir-list/2020-October/msg00395.html
11 files changed, 160 insertions(+), 2 deletions(-)
create mode 100644 tests/qemuxml2argvdata/numatune-memory-migratable.args
create mode 100644 tests/qemuxml2argvdata/numatune-memory-migratable.err
create mode 100644 tests/qemuxml2argvdata/numatune-memory-migratable.xml
diff --git a/docs/formatdomain.rst b/docs/formatdomain.rst
index cc4f91d4ea..4e386a0c3d 100644
--- a/docs/formatdomain.rst
+++ b/docs/formatdomain.rst
[...]
@@ -1097,6 +1097,12 @@ NUMA Node Tuning
will be ignored if it's specified. If ``placement`` of ``vcpu`` is
'auto',
and ``numatune`` is not specified, a default ``numatune`` with ``placement``
'auto' and ``mode`` 'strict' will be added implicitly. :since:`Since
0.9.3`
+ Attribute ``migratable`` is 'no' by default, and 'yes' indicates that
it
+ allows operating system or hypervisor migrating the memory pages between
+ different memory nodes, that also means we will not rely on hypervisor to
+ set the memory policy or memory affinity, we only use cgroups to restrict
+ the memory nodes, so if ``migratable`` is 'yes', the ``mode`` which
indicates
+ memory policy will be ignored.
So ... does this make us ignore the per NUMA-node set 'mode'? If yes,
the code should actually reject any configs where we ignore some
settings rather than just relying on the docs.
``memnode``
Optional ``memnode`` elements can specify memory allocation policies per each
guest NUMA node. For those nodes having no corresponding ``memnode`` element,
[...]
diff --git a/src/conf/numa_conf.c b/src/conf/numa_conf.c
index 6653ba05a6..c14ba1295c 100644
--- a/src/conf/numa_conf.c
+++ b/src/conf/numa_conf.c
[...]
@@ -292,6 +294,15 @@ virDomainNumatuneParseXML(virDomainNumaPtr
numa,
placement = VIR_DOMAIN_NUMATUNE_PLACEMENT_AUTO;
if (node) {
+ if ((tmp = virXMLPropString(node, "migratable")) &&
+ (migratable = virTristateBoolTypeFromString(tmp)) <= 0) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
VIR_ERR_XML_ERROR
+ _("Invalid 'migratable'
attribute value '%s'"), tmp);
+ goto cleanup;
+ }
+ numa->memory.migratable = migratable;
+ VIR_FREE(tmp);
+
if ((tmp = virXMLPropString(node, "mode")) &&
(mode = virDomainNumatuneMemModeTypeFromString(tmp)) < 0) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
@@ -369,6 +380,11 @@ virDomainNumatuneFormatXML(virBufferPtr buf,
tmp = virDomainNumatuneMemModeTypeToString(numatune->memory.mode);
virBufferAsprintf(buf, "<memory mode='%s' ", tmp);
+ if (numatune->memory.migratable) {
Use an explicit condition here:
if (numatune->memory.migratable != VIR_TRISTATE_BOOL_ABSENT)
+ tmp =
virTristateBoolTypeToString(numatune->memory.migratable);
+ virBufferAsprintf(buf, "migratable='%s' ", tmp);
+ }
+
if (numatune->memory.placement == VIR_DOMAIN_NUMATUNE_PLACEMENT_STATIC) {
if (!(nodeset = virBitmapFormat(numatune->memory.nodeset)))
return -1;
@@ -427,6 +443,35 @@ virDomainNumaFree(virDomainNumaPtr numa)
VIR_FREE(numa);
}
+/**
+ * virDomainNumatuneGetMigratable:
+ * @numatune: pointer to numatune definition
+ * @migratable: where to store the result
This argument is not used by the only place calling it. Please remove
it.
+ *
+ * Get the migratable attribute for domain's memory. It's safe to
+ * pass NULL to @migratable if the return value is the only info needed.
+ *
+ * Returns: migratable value
+ */
+int virDomainNumatuneGetMigratable(virDomainNumaPtr numatune,
+ virTristateBool *migratable)
+{
+ virTristateBool tmp_migratable = 0;
+
+ if (!numatune)
+ return tmp_migratable;
Just directly return VIR_TRISTATE_BOOL_ABSENT here ...
+
+ if (numatune->memory.specified)
+ tmp_migratable = numatune->memory.migratable;
+ else
+ return tmp_migratable;
+
+ if (migratable != NULL)
+ *migratable = tmp_migratable;
Also none of this is really needed, just directly return the value.
+
+ return tmp_migratable;
+}
+
/**
* virDomainNumatuneGetMode:
* @numatune: pointer to numatune definition
[...]
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 476cf6972e..882e7e6ba2 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -3189,7 +3189,11 @@ qemuBuildMemoryBackendProps(virJSONValuePtr *backendProps,
return -1;
}
- if (nodemask) {
+ // If migratable attribute is yes, we should only use cgroups setting
+ // memory affinity, and skip passing the host-nodes and policy parameters
+ // to QEMU command line.
We don't use line comments ('//') in our code base. Please use block
comments.
+ if (nodemask &&
+ virDomainNumatuneGetMigratable(def->numa, NULL) != VIR_TRISTATE_BOOL_YES) {
if (!virNumaNodesetIsAvailable(nodemask))
return -1;
if (virJSONValueObjectAdd(props,
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 6b5de29fdb..9c1116064b 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
[...]
@@ -2694,6 +2695,26 @@ qemuProcessSetupPid(virDomainObjPtr vm,
&mem_mask, -1) < 0)
goto cleanup;
+ // For vCPU threads, mem_mask is different among cells
+ if (nameval == VIR_CGROUP_THREAD_VCPU) {
+ virDomainNumaPtr numatune = vm->def->numa;
+ virBitmapPtr numanode_cpumask = NULL;
+ for (i = 0; i < virDomainNumaGetNodeCount(numatune); i++) {
+ numanode_cpumask = virDomainNumaGetNodeCpumask(numatune, i);
+ // 'i' indicates the cell id, if the vCPU id is in this cell,
+ // we need get the corresonding nodeset
+ if (virBitmapIsBitSet(numanode_cpumask, id)) {
+ if (virDomainNumatuneMaybeFormatNodeset(numatune,
+ priv->autoNodeset,
+ &mem_mask, i) < 0) {
+ goto cleanup;
+ } else {
+ break;
+ }
+ }
+ }
+ }
This hunk doesn't seem to belong to this patch. The comments (apart from
using wrong syntax) don't seem to justify why or how this relates to
memory migrability.
+
if (virCgroupNewThread(priv->cgroup, nameval, id, true, &cgroup) < 0)
goto cleanup;
[...]
diff --git a/tests/qemuxml2argvtest.c b/tests/qemuxml2argvtest.c
index 49567326d4..e6554518d7 100644
--- a/tests/qemuxml2argvtest.c
+++ b/tests/qemuxml2argvtest.c
@@ -1950,6 +1950,11 @@ mymain(void)
DO_TEST("numatune-memory", NONE);
DO_TEST_PARSE_ERROR("numatune-memory-invalid-nodeset", NONE);
+ DO_TEST("numatune-memory-migratable",
+ QEMU_CAPS_NUMA,
+ QEMU_CAPS_OBJECT_MEMORY_RAM);
+ DO_TEST_PARSE_ERROR("numatune-memory-migratable", NONE);
Pleased don't use DO_TEST any more. Use DO_TEST_CAPS_* (such as
DO_TEST_CAPS_LATEST) instead for new tests.
The negative test doesn't seem to make sense here since it's not testing
a negative case in your patch. This is visible from the error message it
records:
+++ b/tests/qemuxml2argvdata/numatune-memory-migratable.err
@@ -0,0 +1 @@
+unsupported configuration: Per-node memory binding is not supported with this QEMU