[libvirt][PATCH v1 0/1] introduce an attribute "migratable" to numatune memory element

RFC discussion record link: https://www.redhat.com/archives/libvir-list/2020-August/msg00960.html Luyao Zhong (1): introduce an attribute "migratable" to numatune memory element docs/formatdomain.rst | 8 +++- docs/schemas/domaincommon.rng | 5 +++ src/conf/numa_conf.c | 45 +++++++++++++++++++ src/conf/numa_conf.h | 3 ++ src/libvirt_private.syms | 1 + src/qemu/qemu_command.c | 6 ++- src/qemu/qemu_process.c | 21 +++++++++ .../numatune-memory-migratable.args | 34 ++++++++++++++ .../numatune-memory-migratable.err | 1 + .../numatune-memory-migratable.xml | 33 ++++++++++++++ tests/qemuxml2argvtest.c | 5 +++ 11 files changed, 160 insertions(+), 2 deletions(-) create mode 100644 tests/qemuxml2argvdata/numatune-memory-migratable.args create mode 100644 tests/qemuxml2argvdata/numatune-memory-migratable.err create mode 100644 tests/qemuxml2argvdata/numatune-memory-migratable.xml -- 2.25.4

<numatune> <memory mode="strict" nodeset="1-4,^3" migratable="yes/no" /> <memnode cellid="0" mode="strict" nodeset="1"/> <memnode cellid="2" mode="preferred" nodeset="2"/> </numatune> Attribute ``migratable`` will be 'no' by default, and 'yes' indicates that it allows operating system or hypervisor migrating the memory pages between different memory nodes, that also means we will not rely on hypervisor to set the memory policy or memory affinity, we only use cgroups to restrict the memory nodes, so if ``migratable`` is 'yes', the ``mode`` which indicates memory policy will be ignored. --- docs/formatdomain.rst | 8 +++- docs/schemas/domaincommon.rng | 5 +++ src/conf/numa_conf.c | 45 +++++++++++++++++++ src/conf/numa_conf.h | 3 ++ src/libvirt_private.syms | 1 + src/qemu/qemu_command.c | 6 ++- src/qemu/qemu_process.c | 21 +++++++++ .../numatune-memory-migratable.args | 34 ++++++++++++++ .../numatune-memory-migratable.err | 1 + .../numatune-memory-migratable.xml | 33 ++++++++++++++ tests/qemuxml2argvtest.c | 5 +++ 11 files changed, 160 insertions(+), 2 deletions(-) create mode 100644 tests/qemuxml2argvdata/numatune-memory-migratable.args create mode 100644 tests/qemuxml2argvdata/numatune-memory-migratable.err create mode 100644 tests/qemuxml2argvdata/numatune-memory-migratable.xml diff --git a/docs/formatdomain.rst b/docs/formatdomain.rst index cc4f91d4ea..4e386a0c3d 100644 --- a/docs/formatdomain.rst +++ b/docs/formatdomain.rst @@ -1072,7 +1072,7 @@ NUMA Node Tuning <domain> ... <numatune> - <memory mode="strict" nodeset="1-4,^3"/> + <memory mode="strict" nodeset="1-4,^3" migratable="yes"/> <memnode cellid="0" mode="strict" nodeset="1"/> <memnode cellid="2" mode="preferred" nodeset="2"/> </numatune> @@ -1097,6 +1097,12 @@ NUMA Node Tuning will be ignored if it's specified. If ``placement`` of ``vcpu`` is 'auto', and ``numatune`` is not specified, a default ``numatune`` with ``placement`` 'auto' and ``mode`` 'strict' will be added implicitly. :since:`Since 0.9.3` + Attribute ``migratable`` is 'no' by default, and 'yes' indicates that it + allows operating system or hypervisor migrating the memory pages between + different memory nodes, that also means we will not rely on hypervisor to + set the memory policy or memory affinity, we only use cgroups to restrict + the memory nodes, so if ``migratable`` is 'yes', the ``mode`` which indicates + memory policy will be ignored. ``memnode`` Optional ``memnode`` elements can specify memory allocation policies per each guest NUMA node. For those nodes having no corresponding ``memnode`` element, diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng index 316d93fb69..a0283e8449 100644 --- a/docs/schemas/domaincommon.rng +++ b/docs/schemas/domaincommon.rng @@ -1078,6 +1078,11 @@ <interleave> <optional> <element name="memory"> + <optional> + <attribute name="migratable"> + <ref name="virYesNo"/> + </attribute> + </optional> <optional> <attribute name="mode"> <choice> diff --git a/src/conf/numa_conf.c b/src/conf/numa_conf.c index 6653ba05a6..c14ba1295c 100644 --- a/src/conf/numa_conf.c +++ b/src/conf/numa_conf.c @@ -99,6 +99,7 @@ struct _virDomainNuma { virBitmapPtr nodeset; virDomainNumatuneMemMode mode; virDomainNumatunePlacement placement; + virTristateBool migratable; } memory; /* pinning for all the memory */ struct _virDomainNumaNode { @@ -275,6 +276,7 @@ virDomainNumatuneParseXML(virDomainNumaPtr numa, int ret = -1; virBitmapPtr nodeset = NULL; xmlNodePtr node = NULL; + int migratable = 0; if (virXPathInt("count(./numatune)", ctxt, &n) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", @@ -292,6 +294,15 @@ virDomainNumatuneParseXML(virDomainNumaPtr numa, placement = VIR_DOMAIN_NUMATUNE_PLACEMENT_AUTO; if (node) { + if ((tmp = virXMLPropString(node, "migratable")) && + (migratable = virTristateBoolTypeFromString(tmp)) <= 0) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("Invalid 'migratable' attribute value '%s'"), tmp); + goto cleanup; + } + numa->memory.migratable = migratable; + VIR_FREE(tmp); + if ((tmp = virXMLPropString(node, "mode")) && (mode = virDomainNumatuneMemModeTypeFromString(tmp)) < 0) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, @@ -369,6 +380,11 @@ virDomainNumatuneFormatXML(virBufferPtr buf, tmp = virDomainNumatuneMemModeTypeToString(numatune->memory.mode); virBufferAsprintf(buf, "<memory mode='%s' ", tmp); + if (numatune->memory.migratable) { + tmp = virTristateBoolTypeToString(numatune->memory.migratable); + virBufferAsprintf(buf, "migratable='%s' ", tmp); + } + if (numatune->memory.placement == VIR_DOMAIN_NUMATUNE_PLACEMENT_STATIC) { if (!(nodeset = virBitmapFormat(numatune->memory.nodeset))) return -1; @@ -427,6 +443,35 @@ virDomainNumaFree(virDomainNumaPtr numa) VIR_FREE(numa); } +/** + * virDomainNumatuneGetMigratable: + * @numatune: pointer to numatune definition + * @migratable: where to store the result + * + * Get the migratable attribute for domain's memory. It's safe to + * pass NULL to @migratable if the return value is the only info needed. + * + * Returns: migratable value + */ +int virDomainNumatuneGetMigratable(virDomainNumaPtr numatune, + virTristateBool *migratable) +{ + virTristateBool tmp_migratable = 0; + + if (!numatune) + return tmp_migratable; + + if (numatune->memory.specified) + tmp_migratable = numatune->memory.migratable; + else + return tmp_migratable; + + if (migratable != NULL) + *migratable = tmp_migratable; + + return tmp_migratable; +} + /** * virDomainNumatuneGetMode: * @numatune: pointer to numatune definition diff --git a/src/conf/numa_conf.h b/src/conf/numa_conf.h index db5d79e62a..fe3a8a69c1 100644 --- a/src/conf/numa_conf.h +++ b/src/conf/numa_conf.h @@ -102,6 +102,9 @@ int virDomainNumatuneFormatXML(virBufferPtr buf, virDomainNumaPtr numatune) /* * Getters */ +int virDomainNumatuneGetMigratable(virDomainNumaPtr numatune, + virTristateBool *migratable); + int virDomainNumatuneGetMode(virDomainNumaPtr numatune, int cellid, virDomainNumatuneMemMode *mode); diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 7cf8bea962..de9483065f 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -875,6 +875,7 @@ virDomainNumaSetNodeDistanceCount; virDomainNumaSetNodeMemorySize; virDomainNumatuneFormatNodeset; virDomainNumatuneFormatXML; +virDomainNumatuneGetMigratable; virDomainNumatuneGetMode; virDomainNumatuneGetNodeset; virDomainNumatuneHasPerNodeBinding; diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 476cf6972e..882e7e6ba2 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -3189,7 +3189,11 @@ qemuBuildMemoryBackendProps(virJSONValuePtr *backendProps, return -1; } - if (nodemask) { + // If migratable attribute is yes, we should only use cgroups setting + // memory affinity, and skip passing the host-nodes and policy parameters + // to QEMU command line. + if (nodemask && + virDomainNumatuneGetMigratable(def->numa, NULL) != VIR_TRISTATE_BOOL_YES) { if (!virNumaNodesetIsAvailable(nodemask)) return -1; if (virJSONValueObjectAdd(props, diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 6b5de29fdb..9c1116064b 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2654,6 +2654,7 @@ qemuProcessSetupPid(virDomainObjPtr vm, g_autoptr(virBitmap) hostcpumap = NULL; g_autofree char *mem_mask = NULL; int ret = -1; + size_t i; if ((period || quota) && !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { @@ -2694,6 +2695,26 @@ qemuProcessSetupPid(virDomainObjPtr vm, &mem_mask, -1) < 0) goto cleanup; + // For vCPU threads, mem_mask is different among cells + if (nameval == VIR_CGROUP_THREAD_VCPU) { + virDomainNumaPtr numatune = vm->def->numa; + virBitmapPtr numanode_cpumask = NULL; + for (i = 0; i < virDomainNumaGetNodeCount(numatune); i++) { + numanode_cpumask = virDomainNumaGetNodeCpumask(numatune, i); + // 'i' indicates the cell id, if the vCPU id is in this cell, + // we need get the corresonding nodeset + if (virBitmapIsBitSet(numanode_cpumask, id)) { + if (virDomainNumatuneMaybeFormatNodeset(numatune, + priv->autoNodeset, + &mem_mask, i) < 0) { + goto cleanup; + } else { + break; + } + } + } + } + if (virCgroupNewThread(priv->cgroup, nameval, id, true, &cgroup) < 0) goto cleanup; diff --git a/tests/qemuxml2argvdata/numatune-memory-migratable.args b/tests/qemuxml2argvdata/numatune-memory-migratable.args new file mode 100644 index 0000000000..62300d72a2 --- /dev/null +++ b/tests/qemuxml2argvdata/numatune-memory-migratable.args @@ -0,0 +1,34 @@ +LC_ALL=C \ +PATH=/bin \ +HOME=/tmp/lib/domain--1-QEMUGuest \ +USER=test \ +LOGNAME=test \ +XDG_DATA_HOME=/tmp/lib/domain--1-QEMUGuest/.local/share \ +XDG_CACHE_HOME=/tmp/lib/domain--1-QEMUGuest/.cache \ +XDG_CONFIG_HOME=/tmp/lib/domain--1-QEMUGuest/.config \ +QEMU_AUDIO_DRV=none \ +/usr/bin/qemu-system-x86_64 \ +-name QEMUGuest \ +-S \ +-machine pc,accel=tcg,usb=off,dump-guest-core=off \ +-m 24105 \ +-realtime mlock=off \ +-smp 32,sockets=32,cores=1,threads=1 \ +-object memory-backend-ram,id=ram-node0,size=20971520 \ +-numa node,nodeid=0,cpus=0,memdev=ram-node0 \ +-object memory-backend-ram,id=ram-node1,size=676331520 \ +-numa node,nodeid=1,cpus=1-27,cpus=29,memdev=ram-node1 \ +-object memory-backend-ram,id=ram-node2,size=24578621440 \ +-numa node,nodeid=2,cpus=28,cpus=30-31,memdev=ram-node2 \ +-uuid 9f4b6512-e73a-4a25-93e8-5307802821ce \ +-display none \ +-no-user-config \ +-nodefaults \ +-chardev socket,id=charmonitor,path=/tmp/lib/domain--1-QEMUGuest/monitor.sock,\ +server,nowait \ +-mon chardev=charmonitor,id=monitor,mode=control \ +-rtc base=utc \ +-no-shutdown \ +-no-acpi \ +-usb \ +-device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x3 diff --git a/tests/qemuxml2argvdata/numatune-memory-migratable.err b/tests/qemuxml2argvdata/numatune-memory-migratable.err new file mode 100644 index 0000000000..1d5e57f996 --- /dev/null +++ b/tests/qemuxml2argvdata/numatune-memory-migratable.err @@ -0,0 +1 @@ +unsupported configuration: Per-node memory binding is not supported with this QEMU diff --git a/tests/qemuxml2argvdata/numatune-memory-migratable.xml b/tests/qemuxml2argvdata/numatune-memory-migratable.xml new file mode 100644 index 0000000000..5665358e24 --- /dev/null +++ b/tests/qemuxml2argvdata/numatune-memory-migratable.xml @@ -0,0 +1,33 @@ +<domain type='qemu'> + <name>QEMUGuest</name> + <uuid>9f4b6512-e73a-4a25-93e8-5307802821ce</uuid> + <memory unit='KiB'>24682468</memory> + <currentMemory unit='KiB'>24682468</currentMemory> + <vcpu placement='static'>32</vcpu> + <numatune> + <memory mode='strict' nodeset='0-7' migratable="yes"/> + <memnode cellid='0' mode='preferred' nodeset='3'/> + <memnode cellid='2' mode='strict' nodeset='1-2,5-7,^6'/> + </numatune> + <os> + <type arch='x86_64' machine='pc'>hvm</type> + <boot dev='hd'/> + </os> + <cpu> + <numa> + <cell id='0' cpus='0' memory='20002' unit='KiB'/> + <cell id='1' cpus='1-27,29' memory='660066' unit='KiB'/> + <cell id='2' cpus='28,30-31' memory='24002400' unit='KiB'/> + </numa> + </cpu> + <clock offset='utc'/> + <on_poweroff>destroy</on_poweroff> + <on_reboot>restart</on_reboot> + <on_crash>destroy</on_crash> + <devices> + <emulator>/usr/bin/qemu-system-x86_64</emulator> + <controller type='usb' index='0'/> + <controller type='pci' index='0' model='pci-root'/> + <memballoon model='virtio'/> + </devices> +</domain> diff --git a/tests/qemuxml2argvtest.c b/tests/qemuxml2argvtest.c index 49567326d4..e6554518d7 100644 --- a/tests/qemuxml2argvtest.c +++ b/tests/qemuxml2argvtest.c @@ -1950,6 +1950,11 @@ mymain(void) DO_TEST("numatune-memory", NONE); DO_TEST_PARSE_ERROR("numatune-memory-invalid-nodeset", NONE); + DO_TEST("numatune-memory-migratable", + QEMU_CAPS_NUMA, + QEMU_CAPS_OBJECT_MEMORY_RAM); + DO_TEST_PARSE_ERROR("numatune-memory-migratable", NONE); + DO_TEST("numatune-memnode", QEMU_CAPS_NUMA, QEMU_CAPS_OBJECT_MEMORY_RAM); -- 2.25.4

On Tue, Oct 06, 2020 at 13:47:25 +0800, Luyao Zhong wrote:
<numatune> <memory mode="strict" nodeset="1-4,^3" migratable="yes/no" /> <memnode cellid="0" mode="strict" nodeset="1"/> <memnode cellid="2" mode="preferred" nodeset="2"/> </numatune>
Attribute ``migratable`` will be 'no' by default, and 'yes' indicates that it allows operating system or hypervisor migrating the memory pages between different memory nodes, that also means we will not rely on hypervisor to set the memory policy or memory affinity, we only use cgroups to restrict the memory nodes, so if ``migratable`` is 'yes', the ``mode`` which indicates memory policy will be ignored.
--- We require that patches are split into logical blocks. You need to split
Note that I'm not reviewing whether this is justified and makes sense. I'm commenting purely on the code. this commit at least into following patches:
docs/formatdomain.rst | 8 +++- docs/schemas/domaincommon.rng | 5 +++ src/conf/numa_conf.c | 45 +++++++++++++++++++ src/conf/numa_conf.h | 3 ++ src/libvirt_private.syms | 1 +
Docs, schema and parser changes should be separated
src/qemu/qemu_command.c | 6 ++- src/qemu/qemu_process.c | 21 +++++++++ .../numatune-memory-migratable.args | 34 ++++++++++++++ .../numatune-memory-migratable.err | 1 + .../numatune-memory-migratable.xml | 33 ++++++++++++++ tests/qemuxml2argvtest.c | 5 +++
Implementation in qemu can be all together with tests. Please also add a qemuxml2xmltest case. I've provided some more feedback in a recent review: https://www.redhat.com/archives/libvir-list/2020-October/msg00231.html specifically some guidance on tests: https://www.redhat.com/archives/libvir-list/2020-October/msg00395.html
11 files changed, 160 insertions(+), 2 deletions(-) create mode 100644 tests/qemuxml2argvdata/numatune-memory-migratable.args create mode 100644 tests/qemuxml2argvdata/numatune-memory-migratable.err create mode 100644 tests/qemuxml2argvdata/numatune-memory-migratable.xml
diff --git a/docs/formatdomain.rst b/docs/formatdomain.rst index cc4f91d4ea..4e386a0c3d 100644 --- a/docs/formatdomain.rst +++ b/docs/formatdomain.rst
[...]
@@ -1097,6 +1097,12 @@ NUMA Node Tuning will be ignored if it's specified. If ``placement`` of ``vcpu`` is 'auto', and ``numatune`` is not specified, a default ``numatune`` with ``placement`` 'auto' and ``mode`` 'strict' will be added implicitly. :since:`Since 0.9.3` + Attribute ``migratable`` is 'no' by default, and 'yes' indicates that it + allows operating system or hypervisor migrating the memory pages between + different memory nodes, that also means we will not rely on hypervisor to + set the memory policy or memory affinity, we only use cgroups to restrict + the memory nodes, so if ``migratable`` is 'yes', the ``mode`` which indicates + memory policy will be ignored.
So ... does this make us ignore the per NUMA-node set 'mode'? If yes, the code should actually reject any configs where we ignore some settings rather than just relying on the docs.
``memnode`` Optional ``memnode`` elements can specify memory allocation policies per each guest NUMA node. For those nodes having no corresponding ``memnode`` element,
[...]
diff --git a/src/conf/numa_conf.c b/src/conf/numa_conf.c index 6653ba05a6..c14ba1295c 100644 --- a/src/conf/numa_conf.c +++ b/src/conf/numa_conf.c
[...]
@@ -292,6 +294,15 @@ virDomainNumatuneParseXML(virDomainNumaPtr numa, placement = VIR_DOMAIN_NUMATUNE_PLACEMENT_AUTO;
if (node) { + if ((tmp = virXMLPropString(node, "migratable")) && + (migratable = virTristateBoolTypeFromString(tmp)) <= 0) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
VIR_ERR_XML_ERROR
+ _("Invalid 'migratable' attribute value '%s'"), tmp); + goto cleanup; + } + numa->memory.migratable = migratable; + VIR_FREE(tmp); + if ((tmp = virXMLPropString(node, "mode")) && (mode = virDomainNumatuneMemModeTypeFromString(tmp)) < 0) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, @@ -369,6 +380,11 @@ virDomainNumatuneFormatXML(virBufferPtr buf, tmp = virDomainNumatuneMemModeTypeToString(numatune->memory.mode); virBufferAsprintf(buf, "<memory mode='%s' ", tmp);
+ if (numatune->memory.migratable) {
Use an explicit condition here: if (numatune->memory.migratable != VIR_TRISTATE_BOOL_ABSENT)
+ tmp = virTristateBoolTypeToString(numatune->memory.migratable); + virBufferAsprintf(buf, "migratable='%s' ", tmp); + } + if (numatune->memory.placement == VIR_DOMAIN_NUMATUNE_PLACEMENT_STATIC) { if (!(nodeset = virBitmapFormat(numatune->memory.nodeset))) return -1; @@ -427,6 +443,35 @@ virDomainNumaFree(virDomainNumaPtr numa) VIR_FREE(numa); }
+/** + * virDomainNumatuneGetMigratable: + * @numatune: pointer to numatune definition + * @migratable: where to store the result
This argument is not used by the only place calling it. Please remove it.
+ * + * Get the migratable attribute for domain's memory. It's safe to + * pass NULL to @migratable if the return value is the only info needed.
+ * + * Returns: migratable value + */ +int virDomainNumatuneGetMigratable(virDomainNumaPtr numatune, + virTristateBool *migratable) +{ + virTristateBool tmp_migratable = 0; + + if (!numatune) + return tmp_migratable;
Just directly return VIR_TRISTATE_BOOL_ABSENT here ...
+ + if (numatune->memory.specified) + tmp_migratable = numatune->memory.migratable; + else + return tmp_migratable; + + if (migratable != NULL) + *migratable = tmp_migratable;
Also none of this is really needed, just directly return the value.
+ + return tmp_migratable; +} + /** * virDomainNumatuneGetMode: * @numatune: pointer to numatune definition
[...]
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 476cf6972e..882e7e6ba2 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -3189,7 +3189,11 @@ qemuBuildMemoryBackendProps(virJSONValuePtr *backendProps, return -1; }
- if (nodemask) { + // If migratable attribute is yes, we should only use cgroups setting + // memory affinity, and skip passing the host-nodes and policy parameters + // to QEMU command line.
We don't use line comments ('//') in our code base. Please use block comments.
+ if (nodemask && + virDomainNumatuneGetMigratable(def->numa, NULL) != VIR_TRISTATE_BOOL_YES) { if (!virNumaNodesetIsAvailable(nodemask)) return -1; if (virJSONValueObjectAdd(props, diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 6b5de29fdb..9c1116064b 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c
[...]
@@ -2694,6 +2695,26 @@ qemuProcessSetupPid(virDomainObjPtr vm, &mem_mask, -1) < 0) goto cleanup;
+ // For vCPU threads, mem_mask is different among cells + if (nameval == VIR_CGROUP_THREAD_VCPU) { + virDomainNumaPtr numatune = vm->def->numa; + virBitmapPtr numanode_cpumask = NULL; + for (i = 0; i < virDomainNumaGetNodeCount(numatune); i++) { + numanode_cpumask = virDomainNumaGetNodeCpumask(numatune, i); + // 'i' indicates the cell id, if the vCPU id is in this cell, + // we need get the corresonding nodeset + if (virBitmapIsBitSet(numanode_cpumask, id)) { + if (virDomainNumatuneMaybeFormatNodeset(numatune, + priv->autoNodeset, + &mem_mask, i) < 0) { + goto cleanup; + } else { + break; + } + } + } + }
This hunk doesn't seem to belong to this patch. The comments (apart from using wrong syntax) don't seem to justify why or how this relates to memory migrability.
+ if (virCgroupNewThread(priv->cgroup, nameval, id, true, &cgroup) < 0) goto cleanup;
[...]
diff --git a/tests/qemuxml2argvtest.c b/tests/qemuxml2argvtest.c index 49567326d4..e6554518d7 100644 --- a/tests/qemuxml2argvtest.c +++ b/tests/qemuxml2argvtest.c @@ -1950,6 +1950,11 @@ mymain(void)
DO_TEST("numatune-memory", NONE); DO_TEST_PARSE_ERROR("numatune-memory-invalid-nodeset", NONE); + DO_TEST("numatune-memory-migratable", + QEMU_CAPS_NUMA, + QEMU_CAPS_OBJECT_MEMORY_RAM); + DO_TEST_PARSE_ERROR("numatune-memory-migratable", NONE);
Pleased don't use DO_TEST any more. Use DO_TEST_CAPS_* (such as DO_TEST_CAPS_LATEST) instead for new tests. The negative test doesn't seem to make sense here since it's not testing a negative case in your patch. This is visible from the error message it records:
+++ b/tests/qemuxml2argvdata/numatune-memory-migratable.err @@ -0,0 +1 @@ +unsupported configuration: Per-node memory binding is not supported with this QEMU

On 10/6/2020 3:15 PM, Peter Krempa wrote:
On Tue, Oct 06, 2020 at 13:47:25 +0800, Luyao Zhong wrote:
<numatune> <memory mode="strict" nodeset="1-4,^3" migratable="yes/no" /> <memnode cellid="0" mode="strict" nodeset="1"/> <memnode cellid="2" mode="preferred" nodeset="2"/> </numatune>
Attribute ``migratable`` will be 'no' by default, and 'yes' indicates that it allows operating system or hypervisor migrating the memory pages between different memory nodes, that also means we will not rely on hypervisor to set the memory policy or memory affinity, we only use cgroups to restrict the memory nodes, so if ``migratable`` is 'yes', the ``mode`` which indicates memory policy will be ignored.
Note that I'm not reviewing whether this is justified and makes sense. I'm commenting purely on the code.
--- We require that patches are split into logical blocks. You need to split this commit at least into following patches: docs/formatdomain.rst | 8 +++- docs/schemas/domaincommon.rng | 5 +++ src/conf/numa_conf.c | 45 +++++++++++++++++++ src/conf/numa_conf.h | 3 ++ src/libvirt_private.syms | 1 +
Docs, schema and parser changes should be separated Got it.
src/qemu/qemu_command.c | 6 ++- src/qemu/qemu_process.c | 21 +++++++++ .../numatune-memory-migratable.args | 34 ++++++++++++++ .../numatune-memory-migratable.err | 1 + .../numatune-memory-migratable.xml | 33 ++++++++++++++ tests/qemuxml2argvtest.c | 5 +++
Implementation in qemu can be all together with tests. Please also add a qemuxml2xmltest case.
I've provided some more feedback in a recent review:
https://www.redhat.com/archives/libvir-list/2020-October/msg00231.html
specifically some guidance on tests:
https://www.redhat.com/archives/libvir-list/2020-October/msg00395.html
Got it. Thanks.
11 files changed, 160 insertions(+), 2 deletions(-) create mode 100644 tests/qemuxml2argvdata/numatune-memory-migratable.args create mode 100644 tests/qemuxml2argvdata/numatune-memory-migratable.err create mode 100644 tests/qemuxml2argvdata/numatune-memory-migratable.xml
diff --git a/docs/formatdomain.rst b/docs/formatdomain.rst index cc4f91d4ea..4e386a0c3d 100644 --- a/docs/formatdomain.rst +++ b/docs/formatdomain.rst
[...]
@@ -1097,6 +1097,12 @@ NUMA Node Tuning will be ignored if it's specified. If ``placement`` of ``vcpu`` is 'auto', and ``numatune`` is not specified, a default ``numatune`` with ``placement`` 'auto' and ``mode`` 'strict' will be added implicitly. :since:`Since 0.9.3` + Attribute ``migratable`` is 'no' by default, and 'yes' indicates that it + allows operating system or hypervisor migrating the memory pages between + different memory nodes, that also means we will not rely on hypervisor to + set the memory policy or memory affinity, we only use cgroups to restrict + the memory nodes, so if ``migratable`` is 'yes', the ``mode`` which indicates + memory policy will be ignored.
So ... does this make us ignore the per NUMA-node set 'mode'? If yes, the code should actually reject any configs where we ignore some settings rather than just relying on the docs.
Yes, it will ignore the per NUMA-node set 'mode'. So let me saying in this way, if 'migratable' is set, per NUMA-node mode should not be set, otherwise the config will be rejected, so the mode will have 'None' as its default value but not 'strict'.
``memnode`` Optional ``memnode`` elements can specify memory allocation policies per each guest NUMA node. For those nodes having no corresponding ``memnode`` element,
[...]
diff --git a/src/conf/numa_conf.c b/src/conf/numa_conf.c index 6653ba05a6..c14ba1295c 100644 --- a/src/conf/numa_conf.c +++ b/src/conf/numa_conf.c
[...]
@@ -292,6 +294,15 @@ virDomainNumatuneParseXML(virDomainNumaPtr numa, placement = VIR_DOMAIN_NUMATUNE_PLACEMENT_AUTO;
if (node) { + if ((tmp = virXMLPropString(node, "migratable")) && + (migratable = virTristateBoolTypeFromString(tmp)) <= 0) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
VIR_ERR_XML_ERROR
Got it.
+ _("Invalid 'migratable' attribute value '%s'"), tmp); + goto cleanup; + } + numa->memory.migratable = migratable; + VIR_FREE(tmp); + if ((tmp = virXMLPropString(node, "mode")) && (mode = virDomainNumatuneMemModeTypeFromString(tmp)) < 0) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, @@ -369,6 +380,11 @@ virDomainNumatuneFormatXML(virBufferPtr buf, tmp = virDomainNumatuneMemModeTypeToString(numatune->memory.mode); virBufferAsprintf(buf, "<memory mode='%s' ", tmp);
+ if (numatune->memory.migratable) {
Use an explicit condition here:
if (numatune->memory.migratable != VIR_TRISTATE_BOOL_ABSENT)
Got it.
+ tmp = virTristateBoolTypeToString(numatune->memory.migratable); + virBufferAsprintf(buf, "migratable='%s' ", tmp); + } + if (numatune->memory.placement == VIR_DOMAIN_NUMATUNE_PLACEMENT_STATIC) { if (!(nodeset = virBitmapFormat(numatune->memory.nodeset))) return -1; @@ -427,6 +443,35 @@ virDomainNumaFree(virDomainNumaPtr numa) VIR_FREE(numa); }
+/** + * virDomainNumatuneGetMigratable: + * @numatune: pointer to numatune definition + * @migratable: where to store the result
This argument is not used by the only place calling it. Please remove it.
Got it.
+ * + * Get the migratable attribute for domain's memory. It's safe to + * pass NULL to @migratable if the return value is the only info needed.
+ * + * Returns: migratable value + */ +int virDomainNumatuneGetMigratable(virDomainNumaPtr numatune, + virTristateBool *migratable) +{ + virTristateBool tmp_migratable = 0; + + if (!numatune) + return tmp_migratable;
Just directly return VIR_TRISTATE_BOOL_ABSENT here ...
Got it.
+ + if (numatune->memory.specified) + tmp_migratable = numatune->memory.migratable; + else + return tmp_migratable; + + if (migratable != NULL) + *migratable = tmp_migratable;
Also none of this is really needed, just directly return the value.
Got it.
+ + return tmp_migratable; +} + /** * virDomainNumatuneGetMode: * @numatune: pointer to numatune definition
[...]
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 476cf6972e..882e7e6ba2 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -3189,7 +3189,11 @@ qemuBuildMemoryBackendProps(virJSONValuePtr *backendProps, return -1; }
- if (nodemask) { + // If migratable attribute is yes, we should only use cgroups setting + // memory affinity, and skip passing the host-nodes and policy parameters + // to QEMU command line.
We don't use line comments ('//') in our code base. Please use block comments.
Got it.
+ if (nodemask && + virDomainNumatuneGetMigratable(def->numa, NULL) != VIR_TRISTATE_BOOL_YES) { if (!virNumaNodesetIsAvailable(nodemask)) return -1; if (virJSONValueObjectAdd(props, diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 6b5de29fdb..9c1116064b 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c
[...]
@@ -2694,6 +2695,26 @@ qemuProcessSetupPid(virDomainObjPtr vm, &mem_mask, -1) < 0) goto cleanup;
+ // For vCPU threads, mem_mask is different among cells + if (nameval == VIR_CGROUP_THREAD_VCPU) { + virDomainNumaPtr numatune = vm->def->numa; + virBitmapPtr numanode_cpumask = NULL; + for (i = 0; i < virDomainNumaGetNodeCount(numatune); i++) { + numanode_cpumask = virDomainNumaGetNodeCpumask(numatune, i); + // 'i' indicates the cell id, if the vCPU id is in this cell, + // we need get the corresonding nodeset + if (virBitmapIsBitSet(numanode_cpumask, id)) { + if (virDomainNumatuneMaybeFormatNodeset(numatune, + priv->autoNodeset, + &mem_mask, i) < 0) { + goto cleanup; + } else { + break; + } + } + } + }
This hunk doesn't seem to belong to this patch. The comments (apart from using wrong syntax) don't seem to justify why or how this relates to memory migrability.
It relates to migrability. <numatune> <memory mode="strict" nodeset="1-2", migratable='yes'/> <memnode cellid="0" nodeset="1"/> <memnode cellid="2" nodeset="2"/> </numatune> originally, it will set "1-2" to cgroups cpuset.mems for all vcpus, but acctually, we should set '1' for vcpus in cell 0, set '2' for vcpus in cell 2, and set '1-2' for vcpus in other cells. cgroups can ristrict the memory allocation of the thread at the beginning, and allow the memory page migrability of the operate system or hypervisor. I'll make it clear in comment.
+ if (virCgroupNewThread(priv->cgroup, nameval, id, true, &cgroup) < 0) goto cleanup;
[...]
diff --git a/tests/qemuxml2argvtest.c b/tests/qemuxml2argvtest.c index 49567326d4..e6554518d7 100644 --- a/tests/qemuxml2argvtest.c +++ b/tests/qemuxml2argvtest.c @@ -1950,6 +1950,11 @@ mymain(void)
DO_TEST("numatune-memory", NONE); DO_TEST_PARSE_ERROR("numatune-memory-invalid-nodeset", NONE); + DO_TEST("numatune-memory-migratable", + QEMU_CAPS_NUMA, + QEMU_CAPS_OBJECT_MEMORY_RAM); + DO_TEST_PARSE_ERROR("numatune-memory-migratable", NONE);
Pleased don't use DO_TEST any more. Use DO_TEST_CAPS_* (such as DO_TEST_CAPS_LATEST) instead for new tests.
The negative test doesn't seem to make sense here since it's not testing a negative case in your patch. This is visible from the error message it records:
Got it.
+++ b/tests/qemuxml2argvdata/numatune-memory-migratable.err @@ -0,0 +1 @@ +unsupported configuration: Per-node memory binding is not supported with this QEMU
Thanks for your review, Peter. Regards, Luyao
participants (3)
-
Luyao Zhong
-
Peter Krempa
-
Zhong, Luyao