[PATCH 0/5] qemu: Prefer -numa cpu over -numa node,cpus=

Another round of patches to catch up with the latest command line building recommendations. This time, the winner is: -numa cpu Thing is, the way vCPUs are assigned to NUMA nodes is not very deterministic right now. QEMU relies on this weird mapping where firstly threads are iterated over, then cores, then (possibly) dies and then sockets. So they came up with a better schema: -numa cpu,node-id=N,socket-id=S,die-id=D,core-id=C,thread-id=T where everything is specified on the command line. Long story short, some QEMU code is copied over to keep status quo (or, whatever you want). It was buried down down (include/hw/i386/topology.h). I wanted to allow users configure the vCPU topology too, but then decided not to because neither Libvirt nor QEMU can really guarantee the topology will look the same in the guest (at least in my testing with cpu mode='host-passthrough' it didn't). Or, what you're proposing? I can post them as a follow up if needed. Michal Prívozník (5): qemuBuildNumaArgStr: Move vars into loops qemuBuildNumaArgStr: Use g_autofree on @nodeBackends qemuBuildNumaArgStr: Separate out old style of building CPU list qemu: Prefer -numa cpu over -numa node,cpus= virCPUDefParseXML: Parse uint using virXPathUInt src/conf/cpu_conf.c | 14 +- src/qemu/qemu_command.c | 168 +++++++++++++++--- .../hugepages-nvdimm.x86_64-latest.args | 4 +- ...memory-default-hugepage.x86_64-latest.args | 10 +- .../memfd-memory-numa.x86_64-latest.args | 10 +- ...y-hotplug-nvdimm-access.x86_64-latest.args | 4 +- ...ry-hotplug-nvdimm-align.x86_64-latest.args | 4 +- ...ry-hotplug-nvdimm-label.x86_64-latest.args | 4 +- ...ory-hotplug-nvdimm-pmem.x86_64-latest.args | 4 +- ...ory-hotplug-nvdimm-ppc64.ppc64-latest.args | 4 +- ...hotplug-nvdimm-readonly.x86_64-latest.args | 4 +- .../memory-hotplug-nvdimm.x86_64-latest.args | 4 +- ...vhost-user-fs-fd-memory.x86_64-latest.args | 4 +- ...vhost-user-fs-hugepages.x86_64-latest.args | 4 +- ...host-user-gpu-secondary.x86_64-latest.args | 3 +- .../vhost-user-vga.x86_64-latest.args | 3 +- 16 files changed, 195 insertions(+), 53 deletions(-) -- 2.26.2

There are couple of variables that are used only in a single loop. Move their definitions into their respective loops and use g_autofree on @cpumask. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_command.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 419eca5675..5af22e9359 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -7066,12 +7066,8 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, size_t i, j; virQEMUCapsPtr qemuCaps = priv->qemuCaps; g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER; - char *cpumask = NULL; - char *tmpmask = NULL; - char *next = NULL; virBufferPtr nodeBackends = NULL; bool needBackend = false; - int rc; int ret = -1; size_t ncells = virDomainNumaGetNodeCount(def->numa); @@ -7091,6 +7087,7 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_OBJECT_MEMORY_RAM) || virQEMUCapsGet(qemuCaps, QEMU_CAPS_OBJECT_MEMORY_FILE) || virQEMUCapsGet(qemuCaps, QEMU_CAPS_OBJECT_MEMORY_MEMFD)) { + int rc; for (i = 0; i < ncells; i++) { if ((rc = qemuBuildMemoryCellBackendStr(def, cfg, i, priv, @@ -7112,7 +7109,10 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, goto cleanup; for (i = 0; i < ncells; i++) { - VIR_FREE(cpumask); + g_autofree char *cpumask = NULL; + char *tmpmask = NULL; + char *next = NULL; + if (!(cpumask = virBitmapFormat(virDomainNumaGetNodeCpumask(def->numa, i)))) goto cleanup; @@ -7159,8 +7159,6 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, ret = 0; cleanup: - VIR_FREE(cpumask); - if (nodeBackends) { for (i = 0; i < ncells; i++) virBufferFreeAndReset(&nodeBackends[i]); -- 2.26.2

Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_command.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 5af22e9359..5a438d07c3 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -7066,13 +7066,13 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, size_t i, j; virQEMUCapsPtr qemuCaps = priv->qemuCaps; g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER; - virBufferPtr nodeBackends = NULL; + g_autofree virBufferPtr nodeBackends = NULL; bool needBackend = false; int ret = -1; size_t ncells = virDomainNumaGetNodeCount(def->numa); if (!virDomainNumatuneNodesetIsAvailable(def->numa, priv->autoNodeset)) - goto cleanup; + return -1; if (!virQEMUCapsGetMachineNumaMemSupported(qemuCaps, def->virtType, @@ -7080,7 +7080,7 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, needBackend = true; if (VIR_ALLOC_N(nodeBackends, ncells) < 0) - goto cleanup; + return -1; /* using of -numa memdev= cannot be combined with -numa mem=, thus we * need to check which approach to use */ @@ -7159,12 +7159,8 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, ret = 0; cleanup: - if (nodeBackends) { - for (i = 0; i < ncells; i++) - virBufferFreeAndReset(&nodeBackends[i]); - - VIR_FREE(nodeBackends); - } + for (i = 0; i < ncells; i++) + virBufferFreeAndReset(&nodeBackends[i]); return ret; } -- 2.26.2

Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_command.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 5a438d07c3..7d84fd8b5e 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -7057,6 +7057,28 @@ qemuBuildIOThreadCommandLine(virCommandPtr cmd, } +static int +qemuBuildNumaOldCPUs(virBufferPtr buf, + virBitmapPtr cpu) +{ + g_autofree char *cpumask = NULL; + char *tmpmask = NULL; + char *next = NULL; + + if (!(cpumask = virBitmapFormat(cpu))) + return -1; + + for (tmpmask = cpumask; tmpmask; tmpmask = next) { + if ((next = strchr(tmpmask, ','))) + *(next++) = '\0'; + virBufferAddLit(buf, ",cpus="); + virBufferAdd(buf, tmpmask, -1); + } + + return 0; +} + + static int qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, virDomainDefPtr def, @@ -7109,13 +7131,6 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, goto cleanup; for (i = 0; i < ncells; i++) { - g_autofree char *cpumask = NULL; - char *tmpmask = NULL; - char *next = NULL; - - if (!(cpumask = virBitmapFormat(virDomainNumaGetNodeCpumask(def->numa, i)))) - goto cleanup; - if (needBackend) { virCommandAddArg(cmd, "-object"); virCommandAddArgBuffer(cmd, &nodeBackends[i]); @@ -7124,12 +7139,9 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, virCommandAddArg(cmd, "-numa"); virBufferAsprintf(&buf, "node,nodeid=%zu", i); - for (tmpmask = cpumask; tmpmask; tmpmask = next) { - if ((next = strchr(tmpmask, ','))) - *(next++) = '\0'; - virBufferAddLit(&buf, ",cpus="); - virBufferAdd(&buf, tmpmask, -1); - } + if (qemuBuildNumaOldCPUs(&buf, + virDomainNumaGetNodeCpumask(def->numa, i)) < 0) + goto cleanup; if (needBackend) virBufferAsprintf(&buf, ",memdev=ram-node%zu", i); -- 2.26.2

QEMU is trying to obsolete -numa node,cpus= because that uses ambiguous vCPU id to [socket, die, core, thread] mapping. The new form is: -numa cpu,node-id=N,socket-id=S,die-id=D,core-id=C,thread-id=T which is repeated for every vCPU and places it at [S, D, C, T] into guest NUMA node N. While in general this is magic mapping, we can deal with it. Firstly, with QEMU 2.7 or newer, libvirt ensures that if topology is given then maxvcpus must be sockets * dies * cores * threads (i.e. there are no 'holes'). Secondly, if no topology is given then libvirt itself places each vCPU into a different socket (basically, it fakes topology of: [maxvcpus, 1, 1, 1]) Thirdly, we can copy whatever QEMU is doing when mapping vCPUs onto topology, to make sure vCPUs don't start to move around. Note, migration from old to new cmd line works and therefore doesn't need any special handling. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1678085 Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_command.c | 108 +++++++++++++++++- .../hugepages-nvdimm.x86_64-latest.args | 4 +- ...memory-default-hugepage.x86_64-latest.args | 10 +- .../memfd-memory-numa.x86_64-latest.args | 10 +- ...y-hotplug-nvdimm-access.x86_64-latest.args | 4 +- ...ry-hotplug-nvdimm-align.x86_64-latest.args | 4 +- ...ry-hotplug-nvdimm-label.x86_64-latest.args | 4 +- ...ory-hotplug-nvdimm-pmem.x86_64-latest.args | 4 +- ...ory-hotplug-nvdimm-ppc64.ppc64-latest.args | 4 +- ...hotplug-nvdimm-readonly.x86_64-latest.args | 4 +- .../memory-hotplug-nvdimm.x86_64-latest.args | 4 +- ...vhost-user-fs-fd-memory.x86_64-latest.args | 4 +- ...vhost-user-fs-hugepages.x86_64-latest.args | 4 +- ...host-user-gpu-secondary.x86_64-latest.args | 3 +- .../vhost-user-vga.x86_64-latest.args | 3 +- 15 files changed, 158 insertions(+), 16 deletions(-) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 7d84fd8b5e..0de4fe4905 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -7079,6 +7079,91 @@ qemuBuildNumaOldCPUs(virBufferPtr buf, } +/** + * qemuTranlsatevCPUID: + * + * For given vCPU @id and vCPU topology (@cpu) compute corresponding + * @socket, @die, @core and @thread). This assumes linear topology, + * that is every [socket, die, core, thread] combination is valid vCPU + * ID and there are no 'holes'. This is ensured by + * qemuValidateDomainDef() if QEMU_CAPS_QUERY_HOTPLUGGABLE_CPUS is + * set. + * + * Moreover, if @diesSupported is false (QEMU lacks + * QEMU_CAPS_SMP_DIES) then @die is set to zero and @socket is + * computed without taking numbed of dies into account. + * + * The algorithm is shamelessly copied over from QEMU's + * x86_topo_ids_from_idx() and its history (before introducing dies). + */ +static void +qemuTranlsatevCPUID(unsigned int id, + bool diesSupported, + virCPUDefPtr cpu, + unsigned int *socket, + unsigned int *die, + unsigned int *core, + unsigned int *thread) +{ + if (cpu && cpu->sockets) { + *thread = id % cpu->threads; + *core = id / cpu->threads % cpu->cores; + if (diesSupported) { + *die = id / (cpu->cores * cpu->threads) % cpu->dies; + *socket = id / (cpu->dies * cpu->cores * cpu->threads); + } else { + *die = 0; + *socket = id / (cpu->cores * cpu->threads) % cpu->sockets; + } + } else { + /* If no topology was provided, then qemuBuildSmpCommandLine() + * puts all vCPUs into a separate socket. */ + *thread = 0; + *core = 0; + *die = 0; + *socket = id; + } +} + + +static void +qemuBuildNumaNewCPUs(virCommandPtr cmd, + virCPUDefPtr cpu, + virBitmapPtr cpumask, + size_t nodeid, + virQEMUCapsPtr qemuCaps) +{ + const bool diesSupported = virQEMUCapsGet(qemuCaps, QEMU_CAPS_SMP_DIES); + ssize_t vcpuid = -1; + + while ((vcpuid = virBitmapNextSetBit(cpumask, vcpuid)) >= 0) { + unsigned int socket; + unsigned int die; + unsigned int core; + unsigned int thread; + + qemuTranlsatevCPUID(vcpuid, diesSupported, cpu, + &socket, &die, &core, &thread); + + virCommandAddArg(cmd, "-numa"); + + /* The simple fact that dies are supported by QEMU doesn't mean we can + * put it onto command line. QEMU will accept die-id only if -smp dies + * was set to a value greater than 1. On the other hand, this allows us + * to generate shorter command line. */ + if (diesSupported && cpu && cpu->dies > 1) { + virCommandAddArgFormat(cmd, + "cpu,node-id=%zu,socket-id=%u,die-id=%u,core-id=%u,thread-id=%u", + nodeid, socket, die, core, thread); + } else { + virCommandAddArgFormat(cmd, + "cpu,node-id=%zu,socket-id=%u,core-id=%u,thread-id=%u", + nodeid, socket, core, thread); + } + } +} + + static int qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, virDomainDefPtr def, @@ -7090,6 +7175,7 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER; g_autofree virBufferPtr nodeBackends = NULL; bool needBackend = false; + bool newCpus = false; int ret = -1; size_t ncells = virDomainNumaGetNodeCount(def->numa); @@ -7130,7 +7216,21 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, qemuBuildMemPathStr(def, cmd, priv) < 0) goto cleanup; + /* Use modern style of specifying vCPU topology only if: + * -numa cpu is available, introduced in the same time as -numa + * dist, hence slightly misleading capability test, and + * query-hotpluggable-cpus is avialable, because then + * qemuValidateDomainDef() ensures that if + * topology is specified it matches max vCPU + * count and we can make some shortcuts in + * qemuTranlsatevCPUID(). + */ + newCpus = virQEMUCapsGet(qemuCaps, QEMU_CAPS_NUMA_DIST) && + virQEMUCapsGet(qemuCaps, QEMU_CAPS_QUERY_HOTPLUGGABLE_CPUS); + for (i = 0; i < ncells; i++) { + virBitmapPtr cpu = virDomainNumaGetNodeCpumask(def->numa, i); + if (needBackend) { virCommandAddArg(cmd, "-object"); virCommandAddArgBuffer(cmd, &nodeBackends[i]); @@ -7139,8 +7239,9 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, virCommandAddArg(cmd, "-numa"); virBufferAsprintf(&buf, "node,nodeid=%zu", i); - if (qemuBuildNumaOldCPUs(&buf, - virDomainNumaGetNodeCpumask(def->numa, i)) < 0) + /* -numa cpu is supported from the same release as -numa dist */ + if (!newCpus && + qemuBuildNumaOldCPUs(&buf, cpu) < 0) goto cleanup; if (needBackend) @@ -7150,6 +7251,9 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, virDomainNumaGetNodeMemorySize(def->numa, i) / 1024); virCommandAddArgBuffer(cmd, &buf); + + if (newCpus) + qemuBuildNumaNewCPUs(cmd, def->cpu, cpu, i, qemuCaps); } /* If NUMA node distance is specified for at least one pair diff --git a/tests/qemuxml2argvdata/hugepages-nvdimm.x86_64-latest.args b/tests/qemuxml2argvdata/hugepages-nvdimm.x86_64-latest.args index e80a95c84b..804fc59d74 100644 --- a/tests/qemuxml2argvdata/hugepages-nvdimm.x86_64-latest.args +++ b/tests/qemuxml2argvdata/hugepages-nvdimm.x86_64-latest.args @@ -19,7 +19,9 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -smp 2,sockets=2,dies=1,cores=1,threads=1 \ -object memory-backend-file,id=ram-node0,prealloc=yes,\ mem-path=/dev/hugepages2M/libvirt/qemu/-1-QEMUGuest1,share=yes,size=1073741824 \ --numa node,nodeid=0,cpus=0-1,memdev=ram-node0 \ +-numa node,nodeid=0,memdev=ram-node0 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\ share=yes,size=536870912 \ -device nvdimm,node=0,memdev=memnvdimm0,id=nvdimm0,slot=0 \ diff --git a/tests/qemuxml2argvdata/memfd-memory-default-hugepage.x86_64-latest.args b/tests/qemuxml2argvdata/memfd-memory-default-hugepage.x86_64-latest.args index 5d256c42bc..2e8d933fc2 100644 --- a/tests/qemuxml2argvdata/memfd-memory-default-hugepage.x86_64-latest.args +++ b/tests/qemuxml2argvdata/memfd-memory-default-hugepage.x86_64-latest.args @@ -20,7 +20,15 @@ file=/tmp/lib/domain--1-instance-00000092/master-key.aes \ -smp 8,sockets=1,dies=1,cores=8,threads=1 \ -object memory-backend-memfd,id=ram-node0,hugetlb=yes,hugetlbsize=2097152,\ share=yes,size=15032385536,host-nodes=3,policy=preferred \ --numa node,nodeid=0,cpus=0-7,memdev=ram-node0 \ +-numa node,nodeid=0,memdev=ram-node0 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=1,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=2,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=3,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=4,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=5,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=6,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=7,thread-id=0 \ -uuid 126f2720-6f8e-45ab-a886-ec9277079a67 \ -display none \ -no-user-config \ diff --git a/tests/qemuxml2argvdata/memfd-memory-numa.x86_64-latest.args b/tests/qemuxml2argvdata/memfd-memory-numa.x86_64-latest.args index 5d256c42bc..2e8d933fc2 100644 --- a/tests/qemuxml2argvdata/memfd-memory-numa.x86_64-latest.args +++ b/tests/qemuxml2argvdata/memfd-memory-numa.x86_64-latest.args @@ -20,7 +20,15 @@ file=/tmp/lib/domain--1-instance-00000092/master-key.aes \ -smp 8,sockets=1,dies=1,cores=8,threads=1 \ -object memory-backend-memfd,id=ram-node0,hugetlb=yes,hugetlbsize=2097152,\ share=yes,size=15032385536,host-nodes=3,policy=preferred \ --numa node,nodeid=0,cpus=0-7,memdev=ram-node0 \ +-numa node,nodeid=0,memdev=ram-node0 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=1,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=2,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=3,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=4,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=5,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=6,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=7,thread-id=0 \ -uuid 126f2720-6f8e-45ab-a886-ec9277079a67 \ -display none \ -no-user-config \ diff --git a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-access.x86_64-latest.args b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-access.x86_64-latest.args index 89138f46c4..6bf575d486 100644 --- a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-access.x86_64-latest.args +++ b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-access.x86_64-latest.args @@ -17,7 +17,9 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -m size=219136k,slots=16,maxmem=1099511627776k \ -overcommit mem-lock=off \ -smp 2,sockets=2,dies=1,cores=1,threads=1 \ --numa node,nodeid=0,cpus=0-1,mem=214 \ +-numa node,nodeid=0,mem=214 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\ share=no,size=536870912 \ -device nvdimm,node=0,memdev=memnvdimm0,id=nvdimm0,slot=0 \ diff --git a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-align.x86_64-latest.args b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-align.x86_64-latest.args index 1a8e7932dc..84571a056a 100644 --- a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-align.x86_64-latest.args +++ b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-align.x86_64-latest.args @@ -17,7 +17,9 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -m size=219136k,slots=16,maxmem=1099511627776k \ -overcommit mem-lock=off \ -smp 2,sockets=2,dies=1,cores=1,threads=1 \ --numa node,nodeid=0,cpus=0-1,mem=214 \ +-numa node,nodeid=0,mem=214 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\ share=no,size=536870912,align=2097152 \ -device nvdimm,node=0,memdev=memnvdimm0,id=nvdimm0,slot=0 \ diff --git a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-label.x86_64-latest.args b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-label.x86_64-latest.args index ef32c663de..d684d15423 100644 --- a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-label.x86_64-latest.args +++ b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-label.x86_64-latest.args @@ -17,7 +17,9 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -m size=219136k,slots=16,maxmem=1099511627776k \ -overcommit mem-lock=off \ -smp 2,sockets=2,dies=1,cores=1,threads=1 \ --numa node,nodeid=0,cpus=0-1,mem=214 \ +-numa node,nodeid=0,mem=214 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\ share=no,size=536870912 \ -device nvdimm,node=0,label-size=131072,memdev=memnvdimm0,id=nvdimm0,slot=0 \ diff --git a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-pmem.x86_64-latest.args b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-pmem.x86_64-latest.args index 5dfba9b50a..d1374772da 100644 --- a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-pmem.x86_64-latest.args +++ b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-pmem.x86_64-latest.args @@ -17,7 +17,9 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -m size=219136k,slots=16,maxmem=1099511627776k \ -overcommit mem-lock=off \ -smp 2,sockets=2,dies=1,cores=1,threads=1 \ --numa node,nodeid=0,cpus=0-1,mem=214 \ +-numa node,nodeid=0,mem=214 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\ share=no,size=536870912,pmem=on \ -device nvdimm,node=0,memdev=memnvdimm0,id=nvdimm0,slot=0 \ diff --git a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-ppc64.ppc64-latest.args b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-ppc64.ppc64-latest.args index eff80dcf80..02d2faa054 100644 --- a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-ppc64.ppc64-latest.args +++ b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-ppc64.ppc64-latest.args @@ -17,7 +17,9 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -m size=1048576k,slots=16,maxmem=1099511627776k \ -overcommit mem-lock=off \ -smp 2,sockets=2,dies=1,cores=1,threads=1 \ --numa node,nodeid=0,cpus=0-1,mem=1024 \ +-numa node,nodeid=0,mem=1024 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\ size=537001984 \ -device nvdimm,node=0,label-size=131072,\ diff --git a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-readonly.x86_64-latest.args b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-readonly.x86_64-latest.args index 7088a4f054..a757007690 100644 --- a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-readonly.x86_64-latest.args +++ b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-readonly.x86_64-latest.args @@ -17,7 +17,9 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -m size=219136k,slots=16,maxmem=1099511627776k \ -overcommit mem-lock=off \ -smp 2,sockets=2,dies=1,cores=1,threads=1 \ --numa node,nodeid=0,cpus=0-1,mem=214 \ +-numa node,nodeid=0,mem=214 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\ share=no,size=536870912 \ -device nvdimm,node=0,unarmed=on,memdev=memnvdimm0,id=nvdimm0,slot=0 \ diff --git a/tests/qemuxml2argvdata/memory-hotplug-nvdimm.x86_64-latest.args b/tests/qemuxml2argvdata/memory-hotplug-nvdimm.x86_64-latest.args index 60d6d207c5..e673a4acad 100644 --- a/tests/qemuxml2argvdata/memory-hotplug-nvdimm.x86_64-latest.args +++ b/tests/qemuxml2argvdata/memory-hotplug-nvdimm.x86_64-latest.args @@ -17,7 +17,9 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -m size=1048576k,slots=16,maxmem=1099511627776k \ -overcommit mem-lock=off \ -smp 2,sockets=2,dies=1,cores=1,threads=1 \ --numa node,nodeid=0,cpus=0-1,mem=1024 \ +-numa node,nodeid=0,mem=1024 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\ size=536870912 \ -device nvdimm,node=0,memdev=memnvdimm0,id=nvdimm0,slot=0 \ diff --git a/tests/qemuxml2argvdata/vhost-user-fs-fd-memory.x86_64-latest.args b/tests/qemuxml2argvdata/vhost-user-fs-fd-memory.x86_64-latest.args index dd5f68abc5..bce4e07a5d 100644 --- a/tests/qemuxml2argvdata/vhost-user-fs-fd-memory.x86_64-latest.args +++ b/tests/qemuxml2argvdata/vhost-user-fs-fd-memory.x86_64-latest.args @@ -20,7 +20,9 @@ file=/tmp/lib/domain--1-guest/master-key.aes \ -object memory-backend-file,id=ram-node0,\ mem-path=/var/lib/libvirt/qemu/ram/-1-guest/ram-node0,share=yes,\ size=15032385536 \ --numa node,nodeid=0,cpus=0-1,memdev=ram-node0 \ +-numa node,nodeid=0,memdev=ram-node0 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -uuid 126f2720-6f8e-45ab-a886-ec9277079a67 \ -display none \ -no-user-config \ diff --git a/tests/qemuxml2argvdata/vhost-user-fs-hugepages.x86_64-latest.args b/tests/qemuxml2argvdata/vhost-user-fs-hugepages.x86_64-latest.args index 258fa7813f..f21c08037d 100644 --- a/tests/qemuxml2argvdata/vhost-user-fs-hugepages.x86_64-latest.args +++ b/tests/qemuxml2argvdata/vhost-user-fs-hugepages.x86_64-latest.args @@ -19,7 +19,9 @@ file=/tmp/lib/domain--1-guest/master-key.aes \ -smp 2,sockets=2,cores=1,threads=1 \ -object memory-backend-file,id=ram-node0,prealloc=yes,\ mem-path=/dev/hugepages2M/libvirt/qemu/-1-guest,share=yes,size=2147483648 \ --numa node,nodeid=0,cpus=0-1,memdev=ram-node0 \ +-numa node,nodeid=0,memdev=ram-node0 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -uuid 1ccfd97d-5eb4-478a-bbe6-88d254c16db7 \ -display none \ -no-user-config \ diff --git a/tests/qemuxml2argvdata/vhost-user-gpu-secondary.x86_64-latest.args b/tests/qemuxml2argvdata/vhost-user-gpu-secondary.x86_64-latest.args index e99a5342dc..c935a8942c 100644 --- a/tests/qemuxml2argvdata/vhost-user-gpu-secondary.x86_64-latest.args +++ b/tests/qemuxml2argvdata/vhost-user-gpu-secondary.x86_64-latest.args @@ -18,7 +18,8 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -overcommit mem-lock=off \ -smp 1,sockets=1,cores=1,threads=1 \ -object memory-backend-memfd,id=ram-node0,share=yes,size=224395264 \ --numa node,nodeid=0,cpus=0,memdev=ram-node0 \ +-numa node,nodeid=0,memdev=ram-node0 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ -uuid c7a5fdbd-edaf-9455-926a-d65c16db1809 \ -display none \ -no-user-config \ diff --git a/tests/qemuxml2argvdata/vhost-user-vga.x86_64-latest.args b/tests/qemuxml2argvdata/vhost-user-vga.x86_64-latest.args index 277bf8c646..7f90f0dfa3 100644 --- a/tests/qemuxml2argvdata/vhost-user-vga.x86_64-latest.args +++ b/tests/qemuxml2argvdata/vhost-user-vga.x86_64-latest.args @@ -18,7 +18,8 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -overcommit mem-lock=off \ -smp 1,sockets=1,cores=1,threads=1 \ -object memory-backend-memfd,id=ram-node0,share=yes,size=224395264 \ --numa node,nodeid=0,cpus=0,memdev=ram-node0 \ +-numa node,nodeid=0,memdev=ram-node0 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ -uuid c7a5fdbd-edaf-9455-926a-d65c16db1809 \ -display none \ -no-user-config \ -- 2.26.2

On Fri, 22 May 2020 16:14:14 +0200 Michal Privoznik <mprivozn@redhat.com> wrote:
QEMU is trying to obsolete -numa node,cpus= because that uses ambiguous vCPU id to [socket, die, core, thread] mapping. The new form is:
-numa cpu,node-id=N,socket-id=S,die-id=D,core-id=C,thread-id=T
which is repeated for every vCPU and places it at [S, D, C, T] into guest NUMA node N.
While in general this is magic mapping, we can deal with it. Firstly, with QEMU 2.7 or newer, libvirt ensures that if topology is given then maxvcpus must be sockets * dies * cores * threads (i.e. there are no 'holes'). Secondly, if no topology is given then libvirt itself places each vCPU into a different socket (basically, it fakes topology of: [maxvcpus, 1, 1, 1]) Thirdly, we can copy whatever QEMU is doing when mapping vCPUs onto topology, to make sure vCPUs don't start to move around.
Note, migration from old to new cmd line works and therefore doesn't need any special handling.
Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1678085
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_command.c | 108 +++++++++++++++++- .../hugepages-nvdimm.x86_64-latest.args | 4 +- ...memory-default-hugepage.x86_64-latest.args | 10 +- .../memfd-memory-numa.x86_64-latest.args | 10 +- ...y-hotplug-nvdimm-access.x86_64-latest.args | 4 +- ...ry-hotplug-nvdimm-align.x86_64-latest.args | 4 +- ...ry-hotplug-nvdimm-label.x86_64-latest.args | 4 +- ...ory-hotplug-nvdimm-pmem.x86_64-latest.args | 4 +- ...ory-hotplug-nvdimm-ppc64.ppc64-latest.args | 4 +- ...hotplug-nvdimm-readonly.x86_64-latest.args | 4 +- .../memory-hotplug-nvdimm.x86_64-latest.args | 4 +- ...vhost-user-fs-fd-memory.x86_64-latest.args | 4 +- ...vhost-user-fs-hugepages.x86_64-latest.args | 4 +- ...host-user-gpu-secondary.x86_64-latest.args | 3 +- .../vhost-user-vga.x86_64-latest.args | 3 +- 15 files changed, 158 insertions(+), 16 deletions(-)
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 7d84fd8b5e..0de4fe4905 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -7079,6 +7079,91 @@ qemuBuildNumaOldCPUs(virBufferPtr buf, }
+/** + * qemuTranlsatevCPUID: + * + * For given vCPU @id and vCPU topology (@cpu) compute corresponding + * @socket, @die, @core and @thread). This assumes linear topology, + * that is every [socket, die, core, thread] combination is valid vCPU + * ID and there are no 'holes'. This is ensured by + * qemuValidateDomainDef() if QEMU_CAPS_QUERY_HOTPLUGGABLE_CPUS is + * set. I wouldn't make this assumption, each machine can have (and has) it's own layout, and now it's not hard to change that per machine version if necessary.
I'd suppose one could pull the list of possible CPUs from QEMU started in preconfig mode with desired -smp x,y,z using QUERY_HOTPLUGGABLE_CPUS and then continue to configure numa with QMP commands using provided CPUs layout. How to present it to libvirt user I'm not sure (give them that list perhaps and let select from it???) But it's irrelevant, to the patch, magical IDs for socket/core/...whatever should not be generated by libvirt anymore, but rather taken from QEMU for given machine + -smp combination. CCing Peter, as I vaguely recall him working on this issue (preconfig + numa over QMP)
+ * Moreover, if @diesSupported is false (QEMU lacks + * QEMU_CAPS_SMP_DIES) then @die is set to zero and @socket is + * computed without taking numbed of dies into account. + * + * The algorithm is shamelessly copied over from QEMU's + * x86_topo_ids_from_idx() and its history (before introducing dies). + */ +static void +qemuTranlsatevCPUID(unsigned int id, + bool diesSupported, + virCPUDefPtr cpu, + unsigned int *socket, + unsigned int *die, + unsigned int *core, + unsigned int *thread) +{ + if (cpu && cpu->sockets) { + *thread = id % cpu->threads; + *core = id / cpu->threads % cpu->cores; + if (diesSupported) { + *die = id / (cpu->cores * cpu->threads) % cpu->dies; + *socket = id / (cpu->dies * cpu->cores * cpu->threads); + } else { + *die = 0; + *socket = id / (cpu->cores * cpu->threads) % cpu->sockets; + } + } else { + /* If no topology was provided, then qemuBuildSmpCommandLine() + * puts all vCPUs into a separate socket. */ + *thread = 0; + *core = 0; + *die = 0; + *socket = id; + } +} + + +static void +qemuBuildNumaNewCPUs(virCommandPtr cmd, + virCPUDefPtr cpu, + virBitmapPtr cpumask, + size_t nodeid, + virQEMUCapsPtr qemuCaps) +{ + const bool diesSupported = virQEMUCapsGet(qemuCaps, QEMU_CAPS_SMP_DIES); + ssize_t vcpuid = -1; + + while ((vcpuid = virBitmapNextSetBit(cpumask, vcpuid)) >= 0) { + unsigned int socket; + unsigned int die; + unsigned int core; + unsigned int thread; + + qemuTranlsatevCPUID(vcpuid, diesSupported, cpu, + &socket, &die, &core, &thread); + + virCommandAddArg(cmd, "-numa"); + + /* The simple fact that dies are supported by QEMU doesn't mean we can + * put it onto command line. QEMU will accept die-id only if -smp dies + * was set to a value greater than 1. On the other hand, this allows us + * to generate shorter command line. */ + if (diesSupported && cpu && cpu->dies > 1) { + virCommandAddArgFormat(cmd, + "cpu,node-id=%zu,socket-id=%u,die-id=%u,core-id=%u,thread-id=%u", + nodeid, socket, die, core, thread); + } else { + virCommandAddArgFormat(cmd, + "cpu,node-id=%zu,socket-id=%u,core-id=%u,thread-id=%u", + nodeid, socket, core, thread); + } + } +} + + static int qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, virDomainDefPtr def, @@ -7090,6 +7175,7 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER; g_autofree virBufferPtr nodeBackends = NULL; bool needBackend = false; + bool newCpus = false; int ret = -1; size_t ncells = virDomainNumaGetNodeCount(def->numa);
@@ -7130,7 +7216,21 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, qemuBuildMemPathStr(def, cmd, priv) < 0) goto cleanup;
+ /* Use modern style of specifying vCPU topology only if: + * -numa cpu is available, introduced in the same time as -numa + * dist, hence slightly misleading capability test, and + * query-hotpluggable-cpus is avialable, because then + * qemuValidateDomainDef() ensures that if + * topology is specified it matches max vCPU + * count and we can make some shortcuts in + * qemuTranlsatevCPUID(). + */ + newCpus = virQEMUCapsGet(qemuCaps, QEMU_CAPS_NUMA_DIST) && + virQEMUCapsGet(qemuCaps, QEMU_CAPS_QUERY_HOTPLUGGABLE_CPUS); + for (i = 0; i < ncells; i++) { + virBitmapPtr cpu = virDomainNumaGetNodeCpumask(def->numa, i); + if (needBackend) { virCommandAddArg(cmd, "-object"); virCommandAddArgBuffer(cmd, &nodeBackends[i]); @@ -7139,8 +7239,9 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, virCommandAddArg(cmd, "-numa"); virBufferAsprintf(&buf, "node,nodeid=%zu", i);
- if (qemuBuildNumaOldCPUs(&buf, - virDomainNumaGetNodeCpumask(def->numa, i)) < 0) + /* -numa cpu is supported from the same release as -numa dist */ + if (!newCpus && + qemuBuildNumaOldCPUs(&buf, cpu) < 0) goto cleanup;
if (needBackend) @@ -7150,6 +7251,9 @@ qemuBuildNumaArgStr(virQEMUDriverConfigPtr cfg, virDomainNumaGetNodeMemorySize(def->numa, i) / 1024);
virCommandAddArgBuffer(cmd, &buf); + + if (newCpus) + qemuBuildNumaNewCPUs(cmd, def->cpu, cpu, i, qemuCaps); }
/* If NUMA node distance is specified for at least one pair diff --git a/tests/qemuxml2argvdata/hugepages-nvdimm.x86_64-latest.args b/tests/qemuxml2argvdata/hugepages-nvdimm.x86_64-latest.args index e80a95c84b..804fc59d74 100644 --- a/tests/qemuxml2argvdata/hugepages-nvdimm.x86_64-latest.args +++ b/tests/qemuxml2argvdata/hugepages-nvdimm.x86_64-latest.args @@ -19,7 +19,9 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -smp 2,sockets=2,dies=1,cores=1,threads=1 \ -object memory-backend-file,id=ram-node0,prealloc=yes,\ mem-path=/dev/hugepages2M/libvirt/qemu/-1-QEMUGuest1,share=yes,size=1073741824 \ --numa node,nodeid=0,cpus=0-1,memdev=ram-node0 \ +-numa node,nodeid=0,memdev=ram-node0 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\ share=yes,size=536870912 \ -device nvdimm,node=0,memdev=memnvdimm0,id=nvdimm0,slot=0 \ diff --git a/tests/qemuxml2argvdata/memfd-memory-default-hugepage.x86_64-latest.args b/tests/qemuxml2argvdata/memfd-memory-default-hugepage.x86_64-latest.args index 5d256c42bc..2e8d933fc2 100644 --- a/tests/qemuxml2argvdata/memfd-memory-default-hugepage.x86_64-latest.args +++ b/tests/qemuxml2argvdata/memfd-memory-default-hugepage.x86_64-latest.args @@ -20,7 +20,15 @@ file=/tmp/lib/domain--1-instance-00000092/master-key.aes \ -smp 8,sockets=1,dies=1,cores=8,threads=1 \ -object memory-backend-memfd,id=ram-node0,hugetlb=yes,hugetlbsize=2097152,\ share=yes,size=15032385536,host-nodes=3,policy=preferred \ --numa node,nodeid=0,cpus=0-7,memdev=ram-node0 \ +-numa node,nodeid=0,memdev=ram-node0 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=1,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=2,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=3,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=4,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=5,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=6,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=7,thread-id=0 \ -uuid 126f2720-6f8e-45ab-a886-ec9277079a67 \ -display none \ -no-user-config \ diff --git a/tests/qemuxml2argvdata/memfd-memory-numa.x86_64-latest.args b/tests/qemuxml2argvdata/memfd-memory-numa.x86_64-latest.args index 5d256c42bc..2e8d933fc2 100644 --- a/tests/qemuxml2argvdata/memfd-memory-numa.x86_64-latest.args +++ b/tests/qemuxml2argvdata/memfd-memory-numa.x86_64-latest.args @@ -20,7 +20,15 @@ file=/tmp/lib/domain--1-instance-00000092/master-key.aes \ -smp 8,sockets=1,dies=1,cores=8,threads=1 \ -object memory-backend-memfd,id=ram-node0,hugetlb=yes,hugetlbsize=2097152,\ share=yes,size=15032385536,host-nodes=3,policy=preferred \ --numa node,nodeid=0,cpus=0-7,memdev=ram-node0 \ +-numa node,nodeid=0,memdev=ram-node0 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=1,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=2,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=3,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=4,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=5,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=6,thread-id=0 \ +-numa cpu,node-id=0,socket-id=0,core-id=7,thread-id=0 \ -uuid 126f2720-6f8e-45ab-a886-ec9277079a67 \ -display none \ -no-user-config \ diff --git a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-access.x86_64-latest.args b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-access.x86_64-latest.args index 89138f46c4..6bf575d486 100644 --- a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-access.x86_64-latest.args +++ b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-access.x86_64-latest.args @@ -17,7 +17,9 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -m size=219136k,slots=16,maxmem=1099511627776k \ -overcommit mem-lock=off \ -smp 2,sockets=2,dies=1,cores=1,threads=1 \ --numa node,nodeid=0,cpus=0-1,mem=214 \ +-numa node,nodeid=0,mem=214 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\ share=no,size=536870912 \ -device nvdimm,node=0,memdev=memnvdimm0,id=nvdimm0,slot=0 \ diff --git a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-align.x86_64-latest.args b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-align.x86_64-latest.args index 1a8e7932dc..84571a056a 100644 --- a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-align.x86_64-latest.args +++ b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-align.x86_64-latest.args @@ -17,7 +17,9 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -m size=219136k,slots=16,maxmem=1099511627776k \ -overcommit mem-lock=off \ -smp 2,sockets=2,dies=1,cores=1,threads=1 \ --numa node,nodeid=0,cpus=0-1,mem=214 \ +-numa node,nodeid=0,mem=214 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\ share=no,size=536870912,align=2097152 \ -device nvdimm,node=0,memdev=memnvdimm0,id=nvdimm0,slot=0 \ diff --git a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-label.x86_64-latest.args b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-label.x86_64-latest.args index ef32c663de..d684d15423 100644 --- a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-label.x86_64-latest.args +++ b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-label.x86_64-latest.args @@ -17,7 +17,9 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -m size=219136k,slots=16,maxmem=1099511627776k \ -overcommit mem-lock=off \ -smp 2,sockets=2,dies=1,cores=1,threads=1 \ --numa node,nodeid=0,cpus=0-1,mem=214 \ +-numa node,nodeid=0,mem=214 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\ share=no,size=536870912 \ -device nvdimm,node=0,label-size=131072,memdev=memnvdimm0,id=nvdimm0,slot=0 \ diff --git a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-pmem.x86_64-latest.args b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-pmem.x86_64-latest.args index 5dfba9b50a..d1374772da 100644 --- a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-pmem.x86_64-latest.args +++ b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-pmem.x86_64-latest.args @@ -17,7 +17,9 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -m size=219136k,slots=16,maxmem=1099511627776k \ -overcommit mem-lock=off \ -smp 2,sockets=2,dies=1,cores=1,threads=1 \ --numa node,nodeid=0,cpus=0-1,mem=214 \ +-numa node,nodeid=0,mem=214 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\ share=no,size=536870912,pmem=on \ -device nvdimm,node=0,memdev=memnvdimm0,id=nvdimm0,slot=0 \ diff --git a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-ppc64.ppc64-latest.args b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-ppc64.ppc64-latest.args index eff80dcf80..02d2faa054 100644 --- a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-ppc64.ppc64-latest.args +++ b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-ppc64.ppc64-latest.args @@ -17,7 +17,9 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -m size=1048576k,slots=16,maxmem=1099511627776k \ -overcommit mem-lock=off \ -smp 2,sockets=2,dies=1,cores=1,threads=1 \ --numa node,nodeid=0,cpus=0-1,mem=1024 \ +-numa node,nodeid=0,mem=1024 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\ size=537001984 \ -device nvdimm,node=0,label-size=131072,\ diff --git a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-readonly.x86_64-latest.args b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-readonly.x86_64-latest.args index 7088a4f054..a757007690 100644 --- a/tests/qemuxml2argvdata/memory-hotplug-nvdimm-readonly.x86_64-latest.args +++ b/tests/qemuxml2argvdata/memory-hotplug-nvdimm-readonly.x86_64-latest.args @@ -17,7 +17,9 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -m size=219136k,slots=16,maxmem=1099511627776k \ -overcommit mem-lock=off \ -smp 2,sockets=2,dies=1,cores=1,threads=1 \ --numa node,nodeid=0,cpus=0-1,mem=214 \ +-numa node,nodeid=0,mem=214 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\ share=no,size=536870912 \ -device nvdimm,node=0,unarmed=on,memdev=memnvdimm0,id=nvdimm0,slot=0 \ diff --git a/tests/qemuxml2argvdata/memory-hotplug-nvdimm.x86_64-latest.args b/tests/qemuxml2argvdata/memory-hotplug-nvdimm.x86_64-latest.args index 60d6d207c5..e673a4acad 100644 --- a/tests/qemuxml2argvdata/memory-hotplug-nvdimm.x86_64-latest.args +++ b/tests/qemuxml2argvdata/memory-hotplug-nvdimm.x86_64-latest.args @@ -17,7 +17,9 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -m size=1048576k,slots=16,maxmem=1099511627776k \ -overcommit mem-lock=off \ -smp 2,sockets=2,dies=1,cores=1,threads=1 \ --numa node,nodeid=0,cpus=0-1,mem=1024 \ +-numa node,nodeid=0,mem=1024 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm,\ size=536870912 \ -device nvdimm,node=0,memdev=memnvdimm0,id=nvdimm0,slot=0 \ diff --git a/tests/qemuxml2argvdata/vhost-user-fs-fd-memory.x86_64-latest.args b/tests/qemuxml2argvdata/vhost-user-fs-fd-memory.x86_64-latest.args index dd5f68abc5..bce4e07a5d 100644 --- a/tests/qemuxml2argvdata/vhost-user-fs-fd-memory.x86_64-latest.args +++ b/tests/qemuxml2argvdata/vhost-user-fs-fd-memory.x86_64-latest.args @@ -20,7 +20,9 @@ file=/tmp/lib/domain--1-guest/master-key.aes \ -object memory-backend-file,id=ram-node0,\ mem-path=/var/lib/libvirt/qemu/ram/-1-guest/ram-node0,share=yes,\ size=15032385536 \ --numa node,nodeid=0,cpus=0-1,memdev=ram-node0 \ +-numa node,nodeid=0,memdev=ram-node0 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -uuid 126f2720-6f8e-45ab-a886-ec9277079a67 \ -display none \ -no-user-config \ diff --git a/tests/qemuxml2argvdata/vhost-user-fs-hugepages.x86_64-latest.args b/tests/qemuxml2argvdata/vhost-user-fs-hugepages.x86_64-latest.args index 258fa7813f..f21c08037d 100644 --- a/tests/qemuxml2argvdata/vhost-user-fs-hugepages.x86_64-latest.args +++ b/tests/qemuxml2argvdata/vhost-user-fs-hugepages.x86_64-latest.args @@ -19,7 +19,9 @@ file=/tmp/lib/domain--1-guest/master-key.aes \ -smp 2,sockets=2,cores=1,threads=1 \ -object memory-backend-file,id=ram-node0,prealloc=yes,\ mem-path=/dev/hugepages2M/libvirt/qemu/-1-guest,share=yes,size=2147483648 \ --numa node,nodeid=0,cpus=0-1,memdev=ram-node0 \ +-numa node,nodeid=0,memdev=ram-node0 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ +-numa cpu,node-id=0,socket-id=1,core-id=0,thread-id=0 \ -uuid 1ccfd97d-5eb4-478a-bbe6-88d254c16db7 \ -display none \ -no-user-config \ diff --git a/tests/qemuxml2argvdata/vhost-user-gpu-secondary.x86_64-latest.args b/tests/qemuxml2argvdata/vhost-user-gpu-secondary.x86_64-latest.args index e99a5342dc..c935a8942c 100644 --- a/tests/qemuxml2argvdata/vhost-user-gpu-secondary.x86_64-latest.args +++ b/tests/qemuxml2argvdata/vhost-user-gpu-secondary.x86_64-latest.args @@ -18,7 +18,8 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -overcommit mem-lock=off \ -smp 1,sockets=1,cores=1,threads=1 \ -object memory-backend-memfd,id=ram-node0,share=yes,size=224395264 \ --numa node,nodeid=0,cpus=0,memdev=ram-node0 \ +-numa node,nodeid=0,memdev=ram-node0 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ -uuid c7a5fdbd-edaf-9455-926a-d65c16db1809 \ -display none \ -no-user-config \ diff --git a/tests/qemuxml2argvdata/vhost-user-vga.x86_64-latest.args b/tests/qemuxml2argvdata/vhost-user-vga.x86_64-latest.args index 277bf8c646..7f90f0dfa3 100644 --- a/tests/qemuxml2argvdata/vhost-user-vga.x86_64-latest.args +++ b/tests/qemuxml2argvdata/vhost-user-vga.x86_64-latest.args @@ -18,7 +18,8 @@ file=/tmp/lib/domain--1-QEMUGuest1/master-key.aes \ -overcommit mem-lock=off \ -smp 1,sockets=1,cores=1,threads=1 \ -object memory-backend-memfd,id=ram-node0,share=yes,size=224395264 \ --numa node,nodeid=0,cpus=0,memdev=ram-node0 \ +-numa node,nodeid=0,memdev=ram-node0 \ +-numa cpu,node-id=0,socket-id=0,core-id=0,thread-id=0 \ -uuid c7a5fdbd-edaf-9455-926a-d65c16db1809 \ -display none \ -no-user-config \

On 5/22/20 6:07 PM, Igor Mammedov wrote:
On Fri, 22 May 2020 16:14:14 +0200 Michal Privoznik <mprivozn@redhat.com> wrote:
QEMU is trying to obsolete -numa node,cpus= because that uses ambiguous vCPU id to [socket, die, core, thread] mapping. The new form is:
-numa cpu,node-id=N,socket-id=S,die-id=D,core-id=C,thread-id=T
which is repeated for every vCPU and places it at [S, D, C, T] into guest NUMA node N.
While in general this is magic mapping, we can deal with it. Firstly, with QEMU 2.7 or newer, libvirt ensures that if topology is given then maxvcpus must be sockets * dies * cores * threads (i.e. there are no 'holes'). Secondly, if no topology is given then libvirt itself places each vCPU into a different socket (basically, it fakes topology of: [maxvcpus, 1, 1, 1]) Thirdly, we can copy whatever QEMU is doing when mapping vCPUs onto topology, to make sure vCPUs don't start to move around.
Note, migration from old to new cmd line works and therefore doesn't need any special handling.
Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1678085
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_command.c | 108 +++++++++++++++++- .../hugepages-nvdimm.x86_64-latest.args | 4 +- ...memory-default-hugepage.x86_64-latest.args | 10 +- .../memfd-memory-numa.x86_64-latest.args | 10 +- ...y-hotplug-nvdimm-access.x86_64-latest.args | 4 +- ...ry-hotplug-nvdimm-align.x86_64-latest.args | 4 +- ...ry-hotplug-nvdimm-label.x86_64-latest.args | 4 +- ...ory-hotplug-nvdimm-pmem.x86_64-latest.args | 4 +- ...ory-hotplug-nvdimm-ppc64.ppc64-latest.args | 4 +- ...hotplug-nvdimm-readonly.x86_64-latest.args | 4 +- .../memory-hotplug-nvdimm.x86_64-latest.args | 4 +- ...vhost-user-fs-fd-memory.x86_64-latest.args | 4 +- ...vhost-user-fs-hugepages.x86_64-latest.args | 4 +- ...host-user-gpu-secondary.x86_64-latest.args | 3 +- .../vhost-user-vga.x86_64-latest.args | 3 +- 15 files changed, 158 insertions(+), 16 deletions(-)
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 7d84fd8b5e..0de4fe4905 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -7079,6 +7079,91 @@ qemuBuildNumaOldCPUs(virBufferPtr buf, }
+/** + * qemuTranlsatevCPUID: + * + * For given vCPU @id and vCPU topology (@cpu) compute corresponding + * @socket, @die, @core and @thread). This assumes linear topology, + * that is every [socket, die, core, thread] combination is valid vCPU + * ID and there are no 'holes'. This is ensured by + * qemuValidateDomainDef() if QEMU_CAPS_QUERY_HOTPLUGGABLE_CPUS is + * set. I wouldn't make this assumption, each machine can have (and has) it's own layout, and now it's not hard to change that per machine version if necessary.
I'd suppose one could pull the list of possible CPUs from QEMU started in preconfig mode with desired -smp x,y,z using QUERY_HOTPLUGGABLE_CPUS and then continue to configure numa with QMP commands using provided CPUs layout.
Continue where? At the 'preconfig mode' the guest is already started, isn't it? Are you suggesting that libvirt starts a dummy QEMU process, fetches the CPU topology from it an then starts if for real? Libvirt tries to avoid that as much as it can.
How to present it to libvirt user I'm not sure (give them that list perhaps and let select from it???)
This is what I am trying to figure out in the cover letter. Maybe we need to let users configure the topology (well, vCPU id to [socket, die, core, thread] mapping), but then again, in my testing the guest ignored that and displayed different topology (true, I was testing with -cpu host, so maybe that's why).
But it's irrelevant, to the patch, magical IDs for socket/core/...whatever should not be generated by libvirt anymore, but rather taken from QEMU for given machine + -smp combination.
Taken when? We can do this for running machines, but not for freshly started ones, can we? Michal

On 5/22/20 6:07 PM, Igor Mammedov wrote:
On Fri, 22 May 2020 16:14:14 +0200 Michal Privoznik <mprivozn@redhat.com> wrote:
QEMU is trying to obsolete -numa node,cpus= because that uses ambiguous vCPU id to [socket, die, core, thread] mapping. The new form is:
-numa cpu,node-id=N,socket-id=S,die-id=D,core-id=C,thread-id=T
which is repeated for every vCPU and places it at [S, D, C, T] into guest NUMA node N.
While in general this is magic mapping, we can deal with it. Firstly, with QEMU 2.7 or newer, libvirt ensures that if topology is given then maxvcpus must be sockets * dies * cores * threads (i.e. there are no 'holes'). Secondly, if no topology is given then libvirt itself places each vCPU into a different socket (basically, it fakes topology of: [maxvcpus, 1, 1, 1]) Thirdly, we can copy whatever QEMU is doing when mapping vCPUs onto topology, to make sure vCPUs don't start to move around.
Note, migration from old to new cmd line works and therefore doesn't need any special handling.
Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1678085
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_command.c | 108 +++++++++++++++++- .../hugepages-nvdimm.x86_64-latest.args | 4 +- ...memory-default-hugepage.x86_64-latest.args | 10 +- .../memfd-memory-numa.x86_64-latest.args | 10 +- ...y-hotplug-nvdimm-access.x86_64-latest.args | 4 +- ...ry-hotplug-nvdimm-align.x86_64-latest.args | 4 +- ...ry-hotplug-nvdimm-label.x86_64-latest.args | 4 +- ...ory-hotplug-nvdimm-pmem.x86_64-latest.args | 4 +- ...ory-hotplug-nvdimm-ppc64.ppc64-latest.args | 4 +- ...hotplug-nvdimm-readonly.x86_64-latest.args | 4 +- .../memory-hotplug-nvdimm.x86_64-latest.args | 4 +- ...vhost-user-fs-fd-memory.x86_64-latest.args | 4 +- ...vhost-user-fs-hugepages.x86_64-latest.args | 4 +- ...host-user-gpu-secondary.x86_64-latest.args | 3 +- .../vhost-user-vga.x86_64-latest.args | 3 +- 15 files changed, 158 insertions(+), 16 deletions(-)
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 7d84fd8b5e..0de4fe4905 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -7079,6 +7079,91 @@ qemuBuildNumaOldCPUs(virBufferPtr buf, }
+/** + * qemuTranlsatevCPUID: + * + * For given vCPU @id and vCPU topology (@cpu) compute corresponding + * @socket, @die, @core and @thread). This assumes linear topology, + * that is every [socket, die, core, thread] combination is valid vCPU + * ID and there are no 'holes'. This is ensured by + * qemuValidateDomainDef() if QEMU_CAPS_QUERY_HOTPLUGGABLE_CPUS is + * set. I wouldn't make this assumption, each machine can have (and has) it's own layout, and now it's not hard to change that per machine version if necessary.
I'd suppose one could pull the list of possible CPUs from QEMU started in preconfig mode with desired -smp x,y,z using QUERY_HOTPLUGGABLE_CPUS and then continue to configure numa with QMP commands using provided CPUs layout.
Continue where? At the 'preconfig mode' the guest is already started, isn't it? Are you suggesting that libvirt starts a dummy QEMU process, fetches the CPU topology from it an then starts if for real? Libvirt QEMU started but it's very far from starting guest, at that time it's possible configure numa mapping at runtime and continue to -S or running state without restarting QEMU. For the follow up starts, used topology and numa options can be cached and reused at CLI time as long as machine/-smp combination stays
tries to avoid that as much as it can.
How to present it to libvirt user I'm not sure (give them that list perhaps and let select from it???)
This is what I am trying to figure out in the cover letter. Maybe we need to let users configure the topology (well, vCPU id to [socket, die, core, thread] mapping), but then again, in my testing the guest ignored that and displayed different topology (true, I was testing with -cpu host, so maybe that's why).
On Fri, 22 May 2020 18:28:31 +0200 Michal Privoznik <mprivozn@redhat.com> wrote: the same. there is ongiong issue with EPYC VCPUs topology, but I otherwise it should work. Just report bug to qemu-devel, if it's broken.
But it's irrelevant, to the patch, magical IDs for socket/core/...whatever should not be generated by libvirt anymore, but rather taken from QEMU for given machine + -smp combination.
Taken when? We can do this for running machines, but not for freshly started ones, can we?
it can be used for freshly started as well, QEMU -S -preconfig -M pc -smp ... (QMP) query-hotpluggable-cpus (QMP) set-numa-node ... ... (QMP) exit-preconfig (QMP) other stuff libvirt does (like hot-plugging CPUs , ...) (QMP) cont PS: I didn't notice that -preconfig was moved to experimental state since it took 2 years for starting to implement support for '-numa cpu' to configure numa mappings it was designed for. So it's not advertised in QAPI schema anymore, see QEMU commit 1f214ee1b83afd CCing Markus, so we can think about where to go from here.

On 5/22/20 7:18 PM, Igor Mammedov wrote:
On Fri, 22 May 2020 18:28:31 +0200 Michal Privoznik <mprivozn@redhat.com> wrote:
On 5/22/20 6:07 PM, Igor Mammedov wrote:
On Fri, 22 May 2020 16:14:14 +0200 Michal Privoznik <mprivozn@redhat.com> wrote:
QEMU is trying to obsolete -numa node,cpus= because that uses ambiguous vCPU id to [socket, die, core, thread] mapping. The new form is:
-numa cpu,node-id=N,socket-id=S,die-id=D,core-id=C,thread-id=T
which is repeated for every vCPU and places it at [S, D, C, T] into guest NUMA node N.
While in general this is magic mapping, we can deal with it. Firstly, with QEMU 2.7 or newer, libvirt ensures that if topology is given then maxvcpus must be sockets * dies * cores * threads (i.e. there are no 'holes'). Secondly, if no topology is given then libvirt itself places each vCPU into a different socket (basically, it fakes topology of: [maxvcpus, 1, 1, 1]) Thirdly, we can copy whatever QEMU is doing when mapping vCPUs onto topology, to make sure vCPUs don't start to move around.
Note, migration from old to new cmd line works and therefore doesn't need any special handling.
Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1678085
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_command.c | 108 +++++++++++++++++- .../hugepages-nvdimm.x86_64-latest.args | 4 +- ...memory-default-hugepage.x86_64-latest.args | 10 +- .../memfd-memory-numa.x86_64-latest.args | 10 +- ...y-hotplug-nvdimm-access.x86_64-latest.args | 4 +- ...ry-hotplug-nvdimm-align.x86_64-latest.args | 4 +- ...ry-hotplug-nvdimm-label.x86_64-latest.args | 4 +- ...ory-hotplug-nvdimm-pmem.x86_64-latest.args | 4 +- ...ory-hotplug-nvdimm-ppc64.ppc64-latest.args | 4 +- ...hotplug-nvdimm-readonly.x86_64-latest.args | 4 +- .../memory-hotplug-nvdimm.x86_64-latest.args | 4 +- ...vhost-user-fs-fd-memory.x86_64-latest.args | 4 +- ...vhost-user-fs-hugepages.x86_64-latest.args | 4 +- ...host-user-gpu-secondary.x86_64-latest.args | 3 +- .../vhost-user-vga.x86_64-latest.args | 3 +- 15 files changed, 158 insertions(+), 16 deletions(-)
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 7d84fd8b5e..0de4fe4905 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -7079,6 +7079,91 @@ qemuBuildNumaOldCPUs(virBufferPtr buf, }
+/** + * qemuTranlsatevCPUID: + * + * For given vCPU @id and vCPU topology (@cpu) compute corresponding + * @socket, @die, @core and @thread). This assumes linear topology, + * that is every [socket, die, core, thread] combination is valid vCPU + * ID and there are no 'holes'. This is ensured by + * qemuValidateDomainDef() if QEMU_CAPS_QUERY_HOTPLUGGABLE_CPUS is + * set. I wouldn't make this assumption, each machine can have (and has) it's own layout, and now it's not hard to change that per machine version if necessary.
I'd suppose one could pull the list of possible CPUs from QEMU started in preconfig mode with desired -smp x,y,z using QUERY_HOTPLUGGABLE_CPUS and then continue to configure numa with QMP commands using provided CPUs layout.
Continue where? At the 'preconfig mode' the guest is already started, isn't it? Are you suggesting that libvirt starts a dummy QEMU process, fetches the CPU topology from it an then starts if for real? Libvirt QEMU started but it's very far from starting guest, at that time it's possible configure numa mapping at runtime and continue to -S or running state without restarting QEMU. For the follow up starts, used topology and numa options can be cached and reused at CLI time as long as machine/-smp combination stays the same.
This is a problem. The domain XML that is provided can't be changed, mostly because mgmt apps construct it on the fly and then just pass it as a RO string to libvirt. While libvirt could create a separate cache, there has to be a better way. I mean, I can add some more code that once the guest is running preserves the mapping during migration. But that assumes a running QEMU. When starting a domain from scratch, is it acceptable it vCPU topology changes? I suspect it is not.
tries to avoid that as much as it can.
How to present it to libvirt user I'm not sure (give them that list perhaps and let select from it???)
This is what I am trying to figure out in the cover letter. Maybe we need to let users configure the topology (well, vCPU id to [socket, die, core, thread] mapping), but then again, in my testing the guest ignored that and displayed different topology (true, I was testing with -cpu host, so maybe that's why). there is ongiong issue with EPYC VCPUs topology, but I otherwise it should work. Just report bug to qemu-devel, if it's broken.
But it's irrelevant, to the patch, magical IDs for socket/core/...whatever should not be generated by libvirt anymore, but rather taken from QEMU for given machine + -smp combination.
Taken when? We can do this for running machines, but not for freshly started ones, can we?
it can be used for freshly started as well, QEMU -S -preconfig -M pc -smp ... (QMP) query-hotpluggable-cpus (QMP) set-numa-node ... ... (QMP) exit-preconfig (QMP) other stuff libvirt does (like hot-plugging CPUs , ...) (QMP) cont
I'm not sure this works. query-hotpluggable-cpus does not map vCPU ID <-> socket/core/thread, For '-smp 2,sockets=2,cores=1,threads=1' the 'query-hotpluggable-cpus' returns: {"return": [{"props": {"core-id": 0, "thread-id": 0, "socket-id": 1}, "vcpus-count": 1, "type": "qemu64-x86_64-cpu"}, {"props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "vcpus-count": 1, "type": "qemu64-x86_64-cpu"}]} And 'query-cpus' or 'query-cpus-fast' which map vCPU ID onto socket/core/thread are not allowed in preconfig state. But if I take a step back, the whole point of deprecating -numa node,cpus= is that QEMU no longer wants to do vCPU ID <-> socket/core/thread mapping because it's ambiguous. So it feels a bit weird to design a solution where libvirt would ask QEMU to provide the mapping only so that it can be configured back. Not only because of the extra step, but also because QEMU can't then remove the mapping anyway. I might be misunderstanding the issue though. Michal

On Mon, 25 May 2020 10:05:08 +0200 Michal Privoznik <mprivozn@redhat.com> wrote:
On 5/22/20 7:18 PM, Igor Mammedov wrote:
On Fri, 22 May 2020 18:28:31 +0200 Michal Privoznik <mprivozn@redhat.com> wrote:
On 5/22/20 6:07 PM, Igor Mammedov wrote:
On Fri, 22 May 2020 16:14:14 +0200 Michal Privoznik <mprivozn@redhat.com> wrote:
QEMU is trying to obsolete -numa node,cpus= because that uses ambiguous vCPU id to [socket, die, core, thread] mapping. The new form is:
-numa cpu,node-id=N,socket-id=S,die-id=D,core-id=C,thread-id=T
which is repeated for every vCPU and places it at [S, D, C, T] into guest NUMA node N.
While in general this is magic mapping, we can deal with it. Firstly, with QEMU 2.7 or newer, libvirt ensures that if topology is given then maxvcpus must be sockets * dies * cores * threads (i.e. there are no 'holes'). Secondly, if no topology is given then libvirt itself places each vCPU into a different socket (basically, it fakes topology of: [maxvcpus, 1, 1, 1]) Thirdly, we can copy whatever QEMU is doing when mapping vCPUs onto topology, to make sure vCPUs don't start to move around.
Note, migration from old to new cmd line works and therefore doesn't need any special handling.
Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1678085
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_command.c | 108 +++++++++++++++++- .../hugepages-nvdimm.x86_64-latest.args | 4 +- ...memory-default-hugepage.x86_64-latest.args | 10 +- .../memfd-memory-numa.x86_64-latest.args | 10 +- ...y-hotplug-nvdimm-access.x86_64-latest.args | 4 +- ...ry-hotplug-nvdimm-align.x86_64-latest.args | 4 +- ...ry-hotplug-nvdimm-label.x86_64-latest.args | 4 +- ...ory-hotplug-nvdimm-pmem.x86_64-latest.args | 4 +- ...ory-hotplug-nvdimm-ppc64.ppc64-latest.args | 4 +- ...hotplug-nvdimm-readonly.x86_64-latest.args | 4 +- .../memory-hotplug-nvdimm.x86_64-latest.args | 4 +- ...vhost-user-fs-fd-memory.x86_64-latest.args | 4 +- ...vhost-user-fs-hugepages.x86_64-latest.args | 4 +- ...host-user-gpu-secondary.x86_64-latest.args | 3 +- .../vhost-user-vga.x86_64-latest.args | 3 +- 15 files changed, 158 insertions(+), 16 deletions(-)
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 7d84fd8b5e..0de4fe4905 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -7079,6 +7079,91 @@ qemuBuildNumaOldCPUs(virBufferPtr buf, }
+/** + * qemuTranlsatevCPUID: + * + * For given vCPU @id and vCPU topology (@cpu) compute corresponding + * @socket, @die, @core and @thread). This assumes linear topology, + * that is every [socket, die, core, thread] combination is valid vCPU + * ID and there are no 'holes'. This is ensured by + * qemuValidateDomainDef() if QEMU_CAPS_QUERY_HOTPLUGGABLE_CPUS is + * set. I wouldn't make this assumption, each machine can have (and has) it's own layout, and now it's not hard to change that per machine version if necessary.
I'd suppose one could pull the list of possible CPUs from QEMU started in preconfig mode with desired -smp x,y,z using QUERY_HOTPLUGGABLE_CPUS and then continue to configure numa with QMP commands using provided CPUs layout.
Continue where? At the 'preconfig mode' the guest is already started, isn't it? Are you suggesting that libvirt starts a dummy QEMU process, fetches the CPU topology from it an then starts if for real? Libvirt QEMU started but it's very far from starting guest, at that time it's possible configure numa mapping at runtime and continue to -S or running state without restarting QEMU. For the follow up starts, used topology and numa options can be cached and reused at CLI time as long as machine/-smp combination stays the same.
This is a problem. The domain XML that is provided can't be changed, mostly because mgmt apps construct it on the fly and then just pass it as a RO string to libvirt. While libvirt could create a separate cache, there has to be a better way.
I mean, I can add some more code that once the guest is running preserves the mapping during migration. But that assumes a running QEMU. When starting a domain from scratch, is it acceptable it vCPU topology changes? I suspect it is not. I'm not sure I got you but vCPU topology isn't changnig but when starting QEMU, user has to map 'concrete vCPUs' to spencific numa nodes. The issue here is that to specify concrete vCPUs user needs to get layout from QEMU first as it's a function of target/machine/-smp and possibly cpu type.
that applies not only '-numa cpu' but also to -device cpufoo, that's why query-hotpluggable-cpus was introduced to let user get the list of possible CPUs (including topo properties needed to create them) for a given set of CLI options. If I recall right libvirt uses topo properies during cpu hotplug but treats it mainly as opaqueue info so it could feed it back to QEMU.
tries to avoid that as much as it can.
How to present it to libvirt user I'm not sure (give them that list perhaps and let select from it???)
This is what I am trying to figure out in the cover letter. Maybe we need to let users configure the topology (well, vCPU id to [socket, die, core, thread] mapping), but then again, in my testing the guest ignored that and displayed different topology (true, I was testing with -cpu host, so maybe that's why). there is ongiong issue with EPYC VCPUs topology, but I otherwise it should work. Just report bug to qemu-devel, if it's broken.
But it's irrelevant, to the patch, magical IDs for socket/core/...whatever should not be generated by libvirt anymore, but rather taken from QEMU for given machine + -smp combination.
Taken when? We can do this for running machines, but not for freshly started ones, can we?
it can be used for freshly started as well, QEMU -S -preconfig -M pc -smp ... (QMP) query-hotpluggable-cpus (QMP) set-numa-node ... ... (QMP) exit-preconfig (QMP) other stuff libvirt does (like hot-plugging CPUs , ...) (QMP) cont
I'm not sure this works. query-hotpluggable-cpus does not map vCPU ID <-> socket/core/thread, For '-smp 2,sockets=2,cores=1,threads=1' the 'query-hotpluggable-cpus' returns:
{"return": [{"props": {"core-id": 0, "thread-id": 0, "socket-id": 1}, "vcpus-count": 1, "type": "qemu64-x86_64-cpu"}, {"props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "vcpus-count": 1, "type": "qemu64-x86_64-cpu"}]}
And 'query-cpus' or 'query-cpus-fast' which map vCPU ID onto socket/core/thread are not allowed in preconfig state.
that's the list I was taling about, which is implicitly ordered by cpu_index these 2 commands apply to present cpu only, if I'm not mistaken. query-hotpluggable-cpus shows not only present but also CPUs that could be hotplugged with device_add or used with -device.
But if I take a step back, the whole point of deprecating -numa node,cpus= is that QEMU no longer wants to do vCPU ID <-> socket/core/thread mapping because it's ambiguous. So it feels a bit weird to design a solution where libvirt would ask QEMU to provide the mapping only so that it can be configured back. Not only because of the extra step, but also because QEMU can't then remove the mapping anyway. I might be misunderstanding the issue though. if '-numa node,cpus' is removed, we no longer will be using cpu_index as configuration interface with user, that would allow QEMU start pruning it from HMP/QMP interfaces and then probably remove it internally. (I haven't explored yet if we could get rid of it completely but I'd expect migration stream would be the only reason to keep it intrenally).
I'm quite reluctant to add cpu_index to modern query-hotpluggable-cpus output, since the whole goal is to get rid of the index, which don't actually work with SPAPR where CPU entity is a core and threads are internal impl. detail (while cpu_index has 1:1 mapping with threads). However if it will let QEMU to drop '-numa node,cpus=', we can discuss adding optional 'x-cpu-index' to query-hotpluggable-cpus, that will be available for old machine types for the sole purpose to help libvirt map old CLI to new one. New machines shouldn't care about index though, since they should be using '-numa cpu'. PS: Since libvirt would like to avoid restarting QEMU, after discussing between libvirt and QEMU floks, how to approach chicken/egg problem (started qemu vs cpu descriptors being available after start), -preconfig mode (however ugly) was introduced as a compromise. It was discussed on qemu-devel under subject "enable numa configuration before machine_init() from HMP/QMP" which started with RFC and eventually was merged as v7. I hope above clarifies questions you (would) have.
Michal

On 5/26/20 4:51 PM, Igor Mammedov wrote:
On Mon, 25 May 2020 10:05:08 +0200 Michal Privoznik <mprivozn@redhat.com> wrote:
This is a problem. The domain XML that is provided can't be changed, mostly because mgmt apps construct it on the fly and then just pass it as a RO string to libvirt. While libvirt could create a separate cache, there has to be a better way.
I mean, I can add some more code that once the guest is running preserves the mapping during migration. But that assumes a running QEMU. When starting a domain from scratch, is it acceptable it vCPU topology changes? I suspect it is not.
I'm not sure I got you but vCPU topology isn't changnig but when starting QEMU, user has to map 'concrete vCPUs' to spencific numa nodes. The issue here is that to specify concrete vCPUs user needs to get layout from QEMU first as it's a function of target/machine/-smp and possibly cpu type.
Assume the following config: 4 vCPUs (2 sockets, 2 cores, 1 thread topology) and 2 NUMA nodes and the following assignment to NUMA: node 0: cpus=0-1 node 1: cpus=2-3 With old libvirt & qemu (and assuming x86_64 - not EPYC), I assume the following topology is going to be used: node 0: socket=0,core=0,thread=0 (vCPU0) socket=0,core=1,thread=0 (vCPU1) node 1: socket=1,core=0,thread=0 (vCPU2) socket=1,core=1,thread=0 (vCPU3) Now, user upgrades libvirt & qemu but doesn't change the config. And on a fresh new start (no migration), they might get a different topology: node 0: socket=0,core=0,thread=0 (vCPU0) socket=1,core=0,thread=0 (vCPU1) node 1: socket=0,core=1,thread=0 (vCPU2) socket=1,core=1,thread=0 (vCPU3) (This is a very trivial example that I am intentionally making look bad, but the thing is, there are some CPUs with very weird vCPU -> socket/core/thread mappings). The problem here is that with this new version it is libvirt who configured the vCPU -> NUMA mapping (using -numa cpu). Why so wrong? Well it had no way to ask qemu how it used to be. Okay, so we add an interface to QEMU (say -preconfig + query-hotpluggable-cpus) which will do the mapping and keep it there indefinitely. But if the interface is already there (and "always" will be), I don't see need for the extra step (libvirt asking QEMU for the old mapping). The problem here is not how to assign vCPUs to NUMA nodes, the problem is how to translate vCPU IDs to socket=,core=,thread=.
that applies not only '-numa cpu' but also to -device cpufoo, that's why query-hotpluggable-cpus was introduced to let user get the list of possible CPUs (including topo properties needed to create them) for a given set of CLI options.
If I recall right libvirt uses topo properies during cpu hotplug but treats it mainly as opaqueue info so it could feed it back to QEMU.
tries to avoid that as much as it can.
How to present it to libvirt user I'm not sure (give them that list perhaps and let select from it???)
This is what I am trying to figure out in the cover letter. Maybe we need to let users configure the topology (well, vCPU id to [socket, die, core, thread] mapping), but then again, in my testing the guest ignored that and displayed different topology (true, I was testing with -cpu host, so maybe that's why). there is ongiong issue with EPYC VCPUs topology, but I otherwise it should work. Just report bug to qemu-devel, if it's broken.
But it's irrelevant, to the patch, magical IDs for socket/core/...whatever should not be generated by libvirt anymore, but rather taken from QEMU for given machine + -smp combination.
Taken when? We can do this for running machines, but not for freshly started ones, can we?
it can be used for freshly started as well, QEMU -S -preconfig -M pc -smp ... (QMP) query-hotpluggable-cpus (QMP) set-numa-node ... ... (QMP) exit-preconfig (QMP) other stuff libvirt does (like hot-plugging CPUs , ...) (QMP) cont
I'm not sure this works. query-hotpluggable-cpus does not map vCPU ID <-> socket/core/thread, For '-smp 2,sockets=2,cores=1,threads=1' the 'query-hotpluggable-cpus' returns:
{"return": [{"props": {"core-id": 0, "thread-id": 0, "socket-id": 1}, "vcpus-count": 1, "type": "qemu64-x86_64-cpu"}, {"props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "vcpus-count": 1, "type": "qemu64-x86_64-cpu"}]}
that's the list I was taling about, which is implicitly ordered by cpu_index
Aha! So in this case it would be: vCPU0 -> socket=1,core=0,thread=0 vCPU1 -> socket=0,core=0,thread=0 But that doesn't feel right. Is the cpu_index increasing or decreasing as I go through the array? Also, how is this able to express holes? E.g. there might be some CPUs that don't have linear topology, and for instance while socket=0,core=0,thread=0 and socket=0,core=0,thread=2 exist, socket=0,core=0,thread=1 does not. How am I supposed to know that by just looking at the array?
And 'query-cpus' or 'query-cpus-fast' which map vCPU ID onto socket/core/thread are not allowed in preconfig state. these 2 commands apply to present cpu only, if I'm not mistaken. query-hotpluggable-cpus shows not only present but also CPUs that could be hotplugged with device_add or used with -device.
Fair enough. I haven't looked into the code that much.
But if I take a step back, the whole point of deprecating -numa node,cpus= is that QEMU no longer wants to do vCPU ID <-> socket/core/thread mapping because it's ambiguous. So it feels a bit weird to design a solution where libvirt would ask QEMU to provide the mapping only so that it can be configured back. Not only because of the extra step, but also because QEMU can't then remove the mapping anyway. I might be misunderstanding the issue though. if '-numa node,cpus' is removed, we no longer will be using cpu_index as configuration interface with user, that would allow QEMU start pruning it from HMP/QMP interfaces and then probably remove it internally. (I haven't explored yet if we could get rid of it completely but I'd expect migration stream would be the only reason to keep it intrenally).
I'm quite reluctant to add cpu_index to modern query-hotpluggable-cpus output, since the whole goal is to get rid of the index, which don't actually work with SPAPR where CPU entity is a core and threads are internal impl. detail (while cpu_index has 1:1 mapping with threads).
However if it will let QEMU to drop '-numa node,cpus=', we can discuss adding optional 'x-cpu-index' to query-hotpluggable-cpus, that will be available for old machine types for the sole purpose to help libvirt map old CLI to new one. New machines shouldn't care about index though, since they should be using '-numa cpu'.
The problem here is that so far, all that libvirt users see are vCPU IDs. They use them to assign vCPUs to NUMA nodes. And in order to make libvirt switch to the new command line it needs a way to map IDs to socket=,core=,thread=. I will play more with the preconfig and let you know. Michal

On Tue, 26 May 2020 17:31:09 +0200 Michal Privoznik <mprivozn@redhat.com> wrote:
On 5/26/20 4:51 PM, Igor Mammedov wrote:
On Mon, 25 May 2020 10:05:08 +0200 Michal Privoznik <mprivozn@redhat.com> wrote:
This is a problem. The domain XML that is provided can't be changed, mostly because mgmt apps construct it on the fly and then just pass it as a RO string to libvirt. While libvirt could create a separate cache, there has to be a better way.
I mean, I can add some more code that once the guest is running preserves the mapping during migration. But that assumes a running QEMU. When starting a domain from scratch, is it acceptable it vCPU topology changes? I suspect it is not.
I'm not sure I got you but vCPU topology isn't changnig but when starting QEMU, user has to map 'concrete vCPUs' to spencific numa nodes. The issue here is that to specify concrete vCPUs user needs to get layout from QEMU first as it's a function of target/machine/-smp and possibly cpu type.
Assume the following config: 4 vCPUs (2 sockets, 2 cores, 1 thread topology) and 2 NUMA nodes and the following assignment to NUMA:
node 0: cpus=0-1 node 1: cpus=2-3
With old libvirt & qemu (and assuming x86_64 - not EPYC), I assume the following topology is going to be used:
node 0: socket=0,core=0,thread=0 (vCPU0) socket=0,core=1,thread=0 (vCPU1) node 1: socket=1,core=0,thread=0 (vCPU2) socket=1,core=1,thread=0 (vCPU3)
Now, user upgrades libvirt & qemu but doesn't change the config. And on a fresh new start (no migration), they might get a different topology:
node 0: socket=0,core=0,thread=0 (vCPU0) socket=1,core=0,thread=0 (vCPU1) node 1: socket=0,core=1,thread=0 (vCPU2) socket=1,core=1,thread=0 (vCPU3)
(This is a very trivial example that I am intentionally making look bad, but the thing is, there are some CPUs with very weird vCPU -> socket/core/thread mappings).
The problem here is that with this new version it is libvirt who configured the vCPU -> NUMA mapping (using -numa cpu). Why so wrong? Well it had no way to ask qemu how it used to be. Okay, so we add an interface to QEMU (say -preconfig + query-hotpluggable-cpus) which will do the mapping and keep it there indefinitely. But if the interface is already there (and "always" will be), I don't see need for the extra step (libvirt asking QEMU for the old mapping). with cpu_index users don't know what CPUs they assing where, and in some cases (spapr) it doesn't really maps into board supported CPU model well. We can add and keep cpu_index in query-hotpluggable-cpus to help with migration for old machine types from old CLI to the new one, but otherwise cpu_index would disapear from user visible inerface. I'd like to drop duplicate code supporting ambiguose '-numa node,cpus' (and not always
that shouldn't happen at least for as long as machine version stays the same properly working interface) and keep only single variant '-numa cpu=' to do numa mapping, which uses CPU's topology properties to describe CPUs, and unifies it with the way it's done with cpu hotplug.
The problem here is not how to assign vCPUs to NUMA nodes, the problem is how to translate vCPU IDs to socket=,core=,thread=. if you are talking about libvirt's vCPU IDs, then it's separate issue as it's user facing API, I think it should not rely on cpu_index. Instead it should map vCPU IDs to ([socket,]core[,thread]) tuple or maybe drop notion of vCPU IDs and expose ([socket,]core[,thread]) to users if they ask for numa aware config.
that applies not only '-numa cpu' but also to -device cpufoo, that's why query-hotpluggable-cpus was introduced to let user get the list of possible CPUs (including topo properties needed to create them) for a given set of CLI options.
If I recall right libvirt uses topo properies during cpu hotplug but treats it mainly as opaqueue info so it could feed it back to QEMU.
tries to avoid that as much as it can.
How to present it to libvirt user I'm not sure (give them that list perhaps and let select from it???)
This is what I am trying to figure out in the cover letter. Maybe we need to let users configure the topology (well, vCPU id to [socket, die, core, thread] mapping), but then again, in my testing the guest ignored that and displayed different topology (true, I was testing with -cpu host, so maybe that's why). there is ongiong issue with EPYC VCPUs topology, but I otherwise it should work. Just report bug to qemu-devel, if it's broken.
But it's irrelevant, to the patch, magical IDs for socket/core/...whatever should not be generated by libvirt anymore, but rather taken from QEMU for given machine + -smp combination.
Taken when? We can do this for running machines, but not for freshly started ones, can we?
it can be used for freshly started as well, QEMU -S -preconfig -M pc -smp ... (QMP) query-hotpluggable-cpus (QMP) set-numa-node ... ... (QMP) exit-preconfig (QMP) other stuff libvirt does (like hot-plugging CPUs , ...) (QMP) cont
I'm not sure this works. query-hotpluggable-cpus does not map vCPU ID <-> socket/core/thread, For '-smp 2,sockets=2,cores=1,threads=1' the 'query-hotpluggable-cpus' returns:
{"return": [{"props": {"core-id": 0, "thread-id": 0, "socket-id": 1}, "vcpus-count": 1, "type": "qemu64-x86_64-cpu"}, {"props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "vcpus-count": 1, "type": "qemu64-x86_64-cpu"}]}
that's the list I was taling about, which is implicitly ordered by cpu_index
Aha! So in this case it would be:
vCPU0 -> socket=1,core=0,thread=0 vCPU1 -> socket=0,core=0,thread=0
But that doesn't feel right. Is the cpu_index increasing or decreasing as I go through the array? it's array with decreasing order and index in it currently == cpu_index for
PS: I'm curious how libvirt currently implements numa mapping and how it's correlated with pinnig to host nodes? Does it have any sort of code to calculate topology based on cpu_index so it could properly assign vCPUs to nodes or all the pain of assigning vCPU IDs to nodes is on the user shoulders? present and possible CPUs. Content of array is immutable for given -M/-smp combination, to keep migration working. We can try to add x-cpu-index to cpu entries, so you won't have to rely on order to help with migrating from old CLI (but only for old machine types where old CLI actually worked worked).
Also, how is this able to express holes? E.g. there might be some CPUs that don't have linear topology, and for instance while socket=0,core=0,thread=0 and socket=0,core=0,thread=2 exist, socket=0,core=0,thread=1 does not. How am I supposed to know that by just looking at the array? speaking of x86, QEMU curently does not implement topologies with holes in [socket/core/thread] tuple but if it were it shouldn't matter as all CPUs and their realations with each other are described within that array.
And 'query-cpus' or 'query-cpus-fast' which map vCPU ID onto socket/core/thread are not allowed in preconfig state. these 2 commands apply to present cpu only, if I'm not mistaken. query-hotpluggable-cpus shows not only present but also CPUs that could be hotplugged with device_add or used with -device.
Fair enough. I haven't looked into the code that much.
But if I take a step back, the whole point of deprecating -numa node,cpus= is that QEMU no longer wants to do vCPU ID <-> socket/core/thread mapping because it's ambiguous. So it feels a bit weird to design a solution where libvirt would ask QEMU to provide the mapping only so that it can be configured back. Not only because of the extra step, but also because QEMU can't then remove the mapping anyway. I might be misunderstanding the issue though. if '-numa node,cpus' is removed, we no longer will be using cpu_index as configuration interface with user, that would allow QEMU start pruning it from HMP/QMP interfaces and then probably remove it internally. (I haven't explored yet if we could get rid of it completely but I'd expect migration stream would be the only reason to keep it intrenally).
I'm quite reluctant to add cpu_index to modern query-hotpluggable-cpus output, since the whole goal is to get rid of the index, which don't actually work with SPAPR where CPU entity is a core and threads are internal impl. detail (while cpu_index has 1:1 mapping with threads).
However if it will let QEMU to drop '-numa node,cpus=', we can discuss adding optional 'x-cpu-index' to query-hotpluggable-cpus, that will be available for old machine types for the sole purpose to help libvirt map old CLI to new one. New machines shouldn't care about index though, since they should be using '-numa cpu'.
The problem here is that so far, all that libvirt users see are vCPU IDs. They use them to assign vCPUs to NUMA nodes. And in order to make libvirt switch to the new command line it needs a way to map IDs to socket=,core=,thread=. I will play more with the preconfig and let you know.
If libvirt's vCPU IDs are mirroring cpu_index, I'd say it shouldn't be doing so, see Daniel's response https://lists.gnu.org/archive/html/qemu-devel/2017-10/msg04369.html FYI: I didn't read through all the history of -preconfig patches but QEMU options for topology aware (sane) numa configuration on the table were: 1. -numa node,cpus[cpu_index] libvirt needs to duplicate internal QEMU algorithms that map cpu_index values to topology info and use it to map vCPUs to numa nodes (and keep in sync with QEMU as it's machine versioned moving target) 2. -numa cpu CLI option, libvirt needs to duplicate internal QEMU algorithms that calculate target depended values for socket/core/thread ids. (basically it's the same as #1), the only difference is that CLI user interface is expressed in topology properties. 3. when we discussed it in the past #2 wasn't going to fly as it still had tha same burden as #1 (duplicating code and keeping it in sync). so we ended up with runtime configuration (-preconfig) to avoid QEMU restart just for querying, where libvirt could get list of possible CPUs from QEMU instance and complete numa configuration on the fly (at least for the first time, results could be cached and re-used with -numa cpu).

On 5/27/20 3:58 PM, Igor Mammedov wrote:
On Tue, 26 May 2020 17:31:09 +0200 Michal Privoznik <mprivozn@redhat.com> wrote:
On 5/26/20 4:51 PM, Igor Mammedov wrote:
On Mon, 25 May 2020 10:05:08 +0200 Michal Privoznik <mprivozn@redhat.com> wrote:
This is a problem. The domain XML that is provided can't be changed, mostly because mgmt apps construct it on the fly and then just pass it as a RO string to libvirt. While libvirt could create a separate cache, there has to be a better way.
I mean, I can add some more code that once the guest is running preserves the mapping during migration. But that assumes a running QEMU. When starting a domain from scratch, is it acceptable it vCPU topology changes? I suspect it is not.
I'm not sure I got you but vCPU topology isn't changnig but when starting QEMU, user has to map 'concrete vCPUs' to spencific numa nodes. The issue here is that to specify concrete vCPUs user needs to get layout from QEMU first as it's a function of target/machine/-smp and possibly cpu type.
Assume the following config: 4 vCPUs (2 sockets, 2 cores, 1 thread topology) and 2 NUMA nodes and the following assignment to NUMA:
node 0: cpus=0-1 node 1: cpus=2-3
With old libvirt & qemu (and assuming x86_64 - not EPYC), I assume the following topology is going to be used:
node 0: socket=0,core=0,thread=0 (vCPU0) socket=0,core=1,thread=0 (vCPU1) node 1: socket=1,core=0,thread=0 (vCPU2) socket=1,core=1,thread=0 (vCPU3)
Now, user upgrades libvirt & qemu but doesn't change the config. And on a fresh new start (no migration), they might get a different topology:
node 0: socket=0,core=0,thread=0 (vCPU0) socket=1,core=0,thread=0 (vCPU1) node 1: socket=0,core=1,thread=0 (vCPU2) socket=1,core=1,thread=0 (vCPU3)
that shouldn't happen at least for as long as machine version stays the same
Shouldn't as in it's bad if it happens or as in QEMU won't change topology for released machine types? Well, we are talking about libvirt generating the topology.
The problem here is not how to assign vCPUs to NUMA nodes, the problem is how to translate vCPU IDs to socket=,core=,thread=. if you are talking about libvirt's vCPU IDs, then it's separate issue as it's user facing API, I think it should not rely on cpu_index. Instead it should map vCPU IDs to ([socket,]core[,thread]) tuple or maybe drop notion of vCPU IDs and expose ([socket,]core[,thread]) to users if they ask for numa aware config.
And this is the thing I am asking. How to map vCPU IDs to socket,core,thread and how to do it reliably.
PS: I'm curious how libvirt currently implements numa mapping and how it's correlated with pinnig to host nodes? Does it have any sort of code to calculate topology based on cpu_index so it could properly assign vCPUs to nodes or all the pain of assigning vCPU IDs to nodes is on the user shoulders?
It's on users. In the domain XML they specify number of vCPUs, and then they can assign individual IDs to NUMA nodes. For instance: <vcpu>8</vcpu> <cpu> <numa> <cell id='0' cpus='0-3' memory='2097152' unit='KiB'/> <cell id='1' cpus='4-7' memory='2097152' unit='KiB'/> </numa> </cpu> translates to: -smp 8,sockets=8,cores=1,threads=1 -numa node,nodeid=0,cpus=0-3,mem=... -numa node,nodeid=1,cpus=4-7,mem=... The sockets=,cores=,threads= is formatted every time, even if no topology was specified in the domain XML. If no topology was specified then every vCPU is in its own socket and has 1 core and 1 thread. If topology is specified then the -smp looks accordingly. But all that libvirt uses to assing vCPUs to NUMA nodes is vCPU ID. If it has to use sockets,cores,threads then so be it, but that means libvirt needs to learn the mapping of vCPU IDs to sockets=,cores=,threads=; because if it doesn't and generates the mapping differently to QEMU then for the above snippet vCPUs might move between NUMA nodes. I mean, if there is a domain with the above config it has some topology (that QEMU came up with). Now, after we change it and user updates libvirt & QEMU, libvirt might (in general) come with a different topology and if the VM is booted again it will see say CPU1 move to NUMA#1 (for example). This happened because libvirt came up with vCPU ID -> socket,core,thread mapping itself. I mean, in this patch the algorithm is copied from x86_topo_ids_from_idx(), but I bet there are different mappings (I can see x86_topo_ids_from_idx_epyc() and other architectures might have completely different mapping - powe9 perhaps?). Maybe I'm misunderstanding cpu_index and vCPU ID? I thought it is the same thing.
that applies not only '-numa cpu' but also to -device cpufoo, that's why query-hotpluggable-cpus was introduced to let user get the list of possible CPUs (including topo properties needed to create them) for a given set of CLI options.
If I recall right libvirt uses topo properies during cpu hotplug but treats it mainly as opaqueue info so it could feed it back to QEMU.
tries to avoid that as much as it can.
> > How to present it to libvirt user I'm not sure (give them that list perhaps > and let select from it???)
This is what I am trying to figure out in the cover letter. Maybe we need to let users configure the topology (well, vCPU id to [socket, die, core, thread] mapping), but then again, in my testing the guest ignored that and displayed different topology (true, I was testing with -cpu host, so maybe that's why). there is ongiong issue with EPYC VCPUs topology, but I otherwise it should work. Just report bug to qemu-devel, if it's broken.
> But it's irrelevant, to the patch, magical IDs for socket/core/...whatever > should not be generated by libvirt anymore, but rather taken from QEMU for given > machine + -smp combination.
Taken when? We can do this for running machines, but not for freshly started ones, can we?
it can be used for freshly started as well, QEMU -S -preconfig -M pc -smp ... (QMP) query-hotpluggable-cpus (QMP) set-numa-node ... ... (QMP) exit-preconfig (QMP) other stuff libvirt does (like hot-plugging CPUs , ...) (QMP) cont
I'm not sure this works. query-hotpluggable-cpus does not map vCPU ID <-> socket/core/thread, For '-smp 2,sockets=2,cores=1,threads=1' the 'query-hotpluggable-cpus' returns:
{"return": [{"props": {"core-id": 0, "thread-id": 0, "socket-id": 1}, "vcpus-count": 1, "type": "qemu64-x86_64-cpu"}, {"props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "vcpus-count": 1, "type": "qemu64-x86_64-cpu"}]}
that's the list I was taling about, which is implicitly ordered by cpu_index
Aha! So in this case it would be:
vCPU0 -> socket=1,core=0,thread=0 vCPU1 -> socket=0,core=0,thread=0
But that doesn't feel right. Is the cpu_index increasing or decreasing as I go through the array? it's array with decreasing order and index in it currently == cpu_index for present and possible CPUs. Content of array is immutable for given -M/-smp combination, to keep migration working. We can try to add x-cpu-index to cpu entries, so you won't have to rely on order to help with migrating from old CLI (but only for old machine types where old CLI actually worked worked).
That might help. So we won't hardcode any mapping in libvirt rather than ask QEMU what it thinks the topology is. Cool. So it would work like this: 1) libvirt starts: qemu -preconfig -S -smp 8,sockets=2,cores=2,threads=2 2) libvirt uses "query-hotpluggable-cpus" to learn what topology it came up with, IOW what is the vCPU ID <-> socket,core,thread mapping 3) libvirt configures NUMA nodes, assigns vCPUs to them using [socket,core,thread] based on the mapping it learned in step 2) 4) preconfig is exited, machine resumed Very well. What I don't understand is why we need to have steps 2 and 3. Because in step 2, QEMU needs to report the mapping. Therefore it has to have some internal code that handles the mapping. Having said that, we can have new set of steps: 1) libvirt starts: qemu -preconfig -S -smp 8,sockets=2,cores=2,threads=2 2) libvirt configures NUMA nodes, assigns vCPUs to them using vCPU IDs, QEMU will use the internal code to map IDs to [socket,core,thread] 3) preconfig is exited, machine resumed And since there is no need to preconfig anymore, we can have one step actually: 1) libvirt starts: qemu -S -smp 8,sockets=2,cores=2,threads=2 -numa node -numa cpu,cpus= -numa node -numa cpu,cpus= Or, we can move the mapping into libvirt (that's what I tried to do in this patch). I'm not against it, but we will need to do it exactly like QEMU is doing now. Then we can do plain 1) qemu -S -smp 8,sockets=2,cores=2,threads=2 -numa node -numa cpu,socket=,core=,thread= -numa node -numa cpu,socket=,core=,thread=
Also, how is this able to express holes? E.g. there might be some CPUs that don't have linear topology, and for instance while socket=0,core=0,thread=0 and socket=0,core=0,thread=2 exist, socket=0,core=0,thread=1 does not. How am I supposed to know that by just looking at the array? speaking of x86, QEMU curently does not implement topologies with holes in [socket/core/thread] tuple but if it were it shouldn't matter as all CPUs and their realations with each other are described within that array.
And 'query-cpus' or 'query-cpus-fast' which map vCPU ID onto socket/core/thread are not allowed in preconfig state. these 2 commands apply to present cpu only, if I'm not mistaken. query-hotpluggable-cpus shows not only present but also CPUs that could be hotplugged with device_add or used with -device.
Fair enough. I haven't looked into the code that much.
But if I take a step back, the whole point of deprecating -numa node,cpus= is that QEMU no longer wants to do vCPU ID <-> socket/core/thread mapping because it's ambiguous. So it feels a bit weird to design a solution where libvirt would ask QEMU to provide the mapping only so that it can be configured back. Not only because of the extra step, but also because QEMU can't then remove the mapping anyway. I might be misunderstanding the issue though. if '-numa node,cpus' is removed, we no longer will be using cpu_index as configuration interface with user, that would allow QEMU start pruning it from HMP/QMP interfaces and then probably remove it internally. (I haven't explored yet if we could get rid of it completely but I'd expect migration stream would be the only reason to keep it intrenally).
I'm quite reluctant to add cpu_index to modern query-hotpluggable-cpus output, since the whole goal is to get rid of the index, which don't actually work with SPAPR where CPU entity is a core and threads are internal impl. detail (while cpu_index has 1:1 mapping with threads).
However if it will let QEMU to drop '-numa node,cpus=', we can discuss adding optional 'x-cpu-index' to query-hotpluggable-cpus, that will be available for old machine types for the sole purpose to help libvirt map old CLI to new one. New machines shouldn't care about index though, since they should be using '-numa cpu'.
The problem here is that so far, all that libvirt users see are vCPU IDs. They use them to assign vCPUs to NUMA nodes. And in order to make libvirt switch to the new command line it needs a way to map IDs to socket=,core=,thread=. I will play more with the preconfig and let you know.
If libvirt's vCPU IDs are mirroring cpu_index, I'd say it shouldn't be doing so, see Daniel's response https://lists.gnu.org/archive/html/qemu-devel/2017-10/msg04369.html
No. Libvirt is not mirroring anything. Daniel's right, and libvirt doesn't need to know actual qemu IDs. But it needs to ensure the topology. This is exactly what you objected to in the very first reply to the 4/5 patch. And I agree.
FYI: I didn't read through all the history of -preconfig patches but QEMU options for topology aware (sane) numa configuration on the table were: 1. -numa node,cpus[cpu_index] libvirt needs to duplicate internal QEMU algorithms that map cpu_index values to topology info and use it to map vCPUs to numa nodes (and keep in sync with QEMU as it's machine versioned moving target)
I'm not sure I follow. So libvirt would continue to use -numa node,cpus=. How does topology step into that?
2. -numa cpu CLI option, libvirt needs to duplicate internal QEMU algorithms that calculate target depended values for socket/core/thread ids. (basically it's the same as #1), the only difference is that CLI user interface is expressed in topology properties.
Sure, this is what I tried to do. But you suggested using preconfig + query-hotpluggable-cpus.
3. when we discussed it in the past #2 wasn't going to fly as it still had tha same burden as #1 (duplicating code and keeping it in sync). so we ended up with runtime configuration (-preconfig) to avoid QEMU restart just for querying, where libvirt could get list of possible CPUs from QEMU instance and complete numa configuration on the fly (at least for the first time, results could be cached and re-used with -numa cpu).
Yeah, this is the burden I am talking about. I feel like we are not talking about the same thing. Maybe I'm misunderstanding something. Michal

On Thu, 4 Jun 2020 10:58:01 +0200 Michal Privoznik <mprivozn@redhat.com> wrote:
On 5/27/20 3:58 PM, Igor Mammedov wrote:
On Tue, 26 May 2020 17:31:09 +0200 Michal Privoznik <mprivozn@redhat.com> wrote:
On 5/26/20 4:51 PM, Igor Mammedov wrote:
On Mon, 25 May 2020 10:05:08 +0200 Michal Privoznik <mprivozn@redhat.com> wrote:
This is a problem. The domain XML that is provided can't be changed, mostly because mgmt apps construct it on the fly and then just pass it as a RO string to libvirt. While libvirt could create a separate cache, there has to be a better way.
I mean, I can add some more code that once the guest is running preserves the mapping during migration. But that assumes a running QEMU. When starting a domain from scratch, is it acceptable it vCPU topology changes? I suspect it is not.
I'm not sure I got you but vCPU topology isn't changnig but when starting QEMU, user has to map 'concrete vCPUs' to spencific numa nodes. The issue here is that to specify concrete vCPUs user needs to get layout from QEMU first as it's a function of target/machine/-smp and possibly cpu type.
Assume the following config: 4 vCPUs (2 sockets, 2 cores, 1 thread topology) and 2 NUMA nodes and the following assignment to NUMA:
node 0: cpus=0-1 node 1: cpus=2-3
With old libvirt & qemu (and assuming x86_64 - not EPYC), I assume the following topology is going to be used:
node 0: socket=0,core=0,thread=0 (vCPU0) socket=0,core=1,thread=0 (vCPU1) node 1: socket=1,core=0,thread=0 (vCPU2) socket=1,core=1,thread=0 (vCPU3)
Now, user upgrades libvirt & qemu but doesn't change the config. And on a fresh new start (no migration), they might get a different topology:
node 0: socket=0,core=0,thread=0 (vCPU0) socket=1,core=0,thread=0 (vCPU1) node 1: socket=0,core=1,thread=0 (vCPU2) socket=1,core=1,thread=0 (vCPU3)
that shouldn't happen at least for as long as machine version stays the same
Shouldn't as in it's bad if it happens or as in QEMU won't change topology for released machine types? it's the second
Well, we are talking about libvirt generating the topology.
The problem here is not how to assign vCPUs to NUMA nodes, the problem is how to translate vCPU IDs to socket=,core=,thread=. if you are talking about libvirt's vCPU IDs, then it's separate issue as it's user facing API, I think it should not rely on cpu_index. Instead it should map vCPU IDs to ([socket,]core[,thread]) tuple or maybe drop notion of vCPU IDs and expose ([socket,]core[,thread]) to users if they ask for numa aware config.
And this is the thing I am asking. How to map vCPU IDs to socket,core,thread and how to do it reliably. vCPU ID has the same drawbacks as cpu_index in QEMU, it provides zero information about topology. Which is fine in non NUMA case since user doesn't care about topology at all (I'm assuming it's libvirt who does pinning and it would use topology info to pin vcpus correctly). But for NUMA case, as a user I'd like to see/use topology instead of vCPU ID, especially if user is in charge of assigning vCPUs to nodes.
I'd drop vCPU IDs concept altogether and use ([socket,]core[,thread]) tuple to describe vCPUs instead. It should work fine for both usecases and you wouldn't have to do mapping to vCPU IDs. (I'm talking here about new configs that use new machine types and ignore compatibility. More on the later see below)
PS: I'm curious how libvirt currently implements numa mapping and how it's correlated with pinnig to host nodes? Does it have any sort of code to calculate topology based on cpu_index so it could properly assign vCPUs to nodes or all the pain of assigning vCPU IDs to nodes is on the user shoulders?
It's on users. In the domain XML they specify number of vCPUs, and then they can assign individual IDs to NUMA nodes. For instance:
<vcpu>8</vcpu>
<cpu> <numa> <cell id='0' cpus='0-3' memory='2097152' unit='KiB'/> <cell id='1' cpus='4-7' memory='2097152' unit='KiB'/> </numa> </cpu>
translates to:
-smp 8,sockets=8,cores=1,threads=1 -numa node,nodeid=0,cpus=0-3,mem=... -numa node,nodeid=1,cpus=4-7,mem=...
The sockets=,cores=,threads= is formatted every time, even if no topology was specified in the domain XML. If no topology was specified then every vCPU is in its own socket and has 1 core and 1 thread.
If topology is specified then the -smp looks accordingly. But all that libvirt uses to assing vCPUs to NUMA nodes is vCPU ID. If it has to use sockets,cores,threads then so be it, but that means libvirt needs to learn the mapping of vCPU IDs to sockets=,cores=,threads=; because if it doesn't and generates the mapping differently to QEMU then for the above snippet vCPUs might move between NUMA nodes. I mean, if there is a domain with the above config it has some topology (that QEMU came up with). Now, after we change it and user updates libvirt & QEMU, libvirt might (in general) come with a different topology and if the VM is booted again it will see say CPU1 move to NUMA#1 (for example).
This happened because libvirt came up with vCPU ID -> socket,core,thread mapping itself. I mean, in this patch the algorithm is copied from x86_topo_ids_from_idx(), but I bet there are different mappings (I can see x86_topo_ids_from_idx_epyc() and other architectures might have completely different mapping - powe9 perhaps?).
* in case of x86 if I recall correctly there are 3 mappings now, - x86_topo_ids_from_idx() original can produce 2 different tologies depending on compat_apic_id_mode - x86_topo_ids_from_idx_epyc - newish and probably now final yet, so there is a chance that we will have several variants if we decide to keep bugs on per machine version. * Power (spapr) advertises (core_id) in query_hotpluggbale_cpus but internally it might have more than 1 thread per core so mapping to cpu_index is not 1:1 over there * arm/virt - should advertise (thread_id) in query_hotpluggbale_cpus and mapping to cpu_index happens to be 1:1 but a subject to change in future * s390 - advertises (core_id) and mapping to cpu_index is 1:1 so far Like I said before, for compatibility sake QEMU can introduce cpu_index in the output of query_hotpluggbale_cpus for released machine types, to help libvirt to map old configs to '-numa cpus='. For new configs it would be better move away from vCPU ID configs.
Maybe I'm misunderstanding cpu_index and vCPU ID? I thought it is the same thing. From your examples it looks like it's, and it should be decoupled from each other for new configs (or dropped altogeter).
that applies not only '-numa cpu' but also to -device cpufoo, that's why query-hotpluggable-cpus was introduced to let user get the list of possible CPUs (including topo properties needed to create them) for a given set of CLI options.
If I recall right libvirt uses topo properies during cpu hotplug but treats it mainly as opaqueue info so it could feed it back to QEMU.
> tries to avoid that as much as it can. > >> >> How to present it to libvirt user I'm not sure (give them that list perhaps >> and let select from it???) > > This is what I am trying to figure out in the cover letter. Maybe we > need to let users configure the topology (well, vCPU id to [socket, die, > core, thread] mapping), but then again, in my testing the guest ignored > that and displayed different topology (true, I was testing with -cpu > host, so maybe that's why). there is ongiong issue with EPYC VCPUs topology, but I otherwise it should work. Just report bug to qemu-devel, if it's broken.
> >> But it's irrelevant, to the patch, magical IDs for socket/core/...whatever >> should not be generated by libvirt anymore, but rather taken from QEMU for given >> machine + -smp combination. > > Taken when? We can do this for running machines, but not for freshly > started ones, can we?
it can be used for freshly started as well, QEMU -S -preconfig -M pc -smp ... (QMP) query-hotpluggable-cpus (QMP) set-numa-node ... ... (QMP) exit-preconfig (QMP) other stuff libvirt does (like hot-plugging CPUs , ...) (QMP) cont
I'm not sure this works. query-hotpluggable-cpus does not map vCPU ID <-> socket/core/thread, For '-smp 2,sockets=2,cores=1,threads=1' the 'query-hotpluggable-cpus' returns:
{"return": [{"props": {"core-id": 0, "thread-id": 0, "socket-id": 1}, "vcpus-count": 1, "type": "qemu64-x86_64-cpu"}, {"props": {"core-id": 0, "thread-id": 0, "socket-id": 0}, "vcpus-count": 1, "type": "qemu64-x86_64-cpu"}]}
that's the list I was taling about, which is implicitly ordered by cpu_index
Aha! So in this case it would be:
vCPU0 -> socket=1,core=0,thread=0 vCPU1 -> socket=0,core=0,thread=0
But that doesn't feel right. Is the cpu_index increasing or decreasing as I go through the array? it's array with decreasing order and index in it currently == cpu_index for present and possible CPUs. Content of array is immutable for given -M/-smp combination, to keep migration working. We can try to add x-cpu-index to cpu entries, so you won't have to rely on order to help with migrating from old CLI (but only for old machine types where old CLI actually worked worked).
That might help. So we won't hardcode any mapping in libvirt rather than ask QEMU what it thinks the topology is. Cool.
So it would work like this:
1) libvirt starts: qemu -preconfig -S -smp 8,sockets=2,cores=2,threads=2
2) libvirt uses "query-hotpluggable-cpus" to learn what topology it came up with, IOW what is the vCPU ID <-> socket,core,thread mapping
For power/arm/s390 provided tuples will be different (described above)
3) libvirt configures NUMA nodes, assigns vCPUs to them using [socket,core,thread] based on the mapping it learned in step 2)
4) preconfig is exited, machine resumed
Very well. What I don't understand is why we need to have steps 2 and 3. Because in step 2, QEMU needs to report the mapping. Therefore it has to have some internal code that handles the mapping. Having said that, we can have new set of steps:
in step 2 """cpu_index""" <-> socket,core,thread mapping, could be provided only for old machine types to help with mapping old vCPU IDs based configs to 'new' [socket,core,thread] scheme. New machines should not provide cpu_index as part of "query-hotpluggable-cpus" whole point of exercise is in getting rid of old ambiguous cpu_index interface /-numa node,cpus/ which duplicates topology aware '-numa cpu' one and dropping NUMA code that supports there /that's quite a bit/ That libvirt happens to have 1:1 mapping between cpu_index and vCPU IDs is unfortunate and should be decoupled. It still can keep using vCPU IDs as interface to upper layers but remap it internally to socket,core,thread using its own arch-independed deterministic algorithm for new configs/machine types. However if I were a user, I'd like to describe CPUs (in config) using topology attributes (doubly so if I'm the one who later assigns them to NUMA nodes).
1) libvirt starts: qemu -preconfig -S -smp 8,sockets=2,cores=2,threads=2
2) libvirt configures NUMA nodes, assigns vCPUs to them using vCPU IDs, QEMU will use the internal code to map IDs to [socket,core,thread]
I really can't parse this sentence (in preconfig context).
3) preconfig is exited, machine resumed
And since there is no need to preconfig anymore, we can have one step actually:
1) libvirt starts: qemu -S -smp 8,sockets=2,cores=2,threads=2 -numa node -numa cpu,cpus= -numa node -numa cpu,cpus= you've probably meant here the old way /-numa node,cpus=/ which uses cpu_index
Or, we can move the mapping into libvirt (that's what I tried to do in this patch). I'm not against it, but we will need to do it exactly like QEMU is doing now. as shortcut it will work untill something changes on QEMU side, and then it becomes silently broken [for new machine types]. That's the reason for introducing preconfig, so that topo info could be introspected (per -M/-smp combination) and libvirt doesn't have to duplicate and maintain complex architecture depended topology code from QEMU.
Then we can do plain
1) qemu -S -smp 8,sockets=2,cores=2,threads=2 -numa node -numa cpu,socket=,core=,thread= -numa node -numa cpu,socket=,core=,thread=
Also, how is this able to express holes? E.g. there might be some CPUs that don't have linear topology, and for instance while socket=0,core=0,thread=0 and socket=0,core=0,thread=2 exist, socket=0,core=0,thread=1 does not. How am I supposed to know that by just looking at the array? speaking of x86, QEMU curently does not implement topologies with holes in [socket/core/thread] tuple but if it were it shouldn't matter as all CPUs and their realations with each other are described within that array.
And 'query-cpus' or 'query-cpus-fast' which map vCPU ID onto socket/core/thread are not allowed in preconfig state. these 2 commands apply to present cpu only, if I'm not mistaken. query-hotpluggable-cpus shows not only present but also CPUs that could be hotplugged with device_add or used with -device.
Fair enough. I haven't looked into the code that much.
But if I take a step back, the whole point of deprecating -numa node,cpus= is that QEMU no longer wants to do vCPU ID <-> socket/core/thread mapping because it's ambiguous. So it feels a bit weird to design a solution where libvirt would ask QEMU to provide the mapping only so that it can be configured back. Not only because of the extra step, but also because QEMU can't then remove the mapping anyway. I might be misunderstanding the issue though. if '-numa node,cpus' is removed, we no longer will be using cpu_index as configuration interface with user, that would allow QEMU start pruning it from HMP/QMP interfaces and then probably remove it internally. (I haven't explored yet if we could get rid of it completely but I'd expect migration stream would be the only reason to keep it intrenally).
I'm quite reluctant to add cpu_index to modern query-hotpluggable-cpus output, since the whole goal is to get rid of the index, which don't actually work with SPAPR where CPU entity is a core and threads are internal impl. detail (while cpu_index has 1:1 mapping with threads).
However if it will let QEMU to drop '-numa node,cpus=', we can discuss adding optional 'x-cpu-index' to query-hotpluggable-cpus, that will be available for old machine types for the sole purpose to help libvirt map old CLI to new one. New machines shouldn't care about index though, since they should be using '-numa cpu'.
The problem here is that so far, all that libvirt users see are vCPU IDs. They use them to assign vCPUs to NUMA nodes. And in order to make libvirt switch to the new command line it needs a way to map IDs to socket=,core=,thread=. I will play more with the preconfig and let you know.
If libvirt's vCPU IDs are mirroring cpu_index, I'd say it shouldn't be doing so, see Daniel's response https://lists.gnu.org/archive/html/qemu-devel/2017-10/msg04369.html
No. Libvirt is not mirroring anything. Daniel's right, and libvirt doesn't need to know actual qemu IDs. But it needs to ensure the topology. This is exactly what you objected to in the very first reply to the 4/5 patch. And I agree.
FYI: I didn't read through all the history of -preconfig patches but QEMU options for topology aware (sane) numa configuration on the table were: 1. -numa node,cpus[cpu_index] libvirt needs to duplicate internal QEMU algorithms that map cpu_index values to topology info and use it to map vCPUs to numa nodes (and keep in sync with QEMU as it's machine versioned moving target)
I'm not sure I follow. So libvirt would continue to use -numa node,cpus=. How does topology step into that? keyword was 'were', it was one of the options that were considered back then (that wasn't accepted)
2. -numa cpu CLI option, libvirt needs to duplicate internal QEMU algorithms that calculate target depended values for socket/core/thread ids. (basically it's the same as #1), the only difference is that CLI user interface is expressed in topology properties.
Sure, this is what I tried to do. But you suggested using preconfig + query-hotpluggable-cpus. it wasn't acceptable because #2 was fragile. It involves duplicating complex architecture specific topology from QEMU (including machine version compact hacks) and it has to be kept in sync (as it's sometimes extended to support new layout)
3. when we discussed it in the past #2 wasn't going to fly as it still had tha same burden as #1 (duplicating code and keeping it in sync). so we ended up with runtime configuration (-preconfig) to avoid QEMU restart just for querying, where libvirt could get list of possible CPUs from QEMU instance and complete numa configuration on the fly (at least for the first time, results could be cached and re-used with -numa cpu).
Yeah, this is the burden I am talking about. I feel like we are not talking about the same thing. Maybe I'm misunderstanding something. Perhaps we should 'meet' and discuss it (it might be easier to resove misunderstanding).
Michal

Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/conf/cpu_conf.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/conf/cpu_conf.c b/src/conf/cpu_conf.c index 07404c6fb0..2991ab8204 100644 --- a/src/conf/cpu_conf.c +++ b/src/conf/cpu_conf.c @@ -527,39 +527,33 @@ virCPUDefParseXML(xmlXPathContextPtr ctxt, } if (virXPathNode("./topology[1]", ctxt)) { - unsigned long ul; - - if (virXPathULong("string(./topology[1]/@sockets)", ctxt, &ul) < 0) { + if (virXPathUInt("string(./topology[1]/@sockets)", ctxt, &def->sockets) < 0) { virReportError(VIR_ERR_XML_ERROR, "%s", _("Missing 'sockets' attribute in CPU topology")); goto cleanup; } - def->sockets = (unsigned int) ul; if (virXPathNode("./topology[1]/@dies", ctxt)) { - if (virXPathULong("string(./topology[1]/@dies)", ctxt, &ul) < 0) { + if (virXPathUInt("string(./topology[1]/@dies)", ctxt, &def->dies) < 0) { virReportError(VIR_ERR_XML_ERROR, "%s", _("Malformed 'dies' attribute in CPU topology")); goto cleanup; } - def->dies = (unsigned int) ul; } else { def->dies = 1; } - if (virXPathULong("string(./topology[1]/@cores)", ctxt, &ul) < 0) { + if (virXPathUInt("string(./topology[1]/@cores)", ctxt, &def->cores) < 0) { virReportError(VIR_ERR_XML_ERROR, "%s", _("Missing 'cores' attribute in CPU topology")); goto cleanup; } - def->cores = (unsigned int) ul; - if (virXPathULong("string(./topology[1]/@threads)", ctxt, &ul) < 0) { + if (virXPathUInt("string(./topology[1]/@threads)", ctxt, &def->threads) < 0) { virReportError(VIR_ERR_XML_ERROR, "%s", _("Missing 'threads' attribute in CPU topology")); goto cleanup; } - def->threads = (unsigned int) ul; if (!def->sockets || !def->cores || !def->threads || !def->dies) { virReportError(VIR_ERR_XML_ERROR, "%s", -- 2.26.2
participants (2)
-
Igor Mammedov
-
Michal Privoznik