From: Wim ten Have <wim.ten.have(a)oracle.com>
This patch adds XML definitions to a guest with a vNUMA layout and
contains routines to parse the same. The guest vNUMA specification
looks like:
<vnuma mode='host|node'
distribution='contiguous|siblings|round-robin|interleave'>
<memory unit='#unitsize'>size</memory>
<partition nodeset='#nodes' cells='#cells'/>
</vnuma>
With mode='host' the guest XML is rendered to match the host's NUMA
topology.
With mode='node' the guest XML is rendered according to the "nodes"
and "cells" attributes of the <partition> element.
Signed-off-by: Wim ten Have <wim.ten.have(a)oracle.com>
---
docs/formatdomain.html.in | 94 +++++++
docs/schemas/domaincommon.rng | 65 +++++
src/conf/domain_conf.c | 482 +++++++++++++++++++++++++++++++++-
src/conf/domain_conf.h | 2 +
src/conf/numa_conf.c | 241 ++++++++++++++++-
src/conf/numa_conf.h | 58 +++-
src/libvirt_private.syms | 8 +
7 files changed, 932 insertions(+), 18 deletions(-)
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in
index 962766b792d3..80165f9bd896 100644
--- a/docs/formatdomain.html.in
+++ b/docs/formatdomain.html.in
@@ -1294,6 +1294,98 @@
</dl>
+ <h3><a id="elementsvNUMAPartitioning">NUMA Host or Node
Partitioning</a></h3>
+
+ <p>
+ With the help of the <code>vnuma</code> element, libvirt can
+ dynamically partition a guest domain for vNUMA by rendering its XML
+ into a 'host' or 'node' <a
href="#elementsNUMAtopology"><code>NUMA
+ topology</code></a> matching model.
+ </p>
+
+<pre>
+<domain>
+ ...
+ <vnuma mode='host|node'
distribution='contiguous|siblings|round-robin|interleave'>
+ <memory unit='KiB'>524288</memory>
+ <partition nodeset="1-4,^3" cells="8"/>
+ </vnuma>
+ ...
+</domain>
+</pre>
+
+ <dl>
+ <dt><code>vnuma</code></dt>
+ <dd>
+ The attribute <code>mode</code> selects a specific rendering
+ method. Its value is either "host" or "node." If
<code>mode</code>
+ is set to "host" the guest domain is automatically partitioned
+ to match the host NUMA topology. If <code>mode</code>
+ is "node," the guest domain is partitioned according to the
+ <code>nodeset</code> and <code>cells</code> under the
+ <code>vnuma</code> <code>partition</code> subelement.
+ <span class="since">Since 5.9</span>
+
+ The optional attribute <code>distribution</code> selects the
+ guest <a
href="#elementsNUMAtopology"><code>numa</code></a>
+ <code>cell</code> <code>cpus</code> distribution. It
allows
+ <span class="since">Since 5.9</span> for:
+ <dl>
+ <dt><code>contiguous</code></dt>
+ <dd> The cpus are enumerate sequentially over the
+ <a
href="#elementsNUMAtopology"><code>numa</code></a> defined
+ cells.
+ </dd>
+ <dt><code>siblings</code></dt>
+ <dd> The cpus are distributed over the
+ <a
href="#elementsNUMAtopology"><code>numa</code></a>
+ cells matching the host CPU SMT model.
+ </dd>
+ <dt><code>round-robin</code></dt>
+ <dd> The cpus are distributed over the
+ <a
href="#elementsNUMAtopology"><code>numa</code></a>
+ cells matching the host CPU topology.
+ </dd>
+ <dt><code>interleave</code></dt>
+ <dd> The cpus are interleaved one at a time over the
+ <a
href="#elementsNUMAtopology"><code>numa</code></a> cells.
+ </dd>
+ </dl>
+ </dd>
+
+ <dt><code>memory</code></dt>
+ <dd>
+ The optional subelement <code>memory</code> specifies the
+ memory size reserved for the guest assigned
+ <a
href="#elementsNUMAtopology"><code>numa</code></a> cells.
+ <span class="since">Since 1.2.11</span>, one can use an
additional
+ <a
href="#elementsMemoryAllocation"><code>unit</code></a>
+ attribute to define units in which this <code>memory</code>
+ size is quantified. If no <code>memory</code> is specified, the
+ <a href="#elementsMemoryAllocation">memory</a> setting is
+ acquired to set this subelement documented
+ <a
href="#elementsvNUMAPartitioning"><code>vnuma</code></a>
value.
+ <span class="since">Since 5.9</span>
+ </dd>
+
+ <dt><code>partition</code></dt>
+ <dd>
+ The optional attribute <code>partition</code> is only active when
+ <a
href="#elementsvNUMAPartitioning"><code>vnuma</code></a>
+ <code>mode</code> "node" is selected and allows for
defining the
+ active "nodeset" and "cells" to target for under the
"guest" domain.
+ For example; the specified <code>nodeset</code> can limit the
+ <a
href="#elementsNUMATuning"><code>numatune</code></a>
assigned
+ host NUMA nodes in effect under the "guest". Alternatively,
+ the provided <code>cells</code> attribute can define the number
+ of <a
href="#elementsNUMAtopology"><code>numa</code></a> cells
+ to render.
+
+ <span class="since">Since 5.9</span>
+ </dd>
+ </dl>
+
+
<h3><a id="elementsNUMATuning">NUMA Node
Tuning</a></h3>
<pre>
@@ -1755,6 +1847,8 @@
</dd>
</dl>
+ <h3><a id="elementsNUMAtopology">NUMA
topology</a></h3>
+
<p>
Guest NUMA topology can be specified using the <code>numa</code>
element.
<span class="since">Since 0.9.8</span>
diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng
index e06f892da393..227c856a362c 100644
--- a/docs/schemas/domaincommon.rng
+++ b/docs/schemas/domaincommon.rng
@@ -786,6 +786,10 @@
<ref name="cputune"/>
</optional>
+ <optional>
+ <ref name="vnuma"/>
+ </optional>
+
<optional>
<ref name="numatune"/>
</optional>
@@ -1062,6 +1066,67 @@
</choice>
</define>
+ <!-- All the "host vnuma" related tunables would go in the vnuma -->
+ <define name="vnuma">
+ <element name="vnuma">
+ <optional>
+ <ref name="vnumaMode"/>
+ </optional>
+ <optional>
+ <ref name="vnumaDistribution"/>
+ </optional>
+ <interleave>
+ <optional>
+ <element name="memory">
+ <ref name="scaledInteger"/>
+ </element>
+ </optional>
+ <optional>
+ <element name="partition">
+ <optional>
+ <ref name="vnumaNodeset"/>
+ </optional>
+ <optional>
+ <ref name="vnumaCells"/>
+ </optional>
+ </element>
+ </optional>
+ </interleave>
+ </element>
+ </define>
+
+ <define name="vnumaMode">
+ <attribute name="mode">
+ <choice>
+ <value>host</value>
+ <value>node</value>
+ </choice>
+ </attribute>
+ </define>
+
+ <define name="vnumaDistribution">
+ <attribute name="distribution">
+ <choice>
+ <value>contiguous</value>
+ <value>siblings</value>
+ <value>round-robin</value>
+ <value>interleave</value>
+ </choice>
+ </attribute>
+ </define>
+
+ <define name="vnumaNodeset">
+ <attribute name='nodeset'>
+ <ref name='cpuset'/>
+ </attribute>
+ </define>
+
+ <define name="vnumaCells">
+ <attribute name='cells'>
+ <ref name="positiveInteger"/>
+ </attribute>
+ </define>
+
<!-- All the NUMA related tunables would go in the numatune -->
<define name="numatune">
<element name="numatune">
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 317e7846ceb0..32b29740bffd 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -1824,6 +1824,18 @@ virDomainDefSetVcpusMax(virDomainDefPtr def,
if (def->maxvcpus == maxvcpus)
return 0;
+ if (virDomainVnumaIsEnabled(def->numa)) {
+ size_t nnumaCell = virDomainNumaGetNodeCount(def->numa);
+
+ if (maxvcpus % nnumaCell) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+ _("vNUMA: the maximum vCPU count %d is not a "
+ "multiple of the configured vNUMA node count
%ld"),
+ maxvcpus, nnumaCell);
+ return -1;
+ }
+ }
+
if (def->maxvcpus < maxvcpus) {
if (VIR_EXPAND_N(def->vcpus, def->maxvcpus, maxvcpus - def->maxvcpus)
< 0)
return -1;
@@ -2067,6 +2079,394 @@ virDomainDefGetVcpusTopology(const virDomainDef *def,
}
+void
+virDomainDefSetVcpusVnuma(virDomainDefPtr def,
+ size_t nvcpus)
+{
+ int vcpuscnt = nvcpus;
+ size_t cell, i;
+ size_t vcpu_node;
+ size_t nnumaCell = virDomainNumaGetNodeCount(def->numa);
+
+ if (!nnumaCell)
+ return;
+
+ /* vcpu_node represents the maximum vcpus per vNUMA
+ * node that theoretically could be within a set.
+ */
+ vcpu_node = (def->maxvcpus / nnumaCell) + ((def->maxvcpus % nnumaCell) ? 1 :
0);
+
+ for (i = 0; i < vcpu_node; i++) {
+ for (cell = 0; cell < nnumaCell; cell++) {
+ virDomainVcpuDefPtr vcpu;
+ size_t cid = cell * vcpu_node + i;
+
+ if (cid >= def->maxvcpus)
+ break;
+
+ vcpu = def->vcpus[cid];
+
+ if (vcpuscnt-- > 0)
+ vcpu->online = true;
+ else
+ vcpu->online = false;
+
+ /* vCPU0 cannot be hotplugged */
+ if (cid)
+ vcpu->hotpluggable = true;
+ }
+ }
+ def->individualvcpus = true;
+
+ return;
+}
+
+
+/**
+ * virDomainNumaAutoconfig: vNUMA automatic host partition processing
+ * @def: domain definition
+ * @caps: host capabilities
+ *
+ * vNUMA automatic host partitioning is requested by adding the <vnuma
+ * mode=...> element to the guest XML. See virDomainVnumaParseXML() for
+ * parsing the related XML and filling the virDomainAutoPartition structure.
+ *
+ * If the virDomainAutoPartition structure is valid, libvirt takes into
+ * account the host hardware configuration (including maxvcpus, online
+ * vcpus, and memory) and creates the guest such that vcpus and memory
+ * are spread evenly across the host.
+ *
+ * Returns 0 on success and -1 on error.
+ */
+static int
+virDomainNumaAutoconfig(virDomainDefPtr def,
+ virCapsPtr caps)
+{
+ int ret = -1;
+ virBitmapPtr nodeset = NULL;
+ virDomainNumaPtr numa = def->numa;
+ virDomainAutoPartitionPtr avnuma;
+
+ if (!numa)
+ goto error;
+
+ if (caps &&
+ (avnuma = virDomainVnumaParseXML(numa, NULL))) {
+
+ size_t i, j, cell;
+ size_t nvcpus = 0;
+ size_t nnumaCell = 0;
+ size_t vcpu_node;
+ unsigned long long memsizeCell = 0;
+ virCapsHostPtr host = &caps->host;
+ unsigned int threads = host->cpu->threads;
+
+ if (!def->cpu) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+ _("vNUMA: unable to render <vnuma> partitioning for
"
+ "domain %s because of undefined <cpu ... />
topology."),
+ def->name);
+ goto error;
+ }
+
+ if (!avnuma->nodeset) {
+ if (!(avnuma->nodeset = virBitmapNew(host->nnumaCell)))
+ goto cleanup;
+
+ for (i = 0; i < host->nnumaCell; i++)
+ if (virBitmapSetBit(avnuma->nodeset, i) < 0)
+ goto cleanup;
+ }
+
+ /* Set the vNUMA cell count */
+ nnumaCell = avnuma->vcell ? avnuma->vcell :
virBitmapCountBits(avnuma->nodeset);
+
+ if (!nnumaCell)
+ goto cleanup;
+
+ /* Compute the online vcpus */
+ for (i = 0; i < def->maxvcpus; i++)
+ if (def->vcpus[i]->online)
+ nvcpus++;
+
+ /* vcpu_node represents the maximum vcpus per numanode that
+ * theoretically could be within a set.
+ */
+ vcpu_node = (def->maxvcpus / nnumaCell) + ((def->maxvcpus % nnumaCell) ? 1
: 0);
+
+ /* Do the host provided "CPU topology" threads fit */
+ threads = (nnumaCell % threads) ? 1 : threads;
+
+ /* Is it possible to render the guest for vNUMA auto partition? */
+ if ((def->maxvcpus % nnumaCell) ||
+ (def->maxvcpus < (nnumaCell * threads))) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+ _("vNUMA: %ld vcpus is insufficient to "
+ "arrange a vNUMA topology for %ld nodes."),
+ def->maxvcpus, nnumaCell);
+ goto error;
+ }
+
+ /* Compute the memory size (memsizeCell) per arranged nnumaCell.
+ * If no memory for vNUMA auto partitioning was specified then
+ * compute its value from the total_memory configuration.
+ */
+ if ((memsizeCell = avnuma->mem / nnumaCell) == 0) {
+ unsigned long long hotplugMemory = 0;
+
+ /* Calculate the size of hotplug memory */
+ for (i = 0; i < def->nmems; i++)
+ hotplugMemory += def->mems[i]->size;
+
+ memsizeCell = (def->mem.total_memory - hotplugMemory) / nnumaCell;
+ }
+
+ /* Under vNUMA automatic host partitioning the 'memballoon' controlled
+ * cur_balloon value should reflect the guest's total_memory setting.
+ */
+ def->mem.cur_balloon = def->mem.total_memory;
+
+ /* Correct vNUMA can only be accomplished if the number of maxvcpus
+ * is a multiple of the number of physical nodes. If this is not
+ * possible we set sockets, cores and threads to 0 so libvirt creates
+ * a default topology where all vcpus appear as sockets and cores and
+ * threads are set to 1.
+ */
+ if (def->maxvcpus % (nnumaCell * threads)) {
+ VIR_WARN("Disabling guest %s auto vNUMA topology because configured
"
+ "%ld vCPUs do not match the host's %ld NUMA nodes to
produce "
+ "an evenly balanced CPU topology.",
+ def->name, def->maxvcpus, nnumaCell);
+ def->cpu->sockets = def->cpu->cores = def->cpu->threads =
0;
+ } else {
+ /* Below computed topology aims to align the guest's sockets,
+ * cores and threads with the host's topology.
+ */
+ def->cpu->cores = def->maxvcpus / (nnumaCell * threads);
+ def->cpu->threads = threads;
+ def->cpu->sockets = nnumaCell;
+ }
+
+ /* Build the vNUMA topology. The previous configuration may
+ * have changed entirely, so free the current NUMA allocation
+ * and start over from scratch.
+ */
+ virDomainNumaFree(numa);
+ if (!(numa = virDomainNumaNew()))
+ goto error;
+
+ /* We're clean and good to rebuild the entire guest domain
+ * respecting the requested vNUMA topoplogy provided by <vnuma>
+ * avnuma stored objects.
+ */
+ avnuma->mem = memsizeCell * nnumaCell;
+
+ if (!virDomainNumaSetNodeCount(numa, nnumaCell))
+ goto error;
+
+ if (!(nodeset = virBitmapNewCopy(avnuma->nodeset)))
+ goto error;
+
+ for (cell = 0; cell < nnumaCell; cell++) {
+ size_t ndistances;
+ size_t vcell = cell % host->nnumaCell;
+ size_t vcpu_strt, vcpu_last, vcpu_left;
+ ssize_t node = 0;
+ unsigned int cores = def->cpu->cores;
+ virBitmapPtr cpumask = NULL;
+ virBitmapPtr vnumask = NULL;
+ virCapsHostNUMACell *numaCell = NULL;
+
+ /* per NUMA cell memory size */
+ virDomainNumaSetNodeMemorySize(numa, cell, memsizeCell);
+
+ /* per NUMA cell bind memory (mode='strict') */
+ if ((node = virBitmapNextSetBit(nodeset, (vcell-1))) < 0)
+ node = vcell - 1;
+
+ if (node >= host->nnumaCell) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+ _("vNUMA: domain %s defined nodeset node %ld "
+ "is out of range. Valid range is 0-%ld"),
+ def->name, node, (host->nnumaCell-1));
+ goto error;
+ }
+
+ if (virDomainNumatuneSetmemset(numa, cell, node,
+ VIR_DOMAIN_NUMATUNE_MEM_STRICT) < 0)
+ goto error;
+
+ /* per NUMA cell vcpu range to mask */
+ if (!(cpumask = virBitmapNew(def->maxvcpus)))
+ goto error;
+
+ switch (avnuma->distribution) {
+ case VIR_DOMAIN_VNUMA_DISTRIBUTION_CONTIGUOUS:
+ /* vcpus are equally balanced from 0 to highest vcpu id
+ * available, keeping ranges contiguous where the maximum vcpu
+ * sets run from lowest vNUMA cells to highest available.
+ */
+ vcpu_strt = cell * vcpu_node;
+ vcpu_last = MIN(vcpu_strt + vcpu_node, def->maxvcpus);
+
+ for (i = vcpu_strt; i < vcpu_last; i++) {
+ if (virBitmapSetBitExpand(cpumask, i) < 0) {
+ virBitmapFree(cpumask);
+ goto error;
+ }
+ }
+ break;
+
+ case VIR_DOMAIN_VNUMA_DISTRIBUTION_SIBLINGS:
+ /* Create vNUMA node vcpu ranges that represent a clean
+ * processor sockets/core/threads model, placing one
+ * socket per NUMA node.
+ */
+ vcpu_strt = cell * cores;
+ vcpu_last = def->maxvcpus;
+ vcpu_left = def->maxvcpus / threads;
+
+ for (i = vcpu_strt; i < vcpu_last; i += vcpu_left) {
+ for (j = 0; j < cores; j++) {
+ unsigned int id = i + j;
+
+ if (id < def->maxvcpus &&
+ virBitmapSetBitExpand(cpumask, id) < 0) {
+ virBitmapFree(cpumask);
+ goto error;
+ }
+ }
+ }
+ break;
+
+ case VIR_DOMAIN_VNUMA_DISTRIBUTION_ROUNDROBIN:
+ /* Create vNUMA node vcpu ranges that round-robin
+ * interleave one core per node over the available nodes.
+ */
+ vcpu_strt = cell * threads;
+ vcpu_last = def->maxvcpus;
+ vcpu_left = threads * nnumaCell;
+
+ for (i = vcpu_strt; i < vcpu_last; i += vcpu_left) {
+ for (j = 0; j < threads; j++) {
+ unsigned int id = i + j;
+
+ if (id < def->maxvcpus &&
+ virBitmapSetBitExpand(cpumask, id) < 0) {
+ virBitmapFree(cpumask);
+ goto error;
+ }
+ }
+ }
+ break;
+
+ case VIR_DOMAIN_VNUMA_DISTRIBUTION_INTERLEAVE:
+ /* Distribute vCPUs over the NUMA nodes in a round-robin,
+ * interleaved fashion, with one vCPU (thread) per node.
+ */
+ def->cpu->sockets = def->cpu->cores = def->cpu->threads
= 0;
+ for (i = cell; i < def->maxvcpus; i += nnumaCell) {
+ if (virBitmapSetBitExpand(cpumask, i) < 0) {
+ virBitmapFree(cpumask);
+ goto error;
+ }
+ }
+ break;
+
+ default:
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("vNUMA: domain %s non-existent vCPU distribution
requested."));
+ goto error;
+ break;
+ }
+
+ if (virDomainNumaSetNodeCpumask(numa, cell, cpumask) == NULL)
+ goto error;
+
+ /* per NUMA cpus sibling vNUMA pinning */
+ numaCell = host->numaCell[node];
+ if (!(vnumask = virBitmapNew(nnumaCell * numaCell->ncpus)))
+ goto error;
+
+ for (i = 0; i < numaCell->ncpus; i++) {
+ unsigned int id = numaCell->cpus[i].id;
+
+ if (virBitmapSetBitExpand(vnumask, id) < 0) {
+ virBitmapFree(vnumask);
+ goto error;
+ }
+ }
+
+ for (i = 0; i < def->maxvcpus; i++) {
+ if (virBitmapIsBitSet(cpumask, i)) {
+ if (!(def->vcpus[i]->cpumask = virBitmapNewCopy(vnumask)))
+ goto error;
+ }
+ }
+ virBitmapFree(vnumask);
+
+ /* per NUMA cell sibling distances */
+ numaCell = host->numaCell[node];
+ switch (avnuma->mode) {
+ case VIR_DOMAIN_VNUMA_MODE_HOST:
+ ndistances = numaCell->nsiblings;
+ break;
+
+ case VIR_DOMAIN_VNUMA_MODE_NODE:
+ ndistances = 1;
+ if (avnuma->vcell)
+ vcell = cell;
+ else
+ if (virBitmapClearBit(nodeset, node) < 0)
+ goto error;
+
+ break;
+
+ default:
+ goto error;
+ }
+
+ /* Set vNUMA distances */
+ if (ndistances > 1) {
+ if (virDomainNumaSetNodeDistanceCount(numa,
+ vcell,
+ ndistances) < 0) {
+ virReportError(VIR_ERR_INTERNAL_ERROR,
+ _("vNUMA: domain %s failed to render a "
+ "matching vNUMA node distances set, defined
"
+ "vNUMA nodes %ld build on %ld host
nodes."),
+ def->name, nnumaCell, ndistances);
+ goto error;
+ }
+
+ for (i = 0; i < ndistances; i++) {
+ unsigned int distance = numaCell->siblings[i].distance;
+
+ if (virDomainNumaSetNodeDistance(numa, cell, i, distance) !=
distance)
+ goto error;
+ }
+ }
+ }
+
+ /* We're done - enable the vNUMA marker */
+ virDomainVnumaSetEnabled(numa, avnuma);
+
+ /* Adjust the new created vNUMA description */
+ def->numa = numa;
+
+ /* per NUMA cpus sibling vNUMA hotplugging directives */
+ virDomainDefSetVcpusVnuma(def, virDomainDefGetVcpus(def));
+ }
+ cleanup:
+
+ ret = 0;
+
+ error:
+ virBitmapFree(nodeset);
+ return ret;
+}
+
+
virDomainDiskDefPtr
virDomainDiskDefNew(virDomainXMLOptionPtr xmlopt)
{
@@ -10510,6 +10910,38 @@ virDomainDefSetMemoryTotal(virDomainDefPtr def,
}
+/**
+ * virDomainDefSetNUMAMemoryTotal:
+ * @def: domain definition
+ * @size: size to set
+ * @caps: host capabilities
+ *
+ * A frontend to set the total memory size in @def. If the guest's
+ * configured "total_memory" setting and the requested "size"
differ,
+ * call virDomainNumaAutoconfig() to evenly distribute the additional
+ * memory across all vNUMA nodes.
+ */
+int
+virDomainDefSetNUMAMemoryTotal(virDomainDefPtr def,
+ unsigned long long size,
+ virCapsPtr caps)
+{
+ bool DoNumaAutoConfig = (def->mem.total_memory != size);
+
+ if (DoNumaAutoConfig) {
+ if (virDomainVnumaSetMemory(def->numa, size) < 0)
+ return -1;
+
+ if (virDomainNumaAutoconfig(def, caps))
+ return -1;
+
+ if (virDomainDefPostParseMemory(def, VIR_DOMAIN_DEF_PARSE_ABI_UPDATE) < 0)
+ return -1;
+ }
+ return 0;
+}
+
+
/**
* virDomainDefGetMemoryTotal:
* @def: domain definition
@@ -18809,7 +19241,8 @@ virDomainIOThreadSchedParse(xmlNodePtr node,
static int
virDomainVcpuParse(virDomainDefPtr def,
xmlXPathContextPtr ctxt,
- virDomainXMLOptionPtr xmlopt)
+ virDomainXMLOptionPtr xmlopt,
+ bool IsAvNUMA)
{
int n;
xmlNodePtr vcpuNode;
@@ -18876,6 +19309,15 @@ virDomainVcpuParse(virDomainDefPtr def,
if (virDomainDefSetVcpusMax(def, maxvcpus, xmlopt) < 0)
return -1;
+ /* If vNUMA applies def->numa is reinitialized later */
+ if (IsAvNUMA) {
+
+ if (virDomainDefSetVcpus(def, vcpus) < 0)
+ return -1;
+
+ return 0;
+ }
+
if ((n = virXPathNodeSet("./vcpus/vcpu", ctxt, &nodes)) < 0)
return -1;
@@ -19746,6 +20188,7 @@ virDomainDefParseXML(xmlDocPtr xml,
char *netprefix = NULL;
g_autofree xmlNodePtr *nodes = NULL;
g_autofree char *tmp = NULL;
+ bool IsAvNUMA;
if (flags & VIR_DOMAIN_DEF_PARSE_VALIDATE_SCHEMA) {
g_autofree char *schema = NULL;
@@ -19871,6 +20314,8 @@ virDomainDefParseXML(xmlDocPtr xml,
}
VIR_FREE(tmp);
+ IsAvNUMA = virDomainVnumaParseXML(def->numa, ctxt) ? true : false;
+
tmp = virXPathString("string(./memoryBacking/source/@type)", ctxt);
if (tmp) {
if ((def->mem.source = virDomainMemorySourceTypeFromString(tmp)) <= 0) {
@@ -19986,7 +20431,7 @@ virDomainDefParseXML(xmlDocPtr xml,
&def->mem.swap_hard_limit) < 0)
goto error;
- if (virDomainVcpuParse(def, ctxt, xmlopt) < 0)
+ if (virDomainVcpuParse(def, ctxt, xmlopt, IsAvNUMA) < 0)
goto error;
if (virDomainDefParseIOThreads(def, ctxt) < 0)
@@ -20059,14 +20504,16 @@ virDomainDefParseXML(xmlDocPtr xml,
goto error;
}
- if ((n = virXPathNodeSet("./cputune/vcpupin", ctxt, &nodes)) < 0)
- goto error;
-
- for (i = 0; i < n; i++) {
- if (virDomainVcpuPinDefParseXML(def, nodes[i]))
+ if (!IsAvNUMA) {
+ if ((n = virXPathNodeSet("./cputune/vcpupin", ctxt, &nodes)) <
0)
goto error;
+
+ for (i = 0; i < n; i++) {
+ if (virDomainVcpuPinDefParseXML(def, nodes[i]))
+ goto error;
+ }
+ VIR_FREE(nodes);
}
- VIR_FREE(nodes);
if ((n = virXPathNodeSet("./cputune/emulatorpin", ctxt, &nodes)) <
0) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
@@ -20173,6 +20620,10 @@ virDomainDefParseXML(xmlDocPtr xml,
if (virDomainNumaDefCPUParseXML(def->numa, ctxt) < 0)
goto error;
+ /* Check and update the guest's XML vNUMA topology if needed */
+ if (virDomainNumaAutoconfig(def, caps))
+ goto error;
+
if (virDomainNumaGetCPUCountTotal(def->numa) > virDomainDefGetVcpusMax(def)) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Number of CPUs in <numa> exceeds the"
@@ -20186,10 +20637,11 @@ virDomainDefParseXML(xmlDocPtr xml,
goto error;
}
- if (virDomainNumatuneParseXML(def->numa,
- def->placement_mode ==
- VIR_DOMAIN_CPU_PLACEMENT_MODE_STATIC,
- ctxt) < 0)
+ if (!virDomainVnumaIsEnabled(def->numa) &&
+ (virDomainNumatuneParseXML(def->numa,
+ def->placement_mode ==
+ VIR_DOMAIN_CPU_PLACEMENT_MODE_STATIC,
+ ctxt) < 0))
goto error;
if (virDomainNumatuneHasPlacementAuto(def->numa) &&
@@ -28496,6 +28948,9 @@ virDomainDefFormatInternalSetRootName(virDomainDefPtr def,
if (virDomainMemtuneFormat(buf, &def->mem) < 0)
goto error;
+ if (virDomainVnumaFormatXML(buf, def->numa) < 0)
+ goto error;
+
if (virDomainCpuDefFormat(buf, def) < 0)
goto error;
@@ -29148,6 +29603,9 @@ virDomainSaveConfig(const char *configDir,
{
g_autofree char *xml = NULL;
+ if (virDomainNumaAutoconfig(def, caps) < 0)
+ return -1;
+
if (!(xml = virDomainDefFormat(def, caps, VIR_DOMAIN_DEF_FORMAT_SECURE)))
return -1;
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
index 5a17acedf299..0db77d9247a1 100644
--- a/src/conf/domain_conf.h
+++ b/src/conf/domain_conf.h
@@ -2535,6 +2535,7 @@ struct _virDomainDef {
unsigned long long virDomainDefGetMemoryInitial(const virDomainDef *def);
void virDomainDefSetMemoryTotal(virDomainDefPtr def, unsigned long long size);
+int virDomainDefSetNUMAMemoryTotal(virDomainDefPtr def, unsigned long long size,
virCapsPtr caps);
unsigned long long virDomainDefGetMemoryTotal(const virDomainDef *def);
bool virDomainDefHasMemoryHotplug(const virDomainDef *def);
@@ -2816,6 +2817,7 @@ int virDomainDefSetVcpusMax(virDomainDefPtr def,
bool virDomainDefHasVcpusOffline(const virDomainDef *def);
unsigned int virDomainDefGetVcpusMax(const virDomainDef *def);
int virDomainDefSetVcpus(virDomainDefPtr def, unsigned int vcpus);
+void virDomainDefSetVcpusVnuma(virDomainDefPtr def, size_t vcpus);
unsigned int virDomainDefGetVcpus(const virDomainDef *def);
virBitmapPtr virDomainDefGetOnlineVcpumap(const virDomainDef *def);
virDomainVcpuDefPtr virDomainDefGetVcpu(virDomainDefPtr def, unsigned int vcpu)
diff --git a/src/conf/numa_conf.c b/src/conf/numa_conf.c
index 6720d5620d1d..8e6ef4008b8d 100644
--- a/src/conf/numa_conf.c
+++ b/src/conf/numa_conf.c
@@ -45,6 +45,20 @@ VIR_ENUM_IMPL(virDomainNumatuneMemMode,
"interleave",
);
+VIR_ENUM_IMPL(virDomainVnumaMode,
+ VIR_DOMAIN_VNUMA_MODE_LAST,
+ "host",
+ "node",
+);
+
+VIR_ENUM_IMPL(virDomainVnumaDistribution,
+ VIR_DOMAIN_VNUMA_DISTRIBUTION_LAST,
+ "contiguous",
+ "siblings",
+ "round-robin",
+ "interleave",
+);
+
VIR_ENUM_IMPL(virDomainNumatunePlacement,
VIR_DOMAIN_NUMATUNE_PLACEMENT_LAST,
"default",
@@ -90,6 +104,7 @@ struct _virDomainNuma {
size_t nmem_nodes;
/* Future NUMA tuning related stuff should go here. */
+ virDomainAutoPartitionPtr avnuma;
};
@@ -353,6 +368,156 @@ virDomainNumatuneFormatXML(virBufferPtr buf,
return 0;
}
+int
+virDomainVnumaFormatXML(virBufferPtr buf,
+ virDomainNumaPtr numa)
+{
+ char *nodeset = NULL;
+ if (numa && virDomainVnumaIsEnabled(numa)) {
+
+ virBufferAddLit(buf, "<vnuma");
+ virBufferAsprintf(buf, " mode='%s'",
+ virDomainVnumaModeTypeToString(numa->avnuma->mode));
+ virBufferAsprintf(buf, " distribution='%s'",
+
virDomainVnumaDistributionTypeToString(numa->avnuma->distribution));
+ virBufferAddLit(buf, ">\n");
+
+ virBufferAdjustIndent(buf, 2);
+ virBufferAsprintf(buf, "<memory
unit='KiB'>%llu</memory>\n",
+ numa->avnuma->mem);
+
+
+ if (numa->avnuma->mode == VIR_DOMAIN_VNUMA_MODE_NODE) {
+ if ((nodeset = virBitmapFormat(numa->avnuma->nodeset))) {
+ virBufferAsprintf(buf, "<partition
nodeset='%s'", nodeset);
+ VIR_FREE(nodeset);
+ }
+
+ if (numa->avnuma->vcell)
+ virBufferAsprintf(buf, " cells='%u'",
numa->avnuma->vcell);
+ virBufferAddLit(buf, "/>\n");
+ }
+ virBufferAdjustIndent(buf, -2);
+
+ virBufferAddLit(buf, "</vnuma>\n");
+ }
+
+ return 0;
+}
+
+virDomainAutoPartitionPtr
+virDomainVnumaParseXML(virDomainNumaPtr numa,
+ xmlXPathContextPtr ctxt)
+{
+ int ret = -1;
+ char *tmp = NULL;
+ xmlNodePtr node, oldnode;
+ virDomainAutoPartitionPtr avnuma = NULL;
+
+ if (!numa)
+ return NULL;
+
+ if (!ctxt)
+ return avnuma = numa->avnuma;
+
+ oldnode = ctxt->node;
+ node = virXPathNode("./vnuma[1]", ctxt);
+ if (node) {
+ int mode = -1;
+ int distribution = VIR_DOMAIN_VNUMA_DISTRIBUTION_CONTIGUOUS;
+ unsigned int maxvcell = 0;
+ unsigned long long mem = 0L;
+ virBitmapPtr nodeset = NULL;
+
+ if (!virXMLNodeNameEqual(node, "vnuma")) {
+ virReportError(VIR_ERR_XML_ERROR, "%s",
+ _("domain definition does not contain expected
'vnuma' element"));
+ goto cleanup;
+ }
+
+ if (VIR_ALLOC(avnuma) < 0)
+ goto cleanup;
+
+ /* There has to be a valid vnuma mode setting */
+ if (!(tmp = virXMLPropString(node, "mode"))) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("No vNUMA 'mode' specified for automatic host
partitioning"));
+ goto cleanup;
+ }
+
+ if ((mode = virDomainVnumaModeTypeFromString(tmp)) < 0) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+ _("Unsupported automatic vNUMA partitioning mode
'%s'"), tmp);
+ goto cleanup;
+ }
+ VIR_FREE(tmp);
+
+ /* If specified get the vcpu 'distribution' type */
+ if ((tmp = virXMLPropString(node, "distribution")) &&
+ (distribution = virDomainVnumaDistributionTypeFromString(tmp)) < 0) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+ _("Unsupported automatic vNUMA partitioning distribution
'%s'"), tmp);
+ goto cleanup;
+ }
+ VIR_FREE(tmp);
+
+ /* Obtain the designated <vnuma mode='node' attributes */
+ ctxt->node = node;
+ switch (mode) {
+ case VIR_DOMAIN_VNUMA_MODE_NODE:
+ if ((node = virXPathNode("./partition[1]", ctxt))) {
+
+ /* Get the host <partition> nodeset='#nodeset' for
<numatune> */
+ if ((tmp = virXMLPropString(node, "nodeset"))) {
+ if (virBitmapParse(tmp, &nodeset, VIR_DOMAIN_CPUMASK_LEN)
< 0)
+ goto cleanup;
+ VIR_FREE(tmp);
+ }
+
+ /* Get the fictitious <partition> cells='#count'
attribute */
+ if ((tmp = virXMLPropString(node, "cells"))) {
+ if (virStrToLong_ui(tmp, NULL, 10, &maxvcell) < 0) {
+ virReportError(VIR_ERR_XML_ERROR, "%s",
+ _("maximum vcpus count must be an
integer"));
+ goto cleanup;
+ }
+ VIR_FREE(tmp);
+ }
+ }
+ break;
+
+ case VIR_DOMAIN_VNUMA_MODE_HOST:
+ default:
+ break;
+ }
+
+ /* Get the <memory> size to render the <numa> nodes with */
+ if (virDomainParseMemory("./memory[1]", NULL, ctxt,
+ &mem, false, true) < 0)
+ goto cleanup;
+
+ /* We're set and good to go */
+ avnuma->mode = mode;
+ avnuma->distribution = distribution;
+ avnuma->nodeset = nodeset;
+ avnuma->mem = mem;
+ avnuma->vcell = maxvcell;
+
+ numa->avnuma = avnuma;
+ }
+ ret = 0;
+
+ cleanup:
+ if (ret) {
+ VIR_FREE(tmp);
+ VIR_FREE(avnuma);
+ avnuma = NULL;
+ }
+ ctxt->node = oldnode;
+
+ return avnuma;
+}
+
void
virDomainNumaFree(virDomainNumaPtr numa)
{
@@ -572,6 +737,76 @@ virDomainNumatuneSet(virDomainNumaPtr numa,
return ret;
}
+int
+virDomainNumatuneSetmemset(virDomainNumaPtr numa,
+ size_t cell,
+ size_t node,
+ int mode)
+{
+ int ret = -1;
+ virDomainNumaNodePtr mem_node = &numa->mem_nodes[cell];
+
+ /* Get out if this is under control of numad! */
+ if (numa->memory.specified)
+ goto cleanup;
+
+ /* Get out if numa does not apply */
+ if (cell > numa->nmem_nodes)
+ goto cleanup;
+
+ /* Get out if mode is out of range */
+ if (mode < 0 || mode >= VIR_DOMAIN_NUMATUNE_MEM_LAST) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+ _("Unsupported numatune mode '%d'"),
+ mode);
+ goto cleanup;
+ }
+
+ /* Force the numatune/memset setting */
+ if (!(mem_node->nodeset = virBitmapNew(numa->nmem_nodes)) ||
+ virBitmapSetBitExpand(mem_node->nodeset, node) < 0) {
+ virBitmapFree(mem_node->nodeset);
+ goto cleanup;
+ }
+ mem_node->mode = mode;
+
+ ret = 0;
+
+ cleanup:
+ return ret;
+}
+
+bool
+virDomainVnumaIsEnabled(virDomainNumaPtr numa)
+{
+ if (numa && numa->avnuma)
+ return numa->avnuma->specified;
+
+ return false;
+}
+
+void
+virDomainVnumaSetEnabled(virDomainNumaPtr numa,
+ virDomainAutoPartitionPtr avnuma)
+{
+ if (numa && avnuma) {
+ numa->avnuma = avnuma;
+ numa->avnuma->specified = true;
+ }
+}
+
+int
+virDomainVnumaSetMemory(virDomainNumaPtr numa,
+ unsigned long long size)
+{
+ if (!numa)
+ return -1;
+
+ numa->avnuma->mem = size;
+
+ return 0;
+}
+
static bool
virDomainNumaNodesEqual(virDomainNumaPtr n1,
virDomainNumaPtr n2)
@@ -1273,7 +1508,7 @@ virDomainNumaSetNodeDistance(virDomainNumaPtr numa,
}
-size_t
+int
virDomainNumaSetNodeDistanceCount(virDomainNumaPtr numa,
size_t node,
size_t ndistances)
@@ -1285,11 +1520,11 @@ virDomainNumaSetNodeDistanceCount(virDomainNumaPtr numa,
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Cannot alter an existing nmem_nodes distances set for
node: %zu"),
node);
- return 0;
+ return -1;
}
if (VIR_ALLOC_N(distances, ndistances) < 0)
- return 0;
+ return -1;
numa->mem_nodes[node].distances = distances;
numa->mem_nodes[node].ndistances = ndistances;
diff --git a/src/conf/numa_conf.h b/src/conf/numa_conf.h
index e76a09c20cdc..bdc1deb6e143 100644
--- a/src/conf/numa_conf.h
+++ b/src/conf/numa_conf.h
@@ -32,6 +32,9 @@
typedef struct _virDomainNuma virDomainNuma;
typedef virDomainNuma *virDomainNumaPtr;
+typedef struct _virDomainAutoPartition virDomainAutoPartition;
+typedef virDomainAutoPartition *virDomainAutoPartitionPtr;
+
typedef enum {
VIR_DOMAIN_NUMATUNE_PLACEMENT_DEFAULT = 0,
VIR_DOMAIN_NUMATUNE_PLACEMENT_STATIC,
@@ -43,6 +46,24 @@ typedef enum {
VIR_ENUM_DECL(virDomainNumatunePlacement);
VIR_ENUM_DECL(virDomainNumatuneMemMode);
+typedef enum {
+ VIR_DOMAIN_VNUMA_MODE_HOST = 0,
+ VIR_DOMAIN_VNUMA_MODE_NODE,
+
+ VIR_DOMAIN_VNUMA_MODE_LAST
+} virDomainVnumaMode;
+VIR_ENUM_DECL(virDomainVnumaMode);
+
+typedef enum {
+ VIR_DOMAIN_VNUMA_DISTRIBUTION_CONTIGUOUS = 0,
+ VIR_DOMAIN_VNUMA_DISTRIBUTION_SIBLINGS,
+ VIR_DOMAIN_VNUMA_DISTRIBUTION_ROUNDROBIN,
+ VIR_DOMAIN_VNUMA_DISTRIBUTION_INTERLEAVE,
+
+ VIR_DOMAIN_VNUMA_DISTRIBUTION_LAST
+} virDomainVnumaDistribution;
+VIR_ENUM_DECL(virDomainVnumaDistribution);
+
typedef enum {
VIR_DOMAIN_MEMORY_ACCESS_DEFAULT = 0, /* No memory access defined */
VIR_DOMAIN_MEMORY_ACCESS_SHARED, /* Memory access is set as shared */
@@ -52,6 +73,14 @@ typedef enum {
} virDomainMemoryAccess;
VIR_ENUM_DECL(virDomainMemoryAccess);
+struct _virDomainAutoPartition {
+ bool specified; /* Auto vNUMA active */
+ int mode; /* Auto vNUMA mode */
+ int distribution; /* Auto vNUMA distribution */
+ unsigned long long mem; /* Auto vNUMA total memory */
+ unsigned int vcell; /* Auto vNUMA node Cell */
+ virBitmapPtr nodeset; /* Auto vNUMA host nodes where this guest node resides */
+};
virDomainNumaPtr virDomainNumaNew(void);
void virDomainNumaFree(virDomainNumaPtr numa);
@@ -67,9 +96,19 @@ int virDomainNumatuneParseXML(virDomainNumaPtr numa,
int virDomainNumatuneFormatXML(virBufferPtr buf, virDomainNumaPtr numatune)
ATTRIBUTE_NONNULL(1);
+virDomainAutoPartitionPtr virDomainVnumaParseXML(virDomainNumaPtr numa,
+ xmlXPathContextPtr ctxt)
+ ATTRIBUTE_NONNULL(1);
+
+int virDomainVnumaFormatXML(virBufferPtr buf, virDomainNumaPtr numa)
+ ATTRIBUTE_NONNULL(1);
+
/*
* Getters
*/
+bool virDomainVnumaIsEnabled(virDomainNumaPtr numa)
+ ATTRIBUTE_NONNULL(1);
+
int virDomainNumatuneGetMode(virDomainNumaPtr numatune,
int cellid,
virDomainNumatuneMemMode *mode);
@@ -134,6 +173,19 @@ int virDomainNumatuneSet(virDomainNumaPtr numa,
virBitmapPtr nodeset)
ATTRIBUTE_NONNULL(1);
+void virDomainVnumaSetEnabled(virDomainNumaPtr numa,
+ virDomainAutoPartitionPtr avnuma)
+ ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2);
+int virDomainVnumaSetMemory(virDomainNumaPtr numa,
+ unsigned long long size)
+ ATTRIBUTE_NONNULL(1);
+
+int virDomainNumatuneSetmemset(virDomainNumaPtr numa,
+ size_t cell,
+ size_t node,
+ int mode)
+ ATTRIBUTE_NONNULL(1);
+
size_t virDomainNumaSetNodeCount(virDomainNumaPtr numa,
size_t nmem_nodes)
ATTRIBUTE_NONNULL(1);
@@ -149,9 +201,9 @@ int virDomainNumaSetNodeDistance(virDomainNumaPtr numa,
unsigned int value)
ATTRIBUTE_NONNULL(1);
-size_t virDomainNumaSetNodeDistanceCount(virDomainNumaPtr numa,
- size_t node,
- size_t ndistances)
+int virDomainNumaSetNodeDistanceCount(virDomainNumaPtr numa,
+ size_t node,
+ size_t ndistances)
ATTRIBUTE_NONNULL(1);
virBitmapPtr virDomainNumaSetNodeCpumask(virDomainNumaPtr numa,
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index 17977229d18f..7f7c3fdeafaa 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -311,8 +311,10 @@ virDomainDefParseNode;
virDomainDefParseString;
virDomainDefPostParse;
virDomainDefSetMemoryTotal;
+virDomainDefSetNUMAMemoryTotal;
virDomainDefSetVcpus;
virDomainDefSetVcpusMax;
+virDomainDefSetVcpusVnuma;
virDomainDefValidate;
virDomainDefVcpuOrderClear;
virDomainDeleteConfig;
@@ -828,7 +830,13 @@ virDomainNumatuneParseXML;
virDomainNumatunePlacementTypeFromString;
virDomainNumatunePlacementTypeToString;
virDomainNumatuneSet;
+virDomainNumatuneSetmemset;
virDomainNumatuneSpecifiedMaxNode;
+virDomainVnumaFormatXML;
+virDomainVnumaIsEnabled;
+virDomainVnumaParseXML;
+virDomainVnumaSetEnabled;
+virDomainVnumaSetMemory;
# conf/nwfilter_conf.h
--
2.21.0