https://bugzilla.redhat.com/show_bug.cgi?id=1480668
QEMU has this new feature memory-backend-file.discard-data=yes
which is a nifty optimization. Basically, when qemu is quitting
or on memory hotplug it calls munmap() and close() on the file
that is backing the memory. However, this does not mean kernel
won't stop touching that part of memory. It still might. With
this feature enabled we tell kernel: "we don't need this memory
nor data stored in it". This makes kernel drop the memory
immediately without trying to sync memory with the mapped file.
Unfortunately, this cannot be turned on by default because we
can't be sure when users really don't care about what happens to
data after qemu dies. So it has to be opt-in. As usual, there are
three places where one can configure memory attributes. This
patch adds the feature to all of them.
Signed-off-by: Michal Privoznik <mprivozn(a)redhat.com>
---
docs/formatdomain.html.in | 40 +++++++++++++++++++++++++---
docs/schemas/cputypes.rng | 5 ++++
docs/schemas/domaincommon.rng | 10 +++++++
src/conf/domain_conf.c | 39 +++++++++++++++++++++++++--
src/conf/domain_conf.h | 3 +++
src/conf/numa_conf.c | 27 +++++++++++++++++++
src/conf/numa_conf.h | 3 +++
src/libvirt_private.syms | 1 +
src/qemu/qemu_command.c | 27 ++++++++++++++++---
tests/qemuxml2argvdata/hugepages-pages7.args | 3 ++-
tests/qemuxml2argvdata/hugepages-pages7.xml | 4 +--
tests/qemuxml2argvtest.c | 3 ++-
12 files changed, 152 insertions(+), 13 deletions(-)
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in
index 5e99884dc5..d40536d06f 100644
--- a/docs/formatdomain.html.in
+++ b/docs/formatdomain.html.in
@@ -1016,6 +1016,7 @@
<source type="file|anonymous"/>
<access mode="shared|private"/>
<allocation mode="immediate|ondemand"/>
+ <discard/>
</memoryBacking>
...
</domain>
@@ -1063,11 +1064,21 @@
suitable for the specific environment at the same time to mitigate
the risks described above. <span class="since">Since
1.0.6</span></dd>
<dt><code>source</code></dt>
- <dd>In this attribute you can switch to file memorybacking or keep default
anonymous.</dd>
+ <dd>In this attribute you can switch to file memorybacking or keep
+ default anonymous.</dd>
<dt><code>access</code></dt>
- <dd>Specify if memory is shared or private. This can be overridden per numa
node by <code>memAccess</code></dd>
+ <dd>Specify if memory is shared or private. This can be overridden per
+ numa node by <code>memAccess</code></dd>
<dt><code>allocation</code></dt>
<dd>Specify when allocate the memory</dd>
+ <dt><code>discard</code></dt>
+ <dd>When set an supported by hypervisor the memory
+ content is discarded just before guest shuts down (or
+ when DIMM module is unplugged). Please note that this is
+ just an optimization and is not guaranteed to work in
+ all cases (e.g. when hypervisor crashes).
+ <span class="since">Since 4.3.0</span> (QEMU/KVM only)
+ </dd>
</dl>
@@ -1606,7 +1617,7 @@
<cpu>
...
<numa>
- <cell id='0' cpus='0-3' memory='512000'
unit='KiB'/>
+ <cell id='0' cpus='0-3' memory='512000'
unit='KiB' discard='yes'/>
<cell id='1' cpus='4-7' memory='512000'
unit='KiB' memAccess='shared'/>
</numa>
...
@@ -1632,6 +1643,13 @@
<code>memAccess</code> can control whether the memory is to be
mapped as "shared" or "private". This is valid only for
hugepages-backed memory and nvdimm modules.
+
+ Each <code>cell</code> element can have an optional
+ attribute <code>discard</code> which fine tunes the discard
+ feature for given numa node as described under
+ <a href="#elementsMemoryBacking">Memory Backing</a>.
+ Accepted values are <code>yes</code> and <code>no</code>.
+ <span class='since'>Since 4.3.0</span>
</p>
<p>
@@ -7847,7 +7865,7 @@ qemu-kvm -net nic,model=? /dev/null
<pre>
...
<devices>
- <memory model='dimm' access='private'>
+ <memory model='dimm' access='private'
discard='yes'>
<target>
<size unit='KiB'>524287</size>
<node>0</node>
@@ -7901,6 +7919,20 @@ qemu-kvm -net nic,model=? /dev/null
</p>
</dd>
+ <dt><code>discard</code></dt>
+ <dd>
+ <p>
+ An optional attribute <code>discard</code>
+ (<span class="since">since 4.3.0</span>) that provides
+ capability to fine tune discard of data on per module
+ basis. Accepted values are <code>yes</code> and
+ <code>no</code>. The feature is described here:
+ <a href="#elementsMemoryBacking">Memory Backing</a>.
+ This attribute is allowed only for
+ <code>model='dimm'</code>.
+ </p>
+ </dd>
+
<dt><code>source</code></dt>
<dd>
<p>
diff --git a/docs/schemas/cputypes.rng b/docs/schemas/cputypes.rng
index c45b6dfb28..1f1e0e36d5 100644
--- a/docs/schemas/cputypes.rng
+++ b/docs/schemas/cputypes.rng
@@ -129,6 +129,11 @@
</choice>
</attribute>
</optional>
+ <optional>
+ <attribute name="discard">
+ <ref name="virYesNo"/>
+ </attribute>
+ </optional>
<optional>
<element name="distances">
<oneOrMore>
diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng
index 4cab55f05d..9650a779b7 100644
--- a/docs/schemas/domaincommon.rng
+++ b/docs/schemas/domaincommon.rng
@@ -633,6 +633,11 @@
</attribute>
</element>
</optional>
+ <optional>
+ <element name="discard">
+ <empty/>
+ </element>
+ </optional>
</interleave>
</element>
</optional>
@@ -5138,6 +5143,11 @@
</choice>
</attribute>
</optional>
+ <optional>
+ <attribute name="discard">
+ <ref name="virYesNo"/>
+ </attribute>
+ </optional>
<interleave>
<optional>
<ref name="memorydev-source"/>
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 35666c1347..9585e38bc1 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -5508,6 +5508,20 @@ virDomainVideoDefValidate(const virDomainVideoDef *video)
}
+static int
+virDomainMemoryDefValidate(const virDomainMemoryDef *mem)
+{
+ if (mem->model == VIR_DOMAIN_MEMORY_MODEL_NVDIMM &&
+ mem->discard == VIR_TRISTATE_BOOL_YES) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("discard is not supported for nvdimms"));
+ return -1;
+ }
+
+ return 0;
+}
+
+
static int
virDomainDeviceDefValidateInternal(const virDomainDeviceDef *dev,
const virDomainDef *def)
@@ -5540,6 +5554,9 @@ virDomainDeviceDefValidateInternal(const virDomainDeviceDef *dev,
case VIR_DOMAIN_DEVICE_VIDEO:
return virDomainVideoDefValidate(dev->data.video);
+ case VIR_DOMAIN_DEVICE_MEMORY:
+ return virDomainMemoryDefValidate(dev->data.memory);
+
case VIR_DOMAIN_DEVICE_LEASE:
case VIR_DOMAIN_DEVICE_FS:
case VIR_DOMAIN_DEVICE_INPUT:
@@ -5552,7 +5569,6 @@ virDomainDeviceDefValidateInternal(const virDomainDeviceDef *dev,
case VIR_DOMAIN_DEVICE_SHMEM:
case VIR_DOMAIN_DEVICE_TPM:
case VIR_DOMAIN_DEVICE_PANIC:
- case VIR_DOMAIN_DEVICE_MEMORY:
case VIR_DOMAIN_DEVICE_IOMMU:
case VIR_DOMAIN_DEVICE_NONE:
case VIR_DOMAIN_DEVICE_LAST:
@@ -15613,6 +15629,16 @@ virDomainMemoryDefParseXML(virDomainXMLOptionPtr xmlopt,
}
VIR_FREE(tmp);
+ if ((tmp = virXMLPropString(memdevNode, "discard"))) {
+ if ((val = virTristateBoolTypeFromString(tmp)) <= 0) {
+ virReportError(VIR_ERR_XML_ERROR,
+ _("invalid discard value '%s'"), tmp);
+ goto error;
+ }
+
+ def->discard = val;
+ }
+
/* source */
if ((node = virXPathNode("./source", ctxt)) &&
virDomainMemorySourceDefParseXML(node, ctxt, def) < 0)
@@ -18939,6 +18965,9 @@ virDomainDefParseXML(xmlDocPtr xml,
if (virXPathBoolean("boolean(./memoryBacking/locked)", ctxt))
def->mem.locked = true;
+ if (virXPathBoolean("boolean(./memoryBacking/discard)", ctxt))
+ def->mem.discard = VIR_TRISTATE_BOOL_YES;
+
/* Extract blkio cgroup tunables */
if (virXPathUInt("string(./blkiotune/weight)", ctxt,
&def->blkio.weight) < 0)
@@ -25196,6 +25225,9 @@ virDomainMemoryDefFormat(virBufferPtr buf,
if (def->access)
virBufferAsprintf(buf, " access='%s'",
virDomainMemoryAccessTypeToString(def->access));
+ if (def->discard)
+ virBufferAsprintf(buf, " discard='%s'",
+ virTristateBoolTypeToString(def->discard));
virBufferAddLit(buf, ">\n");
virBufferAdjustIndent(buf, 2);
@@ -26658,7 +26690,8 @@ virDomainDefFormatInternal(virDomainDefPtr def,
}
if (def->mem.nhugepages || def->mem.nosharepages || def->mem.locked
- || def->mem.source || def->mem.access || def->mem.allocation)
+ || def->mem.source || def->mem.access || def->mem.allocation
+ || def->mem.discard)
{
virBufferAddLit(buf, "<memoryBacking>\n");
virBufferAdjustIndent(buf, 2);
@@ -26677,6 +26710,8 @@ virDomainDefFormatInternal(virDomainDefPtr def,
if (def->mem.allocation)
virBufferAsprintf(buf, "<allocation mode='%s'/>\n",
virDomainMemoryAllocationTypeToString(def->mem.allocation));
+ if (def->mem.discard)
+ virBufferAddLit(buf, "<discard/>\n");
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</memoryBacking>\n");
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
index 3c7eccb8ca..52d29124f1 100644
--- a/src/conf/domain_conf.h
+++ b/src/conf/domain_conf.h
@@ -2107,6 +2107,7 @@ typedef enum {
struct _virDomainMemoryDef {
virDomainMemoryAccess access;
+ int discard; /* enum virTristateBool */
/* source */
virBitmapPtr sourceNodes;
@@ -2269,6 +2270,8 @@ struct _virDomainMemtune {
int source; /* enum virDomainMemorySource */
int access; /* enum virDomainMemoryAccess */
int allocation; /* enum virDomainMemoryAllocation */
+
+ int discard; /* enum virTristateBool */
};
typedef struct _virDomainPowerManagement virDomainPowerManagement;
diff --git a/src/conf/numa_conf.c b/src/conf/numa_conf.c
index 9307dd93d3..a1bbcfa945 100644
--- a/src/conf/numa_conf.c
+++ b/src/conf/numa_conf.c
@@ -77,6 +77,7 @@ struct _virDomainNuma {
virBitmapPtr nodeset; /* host memory nodes where this guest node resides */
virDomainNumatuneMemMode mode; /* memory mode selection */
virDomainMemoryAccess memAccess; /* shared memory access configuration */
+ int discard; /* discard-data for memory-backend-file, virTristateBool */
struct _virDomainNumaDistance {
unsigned int value; /* locality value for node i->j or j->i */
@@ -947,6 +948,18 @@ virDomainNumaDefCPUParseXML(virDomainNumaPtr def,
VIR_FREE(tmp);
}
+ if ((tmp = virXMLPropString(nodes[i], "discard"))) {
+ if ((rc = virTristateBoolTypeFromString(tmp)) <= 0) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+ _("Invalid 'discard' attribute value
'%s'"),
+ tmp);
+ goto cleanup;
+ }
+
+ def->mem_nodes[cur_cell].discard = rc;
+ VIR_FREE(tmp);
+ }
+
/* Parse NUMA distances info */
if (virDomainNumaDefNodeDistanceParseXML(def, ctxt, cur_cell) < 0)
goto cleanup;
@@ -967,6 +980,7 @@ virDomainNumaDefCPUFormatXML(virBufferPtr buf,
virDomainNumaPtr def)
{
virDomainMemoryAccess memAccess;
+ int discard;
char *cpustr;
size_t ncells = virDomainNumaGetNodeCount(def);
size_t i;
@@ -980,6 +994,7 @@ virDomainNumaDefCPUFormatXML(virBufferPtr buf,
int ndistances;
memAccess = virDomainNumaGetNodeMemoryAccessMode(def, i);
+ discard = virDomainNumaGetNodeDiscard(def, i);
if (!(cpustr = virBitmapFormat(virDomainNumaGetNodeCpumask(def, i))))
return -1;
@@ -994,6 +1009,10 @@ virDomainNumaDefCPUFormatXML(virBufferPtr buf,
virBufferAsprintf(buf, " memAccess='%s'",
virDomainMemoryAccessTypeToString(memAccess));
+ if (discard)
+ virBufferAsprintf(buf, " discard='%s'",
+ virTristateBoolTypeToString(discard));
+
ndistances = def->mem_nodes[i].ndistances;
if (ndistances == 0) {
virBufferAddLit(buf, "/>\n");
@@ -1304,6 +1323,14 @@ virDomainNumaGetNodeMemoryAccessMode(virDomainNumaPtr numa,
}
+int
+virDomainNumaGetNodeDiscard(virDomainNumaPtr numa,
+ size_t node)
+{
+ return numa->mem_nodes[node].discard;
+}
+
+
unsigned long long
virDomainNumaGetNodeMemorySize(virDomainNumaPtr numa,
size_t node)
diff --git a/src/conf/numa_conf.h b/src/conf/numa_conf.h
index 7947fdb219..6d8f484f73 100644
--- a/src/conf/numa_conf.h
+++ b/src/conf/numa_conf.h
@@ -102,6 +102,9 @@ virBitmapPtr virDomainNumaGetNodeCpumask(virDomainNumaPtr numa,
virDomainMemoryAccess virDomainNumaGetNodeMemoryAccessMode(virDomainNumaPtr numa,
size_t node)
ATTRIBUTE_NONNULL(1);
+int virDomainNumaGetNodeDiscard(virDomainNumaPtr numa,
+ size_t node)
+ ATTRIBUTE_NONNULL(1);
unsigned long long virDomainNumaGetNodeMemorySize(virDomainNumaPtr numa,
size_t node)
ATTRIBUTE_NONNULL(1);
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index b31f599bd2..d3d0495e42 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -750,6 +750,7 @@ virDomainNumaGetMaxCPUID;
virDomainNumaGetMemorySize;
virDomainNumaGetNodeCount;
virDomainNumaGetNodeCpumask;
+virDomainNumaGetNodeDiscard;
virDomainNumaGetNodeDistance;
virDomainNumaGetNodeMemoryAccessMode;
virDomainNumaGetNodeMemorySize;
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index b666f3715f..4964c32aeb 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -3010,6 +3010,7 @@ qemuBuildMemoryBackendStr(virJSONValuePtr *backendProps,
unsigned long long pagesize = mem->pagesize;
bool needHugepage = !!pagesize;
bool useHugepage = !!pagesize;
+ int discard = mem->discard;
/* The difference between @needHugepage and @useHugepage is that the latter
* is true whenever huge page is defined for the current memory cell.
@@ -3020,8 +3021,7 @@ qemuBuildMemoryBackendStr(virJSONValuePtr *backendProps,
*backendProps = NULL;
*backendType = NULL;
- if (memAccess == VIR_DOMAIN_MEMORY_ACCESS_DEFAULT &&
- mem->targetNode >= 0) {
+ if (mem->targetNode >= 0) {
/* memory devices could provide a invalid guest node */
if (mem->targetNode >= virDomainNumaGetNodeCount(def->numa)) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
@@ -3031,12 +3031,19 @@ qemuBuildMemoryBackendStr(virJSONValuePtr *backendProps,
return -1;
}
- memAccess = virDomainNumaGetNodeMemoryAccessMode(def->numa,
mem->targetNode);
+ if (memAccess == VIR_DOMAIN_MEMORY_ACCESS_DEFAULT)
+ memAccess = virDomainNumaGetNodeMemoryAccessMode(def->numa,
mem->targetNode);
+
+ if (discard == VIR_TRISTATE_BOOL_ABSENT)
+ discard = virDomainNumaGetNodeDiscard(def->numa, mem->targetNode);
}
if (memAccess == VIR_DOMAIN_MEMORY_ACCESS_DEFAULT)
memAccess = def->mem.access;
+ if (discard == VIR_TRISTATE_BOOL_ABSENT)
+ discard = def->mem.discard;
+
if (virDomainNumatuneGetMode(def->numa, mem->targetNode, &mode) < 0
&&
virDomainNumatuneGetMode(def->numa, -1, &mode) < 0)
mode = VIR_DOMAIN_NUMATUNE_MEM_STRICT;
@@ -3124,6 +3131,20 @@ qemuBuildMemoryBackendStr(virJSONValuePtr *backendProps,
NULL) < 0)
goto cleanup;
+ if (!mem->nvdimmPath &&
+ discard == VIR_TRISTATE_BOOL_YES) {
+ if (!virQEMUCapsGet(qemuCaps, QEMU_CAPS_OBJECT_MEMORY_FILE_DISCARD)) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("this QEMU doesn't support memory
discard"));
+ goto cleanup;
+ }
+
+ if (virJSONValueObjectAdd(props,
+ "B:discard-data", true,
+ NULL) < 0)
+ goto cleanup;
+ }
+
switch (memAccess) {
case VIR_DOMAIN_MEMORY_ACCESS_SHARED:
if (virJSONValueObjectAdd(props, "b:share", true, NULL) < 0)
diff --git a/tests/qemuxml2argvdata/hugepages-pages7.args
b/tests/qemuxml2argvdata/hugepages-pages7.args
index 1cb598d692..02a98026eb 100644
--- a/tests/qemuxml2argvdata/hugepages-pages7.args
+++ b/tests/qemuxml2argvdata/hugepages-pages7.args
@@ -18,7 +18,8 @@ mem-path=/dev/hugepages1G/libvirt/qemu/-1-fedora,size=1073741824,\
host-nodes=1-3,policy=bind \
-device pc-dimm,node=0,memdev=memdimm0,id=dimm0,slot=0 \
-object memory-backend-file,id=memdimm1,prealloc=yes,\
-mem-path=/dev/hugepages2M/libvirt/qemu/-1-fedora,share=no,size=536870912 \
+mem-path=/dev/hugepages2M/libvirt/qemu/-1-fedora,discard-data=yes,share=no,\
+size=536870912 \
-device pc-dimm,node=0,memdev=memdimm1,id=dimm1,slot=1 \
-uuid 63840878-0deb-4095-97e6-fc444d9bc9fa \
-display none \
diff --git a/tests/qemuxml2argvdata/hugepages-pages7.xml
b/tests/qemuxml2argvdata/hugepages-pages7.xml
index d75cf5afa3..28c72f85a7 100644
--- a/tests/qemuxml2argvdata/hugepages-pages7.xml
+++ b/tests/qemuxml2argvdata/hugepages-pages7.xml
@@ -43,7 +43,7 @@
<memballoon model='virtio'>
<address type='pci' domain='0x0000' bus='0x00'
slot='0x03' function='0x0'/>
</memballoon>
- <memory model='dimm'>
+ <memory model='dimm' discard='no'>
<source>
<nodemask>1-3</nodemask>
<pagesize unit='KiB'>1048576</pagesize>
@@ -54,7 +54,7 @@
</target>
<address type='dimm' slot='0'/>
</memory>
- <memory model='dimm' access='private'>
+ <memory model='dimm' access='private' discard='yes'>
<target>
<size unit='KiB'>524287</size>
<node>0</node>
diff --git a/tests/qemuxml2argvtest.c b/tests/qemuxml2argvtest.c
index 74d930ebe2..481c1ec8bc 100644
--- a/tests/qemuxml2argvtest.c
+++ b/tests/qemuxml2argvtest.c
@@ -951,7 +951,8 @@ mymain(void)
DO_TEST("hugepages-pages5", NONE);
DO_TEST("hugepages-pages6", NONE);
DO_TEST("hugepages-pages7",
- QEMU_CAPS_DEVICE_PC_DIMM, QEMU_CAPS_OBJECT_MEMORY_FILE);
+ QEMU_CAPS_DEVICE_PC_DIMM, QEMU_CAPS_OBJECT_MEMORY_FILE,
+ QEMU_CAPS_OBJECT_MEMORY_FILE_DISCARD);
DO_TEST("hugepages-memaccess", QEMU_CAPS_OBJECT_MEMORY_FILE,
QEMU_CAPS_OBJECT_MEMORY_RAM, QEMU_CAPS_DEVICE_PC_DIMM,
QEMU_CAPS_NUMA);
--
2.16.1