[PATCH 0/5] multiple memory backend support for CPR Live Updates

From: Michael Galaxy <mgalaxy@akamai.com> CPR-based support for whole-hypervisor kexec-based live updates is making progress on qemu-devel. In support of this, we need NUMA to be support in these kinds of environments. To do this we use a technology called PMEM (persistent memory), which underpin the ability for CPR Live Updates to work so that QEMU memory can remain in RAM and be recovered after the kexec has completed. Our systems are highly NUMA-aware, and so this patch series enables NUMA awareness for live update and also allows live migrations to work. Further, we make a small change that allows live migrations to work between *non* PMEM-based systems and PMEM-based systems (and vice-versa). This allows for seemless upgrades from non-live-compatible systems to live-update-compatible sytems without any downtime. Michael Galaxy (5): qemu.conf changes to support multiple memory backend directories Update cleanup routines to handle multiple memory backing paths instead of just one. Implement multiple memory backing paths Support live migration between file-backed memory and anonymous memory. Update unit test to support multiple memory backends src/qemu/qemu_command.c | 8 ++- src/qemu/qemu_conf.c | 140 ++++++++++++++++++++++++++++++++++++---- src/qemu/qemu_conf.h | 14 ++-- src/qemu/qemu_domain.c | 24 +++++-- src/qemu/qemu_driver.c | 31 +++++---- src/qemu/qemu_hotplug.c | 6 +- src/qemu/qemu_process.c | 45 +++++++------ src/qemu/qemu_process.h | 13 ++-- tests/testutilsqemu.c | 5 +- 9 files changed, 217 insertions(+), 69 deletions(-) -- 2.25.1

From: Michael Galaxy <mgalaxy@akamai.com> We start by introducing a backwards-compatible, comma-separated specification that will not break existing installations, such as in the following example: $ cat qemu.conf | grep memory_backing_dir memory_backing_dir = "/path/to/pmem/0,/path/to/pmem/1" In our case, we almost always have two NUMA nodes, so in that example, we have two PMEM regions which are created on the Linux kernel command line that get mounted into those two locations for libvirt to use. Later patches will recognize this configuration and activate it. Signed-off-by: Michael Galaxy <mgalaxy@akamai.com> --- src/qemu/qemu_conf.c | 39 +++++++++++++++++++++++++++++++++------ src/qemu/qemu_conf.h | 3 ++- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index 4050a82341..aae9f316d8 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -43,6 +43,7 @@ #include "virfile.h" #include "virstring.h" #include "virutil.h" +#include "virnuma.h" #include "configmake.h" #include "security/security_util.h" @@ -137,6 +138,9 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged, cfg->cgroupControllers = -1; /* -1 == auto-detect */ + cfg->memoryBackingDirs = g_new0(char *, 1); + cfg->nb_memoryBackingDirs = 1; + if (root != NULL) { cfg->logDir = g_strdup_printf("%s/log/qemu", root); cfg->swtpmLogDir = g_strdup_printf("%s/log/swtpm", root); @@ -153,7 +157,7 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged, cfg->checkpointDir = g_strdup_printf("%s/checkpoint", cfg->libDir); cfg->autoDumpPath = g_strdup_printf("%s/dump", cfg->libDir); cfg->nvramDir = g_strdup_printf("%s/nvram", cfg->libDir); - cfg->memoryBackingDir = g_strdup_printf("%s/ram", cfg->libDir); + cfg->memoryBackingDirs[0] = g_strdup_printf("%s/ram", cfg->libDir); } else if (privileged) { cfg->logDir = g_strdup_printf("%s/log/libvirt/qemu", LOCALSTATEDIR); @@ -174,7 +178,8 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged, cfg->checkpointDir = g_strdup_printf("%s/checkpoint", cfg->libDir); cfg->autoDumpPath = g_strdup_printf("%s/dump", cfg->libDir); cfg->nvramDir = g_strdup_printf("%s/nvram", cfg->libDir); - cfg->memoryBackingDir = g_strdup_printf("%s/ram", cfg->libDir); + + cfg->memoryBackingDirs[0] = g_strdup_printf("%s/ram", cfg->libDir); cfg->swtpmStorageDir = g_strdup_printf("%s/lib/libvirt/swtpm", LOCALSTATEDIR); } else { @@ -201,7 +206,7 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged, cfg->configBaseDir); cfg->autoDumpPath = g_strdup_printf("%s/qemu/dump", cfg->configBaseDir); cfg->nvramDir = g_strdup_printf("%s/qemu/nvram", cfg->configBaseDir); - cfg->memoryBackingDir = g_strdup_printf("%s/qemu/ram", cfg->configBaseDir); + cfg->memoryBackingDirs[0] = g_strdup_printf("%s/qemu/ram", cfg->configBaseDir); cfg->swtpmStorageDir = g_strdup_printf("%s/qemu/swtpm", cfg->configBaseDir); } @@ -369,7 +374,12 @@ static void virQEMUDriverConfigDispose(void *obj) virFirmwareFreeList(cfg->firmwares, cfg->nfirmwares); - g_free(cfg->memoryBackingDir); + for (size_t i = 0; i < cfg->nb_memoryBackingDirs; i++) { + g_free(cfg->memoryBackingDirs[i]); + } + + g_free(cfg->memoryBackingDirs); + g_free(cfg->swtpmStorageDir); g_strfreev(cfg->capabilityfilters); @@ -1019,14 +1029,31 @@ virQEMUDriverConfigLoadMemoryEntry(virQEMUDriverConfig *cfg, virConf *conf) { g_autofree char *dir = NULL; + g_auto(GStrv) params = NULL; + GStrv next; int rc; if ((rc = virConfGetValueString(conf, "memory_backing_dir", &dir)) < 0) return -1; if (rc > 0) { - VIR_FREE(cfg->memoryBackingDir); - cfg->memoryBackingDir = g_strdup_printf("%s/libvirt/qemu", dir); + size_t i; + for (i = 0; i < cfg->nb_memoryBackingDirs; i++) + VIR_FREE(cfg->memoryBackingDirs[i]); + + if (!(params = g_strsplit(dir, ",", 0))) { + return -1; + } + + cfg->nb_memoryBackingDirs = 0; + for (next = params; *next; next++) { + cfg->nb_memoryBackingDirs++; + } + + cfg->memoryBackingDirs = g_new0(char *, cfg->nb_memoryBackingDirs); + for (i = 0, next = params; *next; next++, i++) { + cfg->memoryBackingDirs[i] = g_strdup_printf("%s/libvirt/qemu", *next); + } } return 0; diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h index 36049b4bfa..2b8d540df0 100644 --- a/src/qemu/qemu_conf.h +++ b/src/qemu/qemu_conf.h @@ -221,7 +221,8 @@ struct _virQEMUDriverConfig { unsigned int glusterDebugLevel; bool virtiofsdDebug; - char *memoryBackingDir; + char **memoryBackingDirs; + size_t nb_memoryBackingDirs; uid_t swtpm_user; gid_t swtpm_group; -- 2.25.1

On Mon, Jan 29, 2024 at 04:43:53PM -0500, mgalaxy@akamai.com wrote:
From: Michael Galaxy <mgalaxy@akamai.com>
We start by introducing a backwards-compatible, comma-separated specification that will not break existing installations, such as in the following example:
$ cat qemu.conf | grep memory_backing_dir memory_backing_dir = "/path/to/pmem/0,/path/to/pmem/1"
This would be better as an array, but I understand you thought it would be easier to code for backward compatibility.
In our case, we almost always have two NUMA nodes, so in that example, we have two PMEM regions which are created on the Linux kernel command line that get mounted into those two locations for libvirt to use.
There are PMEM devices which you then expose as filesystems to use for libvirt as a backing for VM's PMEMs. Do I understand that correctly? If yes, how are these different? Can't they be passed through?
Later patches will recognize this configuration and activate it.
Signed-off-by: Michael Galaxy <mgalaxy@akamai.com> --- src/qemu/qemu_conf.c | 39 +++++++++++++++++++++++++++++++++------ src/qemu/qemu_conf.h | 3 ++- 2 files changed, 35 insertions(+), 7 deletions(-)
diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index 4050a82341..aae9f316d8 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -43,6 +43,7 @@ #include "virfile.h" #include "virstring.h" #include "virutil.h" +#include "virnuma.h" #include "configmake.h" #include "security/security_util.h"
@@ -137,6 +138,9 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged,
cfg->cgroupControllers = -1; /* -1 == auto-detect */
+ cfg->memoryBackingDirs = g_new0(char *, 1); + cfg->nb_memoryBackingDirs = 1; + if (root != NULL) { cfg->logDir = g_strdup_printf("%s/log/qemu", root); cfg->swtpmLogDir = g_strdup_printf("%s/log/swtpm", root); @@ -153,7 +157,7 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged, cfg->checkpointDir = g_strdup_printf("%s/checkpoint", cfg->libDir); cfg->autoDumpPath = g_strdup_printf("%s/dump", cfg->libDir); cfg->nvramDir = g_strdup_printf("%s/nvram", cfg->libDir); - cfg->memoryBackingDir = g_strdup_printf("%s/ram", cfg->libDir); + cfg->memoryBackingDirs[0] = g_strdup_printf("%s/ram", cfg->libDir); } else if (privileged) { cfg->logDir = g_strdup_printf("%s/log/libvirt/qemu", LOCALSTATEDIR);
@@ -174,7 +178,8 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged, cfg->checkpointDir = g_strdup_printf("%s/checkpoint", cfg->libDir); cfg->autoDumpPath = g_strdup_printf("%s/dump", cfg->libDir); cfg->nvramDir = g_strdup_printf("%s/nvram", cfg->libDir); - cfg->memoryBackingDir = g_strdup_printf("%s/ram", cfg->libDir); + + cfg->memoryBackingDirs[0] = g_strdup_printf("%s/ram", cfg->libDir); cfg->swtpmStorageDir = g_strdup_printf("%s/lib/libvirt/swtpm", LOCALSTATEDIR); } else { @@ -201,7 +206,7 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged, cfg->configBaseDir); cfg->autoDumpPath = g_strdup_printf("%s/qemu/dump", cfg->configBaseDir); cfg->nvramDir = g_strdup_printf("%s/qemu/nvram", cfg->configBaseDir); - cfg->memoryBackingDir = g_strdup_printf("%s/qemu/ram", cfg->configBaseDir); + cfg->memoryBackingDirs[0] = g_strdup_printf("%s/qemu/ram", cfg->configBaseDir); cfg->swtpmStorageDir = g_strdup_printf("%s/qemu/swtpm", cfg->configBaseDir); } @@ -369,7 +374,12 @@ static void virQEMUDriverConfigDispose(void *obj)
virFirmwareFreeList(cfg->firmwares, cfg->nfirmwares);
- g_free(cfg->memoryBackingDir); + for (size_t i = 0; i < cfg->nb_memoryBackingDirs; i++) {
We don't allow variable declarations in loops, make sure you run our test suite, more tips are available here: https://libvirt.org/hacking.html
+ g_free(cfg->memoryBackingDirs[i]);
TAB in indentation
+ }
Extra braces around one-line body, make sure you follow our coding style, linked from the above article, available here: https://libvirt.org/coding-style.html
+ + g_free(cfg->memoryBackingDirs); + g_free(cfg->swtpmStorageDir);
g_strfreev(cfg->capabilityfilters); @@ -1019,14 +1029,31 @@ virQEMUDriverConfigLoadMemoryEntry(virQEMUDriverConfig *cfg, virConf *conf) { g_autofree char *dir = NULL; + g_auto(GStrv) params = NULL; + GStrv next; int rc;
if ((rc = virConfGetValueString(conf, "memory_backing_dir", &dir)) < 0) return -1;
if (rc > 0) { - VIR_FREE(cfg->memoryBackingDir); - cfg->memoryBackingDir = g_strdup_printf("%s/libvirt/qemu", dir); + size_t i; + for (i = 0; i < cfg->nb_memoryBackingDirs; i++) + VIR_FREE(cfg->memoryBackingDirs[i]);
Please don't mix the old VIR_FREE() and new g_free() as described in our glib adoption page: https://www.libvirt.org/glib-adoption.html
+ + if (!(params = g_strsplit(dir, ",", 0))) { + return -1; + } + + cfg->nb_memoryBackingDirs = 0; + for (next = params; *next; next++) { + cfg->nb_memoryBackingDirs++; + }
g_strv_length()?
+ + cfg->memoryBackingDirs = g_new0(char *, cfg->nb_memoryBackingDirs); + for (i = 0, next = params; *next; next++, i++) { + cfg->memoryBackingDirs[i] = g_strdup_printf("%s/libvirt/qemu", *next); + }
You can do this with one loop, the extra allocation at the start of the program won't ever hurt. Also if you stored the values as GStrv, you could then work with it using g_strv*() functions. See glib-adoption linked above.
}
return 0; diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h index 36049b4bfa..2b8d540df0 100644 --- a/src/qemu/qemu_conf.h +++ b/src/qemu/qemu_conf.h @@ -221,7 +221,8 @@ struct _virQEMUDriverConfig { unsigned int glusterDebugLevel; bool virtiofsdDebug;
- char *memoryBackingDir; + char **memoryBackingDirs; + size_t nb_memoryBackingDirs;
You change this here, but the other uses of the old member is fixed in following patches. I like that you split the series, but the tree should be buildable (with passing tests) after each and every commit. On top of all this, I still don't understand what's the benefit of having multiple backing dirs, so please first enlighten me with that as I'm clearly missing some info.
uid_t swtpm_user; gid_t swtpm_group; -- 2.25.1 _______________________________________________ Devel mailing list -- devel@lists.libvirt.org To unsubscribe send an email to devel-leave@lists.libvirt.org

Hi Martin, Answers inline. Thanks for helping with the review and all the tips! On 3/1/24 04:00, Martin Kletzander wrote:
On Mon, Jan 29, 2024 at 04:43:53PM -0500, mgalaxy@akamai.com wrote:
From: Michael Galaxy <mgalaxy@akamai.com>
In our case, we almost always have two NUMA nodes, so in that example, we have two PMEM regions which are created on the Linux kernel command line that get mounted into those two locations for libvirt to use.
There are PMEM devices which you then expose as filesystems to use for libvirt as a backing for VM's PMEMs. Do I understand that correctly?
If yes, how are these different? Can't they be passed through?
So, these are very different. QEMU currently already supports passing through PMEM for guest internal use (The guest puts its own filesystem onto the passed-through PMEM device). In our case, we are using the PMEM area only in the host to place the QEMU memory backing for all guests into a single PMEM area. To support NUMA correctly, QEMU needs to support mutiple host-level PMEM areas which have been pre-configured to be NUMA aware. This is strictly for the purposes of doing live updates, not as a mechanism for guests to internally take advantage of persistent memory... that's a completely different use case (which in and of itself is very interesting, but not what we are using it for). That's how it works. Does that make sense? (I'll work on those other requests, thank you) - Michael

On Mon, Mar 04, 2024 at 03:54:23PM -0600, Michael Galaxy wrote:
Hi Martin,
Answers inline. Thanks for helping with the review and all the tips!
On 3/1/24 04:00, Martin Kletzander wrote:
On Mon, Jan 29, 2024 at 04:43:53PM -0500, mgalaxy@akamai.com wrote:
From: Michael Galaxy <mgalaxy@akamai.com>
In our case, we almost always have two NUMA nodes, so in that example, we have two PMEM regions which are created on the Linux kernel command line that get mounted into those two locations for libvirt to use.
There are PMEM devices which you then expose as filesystems to use for libvirt as a backing for VM's PMEMs. Do I understand that correctly?
If yes, how are these different? Can't they be passed through?
So, these are very different. QEMU currently already supports passing through PMEM for guest internal use (The guest puts its own filesystem onto the passed-through PMEM device).
In our case, we are using the PMEM area only in the host to place the QEMU memory backing for all guests into a single PMEM area.
To support NUMA correctly, QEMU needs to support mutiple host-level PMEM areas which have been pre-configured to be NUMA aware. This is strictly for the
Is this preconfiguration something that libvirt should be able to do as well? How would anyone know which region is tied to which NUMA node? Shouldn't there be some probing for that?
purposes of doing live updates, not as a mechanism for guests to internally take advantage of persistent memory... that's a completely different use case (which in and of itself is very interesting, but not what we are using it for).
That's how it works. Does that make sense?
(I'll work on those other requests, thank you)
- Michael

Hi Martin, On 3/6/24 04:41, Martin Kletzander wrote:
On Mon, Mar 04, 2024 at 03:54:23PM -0600, Michael Galaxy wrote:
Hi Martin,
Answers inline. Thanks for helping with the review and all the tips!
On 3/1/24 04:00, Martin Kletzander wrote:
On Mon, Jan 29, 2024 at 04:43:53PM -0500, mgalaxy@akamai.com wrote:
From: Michael Galaxy <mgalaxy@akamai.com>
In our case, we almost always have two NUMA nodes, so in that example, we have two PMEM regions which are created on the Linux kernel command line that get mounted into those two locations for libvirt to use.
There are PMEM devices which you then expose as filesystems to use for libvirt as a backing for VM's PMEMs. Do I understand that correctly?
If yes, how are these different? Can't they be passed through?
So, these are very different. QEMU currently already supports passing through PMEM for guest internal use (The guest puts its own filesystem onto the passed-through PMEM device).
In our case, we are using the PMEM area only in the host to place the QEMU memory backing for all guests into a single PMEM area.
To support NUMA correctly, QEMU needs to support mutiple host-level PMEM areas which have been pre-configured to be NUMA aware. This is strictly for the
Is this preconfiguration something that libvirt should be able to do as well? How would anyone know which region is tied to which NUMA node? Shouldn't there be some probing for that?
That's a good question, but probably no. The preconfiguration is done via the memmap=XXXX paramter on the kernel command line. I doubt we want libvirt in the business of managing something like that. I do have checks in the patchset to handle different corner cases where the user provides invalid input in a way that does not match the existing number of NUMA nodes, but beyond that, the reserved PMEM areas are strictly configured at kernel boot time. - Michael

On 3/1/24 11:00, Martin Kletzander wrote:
On Mon, Jan 29, 2024 at 04:43:53PM -0500, mgalaxy@akamai.com wrote:
From: Michael Galaxy <mgalaxy@akamai.com>
We start by introducing a backwards-compatible, comma-separated specification that will not break existing installations, such as in the following example:
$ cat qemu.conf | grep memory_backing_dir memory_backing_dir = "/path/to/pmem/0,/path/to/pmem/1"
This would be better as an array, but I understand you thought it would be easier to code for backward compatibility.
This have to be array. virConfGetValueStringList() can handle both cases just fine. And in fact, when used it'll result in cleaner code. Michal

On 3/5/24 03:46, Michal Prívozník wrote:
On 3/1/24 11:00, Martin Kletzander wrote:
On Mon, Jan 29, 2024 at 04:43:53PM -0500, mgalaxy@akamai.com wrote:
From: Michael Galaxy <mgalaxy@akamai.com>
We start by introducing a backwards-compatible, comma-separated specification that will not break existing installations, such as in the following example:
$ cat qemu.conf | grep memory_backing_dir memory_backing_dir = "/path/to/pmem/0,/path/to/pmem/1"
This would be better as an array, but I understand you thought it would be easier to code for backward compatibility. This have to be array. virConfGetValueStringList() can handle both cases just fine. And in fact, when used it'll result in cleaner code.
Acknowledged!

From: Michael Galaxy <mgalaxy@akamai.com> Signed-off-by: Michael Galaxy <mgalaxy@akamai.com> --- src/qemu/qemu_driver.c | 31 +++++++++++++++++----------- src/qemu/qemu_hotplug.c | 6 +++--- src/qemu/qemu_process.c | 45 ++++++++++++++++++++++++----------------- src/qemu/qemu_process.h | 13 ++++++------ 4 files changed, 54 insertions(+), 41 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 448e6b1591..98742f83c8 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -653,11 +653,14 @@ qemuStateInitialize(bool privileged, cfg->nvramDir); goto error; } - if (g_mkdir_with_parents(cfg->memoryBackingDir, 0777) < 0) { - virReportSystemError(errno, _("Failed to create memory backing dir %1$s"), - cfg->memoryBackingDir); - goto error; + for (i = 0; i < cfg->nb_memoryBackingDirs; i++) { + if (g_mkdir_with_parents(cfg->memoryBackingDirs[i], 0777) < 0) { + virReportSystemError(errno, _("Failed to create memory backing dir # %1$lu @ %2$s, total: %3$lu"), + i, cfg->memoryBackingDirs[i], cfg->nb_memoryBackingDirs); + goto error; + } } + if (g_mkdir_with_parents(cfg->slirpStateDir, 0777) < 0) { virReportSystemError(errno, _("Failed to create slirp state dir %1$s"), cfg->slirpStateDir); @@ -802,13 +805,15 @@ qemuStateInitialize(bool privileged, (int)cfg->group); goto error; } - if (chown(cfg->memoryBackingDir, cfg->user, cfg->group) < 0) { - virReportSystemError(errno, + for (i = 0; i < cfg->nb_memoryBackingDirs; i++) { + if (chown(cfg->memoryBackingDirs[i], cfg->user, cfg->group) < 0) { + virReportSystemError(errno, _("unable to set ownership of '%1$s' to %2$d:%3$d"), - cfg->memoryBackingDir, (int)cfg->user, + cfg->memoryBackingDirs[i], (int)cfg->user, (int)cfg->group); - goto error; - } + goto error; + } + } if (chown(cfg->slirpStateDir, cfg->user, cfg->group) < 0) { virReportSystemError(errno, _("unable to set ownership of '%1$s' to %2$d:%3$d"), @@ -855,10 +860,12 @@ qemuStateInitialize(bool privileged, goto error; } - if (privileged && - virFileUpdatePerm(cfg->memoryBackingDir, + for (i = 0; i < cfg->nb_memoryBackingDirs; i++) { + if (privileged && + virFileUpdatePerm(cfg->memoryBackingDirs[i], 0, S_IXGRP | S_IXOTH) < 0) - goto error; + goto error; + } /* Get all the running persistent or transient configs first */ if (virDomainObjListLoadAllConfigs(qemu_driver->domains, diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index 31b00e05ca..c3d636d167 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -2289,7 +2289,7 @@ qemuDomainAttachMemory(virQEMUDriver *driver, priv, vm->def, mem, true, false, NULL) < 0) goto cleanup; - if (qemuProcessBuildDestroyMemoryPaths(driver, vm, mem, true) < 0) + if (qemuProcessBuildDestroyMemoryPaths(vm, mem, true) < 0) goto cleanup; if (qemuDomainNamespaceSetupMemory(vm, mem, &teardowndevice) < 0) @@ -2364,7 +2364,7 @@ qemuDomainAttachMemory(virQEMUDriver *driver, qemuDomainObjExitMonitor(vm); if (objAdded && mem) - ignore_value(qemuProcessDestroyMemoryBackingPath(driver, vm, mem)); + ignore_value(qemuProcessDestroyMemoryBackingPath(vm->def, priv, mem)); virErrorRestore(&orig_err); if (!mem) @@ -4640,7 +4640,7 @@ qemuDomainRemoveMemoryDevice(virQEMUDriver *driver, if (qemuDomainNamespaceTeardownMemory(vm, mem) < 0) VIR_WARN("Unable to remove memory device from /dev"); - if (qemuProcessDestroyMemoryBackingPath(driver, vm, mem) < 0) + if (qemuProcessDestroyMemoryBackingPath(vm->def, priv, mem) < 0) VIR_WARN("Unable to destroy memory backing path"); qemuDomainReleaseMemoryDeviceSlot(vm, mem); diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index c1115b440f..48681ef97f 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -4002,12 +4002,12 @@ qemuProcessBuildDestroyMemoryPathsImpl(virQEMUDriver *driver, int -qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver, - virDomainObj *vm, +qemuProcessBuildDestroyMemoryPaths(virDomainObj *vm, virDomainMemoryDef *mem, bool build) { - + qemuDomainObjPrivate *priv = vm->privateData; + virQEMUDriver *driver = priv->driver; g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver); size_t i; bool shouldBuildHP = false; @@ -4037,13 +4037,15 @@ qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver, } if (!build || shouldBuildMB) { - g_autofree char *path = NULL; - if (qemuGetMemoryBackingDomainPath(driver, vm->def, &path) < 0) - return -1; + for (i = 0; i < cfg->nb_memoryBackingDirs; i++) { + g_autofree char *path = NULL; + if (qemuGetMemoryBackingDomainPath(vm->def, vm->privateData, i, &path) < 0) + return -1; - if (qemuProcessBuildDestroyMemoryPathsImpl(driver, vm, - path, build) < 0) - return -1; + if (qemuProcessBuildDestroyMemoryPathsImpl(driver, vm, + path, build) < 0) + return -1; + } } return 0; @@ -4051,19 +4053,24 @@ qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver, int -qemuProcessDestroyMemoryBackingPath(virQEMUDriver *driver, - virDomainObj *vm, +qemuProcessDestroyMemoryBackingPath(virDomainDef *def, + qemuDomainObjPrivate *priv, virDomainMemoryDef *mem) { + virQEMUDriver *driver = priv->driver; g_autofree char *path = NULL; + g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver); - if (qemuGetMemoryBackingPath(driver, vm->def, mem->info.alias, &path) < 0) - return -1; + for (size_t i = 0; i < cfg->nb_memoryBackingDirs; i++) { + virDomainXMLPrivateDataCallbacks *privateData = (virDomainXMLPrivateDataCallbacks *) priv; + if (qemuGetMemoryBackingPath(def, privateData, i, mem->info.alias, &path) < 0) + return -1; - if (unlink(path) < 0 && - errno != ENOENT) { - virReportSystemError(errno, _("Unable to remove %1$s"), path); - return -1; + if (unlink(path) < 0 && + errno != ENOENT) { + virReportSystemError(errno, _("Unable to remove %1$s"), path); + return -1; + } } return 0; @@ -7225,7 +7232,7 @@ qemuProcessPrepareHost(virQEMUDriver *driver, if (qemuProcessPrepareHostBackendChardev(vm) < 0) return -1; - if (qemuProcessBuildDestroyMemoryPaths(driver, vm, NULL, true) < 0) + if (qemuProcessBuildDestroyMemoryPaths(vm, NULL, true) < 0) return -1; /* Ensure no historical cgroup for this VM is lying around bogus @@ -8438,7 +8445,7 @@ void qemuProcessStop(virQEMUDriver *driver, goto endjob; } - qemuProcessBuildDestroyMemoryPaths(driver, vm, NULL, false); + qemuProcessBuildDestroyMemoryPaths(vm, NULL, false); if (!!g_atomic_int_dec_and_test(&driver->nactive) && driver->inhibitCallback) driver->inhibitCallback(false, driver->inhibitOpaque); diff --git a/src/qemu/qemu_process.h b/src/qemu/qemu_process.h index c1ea949215..2993ef518a 100644 --- a/src/qemu/qemu_process.h +++ b/src/qemu/qemu_process.h @@ -38,14 +38,13 @@ int qemuProcessStopCPUs(virQEMUDriver *driver, virDomainPausedReason reason, virDomainAsyncJob asyncJob); -int qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver, - virDomainObj *vm, - virDomainMemoryDef *mem, - bool build); +int qemuProcessBuildDestroyMemoryPaths(virDomainObj *vm, + virDomainMemoryDef *mem, + bool build); -int qemuProcessDestroyMemoryBackingPath(virQEMUDriver *driver, - virDomainObj *vm, - virDomainMemoryDef *mem); +int qemuProcessDestroyMemoryBackingPath(virDomainDef *def, + qemuDomainObjPrivate *priv, + virDomainMemoryDef *mem); void qemuProcessReconnectAll(virQEMUDriver *driver); -- 2.25.1

From: Michael Galaxy <mgalaxy@akamai.com> We have different use cases: 1. The domain has multiple NUMA nodes, but they have only specified a single directory pathin qemu.conf (Original default behavior) 2. Domain has multiple NUMA nodes, but we have asked for multiple directory paths as well (new behavior). 3. Domain has single NUMA node, but we have asked for multiple directory paths (new behavior). Each one is elaborated more inline below in the comments. Signed-off-by: Michael Galaxy <mgalaxy@akamai.com> --- src/qemu/qemu_command.c | 8 +++- src/qemu/qemu_conf.c | 101 ++++++++++++++++++++++++++++++++++++---- src/qemu/qemu_conf.h | 11 +++-- 3 files changed, 106 insertions(+), 14 deletions(-) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index b8f071ff2a..818e409d20 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -3448,7 +3448,9 @@ qemuBuildMemoryBackendProps(virJSONValue **backendProps, } else { /* We can have both pagesize and mem source. If that's the case, * prefer hugepages as those are more specific. */ - if (qemuGetMemoryBackingPath(priv->driver, def, mem->info.alias, &memPath) < 0) + + virDomainXMLPrivateDataCallbacks *privateData = (virDomainXMLPrivateDataCallbacks *) priv; + if (qemuGetMemoryBackingPath(def, privateData, mem->targetNode, mem->info.alias, &memPath) < 0) return -1; } @@ -7291,7 +7293,9 @@ qemuBuildMemPathStr(const virDomainDef *def, return -1; prealloc = true; } else if (def->mem.source == VIR_DOMAIN_MEMORY_SOURCE_FILE) { - if (qemuGetMemoryBackingPath(priv->driver, def, "ram", &mem_path) < 0) + // This path should not be reached if NUMA is requested + virDomainXMLPrivateDataCallbacks *privateData = (virDomainXMLPrivateDataCallbacks *) priv; + if (qemuGetMemoryBackingPath(def, privateData, 0, "ram", &mem_path) < 0) return -1; } diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index aae9f316d8..e327a906a3 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -1622,22 +1622,106 @@ qemuGetDomainHupageMemPath(virQEMUDriver *driver, int -qemuGetMemoryBackingDomainPath(virQEMUDriver *driver, - const virDomainDef *def, +qemuGetMemoryBackingDomainPath(const virDomainDef *def, + virDomainXMLPrivateDataCallbacks *priv, + const size_t targetNode, char **path) { + qemuDomainObjPrivate *privateData = (qemuDomainObjPrivate *) priv; + virQEMUDriver *driver = privateData->driver; g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver); const char *root = driver->embeddedRoot; g_autofree char *shortName = NULL; + size_t path_index = 0; // original behavior, described below if (!(shortName = virDomainDefGetShortName(def))) return -1; - if (root && !STRPREFIX(cfg->memoryBackingDir, root)) { + /* + * We have three use cases: + * + * 1. Domain has multiple NUMA nodes, but they have only specified + * a single directory path in qemu.conf. (Original default behavior). + * + * In this case, we already placed the memory backing path for each NUMA node + * into the same path location. Preserve the established default behavior. + * + * 2. Domain has multiple NUMA nodes, but we have asked for multiple directory + * paths as well. + * + * In this case, we will have a one-to-one relationship between the number + * of NUMA nodes and the order in which the paths are provided. + * If the user does not specify enough paths, then we need to throw an error. + * NOTE: This is open to comment. The "ordering" of the paths here is not intially + * configurable to preserve backwards compatibility with the original qemu.conf syntax. + * If controlling the ordering is desired, we would need to revise the syntax in + * qemu.conf to make that possible. That hasn't been needed so far. + * + * NOTE A): We must check with numatune here, if requested. The number of NUMA nodes + * may be less than or equal to the number of provided paths. If it is less, + * we have to respect the choices made by numatune. In this case, we will map the + * physical NUMA nodes (0, 1, 2...) in the order in which they appear in qemu.conf + * + * 3. Domain has a single NUMA node, but we have asked for multiple directory paths. + * + * In this case we also need to check if numatune is requested. If so, + * we want to pick the path indicated by numatune. + * + * NOTE B): In both cases 2 and 3, if numatune is requested, the path obviously cannot + * be changed on the fly, like it normally would be in "restrictive" mode + * during runtime. So, we will only do this is the mode requested is "strict". + * + * NOTE C): Furthermore, in both cases 2 and 3, if the number of directory paths provided + * is more than one, and one of either: a) no numatune is provided at all or + * b) numatune is in fact provided, but the mode is not strict, + * then we must thrown error. This is because we cannot know which backing + * directory path to choose without the user's input. + * + * NOTE D): If one or more directory paths is requested in any of the cases 1, 2, or 3, + * the numatune cannot specifiy more than one NUMA node, because the only mode + * possible with directory paths is "strict" (e.g. automatic numa balancing of + * memory will not work). Only one numa node can be requested by numatune, else + * we must throw an error. + */ + + if (cfg->nb_memoryBackingDirs > 1) { + virDomainNuma *numatune = def->numa; + virBitmap *numaBitmap = virDomainNumatuneGetNodeset(numatune, privateData->autoNodeset, targetNode); + size_t numa_node_count = virDomainNumaGetNodeCount(def->numa); + virDomainNumatuneMemMode mode; + + if ((numatune && numaBitmap && virNumaNodesetIsAvailable(numaBitmap)) && + virDomainNumatuneGetMode(def->numa, -1, &mode) == 0 && + mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && + virBitmapCountBits(numaBitmap) == 1) { + // Is numatune provided? + // Is it strict? + // Does it only specify a single pinning for this target? + // Yes to all 3? then good to go. + + if (cfg->nb_memoryBackingDirs < numa_node_count) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Domain requesting configuration for %lu NUMA nodes, but memory backing directory only has (%lu) directory paths available. Either reduce this to one directory or provide more paths to use."), numa_node_count, cfg->nb_memoryBackingDirs); + return -1; + } + + path_index = virBitmapNextSetBit(numaBitmap, -1); + } else if (numa_node_count > 1 && numa_node_count == cfg->nb_memoryBackingDirs) { + // Be nice. A valid numatune and pinning has not been specified, but the number + // of paths matches up exactly, so just assign them one-to-one. + path_index = targetNode; + } else { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("There are (%lu) memory directory directories configured. Domain must use a 'strict' numatune as well as an associated pinning configuration for each NUMA node before proceeding. An individual NUMA node can only be pinned to a single backing directory. Please correct the domain configuration or remove the memory backing directories and try again."), cfg->nb_memoryBackingDirs); + return -1; + } + } + + if (root && !STRPREFIX(cfg->memoryBackingDirs[path_index], root)) { g_autofree char * hash = virDomainDriverGenerateRootHash("qemu", root); - *path = g_strdup_printf("%s/%s-%s", cfg->memoryBackingDir, hash, shortName); + *path = g_strdup_printf("%s/%s-%s", cfg->memoryBackingDirs[path_index], hash, shortName); } else { - *path = g_strdup_printf("%s/%s", cfg->memoryBackingDir, shortName); + *path = g_strdup_printf("%s/%s", cfg->memoryBackingDirs[path_index], shortName); } return 0; @@ -1657,8 +1741,9 @@ qemuGetMemoryBackingDomainPath(virQEMUDriver *driver, * -1 otherwise (with error reported). */ int -qemuGetMemoryBackingPath(virQEMUDriver *driver, - const virDomainDef *def, +qemuGetMemoryBackingPath(const virDomainDef *def, + virDomainXMLPrivateDataCallbacks *priv, + const size_t targetNode, const char *alias, char **memPath) { @@ -1671,7 +1756,7 @@ qemuGetMemoryBackingPath(virQEMUDriver *driver, return -1; } - if (qemuGetMemoryBackingDomainPath(driver, def, &domainPath) < 0) + if (qemuGetMemoryBackingDomainPath(def, priv, targetNode, &domainPath) < 0) return -1; *memPath = g_strdup_printf("%s/%s", domainPath, alias); diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h index 2b8d540df0..4ae21524f7 100644 --- a/src/qemu/qemu_conf.h +++ b/src/qemu/qemu_conf.h @@ -370,11 +370,14 @@ int qemuGetDomainHupageMemPath(virQEMUDriver *driver, unsigned long long pagesize, char **memPath); -int qemuGetMemoryBackingDomainPath(virQEMUDriver *driver, - const virDomainDef *def, +int qemuGetMemoryBackingDomainPath(const virDomainDef *def, + virDomainXMLPrivateDataCallbacks *priv, + const size_t targetNode, char **path); -int qemuGetMemoryBackingPath(virQEMUDriver *driver, - const virDomainDef *def, + +int qemuGetMemoryBackingPath(const virDomainDef *def, + virDomainXMLPrivateDataCallbacks *priv, + const size_t targetNode, const char *alias, char **memPath); -- 2.25.1

From: Michael Galaxy <mgalaxy@akamai.com> In our environment, we need to convert VMs into a live-update-comptabile configuration "on-the-fly" (via live migration). So, for this very specific case, this needs to work when PMEM is being enabled. QEMU does not have a problem with this at all, but we need to relax the rules here a bit. Signed-off-by: Michael Galaxy <mgalaxy@akamai.com> --- src/qemu/qemu_domain.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index de36641137..5f2058c58d 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -8520,11 +8520,25 @@ qemuDomainABIStabilityCheck(const virDomainDef *src, size_t i; if (src->mem.source != dst->mem.source) { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, - _("Target memoryBacking source '%1$s' doesn't match source memoryBacking source'%2$s'"), - virDomainMemorySourceTypeToString(dst->mem.source), - virDomainMemorySourceTypeToString(src->mem.source)); - return false; + /* + * The current use case for this is the live migration of live-update + * capable CPR guests mounted on PMEM devices at the host + * level (not in-guest PMEM). QEMU has no problem doing these kinds of + * live migrations between these two memory backends, so let them go through. + * This allows us to "upgrade" guests from regular memory to file-backed + * memory seemlessly without taking them down. + */ + if (!((src->mem.source == VIR_DOMAIN_MEMORY_SOURCE_NONE + && dst->mem.source == VIR_DOMAIN_MEMORY_SOURCE_FILE) || + (src->mem.source == VIR_DOMAIN_MEMORY_SOURCE_FILE + && dst->mem.source == VIR_DOMAIN_MEMORY_SOURCE_NONE))) { + + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("Target memoryBacking source '%1$s' doesn't match source memoryBacking source'%2$s'"), + virDomainMemorySourceTypeToString(dst->mem.source), + virDomainMemorySourceTypeToString(src->mem.source)); + return false; + } } for (i = 0; i < src->nmems; i++) { -- 2.25.1

From: Michael Galaxy <mgalaxy@akamai.com> Signed-off-by: Michael Galaxy <mgalaxy@akamai.com> --- tests/testutilsqemu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/testutilsqemu.c b/tests/testutilsqemu.c index 9c12a165b1..0da11d83c0 100644 --- a/tests/testutilsqemu.c +++ b/tests/testutilsqemu.c @@ -320,8 +320,9 @@ int qemuTestDriverInit(virQEMUDriver *driver) cfg->libDir = g_strdup("/var/lib/libvirt/qemu"); VIR_FREE(cfg->channelTargetDir); cfg->channelTargetDir = g_strdup("/var/run/libvirt/qemu/channel"); - VIR_FREE(cfg->memoryBackingDir); - cfg->memoryBackingDir = g_strdup("/var/lib/libvirt/qemu/ram"); + VIR_FREE(cfg->memoryBackingDirs); + cfg->memoryBackingDirs = g_new0(char *, 1); + cfg->memoryBackingDirs[0] = g_strdup("/var/lib/libvirt/qemu/ram"); VIR_FREE(cfg->nvramDir); cfg->nvramDir = g_strdup("/var/lib/libvirt/qemu/nvram"); VIR_FREE(cfg->passtStateDir); -- 2.25.1
participants (4)
-
Martin Kletzander
-
mgalaxy@akamai.com
-
Michael Galaxy
-
Michal Prívozník