[libvirt] [PATCH] qemu: Rework setting process affinity

https://bugzilla.redhat.com/show_bug.cgi?id=1503284 The way we currently start qemu from CPU affinity POV is as follows: 1) the child process is set affinity to all online CPUs (unless some vcpu pinning was given in the domain XML) 2) Once qemu is running, cpuset cgroup is configured taking memory pinning into account Problem is that we let qemu allocate its memory just anywhere in 1) and then rely in 2) to be able to move the memory to configured NUMA nodes. This might not be always possible (e.g. qemu might lock some parts of its memory) and is very suboptimal (copying large memory between NUMA nodes takes significant amount of time). The solution is to set the affinity correctly from the beginning and then possibly refine it later via cgroups. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_process.c | 152 ++++++++++++++++++++++------------------ 1 file changed, 83 insertions(+), 69 deletions(-) diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 9ccc3601a2..a4668f6773 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2435,6 +2435,44 @@ qemuProcessDetectIOThreadPIDs(virQEMUDriverPtr driver, } +static int +qemuProcessGetAllCpuAffinity(pid_t pid, + virBitmapPtr *cpumapRet) +{ + VIR_AUTOPTR(virBitmap) cpumap = NULL; + VIR_AUTOPTR(virBitmap) hostcpumap = NULL; + int hostcpus; + + *cpumapRet = NULL; + + if (!virHostCPUHasBitmap()) + return 0; + + if (!(hostcpumap = virHostCPUGetOnlineBitmap()) || + !(cpumap = virProcessGetAffinity(pid))) + return -1; + + if (!virBitmapEqual(hostcpumap, cpumap)) { + /* setaffinity fails if you set bits for CPUs which + * aren't present, so we have to limit ourselves */ + if ((hostcpus = virHostCPUGetCount()) < 0) + return -1; + + if (hostcpus > QEMUD_CPUMASK_LEN) + hostcpus = QEMUD_CPUMASK_LEN; + + virBitmapFree(cpumap); + if (!(cpumap = virBitmapNew(hostcpus))) + return -1; + + virBitmapSetAll(cpumap); + } + + VIR_STEAL_PTR(*cpumapRet, cpumap); + return 0; +} + + /* * To be run between fork/exec of QEMU only */ @@ -2443,9 +2481,9 @@ static int qemuProcessInitCpuAffinity(virDomainObjPtr vm) { int ret = -1; - virBitmapPtr cpumap = NULL; virBitmapPtr cpumapToSet = NULL; - virBitmapPtr hostcpumap = NULL; + VIR_AUTOPTR(virBitmap) hostcpumap = NULL; + virDomainNumatuneMemMode mem_mode; qemuDomainObjPrivatePtr priv = vm->privateData; if (!vm->pid) { @@ -2454,59 +2492,36 @@ qemuProcessInitCpuAffinity(virDomainObjPtr vm) return -1; } - if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { - VIR_DEBUG("Set CPU affinity with advisory nodeset from numad"); - cpumapToSet = priv->autoCpuset; + /* Here is the deal, we can't set cpuset.mems before qemu is + * started as it clashes with KVM allocation. Therefore we + * used to let qemu allocate its memory anywhere as we would + * then move the memory to desired NUMA node via CGroups. + * However, that might not be always possible because qemu + * might lock some parts of its memory (e.g. due to VFIO). + * Solution is to set some temporary affinity now and then + * fix it later, once qemu is already running. */ + if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 && + virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 && + mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { + if (virDomainNumatuneMaybeGetNodeset(vm->def->numa, + priv->autoNodeset, + &cpumapToSet, + -1) < 0) + goto cleanup; + } else if (vm->def->cputune.emulatorpin) { + cpumapToSet = vm->def->cputune.emulatorpin; } else { - VIR_DEBUG("Set CPU affinity with specified cpuset"); - if (vm->def->cpumask) { - cpumapToSet = vm->def->cpumask; - } else { - /* You may think this is redundant, but we can't assume libvirtd - * itself is running on all pCPUs, so we need to explicitly set - * the spawned QEMU instance to all pCPUs if no map is given in - * its config file */ - int hostcpus; - - if (virHostCPUHasBitmap()) { - hostcpumap = virHostCPUGetOnlineBitmap(); - cpumap = virProcessGetAffinity(vm->pid); - } - - if (hostcpumap && cpumap && virBitmapEqual(hostcpumap, cpumap)) { - /* we're using all available CPUs, no reason to set - * mask. If libvirtd is running without explicit - * affinity, we can use hotplugged CPUs for this VM */ - ret = 0; - goto cleanup; - } else { - /* setaffinity fails if you set bits for CPUs which - * aren't present, so we have to limit ourselves */ - if ((hostcpus = virHostCPUGetCount()) < 0) - goto cleanup; - - if (hostcpus > QEMUD_CPUMASK_LEN) - hostcpus = QEMUD_CPUMASK_LEN; - - virBitmapFree(cpumap); - if (!(cpumap = virBitmapNew(hostcpus))) - goto cleanup; - - virBitmapSetAll(cpumap); - - cpumapToSet = cpumap; - } - } + if (qemuProcessGetAllCpuAffinity(vm->pid, &hostcpumap) < 0) + goto cleanup; + cpumapToSet = hostcpumap; } - if (virProcessSetAffinity(vm->pid, cpumapToSet) < 0) + if (cpumapToSet && + virProcessSetAffinity(vm->pid, cpumapToSet) < 0) goto cleanup; ret = 0; - cleanup: - virBitmapFree(cpumap); - virBitmapFree(hostcpumap); return ret; } #else /* !defined(HAVE_SCHED_GETAFFINITY) && !defined(HAVE_BSD_CPU_AFFINITY) */ @@ -2586,7 +2601,8 @@ qemuProcessSetupPid(virDomainObjPtr vm, qemuDomainObjPrivatePtr priv = vm->privateData; virDomainNumatuneMemMode mem_mode; virCgroupPtr cgroup = NULL; - virBitmapPtr use_cpumask; + virBitmapPtr use_cpumask = NULL; + VIR_AUTOPTR(virBitmap) hostcpumap = NULL; char *mem_mask = NULL; int ret = -1; @@ -2598,12 +2614,21 @@ qemuProcessSetupPid(virDomainObjPtr vm, } /* Infer which cpumask shall be used. */ - if (cpumask) + if (cpumask) { use_cpumask = cpumask; - else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) + } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { use_cpumask = priv->autoCpuset; - else + } else if (vm->def->cpumask) { use_cpumask = vm->def->cpumask; + } else if (virHostCPUHasBitmap()) { + /* You may think this is redundant, but we can't assume libvirtd + * itself is running on all pCPUs, so we need to explicitly set + * the spawned QEMU instance to all pCPUs if no map is given in + * its config file */ + if (qemuProcessGetAllCpuAffinity(pid, &hostcpumap) < 0) + goto cleanup; + use_cpumask = hostcpumap; + } /* * If CPU cgroup controller is not initialized here, then we need @@ -2628,13 +2653,7 @@ qemuProcessSetupPid(virDomainObjPtr vm, qemuSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0) goto cleanup; - /* - * Don't setup cpuset.mems for the emulator, they need to - * be set up after initialization in order for kvm - * allocations to succeed. - */ - if (nameval != VIR_CGROUP_THREAD_EMULATOR && - mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0) + if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0) goto cleanup; } @@ -6634,12 +6653,7 @@ qemuProcessLaunch(virConnectPtr conn, /* This must be done after cgroup placement to avoid resetting CPU * affinity */ - if (!vm->def->cputune.emulatorpin && - qemuProcessInitCpuAffinity(vm) < 0) - goto cleanup; - - VIR_DEBUG("Setting emulator tuning/settings"); - if (qemuProcessSetupEmulator(vm) < 0) + if (qemuProcessInitCpuAffinity(vm) < 0) goto cleanup; VIR_DEBUG("Setting cgroup for external devices (if required)"); @@ -6708,10 +6722,6 @@ qemuProcessLaunch(virConnectPtr conn, if (qemuProcessUpdateAndVerifyCPU(driver, vm, asyncJob) < 0) goto cleanup; - VIR_DEBUG("Setting up post-init cgroup restrictions"); - if (qemuSetupCpusetMems(vm) < 0) - goto cleanup; - VIR_DEBUG("setting up hotpluggable cpus"); if (qemuDomainHasHotpluggableStartupVcpus(vm->def)) { if (qemuDomainRefreshVcpuInfo(driver, vm, asyncJob, false) < 0) @@ -6737,6 +6747,10 @@ qemuProcessLaunch(virConnectPtr conn, if (qemuProcessDetectIOThreadPIDs(driver, vm, asyncJob) < 0) goto cleanup; + VIR_DEBUG("Setting emulator tuning/settings"); + if (qemuProcessSetupEmulator(vm) < 0) + goto cleanup; + VIR_DEBUG("Setting global CPU cgroup (if required)"); if (qemuSetupGlobalCpuCgroup(vm) < 0) goto cleanup; -- 2.19.2

On Wed, Jan 30, 2019 at 02:56:46PM +0100, Michal Privoznik wrote:
https://bugzilla.redhat.com/show_bug.cgi?id=1503284
The way we currently start qemu from CPU affinity POV is as follows:
1) the child process is set affinity to all online CPUs (unless some vcpu pinning was given in the domain XML)
2) Once qemu is running, cpuset cgroup is configured taking memory pinning into account
Problem is that we let qemu allocate its memory just anywhere in 1) and then rely in 2) to be able to move the memory to configured NUMA nodes. This might not be always possible (e.g. qemu might lock some parts of its memory) and is very suboptimal (copying large memory between NUMA nodes takes significant amount of time). The solution is to set the affinity correctly from the beginning and then possibly refine it later via cgroups.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_process.c | 152 ++++++++++++++++++++++------------------ 1 file changed, 83 insertions(+), 69 deletions(-)
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 9ccc3601a2..a4668f6773 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2435,6 +2435,44 @@ qemuProcessDetectIOThreadPIDs(virQEMUDriverPtr driver, }
+static int +qemuProcessGetAllCpuAffinity(pid_t pid, + virBitmapPtr *cpumapRet) +{ + VIR_AUTOPTR(virBitmap) cpumap = NULL; + VIR_AUTOPTR(virBitmap) hostcpumap = NULL; + int hostcpus; + + *cpumapRet = NULL; + + if (!virHostCPUHasBitmap()) + return 0; + + if (!(hostcpumap = virHostCPUGetOnlineBitmap()) || + !(cpumap = virProcessGetAffinity(pid))) + return -1; + + if (!virBitmapEqual(hostcpumap, cpumap)) { + /* setaffinity fails if you set bits for CPUs which + * aren't present, so we have to limit ourselves */ + if ((hostcpus = virHostCPUGetCount()) < 0) + return -1; + + if (hostcpus > QEMUD_CPUMASK_LEN) + hostcpus = QEMUD_CPUMASK_LEN; + + virBitmapFree(cpumap); + if (!(cpumap = virBitmapNew(hostcpus))) + return -1; + + virBitmapSetAll(cpumap); + } + + VIR_STEAL_PTR(*cpumapRet, cpumap);
IIUC, if the QEMU process affinity matches the current set of online host CPUs, we just return the current QEMU affinity. Otherwise we construct a mask of all host CPUs, but seemingly ignoring whether the host CPUs are online or not. Seems a bit odd to return list of online host CPUs in one case, or a list of all possible host CPUs in the other cases. So I guess I'm wondering why this method is not simply reduced to 1 single line *cpumapRet = virHostCPUGetOnlineBitmap(); ?
@@ -2454,59 +2492,36 @@ qemuProcessInitCpuAffinity(virDomainObjPtr vm) return -1; }
- if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { - VIR_DEBUG("Set CPU affinity with advisory nodeset from numad"); - cpumapToSet = priv->autoCpuset; + /* Here is the deal, we can't set cpuset.mems before qemu is + * started as it clashes with KVM allocation. Therefore we + * used to let qemu allocate its memory anywhere as we would + * then move the memory to desired NUMA node via CGroups. + * However, that might not be always possible because qemu + * might lock some parts of its memory (e.g. due to VFIO). + * Solution is to set some temporary affinity now and then + * fix it later, once qemu is already running. */ + if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 && + virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 && + mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { + if (virDomainNumatuneMaybeGetNodeset(vm->def->numa, + priv->autoNodeset, + &cpumapToSet, + -1) < 0) + goto cleanup; + } else if (vm->def->cputune.emulatorpin) { + cpumapToSet = vm->def->cputune.emulatorpin; } else { - VIR_DEBUG("Set CPU affinity with specified cpuset"); - if (vm->def->cpumask) { - cpumapToSet = vm->def->cpumask; - } else { - /* You may think this is redundant, but we can't assume libvirtd - * itself is running on all pCPUs, so we need to explicitly set - * the spawned QEMU instance to all pCPUs if no map is given in - * its config file */ - int hostcpus; - - if (virHostCPUHasBitmap()) { - hostcpumap = virHostCPUGetOnlineBitmap(); - cpumap = virProcessGetAffinity(vm->pid); - } - - if (hostcpumap && cpumap && virBitmapEqual(hostcpumap, cpumap)) { - /* we're using all available CPUs, no reason to set - * mask. If libvirtd is running without explicit - * affinity, we can use hotplugged CPUs for this VM */ - ret = 0; - goto cleanup; - } else { - /* setaffinity fails if you set bits for CPUs which - * aren't present, so we have to limit ourselves */ - if ((hostcpus = virHostCPUGetCount()) < 0) - goto cleanup; - - if (hostcpus > QEMUD_CPUMASK_LEN) - hostcpus = QEMUD_CPUMASK_LEN; - - virBitmapFree(cpumap); - if (!(cpumap = virBitmapNew(hostcpus))) - goto cleanup; - - virBitmapSetAll(cpumap); - - cpumapToSet = cpumap; - } - } + if (qemuProcessGetAllCpuAffinity(vm->pid, &hostcpumap) < 0) + goto cleanup; + cpumapToSet = hostcpumap;
IIUC, the end up setting affinity to one of (in priority order) - The CPUs associated with NUMA memory affinity mask - The CPUs associated with emulator pinning - All online host CPUs Later (once QEMU has allocated its memory) we then change this again to be simpler - The CPUs associated with emulator pinning - All online host CPUs I think it would be nice to mention this explicitly in the commit message, as it clarifies how we're solving the problem the commit message mentions.
}
- if (virProcessSetAffinity(vm->pid, cpumapToSet) < 0) + if (cpumapToSet && + virProcessSetAffinity(vm->pid, cpumapToSet) < 0) goto cleanup;
ret = 0; - cleanup: - virBitmapFree(cpumap); - virBitmapFree(hostcpumap); return ret; } #else /* !defined(HAVE_SCHED_GETAFFINITY) && !defined(HAVE_BSD_CPU_AFFINITY) */ @@ -2586,7 +2601,8 @@ qemuProcessSetupPid(virDomainObjPtr vm, qemuDomainObjPrivatePtr priv = vm->privateData; virDomainNumatuneMemMode mem_mode; virCgroupPtr cgroup = NULL; - virBitmapPtr use_cpumask; + virBitmapPtr use_cpumask = NULL; + VIR_AUTOPTR(virBitmap) hostcpumap = NULL; char *mem_mask = NULL; int ret = -1;
@@ -2598,12 +2614,21 @@ qemuProcessSetupPid(virDomainObjPtr vm, }
/* Infer which cpumask shall be used. */ - if (cpumask) + if (cpumask) { use_cpumask = cpumask; - else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) + } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { use_cpumask = priv->autoCpuset; - else + } else if (vm->def->cpumask) { use_cpumask = vm->def->cpumask; + } else if (virHostCPUHasBitmap()) { + /* You may think this is redundant, but we can't assume libvirtd + * itself is running on all pCPUs, so we need to explicitly set + * the spawned QEMU instance to all pCPUs if no map is given in + * its config file */ + if (qemuProcessGetAllCpuAffinity(pid, &hostcpumap) < 0) + goto cleanup; + use_cpumask = hostcpumap; + }
/* * If CPU cgroup controller is not initialized here, then we need @@ -2628,13 +2653,7 @@ qemuProcessSetupPid(virDomainObjPtr vm, qemuSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0) goto cleanup;
- /* - * Don't setup cpuset.mems for the emulator, they need to - * be set up after initialization in order for kvm - * allocations to succeed. - */ - if (nameval != VIR_CGROUP_THREAD_EMULATOR && - mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0) + if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0) goto cleanup;
} @@ -6634,12 +6653,7 @@ qemuProcessLaunch(virConnectPtr conn,
/* This must be done after cgroup placement to avoid resetting CPU * affinity */ - if (!vm->def->cputune.emulatorpin && - qemuProcessInitCpuAffinity(vm) < 0) - goto cleanup; - - VIR_DEBUG("Setting emulator tuning/settings"); - if (qemuProcessSetupEmulator(vm) < 0) + if (qemuProcessInitCpuAffinity(vm) < 0) goto cleanup;
VIR_DEBUG("Setting cgroup for external devices (if required)"); @@ -6708,10 +6722,6 @@ qemuProcessLaunch(virConnectPtr conn, if (qemuProcessUpdateAndVerifyCPU(driver, vm, asyncJob) < 0) goto cleanup;
- VIR_DEBUG("Setting up post-init cgroup restrictions"); - if (qemuSetupCpusetMems(vm) < 0) - goto cleanup; - VIR_DEBUG("setting up hotpluggable cpus"); if (qemuDomainHasHotpluggableStartupVcpus(vm->def)) { if (qemuDomainRefreshVcpuInfo(driver, vm, asyncJob, false) < 0) @@ -6737,6 +6747,10 @@ qemuProcessLaunch(virConnectPtr conn, if (qemuProcessDetectIOThreadPIDs(driver, vm, asyncJob) < 0) goto cleanup;
+ VIR_DEBUG("Setting emulator tuning/settings"); + if (qemuProcessSetupEmulator(vm) < 0) + goto cleanup; + VIR_DEBUG("Setting global CPU cgroup (if required)"); if (qemuSetupGlobalCpuCgroup(vm) < 0) goto cleanup; -- 2.19.2
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
participants (2)
-
Daniel P. Berrangé
-
Michal Privoznik