[libvirt] [RFC PATCH 00/10] use cpuset to manage numa

The goal of this series is to use cpuset to manage numa. Currently numa tuning is achieved by libnuma, the disadvantage is numa parameters can not be modified when the domain is running. By using cpuset, we can do this and more: - a new command, numatune, to get/set numa parameters - cpuset can migrate pages to new nodes if a domain's numa nodes is changed when running This is a draft for early reviews. Comments are welcome. TODO: - add --migrate option to numatune - docs for numatune Hu Tao (10): enable cgroup cpuset by default Add functions to set/get cpuset parameters use cpuset to manage numa remove dependence on libnuma new numa parameters add new API virDomain{G,S}etNumaParameters Implement main entries of virDomain{G,S}etNumaParameters Add virDomain{G,S}etNumaParameters support to the remote driver Implement virDomain{G,S}etNumaParameters for the qemu driver add new command numatune to virsh daemon/remote.c | 63 +++++++++ include/libvirt/libvirt.h.in | 23 +++ python/generator.py | 2 + src/conf/domain_conf.c | 35 ++--- src/conf/domain_conf.h | 3 +- src/driver.h | 15 ++ src/libvirt.c | 106 +++++++++++++++ src/libvirt_private.syms | 6 + src/libvirt_public.syms | 6 + src/qemu/qemu.conf | 5 +- src/qemu/qemu_cgroup.c | 37 +++++ src/qemu/qemu_conf.c | 3 +- src/qemu/qemu_driver.c | 307 ++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_process.c | 111 --------------- src/remote/remote_driver.c | 50 +++++++ src/remote/remote_protocol.x | 25 ++++- src/remote_protocol-structs | 16 +++ src/util/cgroup.c | 80 +++++++++++ src/util/cgroup.h | 7 + tools/virsh.c | 187 +++++++++++++++++++++++++ 20 files changed, 950 insertions(+), 137 deletions(-) -- 1.7.3.1

This prepares for subsequent patches which introduce dependence on cgroup cpuset. Enable cgroup cpuset by default so users don't have to modify configuration file before encountering a cpuset error. --- src/qemu/qemu.conf | 5 +++-- src/qemu/qemu_conf.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/qemu/qemu.conf b/src/qemu/qemu.conf index 4da5d5a..1c14d8f 100644 --- a/src/qemu/qemu.conf +++ b/src/qemu/qemu.conf @@ -157,18 +157,19 @@ # - 'devices' - use for device whitelisting # - 'memory' - use for memory tunables # - 'blkio' - use for block devices I/O tunables +# - 'cpuset' - use for CPUs and memory nodes # # NB, even if configured here, they won't be used unless # the administrator has mounted cgroups, e.g.: # # mkdir /dev/cgroup -# mount -t cgroup -o devices,cpu,memory,blkio none /dev/cgroup +# mount -t cgroup -o devices,cpu,memory,blkio,cpuset none /dev/cgroup # # They can be mounted anywhere, and different controllers # can be mounted in different locations. libvirt will detect # where they are located. # -# cgroup_controllers = [ "cpu", "devices", "memory", "blkio" ] +# cgroup_controllers = [ "cpu", "devices", "memory", "blkio", "cpuset" ] # This is the basic set of devices allowed / required by # all virtual machines. diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index d1bf075..854e8e7 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -306,7 +306,8 @@ int qemudLoadDriverConfig(struct qemud_driver *driver, (1 << VIR_CGROUP_CONTROLLER_CPU) | (1 << VIR_CGROUP_CONTROLLER_DEVICES) | (1 << VIR_CGROUP_CONTROLLER_MEMORY) | - (1 << VIR_CGROUP_CONTROLLER_BLKIO); + (1 << VIR_CGROUP_CONTROLLER_BLKIO) | + (1 << VIR_CGROUP_CONTROLLER_CPUSET); } for (i = 0 ; i < VIR_CGROUP_CONTROLLER_LAST ; i++) { if (driver->cgroupControllers & (1 << i)) { -- 1.7.3.1

--- src/libvirt_private.syms | 6 +++ src/util/cgroup.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/cgroup.h | 7 ++++ 3 files changed, 93 insertions(+), 0 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 6a1562e..64da63d 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -78,6 +78,9 @@ virCgroupGetCpuShares; virCgroupGetCpuCfsPeriod; virCgroupGetCpuCfsQuota; virCgroupGetCpuacctUsage; +virCgroupGetCpusetHardwall; +virCgroupGetCpusetMemExclusive; +virCgroupGetCpusetMems; virCgroupGetFreezerState; virCgroupGetMemoryHardLimit; virCgroupGetMemorySoftLimit; @@ -93,6 +96,9 @@ virCgroupSetBlkioWeight; virCgroupSetCpuShares; virCgroupSetCpuCfsPeriod; virCgroupSetCpuCfsQuota; +virCgroupSetCpusetHardwall; +virCgroupSetCpusetMemExclusive; +virCgroupSetCpusetMems; virCgroupSetFreezerState; virCgroupSetMemory; virCgroupSetMemoryHardLimit; diff --git a/src/util/cgroup.c b/src/util/cgroup.c index c8d1f33..7b01a1c 100644 --- a/src/util/cgroup.c +++ b/src/util/cgroup.c @@ -1154,6 +1154,86 @@ int virCgroupGetMemSwapHardLimit(virCgroupPtr group, unsigned long long *kb) } /** + * virCgroupSetCpusetMems: + * + * @group: The cgroup to set cpuset.mems for + * @mems: the numa nodes to set + * + * Returns: 0 on success + */ +int virCgroupSetCpusetMems(virCgroupPtr group, const char *mems) +{ + return virCgroupSetValueStr(group, + VIR_CGROUP_CONTROLLER_CPUSET, + "cpuset.mems", + mems); +} + +/** + * virCgroupGetCpusetMems: + * + * @group: The cgroup to get cpuset.mems for + * @mems: the numa nodes to get + * + * Returns: 0 on success + */ +int virCgroupGetCpusetMems(virCgroupPtr group, char **mems) +{ + return virCgroupGetValueStr(group, + VIR_CGROUP_CONTROLLER_CPUSET, + "cpuset.mems", + mems); +} + +/** + * virCgroupSetCpusetHardwall: + * + * @group: The cgroup to set cpuset.mems for + * @hardwall: the hardwall to set + * + * Returns: 0 on success + */ +int virCgroupSetCpusetHardwall(virCgroupPtr group, unsigned long long hardwall) +{ + return virCgroupSetValueU64(group, + VIR_CGROUP_CONTROLLER_CPUSET, + "cpuset.mem_hardwall", + hardwall); +} + +/** + * virCgroupGetCpusetHardwall: + * + * @group: The cgroup to get cpuset.mem_hardwall for + * @hardwall: the hardwall to get + * + * Returns: 0 on success + */ +int virCgroupGetCpusetHardwall(virCgroupPtr group, unsigned long long *hardwall) +{ + return virCgroupGetValueU64(group, + VIR_CGROUP_CONTROLLER_CPUSET, + "cpuset.mem_hardwall", + hardwall); +} + +int virCgroupSetCpusetMemExclusive(virCgroupPtr group, unsigned long long memExclusive) +{ + return virCgroupSetValueU64(group, + VIR_CGROUP_CONTROLLER_CPUSET, + "cpuset.mem_exclusive", + memExclusive); +} + +int virCgroupGetCpusetMemExclusive(virCgroupPtr group, unsigned long long *memExclusive) +{ + return virCgroupGetValueU64(group, + VIR_CGROUP_CONTROLLER_CPUSET, + "cpuset.mem_exclusive", + memExclusive); +} + +/** * virCgroupDenyAllDevices: * * @group: The cgroup to deny all permissions, for all devices diff --git a/src/util/cgroup.h b/src/util/cgroup.h index d190bb3..c9d891b 100644 --- a/src/util/cgroup.h +++ b/src/util/cgroup.h @@ -115,6 +115,13 @@ int virCgroupGetCpuacctUsage(virCgroupPtr group, unsigned long long *usage); int virCgroupSetFreezerState(virCgroupPtr group, const char *state); int virCgroupGetFreezerState(virCgroupPtr group, char **state); +int virCgroupSetCpusetMems(virCgroupPtr group, const char *mems); +int virCgroupGetCpusetMems(virCgroupPtr group, char **mems); +int virCgroupSetCpusetMemExclusive(virCgroupPtr group, unsigned long long memExclusive); +int virCgroupGetCpusetMemExclusive(virCgroupPtr group, unsigned long long *memExclusive); +int virCgroupSetCpusetHardwall(virCgroupPtr group, unsigned long long hardwall); +int virCgroupGetCpusetHardwall(virCgroupPtr group, unsigned long long *hardwall); + int virCgroupRemove(virCgroupPtr group); void virCgroupFree(virCgroupPtr *group); -- 1.7.3.1

This patch deprecates libnuma and uses cpuset to manage numa. We can further add a `numatune' command to adjust numa parameters through cpuset. --- src/qemu/qemu_cgroup.c | 23 +++++++++++++++++++++++ 1 files changed, 23 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 2a10bd2..49307b2 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -365,6 +365,29 @@ int qemuSetupCgroup(struct qemud_driver *driver, } } + if (vm->def->numatune.memory.nodemask) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPUSET)) { + char *mask = virDomainCpuSetFormat(vm->def->numatune.memory.nodemask, VIR_DOMAIN_CPUMASK_LEN); + if (!mask) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("failed to convert memory nodemask")); + goto cleanup; + } + + rc = virCgroupSetCpusetMems(cgroup, mask); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpuset.mems for domain %s"), + vm->def->name); + goto cleanup; + } + } else { + qemuReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("Unable to set numa for domain %s since cpuset cgroup is " + "not available on this host"), vm->def->name); + goto cleanup; + } + } done: virCgroupFree(&cgroup); return 0; -- 1.7.3.1

Since we use cpuset to manage numa, we can safely remove dependence on libnuma. --- src/conf/domain_conf.c | 24 +---------- src/conf/domain_conf.h | 1 - src/qemu/qemu_process.c | 111 ----------------------------------------------- 3 files changed, 1 insertions(+), 135 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 6e2d421..0cf3bb7 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -583,11 +583,6 @@ VIR_ENUM_IMPL(virDomainTimerMode, VIR_DOMAIN_TIMER_MODE_LAST, "paravirt", "smpsafe"); -VIR_ENUM_IMPL(virDomainNumatuneMemMode, VIR_DOMAIN_NUMATUNE_MEM_LAST, - "strict", - "preferred", - "interleave"); - VIR_ENUM_IMPL(virDomainStartupPolicy, VIR_DOMAIN_STARTUP_POLICY_LAST, "default", "mandatory", @@ -6834,20 +6829,6 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, "%s", _("nodeset for NUMA memory tuning must be set")); goto error; } - - tmp = virXPathString("string(./numatune/memory/@mode)", ctxt); - if (tmp) { - if ((def->numatune.memory.mode = - virDomainNumatuneMemModeTypeFromString(tmp)) < 0) { - virDomainReportError(VIR_ERR_INTERNAL_ERROR, - _("Unsupported NUMA memory tuning mode '%s'"), - tmp); - goto error; - } - VIR_FREE(tmp); - } else { - def->numatune.memory.mode = VIR_DOMAIN_NUMATUNE_MEM_STRICT; - } } n = virXPathNodeSet("./features/*", ctxt, &nodes); @@ -10882,7 +10863,6 @@ virDomainDefFormatInternal(virDomainDefPtr def, virBufferAddLit(buf, " </cputune>\n"); if (def->numatune.memory.nodemask) { - const char *mode; char *nodemask = NULL; virBufferAddLit(buf, " <numatune>\n"); @@ -10894,9 +10874,7 @@ virDomainDefFormatInternal(virDomainDefPtr def, goto cleanup; } - mode = virDomainNumatuneMemModeTypeToString(def->numatune.memory.mode); - virBufferAsprintf(buf, " <memory mode='%s' nodeset='%s'/>\n", - mode, nodemask); + virBufferAsprintf(buf, " <memory nodeset='%s'/>\n", nodemask); VIR_FREE(nodemask); virBufferAddLit(buf, " </numatune>\n"); } diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index f74f4bb..ca68437 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -1353,7 +1353,6 @@ typedef virDomainNumatuneDef *virDomainNumatuneDefPtr; struct _virDomainNumatuneDef { struct { char *nodemask; - int mode; } memory; /* Future NUMA tuning related stuff should go here. */ diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 47164f7..5969b34 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -39,11 +39,6 @@ #include "qemu_bridge_filter.h" #include "qemu_migration.h" -#if HAVE_NUMACTL -# define NUMA_VERSION1_COMPATIBILITY 1 -# include <numa.h> -#endif - #include "datatypes.h" #include "logging.h" #include "virterror_internal.h" @@ -1314,109 +1309,6 @@ qemuProcessDetectVcpuPIDs(struct qemud_driver *driver, return 0; } - -/* - * Set NUMA memory policy for qemu process, to be run between - * fork/exec of QEMU only. - */ -#if HAVE_NUMACTL -static int -qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm) -{ - nodemask_t mask; - int mode = -1; - int node = -1; - int ret = -1; - int i = 0; - int maxnode = 0; - bool warned = false; - - if (!vm->def->numatune.memory.nodemask) - return 0; - - VIR_DEBUG("Setting NUMA memory policy"); - - if (numa_available() < 0) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, - "%s", _("Host kernel is not aware of NUMA.")); - return -1; - } - - maxnode = numa_max_node() + 1; - - /* Convert nodemask to NUMA bitmask. */ - nodemask_zero(&mask); - for (i = 0; i < VIR_DOMAIN_CPUMASK_LEN; i++) { - if (vm->def->numatune.memory.nodemask[i]) { - if (i > NUMA_NUM_NODES) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, - _("Host cannot support NUMA node %d"), i); - return -1; - } - if (i > maxnode && !warned) { - VIR_WARN("nodeset is out of range, there is only %d NUMA " - "nodes on host", maxnode); - warned = true; - } - nodemask_set(&mask, i); - } - } - - mode = vm->def->numatune.memory.mode; - - if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { - numa_set_bind_policy(1); - numa_set_membind(&mask); - numa_set_bind_policy(0); - } else if (mode == VIR_DOMAIN_NUMATUNE_MEM_PREFERRED) { - int nnodes = 0; - for (i = 0; i < NUMA_NUM_NODES; i++) { - if (nodemask_isset(&mask, i)) { - node = i; - nnodes++; - } - } - - if (nnodes != 1) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, - "%s", _("NUMA memory tuning in 'preferred' mode " - "only supports single node")); - goto cleanup; - } - - numa_set_bind_policy(0); - numa_set_preferred(node); - } else if (mode == VIR_DOMAIN_NUMATUNE_MEM_INTERLEAVE) { - numa_set_interleave_mask(&mask); - } else { - /* XXX: Shouldn't go here, as we already do checking when - * parsing domain XML. - */ - qemuReportError(VIR_ERR_XML_ERROR, - "%s", _("Invalid mode for memory NUMA tuning.")); - goto cleanup; - } - - ret = 0; - -cleanup: - return ret; -} -#else -static int -qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm) -{ - if (vm->def->numatune.memory.nodemask) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("libvirt is compiled without NUMA tuning support")); - - return -1; - } - - return 0; -} -#endif - /* * To be run between fork/exec of QEMU only */ @@ -2179,9 +2071,6 @@ static int qemuProcessHook(void *data) if (qemuProcessInitCpuAffinity(h->vm) < 0) goto cleanup; - if (qemuProcessInitNumaMemoryPolicy(h->vm) < 0) - return -1; - VIR_DEBUG("Setting up security labelling"); if (virSecurityManagerSetProcessLabel(h->driver->securityManager, h->vm) < 0) goto cleanup; -- 1.7.3.1

On Thu, Nov 03, 2011 at 07:55:19PM +0800, Hu Tao wrote:
Since we use cpuset to manage numa, we can safely remove dependence on libnuma. --- src/conf/domain_conf.c | 24 +---------- src/conf/domain_conf.h | 1 - src/qemu/qemu_process.c | 111 ----------------------------------------------- 3 files changed, 1 insertions(+), 135 deletions(-)
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 6e2d421..0cf3bb7 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -583,11 +583,6 @@ VIR_ENUM_IMPL(virDomainTimerMode, VIR_DOMAIN_TIMER_MODE_LAST, "paravirt", "smpsafe");
-VIR_ENUM_IMPL(virDomainNumatuneMemMode, VIR_DOMAIN_NUMATUNE_MEM_LAST, - "strict", - "preferred", - "interleave"); - VIR_ENUM_IMPL(virDomainStartupPolicy, VIR_DOMAIN_STARTUP_POLICY_LAST, "default", "mandatory", @@ -6834,20 +6829,6 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, "%s", _("nodeset for NUMA memory tuning must be set")); goto error; } - - tmp = virXPathString("string(./numatune/memory/@mode)", ctxt); - if (tmp) { - if ((def->numatune.memory.mode = - virDomainNumatuneMemModeTypeFromString(tmp)) < 0) { - virDomainReportError(VIR_ERR_INTERNAL_ERROR, - _("Unsupported NUMA memory tuning mode '%s'"), - tmp); - goto error; - } - VIR_FREE(tmp); - } else { - def->numatune.memory.mode = VIR_DOMAIN_NUMATUNE_MEM_STRICT; - } }
n = virXPathNodeSet("./features/*", ctxt, &nodes); @@ -10882,7 +10863,6 @@ virDomainDefFormatInternal(virDomainDefPtr def, virBufferAddLit(buf, " </cputune>\n");
if (def->numatune.memory.nodemask) { - const char *mode; char *nodemask = NULL;
virBufferAddLit(buf, " <numatune>\n"); @@ -10894,9 +10874,7 @@ virDomainDefFormatInternal(virDomainDefPtr def, goto cleanup; }
- mode = virDomainNumatuneMemModeTypeToString(def->numatune.memory.mode); - virBufferAsprintf(buf, " <memory mode='%s' nodeset='%s'/>\n", - mode, nodemask); + virBufferAsprintf(buf, " <memory nodeset='%s'/>\n", nodemask); VIR_FREE(nodemask); virBufferAddLit(buf, " </numatune>\n"); } diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index f74f4bb..ca68437 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -1353,7 +1353,6 @@ typedef virDomainNumatuneDef *virDomainNumatuneDefPtr; struct _virDomainNumatuneDef { struct { char *nodemask; - int mode; } memory;
/* Future NUMA tuning related stuff should go here. */
You can't remove this stuff from the XML - this is part of our public stability guarentee. The way it is modelled is not specific to libnuma anyway, so there shouldn't be this tie.
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 47164f7..5969b34 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -39,11 +39,6 @@ #include "qemu_bridge_filter.h" #include "qemu_migration.h"
-#if HAVE_NUMACTL -# define NUMA_VERSION1_COMPATIBILITY 1 -# include <numa.h> -#endif - #include "datatypes.h" #include "logging.h" #include "virterror_internal.h" @@ -1314,109 +1309,6 @@ qemuProcessDetectVcpuPIDs(struct qemud_driver *driver, return 0; }
- -/* - * Set NUMA memory policy for qemu process, to be run between - * fork/exec of QEMU only. - */ -#if HAVE_NUMACTL -static int -qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm) -{ - nodemask_t mask; - int mode = -1; - int node = -1; - int ret = -1; - int i = 0; - int maxnode = 0; - bool warned = false; - - if (!vm->def->numatune.memory.nodemask) - return 0; - - VIR_DEBUG("Setting NUMA memory policy"); - - if (numa_available() < 0) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, - "%s", _("Host kernel is not aware of NUMA.")); - return -1; - } - - maxnode = numa_max_node() + 1; - - /* Convert nodemask to NUMA bitmask. */ - nodemask_zero(&mask); - for (i = 0; i < VIR_DOMAIN_CPUMASK_LEN; i++) { - if (vm->def->numatune.memory.nodemask[i]) { - if (i > NUMA_NUM_NODES) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, - _("Host cannot support NUMA node %d"), i); - return -1; - } - if (i > maxnode && !warned) { - VIR_WARN("nodeset is out of range, there is only %d NUMA " - "nodes on host", maxnode); - warned = true; - } - nodemask_set(&mask, i); - } - } - - mode = vm->def->numatune.memory.mode; - - if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { - numa_set_bind_policy(1); - numa_set_membind(&mask); - numa_set_bind_policy(0); - } else if (mode == VIR_DOMAIN_NUMATUNE_MEM_PREFERRED) { - int nnodes = 0; - for (i = 0; i < NUMA_NUM_NODES; i++) { - if (nodemask_isset(&mask, i)) { - node = i; - nnodes++; - } - } - - if (nnodes != 1) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, - "%s", _("NUMA memory tuning in 'preferred' mode " - "only supports single node")); - goto cleanup; - } - - numa_set_bind_policy(0); - numa_set_preferred(node); - } else if (mode == VIR_DOMAIN_NUMATUNE_MEM_INTERLEAVE) { - numa_set_interleave_mask(&mask); - } else { - /* XXX: Shouldn't go here, as we already do checking when - * parsing domain XML. - */ - qemuReportError(VIR_ERR_XML_ERROR, - "%s", _("Invalid mode for memory NUMA tuning.")); - goto cleanup; - } - - ret = 0; - -cleanup: - return ret; -} -#else -static int -qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm) -{ - if (vm->def->numatune.memory.nodemask) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("libvirt is compiled without NUMA tuning support")); - - return -1; - } - - return 0; -} -#endif - /* * To be run between fork/exec of QEMU only */ @@ -2179,9 +2071,6 @@ static int qemuProcessHook(void *data) if (qemuProcessInitCpuAffinity(h->vm) < 0) goto cleanup;
- if (qemuProcessInitNumaMemoryPolicy(h->vm) < 0) - return -1; - VIR_DEBUG("Setting up security labelling"); if (virSecurityManagerSetProcessLabel(h->driver->securityManager, h->vm) < 0) goto cleanup;
NACK to this removal of libnuma. The libvirt QEMU driver still supports deployment on RHEL5 vintage hosts where there is no cgroups mechanism at all. I'm all for using the cpuset controller *if* it is available. We must fallback to libnuma if it is not present though, to prevent functional regressions for existing users of libvirt Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Thu, Nov 03, 2011 at 12:13:27PM +0000, Daniel P. Berrange wrote:
On Thu, Nov 03, 2011 at 07:55:19PM +0800, Hu Tao wrote:
Since we use cpuset to manage numa, we can safely remove dependence on libnuma. --- src/conf/domain_conf.c | 24 +---------- src/conf/domain_conf.h | 1 - src/qemu/qemu_process.c | 111 ----------------------------------------------- 3 files changed, 1 insertions(+), 135 deletions(-)
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 6e2d421..0cf3bb7 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -583,11 +583,6 @@ VIR_ENUM_IMPL(virDomainTimerMode, VIR_DOMAIN_TIMER_MODE_LAST, "paravirt", "smpsafe");
-VIR_ENUM_IMPL(virDomainNumatuneMemMode, VIR_DOMAIN_NUMATUNE_MEM_LAST, - "strict", - "preferred", - "interleave"); - VIR_ENUM_IMPL(virDomainStartupPolicy, VIR_DOMAIN_STARTUP_POLICY_LAST, "default", "mandatory", @@ -6834,20 +6829,6 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, "%s", _("nodeset for NUMA memory tuning must be set")); goto error; } - - tmp = virXPathString("string(./numatune/memory/@mode)", ctxt); - if (tmp) { - if ((def->numatune.memory.mode = - virDomainNumatuneMemModeTypeFromString(tmp)) < 0) { - virDomainReportError(VIR_ERR_INTERNAL_ERROR, - _("Unsupported NUMA memory tuning mode '%s'"), - tmp); - goto error; - } - VIR_FREE(tmp); - } else { - def->numatune.memory.mode = VIR_DOMAIN_NUMATUNE_MEM_STRICT; - } }
n = virXPathNodeSet("./features/*", ctxt, &nodes); @@ -10882,7 +10863,6 @@ virDomainDefFormatInternal(virDomainDefPtr def, virBufferAddLit(buf, " </cputune>\n");
if (def->numatune.memory.nodemask) { - const char *mode; char *nodemask = NULL;
virBufferAddLit(buf, " <numatune>\n"); @@ -10894,9 +10874,7 @@ virDomainDefFormatInternal(virDomainDefPtr def, goto cleanup; }
- mode = virDomainNumatuneMemModeTypeToString(def->numatune.memory.mode); - virBufferAsprintf(buf, " <memory mode='%s' nodeset='%s'/>\n", - mode, nodemask); + virBufferAsprintf(buf, " <memory nodeset='%s'/>\n", nodemask); VIR_FREE(nodemask); virBufferAddLit(buf, " </numatune>\n"); } diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index f74f4bb..ca68437 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -1353,7 +1353,6 @@ typedef virDomainNumatuneDef *virDomainNumatuneDefPtr; struct _virDomainNumatuneDef { struct { char *nodemask; - int mode; } memory;
/* Future NUMA tuning related stuff should go here. */
You can't remove this stuff from the XML - this is part of our public stability guarentee. The way it is modelled is not specific to libnuma anyway, so there shouldn't be this tie.
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 47164f7..5969b34 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -39,11 +39,6 @@ #include "qemu_bridge_filter.h" #include "qemu_migration.h"
-#if HAVE_NUMACTL -# define NUMA_VERSION1_COMPATIBILITY 1 -# include <numa.h> -#endif - #include "datatypes.h" #include "logging.h" #include "virterror_internal.h" @@ -1314,109 +1309,6 @@ qemuProcessDetectVcpuPIDs(struct qemud_driver *driver, return 0; }
- -/* - * Set NUMA memory policy for qemu process, to be run between - * fork/exec of QEMU only. - */ -#if HAVE_NUMACTL -static int -qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm) -{ - nodemask_t mask; - int mode = -1; - int node = -1; - int ret = -1; - int i = 0; - int maxnode = 0; - bool warned = false; - - if (!vm->def->numatune.memory.nodemask) - return 0; - - VIR_DEBUG("Setting NUMA memory policy"); - - if (numa_available() < 0) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, - "%s", _("Host kernel is not aware of NUMA.")); - return -1; - } - - maxnode = numa_max_node() + 1; - - /* Convert nodemask to NUMA bitmask. */ - nodemask_zero(&mask); - for (i = 0; i < VIR_DOMAIN_CPUMASK_LEN; i++) { - if (vm->def->numatune.memory.nodemask[i]) { - if (i > NUMA_NUM_NODES) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, - _("Host cannot support NUMA node %d"), i); - return -1; - } - if (i > maxnode && !warned) { - VIR_WARN("nodeset is out of range, there is only %d NUMA " - "nodes on host", maxnode); - warned = true; - } - nodemask_set(&mask, i); - } - } - - mode = vm->def->numatune.memory.mode; - - if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { - numa_set_bind_policy(1); - numa_set_membind(&mask); - numa_set_bind_policy(0); - } else if (mode == VIR_DOMAIN_NUMATUNE_MEM_PREFERRED) { - int nnodes = 0; - for (i = 0; i < NUMA_NUM_NODES; i++) { - if (nodemask_isset(&mask, i)) { - node = i; - nnodes++; - } - } - - if (nnodes != 1) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, - "%s", _("NUMA memory tuning in 'preferred' mode " - "only supports single node")); - goto cleanup; - } - - numa_set_bind_policy(0); - numa_set_preferred(node); - } else if (mode == VIR_DOMAIN_NUMATUNE_MEM_INTERLEAVE) { - numa_set_interleave_mask(&mask); - } else { - /* XXX: Shouldn't go here, as we already do checking when - * parsing domain XML. - */ - qemuReportError(VIR_ERR_XML_ERROR, - "%s", _("Invalid mode for memory NUMA tuning.")); - goto cleanup; - } - - ret = 0; - -cleanup: - return ret; -} -#else -static int -qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm) -{ - if (vm->def->numatune.memory.nodemask) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("libvirt is compiled without NUMA tuning support")); - - return -1; - } - - return 0; -} -#endif - /* * To be run between fork/exec of QEMU only */ @@ -2179,9 +2071,6 @@ static int qemuProcessHook(void *data) if (qemuProcessInitCpuAffinity(h->vm) < 0) goto cleanup;
- if (qemuProcessInitNumaMemoryPolicy(h->vm) < 0) - return -1; - VIR_DEBUG("Setting up security labelling"); if (virSecurityManagerSetProcessLabel(h->driver->securityManager, h->vm) < 0) goto cleanup;
NACK to this removal of libnuma. The libvirt QEMU driver still supports deployment on RHEL5 vintage hosts where there is no cgroups mechanism at all.
I'm all for using the cpuset controller *if* it is available. We must fallback to libnuma if it is not present though, to prevent functional regressions for existing users of libvirt
I will add the fallback code in v2. Thanks for comment. -- Thanks, Hu Tao

This patch adds two parameters: strict and exclusive that we can get/set through cpuset. --- src/conf/domain_conf.c | 15 +++++++++++++++ src/conf/domain_conf.h | 2 ++ 2 files changed, 17 insertions(+), 0 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 0cf3bb7..02a144b 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -6802,6 +6802,10 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, VIR_FREE(nodes); /* Extract numatune if exists. */ + + def->numatune.strict = 0; + def->numatune.exclusive = 0; + if ((n = virXPathNodeSet("./numatune", ctxt, NULL)) < 0) { virDomainReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("cannot extract numatune nodes")); @@ -6829,6 +6833,13 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, "%s", _("nodeset for NUMA memory tuning must be set")); goto error; } + + if (virXPathNode("./numatune/strict", ctxt)) { + def->numatune.strict = 1; + } + if (virXPathNode("./numatune/exclusive", ctxt)) { + def->numatune.exclusive = 1; + } } n = virXPathNodeSet("./features/*", ctxt, &nodes); @@ -10876,6 +10887,10 @@ virDomainDefFormatInternal(virDomainDefPtr def, virBufferAsprintf(buf, " <memory nodeset='%s'/>\n", nodemask); VIR_FREE(nodemask); + if (def->numatune.strict) + virBufferAsprintf(buf, " <strict/>\n"); + if (def->numatune.exclusive) + virBufferAsprintf(buf, " <exclusive/>\n"); virBufferAddLit(buf, " </numatune>\n"); } diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index ca68437..f3dbece 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -1355,6 +1355,8 @@ struct _virDomainNumatuneDef { char *nodemask; } memory; + int strict; + int exclusive; /* Future NUMA tuning related stuff should go here. */ }; -- 1.7.3.1

On Thu, Nov 03, 2011 at 07:55:20PM +0800, Hu Tao wrote:
This patch adds two parameters: strict and exclusive that we can get/set through cpuset. --- src/conf/domain_conf.c | 15 +++++++++++++++ src/conf/domain_conf.h | 2 ++ 2 files changed, 17 insertions(+), 0 deletions(-)
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 0cf3bb7..02a144b 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -6802,6 +6802,10 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, VIR_FREE(nodes);
/* Extract numatune if exists. */ + + def->numatune.strict = 0; + def->numatune.exclusive = 0; + if ((n = virXPathNodeSet("./numatune", ctxt, NULL)) < 0) { virDomainReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("cannot extract numatune nodes")); @@ -6829,6 +6833,13 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, "%s", _("nodeset for NUMA memory tuning must be set")); goto error; } + + if (virXPathNode("./numatune/strict", ctxt)) { + def->numatune.strict = 1; + } + if (virXPathNode("./numatune/exclusive", ctxt)) { + def->numatune.exclusive = 1; + } }
n = virXPathNodeSet("./features/*", ctxt, &nodes); @@ -10876,6 +10887,10 @@ virDomainDefFormatInternal(virDomainDefPtr def,
virBufferAsprintf(buf, " <memory nodeset='%s'/>\n", nodemask); VIR_FREE(nodemask); + if (def->numatune.strict) + virBufferAsprintf(buf, " <strict/>\n"); + if (def->numatune.exclusive) + virBufferAsprintf(buf, " <exclusive/>\n"); virBufferAddLit(buf, " </numatune>\n"); }
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index ca68437..f3dbece 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -1355,6 +1355,8 @@ struct _virDomainNumatuneDef { char *nodemask; } memory;
+ int strict; + int exclusive; /* Future NUMA tuning related stuff should go here. */ };
NACK to this for the same reason as the previous patch - we can't simply change the XML format at will to suit a new backend implementation. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

Set up the types for the numa functions and insert them into the virDriver structure definition. --- include/libvirt/libvirt.h.in | 23 +++++++++++++++++++++++ python/generator.py | 2 ++ src/driver.h | 15 +++++++++++++++ src/libvirt_public.syms | 6 ++++++ 4 files changed, 46 insertions(+), 0 deletions(-) diff --git a/include/libvirt/libvirt.h.in b/include/libvirt/libvirt.h.in index 7181f62..7802508 100644 --- a/include/libvirt/libvirt.h.in +++ b/include/libvirt/libvirt.h.in @@ -1281,6 +1281,29 @@ typedef enum { } virDomainMemoryModFlags; +/* Manage numa parameters */ + +/** + * VIR_DOMAIN_NUMA_STRICT: + * + * Whether memory allocations are restricted to the numa nodes + */ +#define VIR_DOMAIN_NUMA_STRICT "numa_strict" + +/** + * VIR_DOMAIN_NUMA_EXCLUSIVE: + * + * Whether others can share the numa nodes + */ +#define VIR_DOMAIN_NUMA_EXCLUSIVE "numa_exclusive" + +int virDomainSetNumaParameters(virDomainPtr domain, + virTypedParameterPtr params, + int nparams, unsigned int flags); +int virDomainGetNumaParameters(virDomainPtr domain, + virTypedParameterPtr params, + int *nparams, unsigned int flags); + /* * Dynamic control of domains */ diff --git a/python/generator.py b/python/generator.py index 71afdb7..5b09123 100755 --- a/python/generator.py +++ b/python/generator.py @@ -382,6 +382,8 @@ skip_impl = ( 'virDomainGetBlkioParameters', 'virDomainSetMemoryParameters', 'virDomainGetMemoryParameters', + 'virDomainSetNumaParameters', + 'virDomainGetNumaParameters', 'virDomainGetVcpus', 'virDomainPinVcpu', 'virDomainPinVcpuFlags', diff --git a/src/driver.h b/src/driver.h index 4c14aaa..76cf966 100644 --- a/src/driver.h +++ b/src/driver.h @@ -158,6 +158,19 @@ typedef int int *nparams, unsigned int flags); typedef int + (*virDrvDomainSetNumaParameters) + (virDomainPtr domain, + virTypedParameterPtr params, + int nparams, + unsigned int flags); +typedef int + (*virDrvDomainGetNumaParameters) + (virDomainPtr domain, + virTypedParameterPtr params, + int *nparams, + unsigned int flags); + +typedef int (*virDrvDomainSetBlkioParameters) (virDomainPtr domain, virTypedParameterPtr params, @@ -787,6 +800,8 @@ struct _virDriver { virDrvDomainSetMemoryFlags domainSetMemoryFlags; virDrvDomainSetMemoryParameters domainSetMemoryParameters; virDrvDomainGetMemoryParameters domainGetMemoryParameters; + virDrvDomainSetNumaParameters domainSetNumaParameters; + virDrvDomainGetNumaParameters domainGetNumaParameters; virDrvDomainSetBlkioParameters domainSetBlkioParameters; virDrvDomainGetBlkioParameters domainGetBlkioParameters; virDrvDomainGetInfo domainGetInfo; diff --git a/src/libvirt_public.syms b/src/libvirt_public.syms index bcefb10..d830974 100644 --- a/src/libvirt_public.syms +++ b/src/libvirt_public.syms @@ -498,4 +498,10 @@ LIBVIRT_0.9.7 { virDomainSnapshotNumChildren; } LIBVIRT_0.9.5; +LIBVIRT_0.9.8 { + global: + virDomainGetNumaParameters; + virDomainSetNumaParameters; +} LIBVIRT_0.9.7; + # .... define new API here using predicted next version number .... -- 1.7.3.1

--- src/libvirt.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 106 insertions(+), 0 deletions(-) diff --git a/src/libvirt.c b/src/libvirt.c index b0d1e01..af791bf 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -3712,6 +3712,112 @@ error: } /** + * virDomainSetNumaParameters: + * @domain: pointer to domain object + * @params: pointer to numa parameter objects + * @nparams: number of numa parameter (this value can be the same or + * less than the number of parameters supported) + * @flags: bitwise-OR of virDomainModificationImpact + * + * Change all or a subset of the numa tunables. + * This function may require privileged access to the hypervisor. + * + * Returns -1 in case of error, 0 in case of success. + */ +int virDomainSetNumaParameters(virDomainPtr domain, + virTypedParameterPtr params, + int nparams, unsigned int flags) +{ + virConnectPtr conn; + + VIR_DOMAIN_DEBUG(domain, "params=%p, nparams=%d, flags=%x", + params, nparams, flags); + + virResetLastError(); + + if (!VIR_IS_CONNECTED_DOMAIN(domain)) { + virLibDomainError(VIR_ERR_INVALID_DOMAIN, __FUNCTION__); + virDispatchError(NULL); + return -1; + } + if (domain->conn->flags & VIR_CONNECT_RO) { + virLibDomainError(VIR_ERR_OPERATION_DENIED, __FUNCTION__); + goto error; + } + if ((nparams <= 0) || (params == NULL)) { + virLibDomainError(VIR_ERR_INVALID_ARG, __FUNCTION__); + goto error; + } + conn = domain->conn; + + if (conn->driver->domainSetNumaParameters) { + int ret; + ret = conn->driver->domainSetNumaParameters (domain, params, nparams, flags); + if (ret < 0) + goto error; + return ret; + } + + virLibConnError(VIR_ERR_NO_SUPPORT, __FUNCTION__); + +error: + virDispatchError(domain->conn); + return -1; +} + +/** + * virDomainGetNumaParameters: + * @domain: pointer to domain object + * @params: pointer to numa parameter object + * (return value, allocated by the caller) + * @nparams: pointer to number of numa parameters + * @flags: one of virDomainModificationImpact + * + * This function may require privileged access to the hypervisor. This function + * expects the caller to allocate the @params. + * + * Returns -1 in case of error, 0 in case of success. + */ + +int +virDomainGetNumaParameters(virDomainPtr domain, + virTypedParameterPtr params, + int *nparams, unsigned int flags) +{ + virConnectPtr conn; + + VIR_DOMAIN_DEBUG(domain, "params=%p, nparams=%d, flags=%x", + params, (nparams) ? *nparams : -1, flags); + + virResetLastError(); + + if (!VIR_IS_CONNECTED_DOMAIN(domain)) { + virLibDomainError(VIR_ERR_INVALID_DOMAIN, __FUNCTION__); + virDispatchError(NULL); + return -1; + } + if ((nparams == NULL) || (*nparams < 0) || + (params == NULL && *nparams != 0)) { + virLibDomainError(VIR_ERR_INVALID_ARG, __FUNCTION__); + goto error; + } + conn = domain->conn; + + if (conn->driver->domainGetNumaParameters) { + int ret; + ret = conn->driver->domainGetNumaParameters (domain, params, nparams, flags); + if (ret < 0) + goto error; + return ret; + } + virLibConnError(VIR_ERR_NO_SUPPORT, __FUNCTION__); + +error: + virDispatchError(domain->conn); + return -1; +} + +/** * virDomainSetBlkioParameters: * @domain: pointer to domain object * @params: pointer to blkio parameter objects -- 1.7.3.1

--- daemon/remote.c | 63 ++++++++++++++++++++++++++++++++++++++++++ src/remote/remote_driver.c | 50 +++++++++++++++++++++++++++++++++ src/remote/remote_protocol.x | 25 ++++++++++++++++- src/remote_protocol-structs | 16 ++++++++++ 4 files changed, 153 insertions(+), 1 deletions(-) diff --git a/daemon/remote.c b/daemon/remote.c index bd0c3e3..de6bbc8 100644 --- a/daemon/remote.c +++ b/daemon/remote.c @@ -1591,6 +1591,69 @@ cleanup: } static int +remoteDispatchDomainGetNumaParameters(virNetServerPtr server ATTRIBUTE_UNUSED, + virNetServerClientPtr client ATTRIBUTE_UNUSED, + virNetMessagePtr msg ATTRIBUTE_UNUSED, + virNetMessageErrorPtr rerr, + remote_domain_get_numa_parameters_args *args, + remote_domain_get_numa_parameters_ret *ret) +{ + virDomainPtr dom = NULL; + virTypedParameterPtr params = NULL; + int nparams = args->nparams; + unsigned int flags; + int rv = -1; + struct daemonClientPrivate *priv = + virNetServerClientGetPrivateData(client); + + if (!priv->conn) { + virNetError(VIR_ERR_INTERNAL_ERROR, "%s", _("connection not open")); + goto cleanup; + } + + flags = args->flags; + + if (nparams > REMOTE_DOMAIN_MEMORY_PARAMETERS_MAX) { + virNetError(VIR_ERR_INTERNAL_ERROR, "%s", _("nparams too large")); + goto cleanup; + } + if (VIR_ALLOC_N(params, nparams) < 0) { + virReportOOMError(); + goto cleanup; + } + + if (!(dom = get_nonnull_domain(priv->conn, args->dom))) + goto cleanup; + + if (virDomainGetNumaParameters(dom, params, &nparams, flags) < 0) + goto cleanup; + + /* In this case, we need to send back the number of parameters + * supported + */ + if (args->nparams == 0) { + ret->nparams = nparams; + goto success; + } + + if (remoteSerializeTypedParameters(params, nparams, + &ret->params.params_val, + &ret->params.params_len) < 0) + goto cleanup; + +success: + rv = 0; + +cleanup: + if (rv < 0) + virNetMessageSaveError(rerr); + if (dom) + virDomainFree(dom); + VIR_FREE(params); + return rv; +} + +static int remoteDispatchDomainGetBlkioParameters(virNetServerPtr server ATTRIBUTE_UNUSED, virNetServerClientPtr client ATTRIBUTE_UNUSED, virNetMessagePtr msg ATTRIBUTE_UNUSED, diff --git a/src/remote/remote_driver.c b/src/remote/remote_driver.c index ea7fb24..29a1d49 100644 --- a/src/remote/remote_driver.c +++ b/src/remote/remote_driver.c @@ -1490,6 +1490,54 @@ done: } static int +remoteDomainGetNumaParameters (virDomainPtr domain, + virTypedParameterPtr params, int *nparams, + unsigned int flags) +{ + int rv = -1; + remote_domain_get_numa_parameters_args args; + remote_domain_get_numa_parameters_ret ret; + struct private_data *priv = domain->conn->privateData; + + remoteDriverLock(priv); + + make_nonnull_domain (&args.dom, domain); + args.nparams = *nparams; + args.flags = flags; + + memset (&ret, 0, sizeof ret); + if (call (domain->conn, priv, 0, REMOTE_PROC_DOMAIN_GET_NUMA_PARAMETERS, + (xdrproc_t) xdr_remote_domain_get_numa_parameters_args, (char *) &args, + (xdrproc_t) xdr_remote_domain_get_numa_parameters_ret, (char *) &ret) == -1) + goto done; + + /* Handle the case when the caller does not know the number of parameters + * and is asking for the number of parameters supported + */ + if (*nparams == 0) { + *nparams = ret.nparams; + rv = 0; + goto cleanup; + } + + if (remoteDeserializeTypedParameters(ret.params.params_val, + ret.params.params_len, + REMOTE_DOMAIN_NUMA_PARAMETERS_MAX, + params, + nparams) < 0) + goto cleanup; + + rv = 0; + +cleanup: + xdr_free ((xdrproc_t) xdr_remote_domain_get_numa_parameters_ret, + (char *) &ret); +done: + remoteDriverUnlock(priv); + return rv; +} + +static int remoteDomainGetBlkioParameters (virDomainPtr domain, virTypedParameterPtr params, int *nparams, unsigned int flags) @@ -4526,6 +4574,8 @@ static virDriver remote_driver = { .domainGetBlockJobInfo = remoteDomainGetBlockJobInfo, /* 0.9.4 */ .domainBlockJobSetSpeed = remoteDomainBlockJobSetSpeed, /* 0.9.4 */ .domainBlockPull = remoteDomainBlockPull, /* 0.9.4 */ + .domainSetNumaParameters = remoteDomainSetNumaParameters, /* 0.9.8 */ + .domainGetNumaParameters = remoteDomainGetNumaParameters, /* 0.9.8 */ }; static virNetworkDriver network_driver = { diff --git a/src/remote/remote_protocol.x b/src/remote/remote_protocol.x index a174af8..3cdbc40 100644 --- a/src/remote/remote_protocol.x +++ b/src/remote/remote_protocol.x @@ -125,6 +125,9 @@ const REMOTE_DOMAIN_BLKIO_PARAMETERS_MAX = 16; /* Upper limit on list of memory parameters. */ const REMOTE_DOMAIN_MEMORY_PARAMETERS_MAX = 16; +/* Upper limit on list of numa parameters. */ +const REMOTE_DOMAIN_NUMA_PARAMETERS_MAX = 16; + /* Upper limit on list of node cpu stats. */ const REMOTE_NODE_CPU_STATS_MAX = 16; @@ -535,6 +538,23 @@ struct remote_domain_get_memory_parameters_ret { int nparams; }; +struct remote_domain_set_numa_parameters_args { + remote_nonnull_domain dom; + remote_typed_param params<REMOTE_DOMAIN_MEMORY_PARAMETERS_MAX>; + unsigned int flags; +}; + +struct remote_domain_get_numa_parameters_args { + remote_nonnull_domain dom; + int nparams; + unsigned int flags; +}; + +struct remote_domain_get_numa_parameters_ret { + remote_typed_param params<REMOTE_DOMAIN_MEMORY_PARAMETERS_MAX>; + int nparams; +}; + struct remote_domain_block_stats_args { remote_nonnull_domain dom; remote_nonnull_string path; @@ -2562,7 +2582,10 @@ enum remote_procedure { REMOTE_PROC_DOMAIN_SNAPSHOT_NUM_CHILDREN = 246, /* autogen autogen priority:high */ REMOTE_PROC_DOMAIN_SNAPSHOT_LIST_CHILDREN_NAMES = 247, /* autogen autogen priority:high */ REMOTE_PROC_DOMAIN_EVENT_DISK_CHANGE = 248, /* skipgen skipgen */ - REMOTE_PROC_DOMAIN_OPEN_GRAPHICS = 249 /* skipgen skipgen */ + REMOTE_PROC_DOMAIN_OPEN_GRAPHICS = 249, /* skipgen skipgen */ + REMOTE_PROC_DOMAIN_SET_NUMA_PARAMETERS = 250, /* autogen autogen */ + + REMOTE_PROC_DOMAIN_GET_NUMA_PARAMETERS = 251 /* skipgen skipgen */ /* * Notice how the entries are grouped in sets of 10 ? diff --git a/src/remote_protocol-structs b/src/remote_protocol-structs index 12cedef..65d4c70 100644 --- a/src/remote_protocol-structs +++ b/src/remote_protocol-structs @@ -166,6 +166,22 @@ struct remote_node_get_memory_stats_ret { } params; int nparams; }; +struct remote_domain_set_numa_parameters_args { + remote_nonnull_domain dom; + remote_typed_param params<REMOTE_DOMAIN_MEMORY_PARAMETERS_MAX>; + unsigned int flags; +}; + +struct remote_domain_get_numa_parameters_args { + remote_nonnull_domain dom; + int nparams; + unsigned int flags; +}; + +struct remote_domain_get_numa_parameters_ret { + remote_typed_param params<REMOTE_DOMAIN_MEMORY_PARAMETERS_MAX>; + int nparams; +}; struct remote_node_get_cells_free_memory_args { int startCell; int maxcells; -- 1.7.3.1

--- src/qemu/qemu_cgroup.c | 14 ++ src/qemu/qemu_driver.c | 307 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 321 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 49307b2..99714ab 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -381,6 +381,20 @@ int qemuSetupCgroup(struct qemud_driver *driver, vm->def->name); goto cleanup; } + rc = virCgroupSetCpusetHardwall(cgroup, vm->def->numatune.strict); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpuset.mem_hardwall for domain %s"), + vm->def->name); + goto cleanup; + } + rc = virCgroupSetCpusetMemExclusive(cgroup, vm->def->numatune.exclusive); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpuset.mem_exclusive for domain %s"), + vm->def->name); + goto cleanup; + } } else { qemuReportError(VIR_ERR_CONFIG_UNSUPPORTED, _("Unable to set numa for domain %s since cpuset cgroup is " diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 5e49ff4..be9697c 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -94,6 +94,8 @@ #define QEMU_NB_MEM_PARAM 3 +#define QEMU_NB_NUMA_PARAM 2 + #if HAVE_LINUX_KVM_H # include <linux/kvm.h> #endif @@ -6518,6 +6520,309 @@ cleanup: return ret; } +static int qemuDomainSetNumaParameters(virDomainPtr dom, + virTypedParameterPtr params, + int nparams, + unsigned int flags) +{ + struct qemud_driver *driver = dom->conn->privateData; + int i; + virDomainDefPtr persistentDef = NULL; + virCgroupPtr group = NULL; + virDomainObjPtr vm = NULL; + int ret = -1; + bool isActive; + + virCheckFlags(VIR_DOMAIN_AFFECT_LIVE | + VIR_DOMAIN_AFFECT_CONFIG, -1); + + qemuDriverLock(driver); + + vm = virDomainFindByUUID(&driver->domains, dom->uuid); + + if (vm == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("No such domain %s"), dom->uuid); + goto cleanup; + } + + isActive = virDomainObjIsActive(vm); + + if (flags == VIR_DOMAIN_AFFECT_CURRENT) { + if (isActive) + flags = VIR_DOMAIN_AFFECT_LIVE; + else + flags = VIR_DOMAIN_AFFECT_CONFIG; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + if (!isActive) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("domain is not running")); + goto cleanup; + } + + if (!qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPUSET)) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("cgroup cpuset controller is not mounted")); + goto cleanup; + } + + if (virCgroupForDomain(driver->cgroup, vm->def->name, &group, 0) != 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("cannot find cgroup for domain %s"), vm->def->name); + goto cleanup; + } + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + if (!vm->persistent) { + qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("cannot change persistent config of a transient domain")); + goto cleanup; + } + if (!(persistentDef = virDomainObjGetPersistentDef(driver->caps, vm))) + goto cleanup; + } + + ret = 0; + for (i = 0; i < nparams; i++) { + virTypedParameterPtr param = ¶ms[i]; + + if (STREQ(param->field, VIR_DOMAIN_NUMA_STRICT)) { + int rc; + if (param->type != VIR_TYPED_PARAM_ULLONG) { + qemuReportError(VIR_ERR_INVALID_ARG, "%s", + _("invalid type for numa strict tunable, expected a 'ullong'")); + ret = -1; + continue; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + rc = virCgroupSetCpusetHardwall(group, params[i].value.i); + if (rc != 0) { + virReportSystemError(-rc, "%s", + _("unable to set memory hard_limit tunable")); + ret = -1; + } + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + persistentDef->numatune.strict = params[i].value.i; + } + } else if (STREQ(param->field, VIR_DOMAIN_NUMA_EXCLUSIVE)) { + int rc; + if (param->type != VIR_TYPED_PARAM_ULLONG) { + qemuReportError(VIR_ERR_INVALID_ARG, "%s", + _("invalid type for numa strict tunable, expected a 'ullong'")); + ret = -1; + continue; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + rc = virCgroupSetCpusetMemExclusive(group, params[i].value.i); + if (rc != 0) { + virReportSystemError(-rc, "%s", + _("unable to set memory hard_limit tunable")); + ret = -1; + } + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + persistentDef->numatune.exclusive = params[i].value.i; + } + } else { + qemuReportError(VIR_ERR_INVALID_ARG, + _("Parameter `%s' not supported"), param->field); + ret = -1; + } + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + if (virDomainSaveConfig(driver->configDir, persistentDef) < 0) + ret = -1; + } + +cleanup: + virCgroupFree(&group); + if (vm) + virDomainObjUnlock(vm); + qemuDriverUnlock(driver); + return ret; +} + +static int qemuDomainGetNumaParameters(virDomainPtr dom, + virTypedParameterPtr params, + int *nparams, + unsigned int flags) +{ + struct qemud_driver *driver = dom->conn->privateData; + int i; + virCgroupPtr group = NULL; + virDomainObjPtr vm = NULL; + virDomainDefPtr persistentDef = NULL; + unsigned long long val; + int ret = -1; + int rc; + bool isActive; + + virCheckFlags(VIR_DOMAIN_AFFECT_LIVE | + VIR_DOMAIN_AFFECT_CONFIG, -1); + + qemuDriverLock(driver); + + vm = virDomainFindByUUID(&driver->domains, dom->uuid); + + if (vm == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("No such domain %s"), dom->uuid); + goto cleanup; + } + + isActive = virDomainObjIsActive(vm); + + if (flags == VIR_DOMAIN_AFFECT_CURRENT) { + if (isActive) + flags = VIR_DOMAIN_AFFECT_LIVE; + else + flags = VIR_DOMAIN_AFFECT_CONFIG; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + if (!isActive) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("domain is not running")); + goto cleanup; + } + + if (!qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_MEMORY)) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("cgroup memory controller is not mounted")); + goto cleanup; + } + + if (virCgroupForDomain(driver->cgroup, vm->def->name, &group, 0) != 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("cannot find cgroup for domain %s"), vm->def->name); + goto cleanup; + } + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + if (!vm->persistent) { + qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("cannot change persistent config of a transient domain")); + goto cleanup; + } + if (!(persistentDef = virDomainObjGetPersistentDef(driver->caps, vm))) + goto cleanup; + } + + if ((*nparams) == 0) { + *nparams = QEMU_NB_NUMA_PARAM; + ret = 0; + goto cleanup; + } + + if ((*nparams) < QEMU_NB_NUMA_PARAM) { + qemuReportError(VIR_ERR_INVALID_ARG, + "%s", _("Invalid parameter count")); + goto cleanup; + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + for (i = 0; i < *nparams; i++) { + virMemoryParameterPtr param = ¶ms[i]; + val = 0; + param->value.ul = 0; + param->type = VIR_TYPED_PARAM_ULLONG; + + switch (i) { + case 0: /* fill numa strict here */ + if (virStrcpyStatic(param->field, VIR_DOMAIN_NUMA_STRICT) == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("Field numa strict too long for destination")); + goto cleanup; + } + param->value.i = persistentDef->numatune.strict; + break; + + case 1: + if (virStrcpyStatic(param->field, VIR_DOMAIN_NUMA_EXCLUSIVE) == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("Field numa exclusive too long for destination")); + goto cleanup; + } + param->value.i = persistentDef->numatune.exclusive; + break; + + default: + break; + /* should not hit here */ + } + } + goto out; + } + + for (i = 0; i < QEMU_NB_NUMA_PARAM; i++) { + virTypedParameterPtr param = ¶ms[i]; + val = 0; + param->value.ul = 0; + param->type = VIR_TYPED_PARAM_ULLONG; + + /* Coverity does not realize that if we get here, group is set. */ + sa_assert(group); + + switch (i) { + case 0: /* fill numa strict here */ + rc = virCgroupGetCpusetHardwall(group, &val); + if (rc != 0) { + virReportSystemError(-rc, "%s", + _("unable to get numa strict")); + goto cleanup; + } + if (virStrcpyStatic(param->field, VIR_DOMAIN_NUMA_STRICT) == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("Field numa strict too long for destination")); + goto cleanup; + } + param->value.ul = val; + break; + + case 1: /* file numa exclusive here */ + rc = virCgroupGetCpusetMemExclusive(group, &val); + if (rc != 0) { + virReportSystemError(-rc, "%s", + _("unable to get numa exclusive")); + goto cleanup; + } + if (virStrcpyStatic(param->field, VIR_DOMAIN_NUMA_EXCLUSIVE) == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("Field numa exclusive too long for destination")); + goto cleanup; + } + param->value.ul = val; + break; + + default: + break; + /* should not hit here */ + } + } + +out: + *nparams = QEMU_NB_NUMA_PARAM; + ret = 0; + +cleanup: + if (group) + virCgroupFree(&group); + if (vm) + virDomainObjUnlock(vm); + qemuDriverUnlock(driver); + return ret; +} + static int qemuSetVcpusBWLive(virDomainObjPtr vm, virCgroupPtr cgroup, unsigned long long period, long long quota) @@ -10904,6 +11209,8 @@ static virDriver qemuDriver = { .domainGetBlockJobInfo = qemuDomainGetBlockJobInfo, /* 0.9.4 */ .domainBlockJobSetSpeed = qemuDomainBlockJobSetSpeed, /* 0.9.4 */ .domainBlockPull = qemuDomainBlockPull, /* 0.9.4 */ + .domainSetNumaParameters = qemuDomainSetNumaParameters, /* 0.9.8 */ + .domainGetNumaParameters = qemuDomainGetNumaParameters, /* 0.9.8 */ }; -- 1.7.3.1

add new command numatune to virsh to get/set numa parameters --- tools/virsh.c | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 187 insertions(+), 0 deletions(-) diff --git a/tools/virsh.c b/tools/virsh.c index 5544a41..e2c3cc2 100644 --- a/tools/virsh.c +++ b/tools/virsh.c @@ -4974,6 +4974,192 @@ cmdMemtune(vshControl * ctl, const vshCmd * cmd) } /* + * "numatune" command + */ +static const vshCmdInfo info_numatune[] = { + {"help", N_("Get or set numa parameters")}, + {"desc", N_("Get or set the current numa parameters for a guest" \ + " domain.\n" \ + " To get the numa parameters use following command: \n\n" \ + " virsh # numatune <domain>")}, + {NULL, NULL} + +}; + +static const vshCmdOptDef opts_numatune[] = { + {"domain", VSH_OT_DATA, VSH_OFLAG_REQ, N_("domain name, id or uuid")}, + {"nodeset", VSH_OT_DATA, VSH_OFLAG_NONE, + N_("NUMA node to set")}, + {"strict", VSH_OT_INT, VSH_OFLAG_NONE, + N_("whether memory allocation should be restricted to the memory nodes ")}, + {"exclusive", VSH_OT_INT, VSH_OFLAG_NONE, + N_("XXX")}, + {"config", VSH_OT_BOOL, 0, N_("affect next boot")}, + {"live", VSH_OT_BOOL, 0, N_("affect running domain")}, + {"current", VSH_OT_BOOL, 0, N_("affect current domain")}, + {NULL, 0, 0, NULL} +}; + +static bool +cmdNumatune(vshControl * ctl, const vshCmd * cmd) +{ + virDomainPtr dom; + int rc = -1; + int nparams = 0; + unsigned int i = 0; + virTypedParameterPtr params = NULL, temp = NULL; + const char *nodeset = NULL; + unsigned long strict = 0; + unsigned long exclusive = 0; + bool strictParam = false; + bool exclusiveParam = false; + bool ret = false; + unsigned int flags = 0; + int current = vshCommandOptBool(cmd, "current"); + int config = vshCommandOptBool(cmd, "config"); + int live = vshCommandOptBool(cmd, "live"); + + if (current) { + if (live || config) { + vshError(ctl, "%s", _("--current must be specified exclusively")); + return false; + } + flags = VIR_DOMAIN_AFFECT_CURRENT; + } else { + if (config) + flags |= VIR_DOMAIN_AFFECT_CONFIG; + if (live) + flags |= VIR_DOMAIN_AFFECT_LIVE; + } + + if (!vshConnectionUsability(ctl, ctl->conn)) + return false; + + if (!(dom = vshCommandOptDomain(ctl, cmd, NULL))) + return false; + + if (vshCommandOptString(cmd, "nodeset", &nodeset) < 0) { + vshError(ctl, "%s", _("Unable to parse nodeset.")); + virDomainFree(dom); + return false; + } + rc = vshCommandOptUL(cmd, "strict", &strict); + if (rc > 0) { + nparams++; + strictParam = true; + } else if (rc < 0) { + vshError(ctl, "%s", _("Unable to parse parameter --strict.")); + virDomainFree(dom); + return false; + } + + rc = vshCommandOptUL(cmd, "exclusive", &exclusive); + if (rc > 0) { + nparams++; + exclusiveParam = true; + } else if (rc < 0) { + vshError(ctl, "%s", _("Unable to parse parameter --exclusive.")); + virDomainFree(dom); + return false; + } + + if (!nodeset) { + /* get nodeset information */ + } + + if (nparams == 0) { + /* get the number of numa parameters */ + if (virDomainGetNumaParameters(dom, NULL, &nparams, flags) != 0) { + vshError(ctl, "%s", + _("Unable to get number of memory parameters")); + goto cleanup; + } + + if (nparams == 0) { + /* nothing to output */ + ret = true; + goto cleanup; + } + + /* now go get all the numa parameters */ + params = vshCalloc(ctl, nparams, sizeof(*params)); + if (virDomainGetNumaParameters(dom, params, &nparams, flags) != 0) { + vshError(ctl, "%s", _("Unable to get numa parameters")); + goto cleanup; + } + + for (i = 0; i < nparams; i++) { + switch (params[i].type) { + case VIR_TYPED_PARAM_INT: + vshPrint(ctl, "%-15s: %d\n", params[i].field, + params[i].value.i); + break; + case VIR_TYPED_PARAM_UINT: + vshPrint(ctl, "%-15s: %u\n", params[i].field, + params[i].value.ui); + break; + case VIR_TYPED_PARAM_LLONG: + vshPrint(ctl, "%-15s: %lld\n", params[i].field, + params[i].value.l); + break; + case VIR_TYPED_PARAM_ULLONG: + vshPrint(ctl, "%-15s: %llu\n", params[i].field, + params[i].value.ul); + break; + case VIR_TYPED_PARAM_DOUBLE: + vshPrint(ctl, "%-15s: %f\n", params[i].field, + params[i].value.d); + break; + case VIR_TYPED_PARAM_BOOLEAN: + vshPrint(ctl, "%-15s: %d\n", params[i].field, + params[i].value.b); + break; + default: + vshPrint(ctl, "unimplemented numa parameter type\n"); + } + } + + ret = true; + } else { + /* set the numa parameters */ + params = vshCalloc(ctl, nparams, sizeof(*params)); + + for (i = 0; i < nparams; i++) { + temp = ¶ms[i]; + temp->type = VIR_TYPED_PARAM_ULLONG; + + /* + * Some magic here, this is used to fill the params structure with + * the valid arguments passed, after filling the particular + * argument we purposely make them 0, so on the next pass it goes + * to the next valid argument and so on. + */ + if (strictParam) { + temp->value.i = strict; + strncpy(temp->field, VIR_DOMAIN_NUMA_STRICT, + sizeof(temp->field)); + strictParam = false; + } + if (exclusiveParam) { + temp->value.i = exclusive; + strncpy(temp->field, VIR_DOMAIN_NUMA_EXCLUSIVE, + sizeof(temp->field)); + exclusiveParam = false; + } + } + if (virDomainSetNumaParameters(dom, params, nparams, flags) != 0) + vshError(ctl, "%s", _("Unable to change numa parameters")); + else + ret = true; + } + + cleanup: + VIR_FREE(params); + virDomainFree(dom); + return ret; +} + +/* * "nodeinfo" command */ static const vshCmdInfo info_nodeinfo[] = { @@ -14050,6 +14236,7 @@ static const vshCmdDef domManagementCmds[] = { info_managedsaveremove, 0}, {"maxvcpus", cmdMaxvcpus, opts_maxvcpus, info_maxvcpus, 0}, {"memtune", cmdMemtune, opts_memtune, info_memtune, 0}, + {"numatune", cmdNumatune, opts_numatune, info_numatune, 0}, {"migrate", cmdMigrate, opts_migrate, info_migrate, 0}, {"migrate-setmaxdowntime", cmdMigrateSetMaxDowntime, opts_migrate_setmaxdowntime, info_migrate_setmaxdowntime, 0}, -- 1.7.3.1
participants (2)
-
Daniel P. Berrange
-
Hu Tao