[libvirt] PATCH 0/2]: Fix CPU affinity management for NR_CPUS > 1024

Daniel P. Berrange

16 Nov 2009 16 Nov '09

5:49 p.m.

The RHEL-6 Alpha kernels have defined NR_CPUS=4096 The glibc static cpu_set_t data type used with sched_[gs]etaffinity() has been sized with # define __CPU_SETSIZE 1024 So any attempt to use sched_[gs]etaffinity() with a static cpu_set_t on a RHEL-6 kernel ends up with -EINVAL, even if the machine does not physically have > 1024 CPUs. The syscall mandates that the cpu set arg is large enough to hold entire of NR_CPUS. Fairly recent gLibc's now provide an alternative dynamically allocated cpu_set_t data type, where the app tells glibc how many CPUs it has to be able to hold. Great, except, the kernel does not tell userspace what NR_CPUS it has been compiled with. So apps have to a pick a starting size, try the sched_[gs]etaffinity() call, catch EINVAL, enlarge the cpuset, repeat, repeat, repeat until it works. This was going to be horrible to put into the qemu driver code, so I've introduced a new util/processinfo.[h,c] file providing APis for getting/setting CPU affinity that use the dynamically allocate mask libvirt defines in its public API. Daniel

Show replies by date

Daniel P. Berrange

16 Nov 16 Nov

5:49 p.m.

New subject: [libvirt] [PATCH 1/2] Pull schedular affinity code out into a separate module

* src/Makefile.am: Add processinfo.h/processinfo.c * src/util/processinfo.c, src/util/processinfo.h: Module providing APIs for getting/setting process CPU affinity * src/qemu/qemu_driver.c: Switch over to new APIs for schedular affinity * src/libvirt_private.syms: Export virProcessInfoSetAffinity and virProcessInfoGetAffinity to internal drivers --- src/Makefile.am | 4 +- src/libvirt_private.syms | 5 ++ src/qemu/qemu_driver.c | 73 +++++++++++++-------------------- src/util/processinfo.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/processinfo.h | 37 +++++++++++++++++ 5 files changed, 175 insertions(+), 45 deletions(-) create mode 100644 src/util/processinfo.c create mode 100644 src/util/processinfo.h diff --git a/src/Makefile.am b/src/Makefile.am index d22a103..9a3c9c8 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -55,6 +55,7 @@ UTIL_SOURCES = \ util/logging.c util/logging.h \ util/memory.c util/memory.h \ util/pci.c util/pci.h \ + util/processinfo.c util/processinfo.h \ util/hostusb.c util/hostusb.h \ util/network.c util/network.h \ util/qparams.c util/qparams.h \ @@ -268,7 +269,8 @@ NODE_DEVICE_DRIVER_HAL_SOURCES = \ node_device/node_device_hal.h NODE_DEVICE_DRIVER_UDEV_SOURCES = \ - node_device/node_device_udev.c + node_device/node_device_udev.c \ + node_device/node_device_udev.h ######################### diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index c473d49..e880c2e 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -374,6 +374,11 @@ pciDeviceListUnlock; pciDeviceListSteal; +# processinfo.h +virProcessInfoSetAffinity; +virProcessInfoGetAffinity; + + # qparams.h qparam_get_query; qparam_query_parse; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 968118e..b50cf6f 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -47,10 +47,6 @@ #include <sys/ioctl.h> #include <sys/un.h> -#if HAVE_SCHED_H -#include <sched.h> -#endif - #include "virterror_internal.h" #include "logging.h" #include "datatypes.h" @@ -72,6 +68,7 @@ #include "node_device_conf.h" #include "pci.h" #include "hostusb.h" +#include "processinfo.h" #include "security/security_driver.h" #include "cgroup.h" #include "libvirt_internal.h" @@ -1359,11 +1356,11 @@ static int qemudInitCpus(virConnectPtr conn, virDomainObjPtr vm, const char *migrateFrom) { -#if HAVE_SCHED_GETAFFINITY - cpu_set_t mask; int i, hostcpus, maxcpu = QEMUD_CPUMASK_LEN; virNodeInfo nodeinfo; qemuDomainObjPrivatePtr priv = vm->privateData; + unsigned char *cpumap; + int cpumaplen; if (nodeGetInfo(conn, &nodeinfo) < 0) return -1; @@ -1374,25 +1371,37 @@ qemudInitCpus(virConnectPtr conn, if (maxcpu > hostcpus) maxcpu = hostcpus; - CPU_ZERO(&mask); + cpumaplen = VIR_CPU_MAPLEN(maxcpu); + if (VIR_ALLOC_N(cpumap, cpumaplen) < 0) { + virReportOOMError(conn); + return -1; + } + if (vm->def->cpumask) { - for (i = 0 ; i < maxcpu ; i++) + /* XXX why don't we keep 'cpumask' in the libvirt cpumap + * format to start with ?!?! */ + for (i = 0 ; i < maxcpu && i < vm->def->cpumasklen ; i++) if (vm->def->cpumask[i]) - CPU_SET(i, &mask); + VIR_USE_CPU(cpumap, i); } else { + /* You may think this is redundant, but we can't assume libvirtd + * itself is running on all pCPUs, so we need to explicitly set + * the spawned QEMU instance to all pCPUs if no map is given in + * its config file */ for (i = 0 ; i < maxcpu ; i++) - CPU_SET(i, &mask); + VIR_USE_CPU(cpumap, i); } + /* The XML config only gives a per-VM affinity, so we apply + * the same mapping to all vCPUs */ for (i = 0 ; i < vm->nvcpupids ; i++) { - if (sched_setaffinity(vm->vcpupids[i], - sizeof(mask), &mask) < 0) { - virReportSystemError(conn, errno, "%s", - _("failed to set CPU affinity")); + if (virProcessInfoSetAffinity(vm->vcpupids[i], + cpumap, cpumaplen, maxcpu) < 0) { + VIR_FREE(cpumap); return -1; } } -#endif /* HAVE_SCHED_GETAFFINITY */ + VIR_FREE(cpumap); /* XXX This resume doesn't really belong here. Move it up to caller */ if (migrateFrom == NULL) { @@ -3657,7 +3666,6 @@ cleanup: } -#if HAVE_SCHED_GETAFFINITY static int qemudDomainPinVcpu(virDomainPtr dom, unsigned int vcpu, @@ -3665,8 +3673,7 @@ qemudDomainPinVcpu(virDomainPtr dom, int maplen) { struct qemud_driver *driver = dom->conn->privateData; virDomainObjPtr vm; - cpu_set_t mask; - int i, maxcpu, hostcpus; + int maxcpu, hostcpus; virNodeInfo nodeinfo; int ret = -1; @@ -3703,18 +3710,10 @@ qemudDomainPinVcpu(virDomainPtr dom, if (maxcpu > hostcpus) maxcpu = hostcpus; - CPU_ZERO(&mask); - for (i = 0 ; i < maxcpu ; i++) { - if (VIR_CPU_USABLE(cpumap, maplen, 0, i)) - CPU_SET(i, &mask); - } - if (vm->vcpupids != NULL) { - if (sched_setaffinity(vm->vcpupids[vcpu], sizeof(mask), &mask) < 0) { - virReportSystemError(dom->conn, errno, "%s", - _("cannot set affinity")); + if (virProcessInfoSetAffinity(vm->vcpupids[vcpu], + cpumap, maplen, maxcpu) < 0) goto cleanup; - } } else { qemudReportError(dom->conn, dom, NULL, VIR_ERR_NO_SUPPORT, "%s", _("cpu affinity is not supported")); @@ -3794,19 +3793,11 @@ qemudDomainGetVcpus(virDomainPtr dom, memset(cpumaps, 0, maplen * maxinfo); if (vm->vcpupids != NULL) { for (v = 0 ; v < maxinfo ; v++) { - cpu_set_t mask; unsigned char *cpumap = VIR_GET_CPUMAP(cpumaps, maplen, v); - CPU_ZERO(&mask); - if (sched_getaffinity(vm->vcpupids[v], sizeof(mask), &mask) < 0) { - virReportSystemError(dom->conn, errno, "%s", - _("cannot get affinity")); + if (virProcessInfoGetAffinity(vm->vcpupids[v], + cpumap, maplen, maxcpu) < 0) goto cleanup; - } - - for (i = 0 ; i < maxcpu ; i++) - if (CPU_ISSET(i, &mask)) - VIR_USE_CPU(cpumap, i); } } else { qemudReportError(dom->conn, dom, NULL, VIR_ERR_NO_SUPPORT, @@ -3822,7 +3813,6 @@ cleanup: virDomainObjUnlock(vm); return ret; } -#endif /* HAVE_SCHED_GETAFFINITY */ static int qemudDomainGetMaxVcpus(virDomainPtr dom) { @@ -7510,13 +7500,8 @@ static virDriver qemuDriver = { qemudDomainRestore, /* domainRestore */ qemudDomainCoreDump, /* domainCoreDump */ qemudDomainSetVcpus, /* domainSetVcpus */ -#if HAVE_SCHED_GETAFFINITY qemudDomainPinVcpu, /* domainPinVcpu */ qemudDomainGetVcpus, /* domainGetVcpus */ -#else - NULL, /* domainPinVcpu */ - NULL, /* domainGetVcpus */ -#endif qemudDomainGetMaxVcpus, /* domainGetMaxVcpus */ qemudDomainGetSecurityLabel, /* domainGetSecurityLabel */ qemudNodeGetSecurityModel, /* nodeGetSecurityModel */ diff --git a/src/util/processinfo.c b/src/util/processinfo.c new file mode 100644 index 0000000..aaffd88 --- /dev/null +++ b/src/util/processinfo.c @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2009 Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors: + * Daniel P. Berrange <berrange@redhat.com> + */ + +#include <config.h> + +#if HAVE_SCHED_H +#include <sched.h> +#endif + +#include "processinfo.h" +#include "virterror_internal.h" + +#define VIR_FROM_THIS VIR_FROM_NONE + +#if HAVE_SCHED_GETAFFINITY + +int virProcessInfoSetAffinity(pid_t pid, + const unsigned char *map, + size_t maplen, + int maxcpu) +{ + int i; + cpu_set_t mask; + + CPU_ZERO(&mask); + for (i = 0 ; i < maxcpu ; i++) { + if (VIR_CPU_USABLE(map, maplen, 0, i)) + CPU_SET(i, &mask); + } + + if (sched_setaffinity(pid, sizeof(mask), &mask) < 0) { + virReportSystemError(NULL, errno, + _("cannot set CPU affinity on process %d"), pid); + return -1; + } + + return 0; +} + +int virProcessInfoGetAffinity(pid_t pid, + unsigned char *map, + size_t maplen ATTRIBUTE_UNUSED, + int maxcpu) +{ + int i; + cpu_set_t mask; + + CPU_ZERO(&mask); + if (sched_getaffinity(pid, sizeof(mask), &mask) < 0) { + virReportSystemError(NULL, errno, + _("cannot set CPU affinity on process %d"), pid); + return -1; + } + + for (i = 0 ; i < maxcpu ; i++) + if (CPU_ISSET(i, &mask)) + VIR_USE_CPU(map, i); + + return 0; +} + +#else /* HAVE_SCHED_GETAFFINITY */ + +int virProcessInfoSetAffinity(pid_t pid ATTRIBUTE_UNUSED, + unsigned char *map ATTRIBUTE_UNUSED, + size_t maplen ATTRIBUTE_UNUSED, + int maxcpu ATTRIBUTE_UNUSED) +{ + virReportSystemError(NULL, ENOSYS, "%s", + _("Process CPU affinity is not supported on this platform")); + return -1; +} + +int virProcessInfoGetAffinity(pid_t pid ATTRIBUTE_UNUSED, + unsigned char *map ATTRIBUTE_UNUSED, + size_t maplen ATTRIBUTE_UNUSED, + int maxcpu ATTRIBUTE_UNUSED) +{ + virReportSystemError(NULL, ENOSYS, "%s", + _("Process CPU affinity is not supported on this platform")); + return -1; +} +#endif /* HAVE_SCHED_GETAFFINITY */ diff --git a/src/util/processinfo.h b/src/util/processinfo.h new file mode 100644 index 0000000..17800bd --- /dev/null +++ b/src/util/processinfo.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2009 Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors: + * Daniel P. Berrange <berrange@redhat.com> + */ + +#ifndef __VIR_PROCESSINFO_H__ +#define __VIR_PROCESSINFO_H__ + +#include "internal.h" + +int virProcessInfoSetAffinity(pid_t pid, + const unsigned char *map, + size_t maplen, + int maxcpu); + +int virProcessInfoGetAffinity(pid_t pid, + unsigned char *map, + size_t maplen, + int maxcpu); + +#endif /* __VIR_PROCESSINFO_H__ */ -- 1.6.5.2

Daniel Veillard

6:07 p.m.

New subject: [libvirt] [PATCH 1/2] Pull schedular affinity code out into a separate module

On Mon, Nov 16, 2009 at 04:49:29PM +0000, Daniel P. Berrange wrote:

...

* src/Makefile.am: Add processinfo.h/processinfo.c * src/util/processinfo.c, src/util/processinfo.h: Module providing APIs for getting/setting process CPU affinity * src/qemu/qemu_driver.c: Switch over to new APIs for schedular affinity * src/libvirt_private.syms: Export virProcessInfoSetAffinity and virProcessInfoGetAffinity to internal drivers --- src/Makefile.am | 4 +- src/libvirt_private.syms | 5 ++ src/qemu/qemu_driver.c | 73 +++++++++++++-------------------- src/util/processinfo.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/processinfo.h | 37 +++++++++++++++++ 5 files changed, 175 insertions(+), 45 deletions(-) create mode 100644 src/util/processinfo.c create mode 100644 src/util/processinfo.h

diff --git a/src/Makefile.am b/src/Makefile.am @@ -268,7 +269,8 @@ NODE_DEVICE_DRIVER_HAL_SOURCES = \ node_device/node_device_hal.h

NODE_DEVICE_DRIVER_UDEV_SOURCES = \ - node_device/node_device_udev.c + node_device/node_device_udev.c \ + node_device/node_device_udev.h

#########################

Can you commit that separately as this is a bug fix for 0.7.3 in any case ?

...

diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index c473d49..e880c2e 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -374,6 +374,11 @@ pciDeviceListUnlock; pciDeviceListSteal;

+# processinfo.h +virProcessInfoSetAffinity; +virProcessInfoGetAffinity;

On one hand this is a bug fix and will affect testers, on the other hand this is also a relatively heavy cleanup, but it's probably a good idea to commit this before 0.7.3 ACK, Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

Daniel P. Berrange

17 Nov 17 Nov

6:38 p.m.

New subject: [libvirt] [PATCH 1/2] Pull schedular affinity code out into a separate module

On Mon, Nov 16, 2009 at 06:07:25PM +0100, Daniel Veillard wrote:

...

On Mon, Nov 16, 2009 at 04:49:29PM +0000, Daniel P. Berrange wrote:

...
* src/Makefile.am: Add processinfo.h/processinfo.c * src/util/processinfo.c, src/util/processinfo.h: Module providing APIs for getting/setting process CPU affinity * src/qemu/qemu_driver.c: Switch over to new APIs for schedular affinity * src/libvirt_private.syms: Export virProcessInfoSetAffinity and virProcessInfoGetAffinity to internal drivers --- src/Makefile.am | 4 +- src/libvirt_private.syms | 5 ++ src/qemu/qemu_driver.c | 73 +++++++++++++-------------------- src/util/processinfo.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/processinfo.h | 37 +++++++++++++++++ 5 files changed, 175 insertions(+), 45 deletions(-) create mode 100644 src/util/processinfo.c create mode 100644 src/util/processinfo.h

diff --git a/src/Makefile.am b/src/Makefile.am @@ -268,7 +269,8 @@ NODE_DEVICE_DRIVER_HAL_SOURCES = \ node_device/node_device_hal.h

NODE_DEVICE_DRIVER_UDEV_SOURCES = \ - node_device/node_device_udev.c + node_device/node_device_udev.c \ + node_device/node_device_udev.h

#########################

Can you commit that separately as this is a bug fix for 0.7.3 in any case ?

Opps yes, I didn't notice that got in here !

...

...
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index c473d49..e880c2e 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -374,6 +374,11 @@ pciDeviceListUnlock; pciDeviceListSteal;

+# processinfo.h +virProcessInfoSetAffinity; +virProcessInfoGetAffinity;

On one hand this is a bug fix and will affect testers, on the other hand this is also a relatively heavy cleanup, but it's probably a good idea to commit this before 0.7.3

ACK,

Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

Daniel P. Berrange

16 Nov 16 Nov

5:49 p.m.

New subject: [libvirt] [PATCH 2/2] Alternate CPU affinity impl to cope with NR_CPUS > 1024

The cpu_set_t type can only cope with NR_CPUS <= 1024, beyond this it is neccessary to use alternate CPU_SET maps with a dynamically allocated CPU map * src/util/processinfo.c: Support new unlimited size CPU set type --- src/util/processinfo.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 84 insertions(+), 0 deletions(-) diff --git a/src/util/processinfo.c b/src/util/processinfo.c index aaffd88..a0dac34 100644 --- a/src/util/processinfo.c +++ b/src/util/processinfo.c @@ -38,6 +38,48 @@ int virProcessInfoSetAffinity(pid_t pid, int maxcpu) { int i; +#ifdef CPU_ALLOC + /* New method dynamically allocates cpu mask, allowing unlimted cpus */ + int numcpus = 1024; + size_t masklen; + cpu_set_t *mask; + + /* Not only may the statically allocated cpu_set_t be too small, + * but there is no way to ask the kernel what size is large enough. + * So you have no option but to pick a size, try, catch EINVAL, + * enlarge, and re-try. + * + * http://lkml.org/lkml/2009/7/28/620 + */ +realloc: + masklen = CPU_ALLOC_SIZE(numcpus); + mask = CPU_ALLOC(numcpus); + + if (!mask) { + virReportOOMError(NULL); + return -1; + } + + CPU_ZERO_S(masklen, mask); + for (i = 0 ; i < maxcpu ; i++) { + if (VIR_CPU_USABLE(map, maplen, 0, i)) + CPU_SET_S(i, masklen, mask); + } + + if (sched_setaffinity(pid, masklen, mask) < 0) { + CPU_FREE(mask); + if (errno == EINVAL && + numcpus < (1024 << 8)) { /* 262144 cpus ought to be enough for anyone */ + numcpus = numcpus << 2; + goto realloc; + } + virReportSystemError(NULL, errno, + _("cannot set CPU affinity on process %d"), pid); + return -1; + } + CPU_FREE(mask); +#else + /* Legacy method uses a fixed size cpu mask, only allows upto 1024 cpus */ cpu_set_t mask; CPU_ZERO(&mask); @@ -51,6 +93,7 @@ int virProcessInfoSetAffinity(pid_t pid, _("cannot set CPU affinity on process %d"), pid); return -1; } +#endif return 0; } @@ -61,6 +104,46 @@ int virProcessInfoGetAffinity(pid_t pid, int maxcpu) { int i; +#ifdef CPU_ALLOC + /* New method dynamically allocates cpu mask, allowing unlimted cpus */ + int numcpus = 1024; + size_t masklen; + cpu_set_t *mask; + + /* Not only may the statically allocated cpu_set_t be too small, + * but there is no way to ask the kernel what size is large enough. + * So you have no option but to pick a size, try, catch EINVAL, + * enlarge, and re-try. + * + * http://lkml.org/lkml/2009/7/28/620 + */ +realloc: + masklen = CPU_ALLOC_SIZE(numcpus); + mask = CPU_ALLOC(numcpus); + + if (!mask) { + virReportOOMError(NULL); + return -1; + } + + CPU_ZERO_S(masklen, mask); + if (sched_getaffinity(pid, masklen, mask) < 0) { + CPU_FREE(mask); + if (errno == EINVAL && + numcpus < (1024 << 8)) { /* 262144 cpus ought to be enough for anyone */ + numcpus = numcpus << 2; + goto realloc; + } + virReportSystemError(NULL, errno, + _("cannot set CPU affinity on process %d"), pid); + return -1; + } + + for (i = 0 ; i < maxcpu ; i++) + if (CPU_ISSET_S(i, masklen, mask)) + VIR_USE_CPU(map, i); +#else + /* Legacy method uses a fixed size cpu mask, only allows upto 1024 cpus */ cpu_set_t mask; CPU_ZERO(&mask); @@ -73,6 +156,7 @@ int virProcessInfoGetAffinity(pid_t pid, for (i = 0 ; i < maxcpu ; i++) if (CPU_ISSET(i, &mask)) VIR_USE_CPU(map, i); +#endif return 0; } -- 1.6.5.2

Daniel Veillard

6:13 p.m.

New subject: [libvirt] [PATCH 2/2] Alternate CPU affinity impl to cope with NR_CPUS > 1024

On Mon, Nov 16, 2009 at 04:49:30PM +0000, Daniel P. Berrange wrote:

...

The cpu_set_t type can only cope with NR_CPUS <= 1024, beyond this it is neccessary to use alternate CPU_SET maps with a dynamically allocated CPU map

* src/util/processinfo.c: Support new unlimited size CPU set type --- src/util/processinfo.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 84 insertions(+), 0 deletions(-)

diff --git a/src/util/processinfo.c b/src/util/processinfo.c index aaffd88..a0dac34 100644 --- a/src/util/processinfo.c +++ b/src/util/processinfo.c @@ -38,6 +38,48 @@ int virProcessInfoSetAffinity(pid_t pid, int maxcpu) { int i; +#ifdef CPU_ALLOC + /* New method dynamically allocates cpu mask, allowing unlimted cpus */ + int numcpus = 1024; + size_t masklen; + cpu_set_t *mask; + + /* Not only may the statically allocated cpu_set_t be too small, + * but there is no way to ask the kernel what size is large enough. + * So you have no option but to pick a size, try, catch EINVAL, + * enlarge, and re-try. + * + * http://lkml.org/lkml/2009/7/28/620 + */

Urgh ....

...

+realloc: + masklen = CPU_ALLOC_SIZE(numcpus); + mask = CPU_ALLOC(numcpus); + + if (!mask) { + virReportOOMError(NULL); + return -1; + } + + CPU_ZERO_S(masklen, mask); + for (i = 0 ; i < maxcpu ; i++) { + if (VIR_CPU_USABLE(map, maplen, 0, i)) + CPU_SET_S(i, masklen, mask); + } + + if (sched_setaffinity(pid, masklen, mask) < 0) { + CPU_FREE(mask); + if (errno == EINVAL && + numcpus < (1024 << 8)) { /* 262144 cpus ought to be enough for anyone */ + numcpus = numcpus << 2;

let's just numcpus *= 2; or numcpus *= 4; it's not like we want to shave a microsecond, makes code less readable.

...

+ goto realloc; + } + virReportSystemError(NULL, errno, + _("cannot set CPU affinity on process %d"), pid); + return -1; + } + CPU_FREE(mask); +#else + /* Legacy method uses a fixed size cpu mask, only allows upto 1024 cpus */ cpu_set_t mask;

CPU_ZERO(&mask); @@ -51,6 +93,7 @@ int virProcessInfoSetAffinity(pid_t pid, _("cannot set CPU affinity on process %d"), pid); return -1; } +#endif

return 0; } @@ -61,6 +104,46 @@ int virProcessInfoGetAffinity(pid_t pid, int maxcpu) { int i; +#ifdef CPU_ALLOC + /* New method dynamically allocates cpu mask, allowing unlimted cpus */ + int numcpus = 1024; + size_t masklen; + cpu_set_t *mask; + + /* Not only may the statically allocated cpu_set_t be too small, + * but there is no way to ask the kernel what size is large enough. + * So you have no option but to pick a size, try, catch EINVAL, + * enlarge, and re-try. + * + * http://lkml.org/lkml/2009/7/28/620 + */ +realloc: + masklen = CPU_ALLOC_SIZE(numcpus); + mask = CPU_ALLOC(numcpus); + + if (!mask) { + virReportOOMError(NULL); + return -1; + } + + CPU_ZERO_S(masklen, mask); + if (sched_getaffinity(pid, masklen, mask) < 0) { + CPU_FREE(mask); + if (errno == EINVAL && + numcpus < (1024 << 8)) { /* 262144 cpus ought to be enough for anyone */ + numcpus = numcpus << 2;

same I would also make numcpus a static variable, so that you don't repeat he loop each time you go though one of those APIs.

...

+ goto realloc; + } + virReportSystemError(NULL, errno, + _("cannot set CPU affinity on process %d"), pid); + return -1; + } + + for (i = 0 ; i < maxcpu ; i++) + if (CPU_ISSET_S(i, masklen, mask)) + VIR_USE_CPU(map, i); +#else + /* Legacy method uses a fixed size cpu mask, only allows upto 1024 cpus */ cpu_set_t mask;

CPU_ZERO(&mask); @@ -73,6 +156,7 @@ int virProcessInfoGetAffinity(pid_t pid, for (i = 0 ; i < maxcpu ; i++) if (CPU_ISSET(i, &mask)) VIR_USE_CPU(map, i); +#endif

return 0; }

That patch sounds less urgent might be worth waiting past 0.7.3, ACK though Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

Daniel P. Berrange

17 Nov 17 Nov

6:41 p.m.

New subject: [libvirt] [PATCH 2/2] Alternate CPU affinity impl to cope with NR_CPUS > 1024

On Mon, Nov 16, 2009 at 06:13:05PM +0100, Daniel Veillard wrote:

...

On Mon, Nov 16, 2009 at 04:49:30PM +0000, Daniel P. Berrange wrote:

...
The cpu_set_t type can only cope with NR_CPUS <= 1024, beyond this it is neccessary to use alternate CPU_SET maps with a dynamically allocated CPU map

+realloc: + masklen = CPU_ALLOC_SIZE(numcpus); + mask = CPU_ALLOC(numcpus); + + if (!mask) { + virReportOOMError(NULL); + return -1; + } + + CPU_ZERO_S(masklen, mask); + for (i = 0 ; i < maxcpu ; i++) { + if (VIR_CPU_USABLE(map, maplen, 0, i)) + CPU_SET_S(i, masklen, mask); + } + + if (sched_setaffinity(pid, masklen, mask) < 0) { + CPU_FREE(mask); + if (errno == EINVAL && + numcpus < (1024 << 8)) { /* 262144 cpus ought to be enough for anyone */ + numcpus = numcpus << 2;

let's just numcpus *= 2; or numcpus *= 4; it's not like we want to shave a microsecond, makes code less readable.

...

...
+ goto realloc; + } + virReportSystemError(NULL, errno, + _("cannot set CPU affinity on process %d"), pid); + return -1; + } + CPU_FREE(mask); +#else + /* Legacy method uses a fixed size cpu mask, only allows upto 1024 cpus */ cpu_set_t mask;

CPU_ZERO(&mask); @@ -51,6 +93,7 @@ int virProcessInfoSetAffinity(pid_t pid, _("cannot set CPU affinity on process %d"), pid); return -1; } +#endif

return 0; } @@ -61,6 +104,46 @@ int virProcessInfoGetAffinity(pid_t pid, int maxcpu) { int i; +#ifdef CPU_ALLOC + /* New method dynamically allocates cpu mask, allowing unlimted cpus */ + int numcpus = 1024; + size_t masklen; + cpu_set_t *mask; + + /* Not only may the statically allocated cpu_set_t be too small, + * but there is no way to ask the kernel what size is large enough. + * So you have no option but to pick a size, try, catch EINVAL, + * enlarge, and re-try. + * + * http://lkml.org/lkml/2009/7/28/620 + */ +realloc: + masklen = CPU_ALLOC_SIZE(numcpus); + mask = CPU_ALLOC(numcpus); + + if (!mask) { + virReportOOMError(NULL); + return -1; + } + + CPU_ZERO_S(masklen, mask); + if (sched_getaffinity(pid, masklen, mask) < 0) { + CPU_FREE(mask); + if (errno == EINVAL && + numcpus < (1024 << 8)) { /* 262144 cpus ought to be enough for anyone */ + numcpus = numcpus << 2;

same I would also make numcpus a static variable, so that you don't repeat he loop each time you go though one of those APIs.

Using static variables in this kind of context are not thread-safe and I don't really want to introduce locking in here. FYI, in the common case of kernels compiled with a sensible NR_CPUS, there will only ever be a single pass in the loop. In the uncommon case of using a NR_CPUS=4096, I picked 1024 and the '<< 2', to ensure there is only two passes in the loop (first fails, second succeeds). So i don't think it needs optimizing further Regards, Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

5831

Age (days ago)

5832

Last active (days ago)

List overview

Download

6 comments

2 participants

participants (2)

Daniel P. Berrange
Daniel Veillard