[libvirt] [PATCH] nodeinfo: skip offline CPUs

https://bugzilla.redhat.com/622515 - When hot-unplugging CPUs, libvirt failed to start a guest that had been pinned to CPUs that were still online, because it was failing to read status from unrelated offline CPUs. Tested on a dual-core laptop, where I also discovered that, per http://www.cyberciti.biz/files/linux-kernel/Documentation/cpu-hotplug.txt, /sys/devices/system/cpu/cpu0/online does not exist on systems where it cannot be hot-unplugged. * src/nodeinfo.c (linuxNodeInfoCPUPopulate): Ignore CPUs that are currently offline. Detect readdir failure. (parse_socket): Move guts... (get_cpu_value): ...to new function, shared with... (cpu_online): New function. --- src/nodeinfo.c | 109 +++++++++++++++++++++++++++++++++++++------------------ 1 files changed, 73 insertions(+), 36 deletions(-) diff --git a/src/nodeinfo.c b/src/nodeinfo.c index 5ec1bcf..6286aa2 100644 --- a/src/nodeinfo.c +++ b/src/nodeinfo.c @@ -23,6 +23,7 @@ #include <config.h> +#include <stdbool.h> #include <stdio.h> #include <string.h> #include <stdlib.h> @@ -60,6 +61,58 @@ int linuxNodeInfoCPUPopulate(FILE *cpuinfo, virNodeInfoPtr nodeinfo); +/* Return the positive decimal contents of the given + * CPU_SYS_PATH/cpu%u/FILE, or -1 on error. If MISSING_OK and the + * file could not be found, return 1 instead of an error; this is + * because some machines cannot hot-unplug cpu0. */ +static int get_cpu_value(unsigned int cpu, const char *file, bool missing_ok) +{ + char *path; + FILE *pathfp; + char value_str[1024]; + char *tmp; + int value = -1; + + if (virAsprintf(&path, CPU_SYS_PATH "/cpu%u/%s", cpu, file) < 0) { + virReportOOMError(); + return -1; + } + + pathfp = fopen(path, "r"); + if (pathfp == NULL) { + if (missing_ok && errno == ENOENT) + value = 1; + else + virReportSystemError(errno, _("cannot open %s"), path); + goto cleanup; + } + + if (fgets(value_str, sizeof(value_str), pathfp) == NULL) { + virReportSystemError(errno, _("cannot read from %s"), path); + goto cleanup; + } + if (virStrToLong_i(value_str, &tmp, 10, &value) < 0) { + nodeReportError(VIR_ERR_INTERNAL_ERROR, + _("could not convert '%s' to an integer"), + value_str); + goto cleanup; + } + +cleanup: + if (pathfp) + fclose(pathfp); + VIR_FREE(path); + + return value; +} + +/* Check if CPU is online via CPU_SYS_PATH/cpu%u/online. Return 1 if online, + 0 if offline, and -1 on error. */ +static int cpu_online(unsigned int cpu) +{ + return get_cpu_value(cpu, "online", cpu == 0); +} + static unsigned long count_thread_siblings(unsigned int cpu) { unsigned long ret = 0; @@ -106,41 +159,7 @@ cleanup: static int parse_socket(unsigned int cpu) { - char *path; - FILE *pathfp; - char socket_str[1024]; - char *tmp; - int socket = -1; - - if (virAsprintf(&path, CPU_SYS_PATH "/cpu%u/topology/physical_package_id", - cpu) < 0) { - virReportOOMError(); - return -1; - } - - pathfp = fopen(path, "r"); - if (pathfp == NULL) { - virReportSystemError(errno, _("cannot open %s"), path); - VIR_FREE(path); - return -1; - } - - if (fgets(socket_str, sizeof(socket_str), pathfp) == NULL) { - virReportSystemError(errno, _("cannot read from %s"), path); - goto cleanup; - } - if (virStrToLong_i(socket_str, &tmp, 10, &socket) < 0) { - nodeReportError(VIR_ERR_INTERNAL_ERROR, - _("could not convert '%s' to an integer"), - socket_str); - goto cleanup; - } - -cleanup: - fclose(pathfp); - VIR_FREE(path); - - return socket; + return get_cpu_value(cpu, "topology/physical_package_id", false); } int linuxNodeInfoCPUPopulate(FILE *cpuinfo, @@ -153,6 +172,8 @@ int linuxNodeInfoCPUPopulate(FILE *cpuinfo, unsigned long cur_threads; int socket; unsigned long long socket_mask = 0; + unsigned int remaining; + int online; nodeinfo->cpus = 0; nodeinfo->mhz = 0; @@ -222,15 +243,25 @@ int linuxNodeInfoCPUPopulate(FILE *cpuinfo, /* OK, we've parsed what we can out of /proc/cpuinfo. Get the socket * and thread information from /sys */ + remaining = nodeinfo->cpus; cpudir = opendir(CPU_SYS_PATH); if (cpudir == NULL) { virReportSystemError(errno, _("cannot opendir %s"), CPU_SYS_PATH); return -1; } - while ((cpudirent = readdir(cpudir))) { + while (errno = 0, remaining && (cpudirent = readdir(cpudir))) { if (sscanf(cpudirent->d_name, "cpu%u", &cpu) != 1) continue; + online = cpu_online(cpu); + if (online < 0) { + closedir(cpudir); + return -1; + } + if (!online) + continue; + remaining--; + socket = parse_socket(cpu); if (socket < 0) { closedir(cpudir); @@ -249,6 +280,12 @@ int linuxNodeInfoCPUPopulate(FILE *cpuinfo, if (cur_threads > nodeinfo->threads) nodeinfo->threads = cur_threads; } + if (errno) { + virReportSystemError(errno, + _("problem reading %s"), CPU_SYS_PATH); + closedir(cpudir); + return -1; + } closedir(cpudir); -- 1.7.1

On Tue, Aug 10, 2010 at 03:44:37PM -0600, Eric Blake wrote:
https://bugzilla.redhat.com/622515 - When hot-unplugging CPUs, libvirt failed to start a guest that had been pinned to CPUs that were still online, because it was failing to read status from unrelated offline CPUs.
Argh, yes that's a nasty problem !
Tested on a dual-core laptop, where I also discovered that, per http://www.cyberciti.biz/files/linux-kernel/Documentation/cpu-hotplug.txt, /sys/devices/system/cpu/cpu0/online does not exist on systems where it cannot be hot-unplugged.
* src/nodeinfo.c (linuxNodeInfoCPUPopulate): Ignore CPUs that are currently offline. Detect readdir failure. (parse_socket): Move guts... (get_cpu_value): ...to new function, shared with... (cpu_online): New function. --- src/nodeinfo.c | 109 +++++++++++++++++++++++++++++++++++++------------------ 1 files changed, 73 insertions(+), 36 deletions(-)
diff --git a/src/nodeinfo.c b/src/nodeinfo.c index 5ec1bcf..6286aa2 100644 --- a/src/nodeinfo.c +++ b/src/nodeinfo.c @@ -23,6 +23,7 @@
#include <config.h>
+#include <stdbool.h> #include <stdio.h> #include <string.h> #include <stdlib.h> @@ -60,6 +61,58 @@ int linuxNodeInfoCPUPopulate(FILE *cpuinfo, virNodeInfoPtr nodeinfo);
+/* Return the positive decimal contents of the given + * CPU_SYS_PATH/cpu%u/FILE, or -1 on error. If MISSING_OK and the + * file could not be found, return 1 instead of an error; this is + * because some machines cannot hot-unplug cpu0. */ +static int get_cpu_value(unsigned int cpu, const char *file, bool missing_ok) +{ + char *path; + FILE *pathfp; + char value_str[1024]; + char *tmp; + int value = -1; + + if (virAsprintf(&path, CPU_SYS_PATH "/cpu%u/%s", cpu, file) < 0) { + virReportOOMError(); + return -1; + } + + pathfp = fopen(path, "r"); + if (pathfp == NULL) { + if (missing_ok && errno == ENOENT) + value = 1; + else + virReportSystemError(errno, _("cannot open %s"), path); + goto cleanup; + } + + if (fgets(value_str, sizeof(value_str), pathfp) == NULL) { + virReportSystemError(errno, _("cannot read from %s"), path); + goto cleanup; + } + if (virStrToLong_i(value_str, &tmp, 10, &value) < 0) { + nodeReportError(VIR_ERR_INTERNAL_ERROR, + _("could not convert '%s' to an integer"), + value_str); + goto cleanup; + } + +cleanup: + if (pathfp) + fclose(pathfp); + VIR_FREE(path); + + return value; +} + +/* Check if CPU is online via CPU_SYS_PATH/cpu%u/online. Return 1 if online, + 0 if offline, and -1 on error. */ +static int cpu_online(unsigned int cpu) +{ + return get_cpu_value(cpu, "online", cpu == 0); +} + static unsigned long count_thread_siblings(unsigned int cpu) { unsigned long ret = 0; @@ -106,41 +159,7 @@ cleanup:
static int parse_socket(unsigned int cpu) { - char *path; - FILE *pathfp; - char socket_str[1024]; - char *tmp; - int socket = -1; - - if (virAsprintf(&path, CPU_SYS_PATH "/cpu%u/topology/physical_package_id", - cpu) < 0) { - virReportOOMError(); - return -1; - } - - pathfp = fopen(path, "r"); - if (pathfp == NULL) { - virReportSystemError(errno, _("cannot open %s"), path); - VIR_FREE(path); - return -1; - } - - if (fgets(socket_str, sizeof(socket_str), pathfp) == NULL) { - virReportSystemError(errno, _("cannot read from %s"), path); - goto cleanup; - } - if (virStrToLong_i(socket_str, &tmp, 10, &socket) < 0) { - nodeReportError(VIR_ERR_INTERNAL_ERROR, - _("could not convert '%s' to an integer"), - socket_str); - goto cleanup; - } - -cleanup: - fclose(pathfp); - VIR_FREE(path); - - return socket; + return get_cpu_value(cpu, "topology/physical_package_id", false); }
int linuxNodeInfoCPUPopulate(FILE *cpuinfo, @@ -153,6 +172,8 @@ int linuxNodeInfoCPUPopulate(FILE *cpuinfo, unsigned long cur_threads; int socket; unsigned long long socket_mask = 0; + unsigned int remaining; + int online;
nodeinfo->cpus = 0; nodeinfo->mhz = 0; @@ -222,15 +243,25 @@ int linuxNodeInfoCPUPopulate(FILE *cpuinfo, /* OK, we've parsed what we can out of /proc/cpuinfo. Get the socket * and thread information from /sys */ + remaining = nodeinfo->cpus; cpudir = opendir(CPU_SYS_PATH); if (cpudir == NULL) { virReportSystemError(errno, _("cannot opendir %s"), CPU_SYS_PATH); return -1; } - while ((cpudirent = readdir(cpudir))) { + while (errno = 0, remaining && (cpudirent = readdir(cpudir))) { if (sscanf(cpudirent->d_name, "cpu%u", &cpu) != 1) continue;
+ online = cpu_online(cpu); + if (online < 0) { + closedir(cpudir); + return -1; + } + if (!online) + continue; + remaining--; + socket = parse_socket(cpu); if (socket < 0) { closedir(cpudir); @@ -249,6 +280,12 @@ int linuxNodeInfoCPUPopulate(FILE *cpuinfo, if (cur_threads > nodeinfo->threads) nodeinfo->threads = cur_threads; } + if (errno) { + virReportSystemError(errno, + _("problem reading %s"), CPU_SYS_PATH); + closedir(cpudir); + return -1; + }
closedir(cpudir);
ACK, that looks fine ! thanks ! Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

On 08/10/2010 04:01 PM, Daniel Veillard wrote:
On Tue, Aug 10, 2010 at 03:44:37PM -0600, Eric Blake wrote:
https://bugzilla.redhat.com/622515 - When hot-unplugging CPUs, libvirt failed to start a guest that had been pinned to CPUs that were still online, because it was failing to read status from unrelated offline CPUs.
Argh, yes that's a nasty problem !
ACK, that looks fine !
thanks !
Thanks for the review. I'm squashing this in before applying, based on an IRC comment by Dave Allan that 1024 is an awfully big stack allocation when we know that the value will fit in an int. diff --git i/src/nodeinfo.c w/src/nodeinfo.c index 2a20679..59e0163 100644 --- i/src/nodeinfo.c +++ w/src/nodeinfo.c @@ -48,6 +48,7 @@ #include "logging.h" #include "virterror_internal.h" #include "count-one-bits.h" +#include "intprops.h" #define VIR_FROM_THIS VIR_FROM_NONE @@ -72,7 +73,7 @@ static int get_cpu_value(unsigned int cpu, const char *file, bool missing_ok) { char *path; FILE *pathfp; - char value_str[1024]; + char value_str[INT_BUFSIZE_BOUND(int)]; char *tmp; int value = -1; -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

On Tue, Aug 10, 2010 at 04:17:56PM -0600, Eric Blake wrote:
On 08/10/2010 04:01 PM, Daniel Veillard wrote:
On Tue, Aug 10, 2010 at 03:44:37PM -0600, Eric Blake wrote:
https://bugzilla.redhat.com/622515 - When hot-unplugging CPUs, libvirt failed to start a guest that had been pinned to CPUs that were still online, because it was failing to read status from unrelated offline CPUs.
Argh, yes that's a nasty problem !
ACK, that looks fine !
thanks !
Thanks for the review. I'm squashing this in before applying, based on an IRC comment by Dave Allan that 1024 is an awfully big stack allocation when we know that the value will fit in an int.
diff --git i/src/nodeinfo.c w/src/nodeinfo.c index 2a20679..59e0163 100644 --- i/src/nodeinfo.c +++ w/src/nodeinfo.c @@ -48,6 +48,7 @@ #include "logging.h" #include "virterror_internal.h" #include "count-one-bits.h" +#include "intprops.h"
#define VIR_FROM_THIS VIR_FROM_NONE @@ -72,7 +73,7 @@ static int get_cpu_value(unsigned int cpu, const char *file, bool missing_ok) { char *path; FILE *pathfp; - char value_str[1024]; + char value_str[INT_BUFSIZE_BOUND(int)]; char *tmp; int value = -1;
Whoops I didn't catch that ! Actually I wonder if we could catch sequences like \[[0-9]..[0-9]+\] in make syntax-check, maybe this could make sure we always dynamically allocate such large buffers. Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/
participants (2)
-
Daniel Veillard
-
Eric Blake