From: Shivaprasad G Bhat <sbhat(a)linux.vnet.ibm.com>
The nodeinfo is reporting incorrect number of cpus and incorrect host
topology on PPC64 KVM hosts. The KVM hypervisor on PPC64 needs only
the primary thread in a core to be online, and the secondaries offlined.
While scheduling a guest in, the kvm scheduler wakes up the secondaries to
run in guest context.
The host scheduling of the guests happen at the core level(as only primary
thread is online). The kvm scheduler exploits as many threads of the core
as needed by guest. Further, starting POWER8, the processor allows splitting
a physical core into multiple subcores with 2 or 4 threads each. Again, only
the primary thread in a subcore is online in the host. The KVM-PPC
scheduler allows guests to exploit all the offline threads in the subcore,
by bringing them online when needed.
(Kernel patches on split-core
http://www.spinics.net/lists/kvm-ppc/msg09121.html)
Recently with dynamic micro-threading changes in ppc-kvm, makes sure
to utilize all the offline cpus across guests, and across guests with
different cpu topologies.
(
https://www.mail-archive.com/kvm@vger.kernel.org/msg115978.html)
Since the offline cpus are brought online in the guest context, it is safe
to count them as online. Nodeinfo today discounts these offline cpus from
cpu count/topology calclulation, and the nodeinfo output is not of any help
and the host appears overcommited when it is actually not.
The patch carefully counts those offline threads whose primary threads are
online. The host topology displayed by the nodeinfo is also fixed when the
host is in valid kvm state.
Signed-off-by: Shivaprasad G Bhat <sbhat(a)linux.vnet.ibm.com>
Signed-off-by: Andrea Bolognani <abologna(a)redhat.com>
---
src/libvirt_private.syms | 1 +
src/nodeinfo.c | 140 ++++++++++++++++++++++++++++++++++++++++++-----
src/nodeinfo.h | 1 +
3 files changed, 129 insertions(+), 13 deletions(-)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index 1566d11..64644a2 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -1008,6 +1008,7 @@ nodeGetInfo;
nodeGetMemory;
nodeGetMemoryParameters;
nodeGetMemoryStats;
+nodeGetThreadsPerSubcore;
nodeSetMemoryParameters;
diff --git a/src/nodeinfo.c b/src/nodeinfo.c
index 2fafe2d..25a6471 100644
--- a/src/nodeinfo.c
+++ b/src/nodeinfo.c
@@ -32,6 +32,12 @@
#include <sys/utsname.h>
#include <sched.h>
#include "conf/domain_conf.h"
+#include <fcntl.h>
+#include <sys/ioctl.h>
+
+#if HAVE_LINUX_KVM_H
+# include <linux/kvm.h>
+#endif
#if defined(__FreeBSD__) || defined(__APPLE__)
# include <sys/time.h>
@@ -428,28 +434,88 @@ virNodeParseNode(const char *node,
unsigned int cpu;
int online;
int direrr;
+ int lastonline;
+ virBitmapPtr cpu_map = NULL;
+ int threads_per_subcore = 0;
*threads = 0;
*cores = 0;
*sockets = 0;
+ /* PPC-KVM needs the secondary threads of a core to be offline on the
+ * host. The kvm scheduler brings the secondary threads online in the
+ * guest context. Moreover, P8 processor has split-core capability
+ * where, there can be 1,2 or 4 subcores per core. The primaries of the
+ * subcores alone will be online on the host for a subcore in the
+ * host. Even though the actual threads per core for P8 processor is 8,
+ * depending on the subcores_per_core = 1, 2 or 4, the threads per
+ * subcore will vary accordingly to 8, 4 and 2 repectively.
+ * So, On host threads_per_core what is arrived at from sysfs in the
+ * current logic is actually the subcores_per_core. Threads per subcore
+ * can only be obtained from the kvm device. For example, on P8 wih 1
+ * core having 8 threads, sub_cores_percore=4, the threads 0,2,4 & 6
+ * will be online. The sysfs reflects this and in the current logic
+ * variable 'threads' will be 4 which is nothing but subcores_per_core.
+ * If the user tampers the cpu online/offline states using chcpu or other
+ * means, then it is an unsupported configuration for kvm.
+ * The code below tries to keep in mind
+ * - when the libvirtd is run inside a KVM guest or Phyp based guest.
+ * - Or on the kvm host where user manually tampers the cpu states to
+ * offline/online randomly.
+ * On hosts other than POWER this will be 0, in which case a simpler
+ * thread-counting logic will be used */
+ if ((threads_per_subcore = nodeGetThreadsPerSubcore(arch)) < 0)
+ goto cleanup;
+
+ /* Keep track of node CPUs in a bitmap so that we can iterate
+ * through them in guaranteed numeric order, which is required to
+ * find out whether a thread is primary or secondary */
+ if ((cpu_map = virBitmapNew(VIR_DOMAIN_CPUMASK_LEN)) == NULL)
+ goto cleanup;
+
if (!(cpudir = opendir(node))) {
virReportSystemError(errno, _("cannot opendir %s"), node);
goto cleanup;
}
- /* enumerate sockets in the node */
- CPU_ZERO(&sock_map);
while ((direrr = virDirRead(cpudir, &cpudirent, node)) > 0) {
if (sscanf(cpudirent->d_name, "cpu%u", &cpu) != 1)
continue;
+ if (virBitmapSetBit(cpu_map, cpu) < 0) {
+ printf("virBitmapSetBit(%d)\n", cpu);
+ goto cleanup;
+ }
+ }
+
+ if (direrr < 0)
+ goto cleanup;
+
+ /* enumerate sockets in the node */
+ CPU_ZERO(&sock_map);
+ lastonline = -1;
+ for (cpu = 0; cpu < virBitmapSize(cpu_map); cpu++) {
+ if (!virBitmapIsBitSet(cpu_map, cpu))
+ continue;
+
if ((online = virNodeGetCpuValue(node, cpu, "online", 1)) < 0)
goto cleanup;
if (!online)
continue;
+ /* If subcores are in use, only the primary thread in each subcore
+ * can be online. If a secondary thread is found online, it means
+ * the configuration has been tampered with by the user and we can't
+ * rely on our logic to count threads properly */
+ if (threads_per_subcore > 0 &&
+ lastonline >= 0 &&
+ (cpu - lastonline) < threads_per_subcore) {
+ /* Fallback to the subcore-unaware logic */
+ threads_per_subcore = 0;
+ }
+ lastonline = cpu;
+
/* Parse socket */
if ((sock = virNodeParseSocket(node, arch, cpu)) < 0)
goto cleanup;
@@ -459,9 +525,6 @@ virNodeParseNode(const char *node,
sock_max = sock;
}
- if (direrr < 0)
- goto cleanup;
-
sock_max++;
/* allocate cpu maps for each socket */
@@ -471,21 +534,31 @@ virNodeParseNode(const char *node,
for (i = 0; i < sock_max; i++)
CPU_ZERO(&core_maps[i]);
- /* iterate over all CPU's in the node */
- rewinddir(cpudir);
- while ((direrr = virDirRead(cpudir, &cpudirent, node)) > 0) {
- if (sscanf(cpudirent->d_name, "cpu%u", &cpu) != 1)
+ /* Iterate over all CPUs in the node */
+ lastonline = -1;
+ for (cpu = 0; cpu < virBitmapSize(cpu_map); cpu++) {
+ if (!virBitmapIsBitSet(cpu_map, cpu))
continue;
if ((online = virNodeGetCpuValue(node, cpu, "online", 1)) < 0)
goto cleanup;
if (!online) {
- (*offline)++;
+ if (threads_per_subcore > 0 &&
+ lastonline >= 0 &&
+ (cpu-lastonline) < threads_per_subcore) {
+ /* Secondary offline threads are counted as online when
+ * subcores are in use */
+ processors++;
+ } else {
+ /* But they are counted as offline otherwise */
+ (*offline)++;
+ }
continue;
}
processors++;
+ lastonline = cpu;
/* Parse socket */
if ((sock = virNodeParseSocket(node, arch, cpu)) < 0)
@@ -513,9 +586,6 @@ virNodeParseNode(const char *node,
*threads = siblings;
}
- if (direrr < 0)
- goto cleanup;
-
/* finalize the returned data */
*sockets = CPU_COUNT(&sock_map);
@@ -528,6 +598,12 @@ virNodeParseNode(const char *node,
*cores = core;
}
+ if (threads_per_subcore > 0) {
+ /* The thread count ignores offline threads, which means that only
+ * only primary threads have been considered so far. If subcores
+ * are in use, we need to also account for secondary threads */
+ *threads *= threads_per_subcore;
+ }
ret = processors;
cleanup:
@@ -537,6 +613,7 @@ virNodeParseNode(const char *node,
ret = -1;
}
VIR_FREE(core_maps);
+ virBitmapFree(cpu_map);
return ret;
}
@@ -2116,3 +2193,40 @@ nodeAllocPages(unsigned int npages,
cleanup:
return ret;
}
+
+/* Get the number of threads per subcore.
+ *
+ * This will be 2, 4 or 8 on POWER hosts, depending on the current
+ * micro-threading configuration, and 0 everywhere else.
+ *
+ * Returns the number of threads per subcore if subcores are in use, zero
+ * if subcores are not in use, and a negative value on error */
+int
+nodeGetThreadsPerSubcore(virArch arch)
+{
+ int kvmfd;
+ int threads_per_subcore = 0;
+
+#if HAVE_LINUX_KVM_H && defined(KVM_CAP_PPC_SMT)
+ if (ARCH_IS_PPC64(arch)) {
+
+ kvmfd = open("/dev/kvm", O_RDONLY);
+ if (kvmfd < 0) {
+ threads_per_subcore = -1;
+ goto out;
+ }
+
+ /* For Phyp and KVM based guests the ioctl for KVM_CAP_PPC_SMT
+ * returns zero and both primary and secondary threads will be
+ * online */
+ threads_per_subcore = ioctl(kvmfd,
+ KVM_CHECK_EXTENSION,
+ KVM_CAP_PPC_SMT);
+
+ out:
+ VIR_FORCE_CLOSE(kvmfd);
+ }
+#endif /* HAVE_LINUX_KVM_H && defined(KVM_CAP_PPC_SMT) */
+
+ return threads_per_subcore;
+}
diff --git a/src/nodeinfo.h b/src/nodeinfo.h
index 047bd5c..41739a1 100644
--- a/src/nodeinfo.h
+++ b/src/nodeinfo.h
@@ -46,6 +46,7 @@ int nodeGetMemory(unsigned long long *mem,
virBitmapPtr nodeGetPresentCPUBitmap(void);
virBitmapPtr nodeGetCPUBitmap(int *max_id);
int nodeGetCPUCount(void);
+int nodeGetThreadsPerSubcore(virArch arch);
int nodeGetMemoryParameters(virTypedParameterPtr params,
int *nparams,
--
2.4.3