On Mon, Jun 16, 2014 at 05:08:26PM +0200, Michal Privoznik wrote:
For future work we need two functions that fetches total number of
pages and number of free pages for given NUMA node and page size
(virNumaGetPageInfo()).
Then we need to learn pages of what sizes are supported on given node
(virNumaGetPages()).
Signed-off-by: Michal Privoznik <mprivozn(a)redhat.com>
---
src/libvirt_private.syms | 2 +
src/util/virnuma.c | 325 +++++++++++++++++++++++++++++++++++++++++++++++
src/util/virnuma.h | 10 ++
3 files changed, 337 insertions(+)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index 18fde54..a7834ed 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -1662,6 +1662,8 @@ virNumaGetAutoPlacementAdvice;
virNumaGetDistances;
virNumaGetMaxNode;
virNumaGetNodeMemory;
+virNumaGetPageInfo;
+virNumaGetPages;
virNumaIsAvailable;
virNumaNodeIsAvailable;
virNumaSetupMemoryPolicy;
diff --git a/src/util/virnuma.c b/src/util/virnuma.c
index 5814cba..a59feca 100644
--- a/src/util/virnuma.c
+++ b/src/util/virnuma.c
@@ -34,12 +34,18 @@
#endif /* WITH_NUMACTL */
+#include <sys/types.h>
+#include <dirent.h>
+
#include "virnuma.h"
#include "vircommand.h"
#include "virerror.h"
#include "virlog.h"
#include "viralloc.h"
#include "virbitmap.h"
+#include "virstring.h"
+#include "virfile.h"
+#include "nodeinfo.h"
#define VIR_FROM_THIS VIR_FROM_NONE
@@ -504,3 +510,322 @@ virNumaGetDistances(int node ATTRIBUTE_UNUSED,
return 0;
}
#endif
+
+
+#define HUGEPAGES_NUMA_PREFIX "/sys/devices/system/node/"
+#define HUGEPAGES_SYSTEM_PREFIX "/sys/kernel/mm/hugepages/"
+#define HUGEPAGES_PREFIX "hugepages-"
+
+static int
+virNumaGetHugePageInfoPath(char **path,
+ int node,
+ unsigned int page_size,
+ const char *suffix)
+{
+
+ int ret = -1;
+
+ if (node == -1) {
+ /* We are aiming at overall system info */
+ if (page_size) {
+ /* And even on specific huge page size */
+ if (virAsprintf(path,
+ HUGEPAGES_SYSTEM_PREFIX HUGEPAGES_PREFIX
"%ukB/%s",
+ page_size, suffix ? suffix : "") < 0)
+ goto cleanup;
+ } else {
+ if (VIR_STRDUP(*path, HUGEPAGES_SYSTEM_PREFIX) < 0)
+ goto cleanup;
+ }
+
+ } else {
+ /* We are aiming on specific NUMA node */
+ if (page_size) {
+ /* And even on specific huge page size */
+ if (virAsprintf(path,
+ HUGEPAGES_NUMA_PREFIX "node%d/hugepages/"
+ HUGEPAGES_PREFIX "%ukB/%s",
+ node, page_size, suffix ? suffix : "") < 0)
+ goto cleanup;
+ } else {
+ if (virAsprintf(path,
+ HUGEPAGES_NUMA_PREFIX "node%d/hugepages/",
+ node) < 0)
+ goto cleanup;
+ }
+ }
+
+ ret = 0;
+ cleanup:
+ return ret;
+}
+
+
+/**
+ * virNumaGetHugePageInfo:
+ * @node: NUMA node id
+ * @page_size: which huge page are we interested in
+ * @page_avail: total number of huge pages in the pool
+ * @page_free: the number of free huge pages in the pool
+ *
+ * For given NUMA node and huge page size fetch information on
+ * total number of huge pages in the pool (both free and taken)
+ * and count for free huge pages in the pool.
+ *
+ * If you're interested in just one bit, pass NULL to the other one.
+ *
+ * As a special case, if @node == -1, overall info is fetched
+ * from the system.
+ *
+ * Returns 0 on success, -1 otherwise (with error reported).
+ */
+static int
+virNumaGetHugePageInfo(int node,
+ unsigned int page_size,
+ unsigned int *page_avail,
+ unsigned int *page_free)
+{
+ int ret = -1;
+ char *path = NULL;
+ char *buf = NULL;
+ char *end;
+
+ if (page_avail) {
+ if (virNumaGetHugePageInfoPath(&path, node,
+ page_size, "nr_hugepages") < 0)
+ goto cleanup;
+
+ if (virFileReadAll(path, 1024, &buf) < 0)
+ goto cleanup;
+
+ if (virStrToLong_ui(buf, &end, 10, page_avail) < 0 ||
+ *end != '\n') {
+ virReportError(VIR_ERR_INTERNAL_ERROR,
+ _("unable to parse: %s"),
+ buf);
+ goto cleanup;
+ }
It would probably be worth our while to introduce a helper
API virFileReadLong_ui() and likewise for the other
virStrToLong variants - we have this need in quite a few
places that interact with sysfs.
No requirement to do it in this patch unless you want to
though - its a long standing issue we can fix separately.
+/**
+ * virNumaGetPageInfo:
+ * @node: NUMA node id
+ * @page_size: which huge page are we interested in (in KiB)
+ * @page_avail: total number of huge pages in the pool
+ * @page_free: the number of free huge pages in the pool
+ *
+ * For given NUMA node and page size fetch information on
+ * total number of pages in the pool (both free and taken)
+ * and count for free pages in the pool.
+ *
+ * If you're interested in just one bit, pass NULL to the other one.
+ *
+ * As a special case, if @node == -1, overall info is fetched
+ * from the system.
+ *
+ * Returns 0 on success, -1 otherwise (with error reported).
+ */
+int
+virNumaGetPageInfo(int node,
+ unsigned int page_size,
+ unsigned int *page_avail,
+ unsigned int *page_free)
+{
+ int ret = -1;
+ long system_page_size = sysconf(_SC_PAGESIZE);
+
+ /* sysconf() returns page size in bytes,
+ * the @page_size is however in kibibytes */
+ if (page_size == system_page_size / 1024) {
+ unsigned long long memsize, memfree;
+
+ /* TODO: come up with better algorithm that takes huge pages into
+ * account. The problem is huge pages cut off regular memory. */
Hmm, so this code is returning normal page count that ignores the fact
that some pages are not in fact usable because they've been stolen for
huge pages ? I was thinking that the total memory reported by the kernel
was reduced when you allocated huage pages, but testing now, it seems I
was mistaken in that belief. So this is a bit of a nasty gotcha because
a user of this API would probably expect that the sum of page size *
page count for all page sizes would equal total physical RAM (give or
take).
I still like the idea of including the default page size in this info,
but perhaps we should disable the default system page size for now &
revisit later if we can figure out a way to accurately report it,
rather than reporting misleading info.
+ if (node == -1) {
+ if (nodeGetMemory(&memsize, &memfree) < 0)
+ goto cleanup;
+ } else {
+ if (virNumaGetNodeMemory(node, &memsize, &memfree) < 0)
+ goto cleanup;
+ }
+
+ if (page_avail)
+ *page_avail = memsize / system_page_size;
+
+ if (page_free)
+ *page_free = memfree / system_page_size;
+ } else {
+ if (virNumaGetHugePageInfo(node, page_size, page_avail, page_free) < 0)
+ goto cleanup;
+ }
+
+ ret = 0;
+ cleanup:
+ return ret;
+}
+
+
+/**
+ * virNumaGetPages:
+ * @node: NUMA node id
+ * @pages_size: list of pages supported on @node
+ * @pages_avail: list of the pool sizes on @node
+ * @pages_free: list of free pages on @node
+ * @npages: the lists size
+ *
+ * For given NUMA node fetch info on pages. The size of pages
+ * (e.g. 4K, 2M, 1G) is stored into @pages_size, the size of the
+ * pool is then stored into @pages_avail and the number of free
+ * pages in the pool is stored into @pages_free.
+ *
+ * If you're interested only in some lists, pass NULL to the
+ * other ones.
+ *
+ * As a special case, if @node == -1, overall info is fetched
+ * from the system.
+ *
+ * Returns 0 on success, -1 otherwise.
+ */
+int
+virNumaGetPages(int node,
+ unsigned int **pages_size,
+ unsigned int **pages_avail,
+ unsigned int **pages_free,
+ size_t *npages)
+{
+ int ret = -1;
+ char *path = NULL;
+ DIR *dir = NULL;
+ struct dirent *entry;
+ unsigned int *tmp_size = NULL, *tmp_avail = NULL, *tmp_free = NULL;
+ unsigned int ntmp = 0;
+ size_t i;
+ bool exchange;
+ long system_page_size;
+
+ /* sysconf() returns page size in bytes,
+ * but we are storing the page size in kibibytes. */
+ system_page_size = sysconf(_SC_PAGESIZE) / 1024;
+
+ /* We know that ordinary system pages are supported
+ * if nothing else is. */
+ if (VIR_REALLOC_N(tmp_size, 1) < 0 ||
+ VIR_REALLOC_N(tmp_avail, 1) < 0 ||
+ VIR_REALLOC_N(tmp_free, 1) < 0)
+ goto cleanup;
+
+ if (virNumaGetPageInfo(node, system_page_size,
+ &tmp_avail[ntmp], &tmp_free[ntmp]) < 0)
+ goto cleanup;
+ tmp_size[ntmp] = system_page_size;
+ ntmp++;
+
+ /* Now that we got ordinary system pages, lets get info on huge pages */
+ if (virNumaGetHugePageInfoPath(&path, node, 0, NULL) < 0)
+ goto cleanup;
+
+ if (!(dir = opendir(path))) {
+ virReportSystemError(errno,
+ _("unable to open path: %s"),
+ path);
+ goto cleanup;
+ }
+
+ while (virDirRead(dir, &entry, path) > 0) {
+ const char *page_name = entry->d_name;
+ unsigned int page_size, page_avail = 0, page_free = 0;
+ char *end;
+
+ /* Just to give you a hint, we're dealing with this:
+ * hugepages-2048kB/ or hugepages-1048576kB/ */
+ if (!STRPREFIX(entry->d_name, HUGEPAGES_PREFIX))
+ continue;
+
+ page_name += strlen(HUGEPAGES_PREFIX);
+
+ if (virStrToLong_ui(page_name, &end, 10, &page_size) < 0 ||
+ STRCASENEQ(end, "kB")) {
+ virReportError(VIR_ERR_INTERNAL_ERROR,
+ _("unable to parse %s"),
+ entry->d_name);
+ goto cleanup;
+ }
+
+ /* Querying more detailed info makes sense only sometimes */
+ if ((pages_avail || pages_free) &&
+ virNumaGetHugePageInfo(node, page_size,
+ &page_avail, &page_free) < 0)
+ goto cleanup;
+
+ if (VIR_REALLOC_N(tmp_size, ntmp + 1) < 0 ||
+ VIR_REALLOC_N(tmp_avail, ntmp + 1) < 0 ||
+ VIR_REALLOC_N(tmp_free, ntmp + 1) < 0)
+ goto cleanup;
+
+ tmp_size[ntmp] = page_size;
+ tmp_avail[ntmp] = page_avail;
+ tmp_free[ntmp] = page_free;
+ ntmp++;
+ }
+
+ /* Just to produce nice output, sort the arrays by increasing page size */
+ do {
+ exchange = false;
+ for (i = 0; i < ntmp -1; i++) {
+ if (tmp_size[i] > tmp_size[i + 1]) {
+ exchange = true;
+ SWAP(tmp_size[i], tmp_size[i + 1]);
+ SWAP(tmp_avail[i], tmp_avail[i + 1]);
+ SWAP(tmp_free[i], tmp_free[i + 1]);
+ }
+ }
+ } while (exchange);
+
+ if (pages_size) {
+ *pages_size = tmp_size;
+ tmp_size = NULL;
+ }
+ if (pages_avail) {
+ *pages_avail = tmp_avail;
+ tmp_avail = NULL;
+ }
+ if (pages_free) {
+ *pages_free = tmp_free;
+ tmp_free = NULL;
+ }
+ *npages = ntmp;
+ ret = 0;
+ cleanup:
+ VIR_FREE(tmp_free);
+ VIR_FREE(tmp_avail);
+ VIR_FREE(tmp_size);
+ closedir(dir);
+ VIR_FREE(path);
+ return ret;
+}
Regards,
Daniel
--
|:
http://berrange.com -o-
http://www.flickr.com/photos/dberrange/ :|
|:
http://libvirt.org -o-
http://virt-manager.org :|
|:
http://autobuild.org -o-
http://search.cpan.org/~danberr/ :|
|:
http://entangle-photo.org -o-
http://live.gnome.org/gtk-vnc :|