[libvirt] [PATCH 0/5] Expose host's huge pages capability

*** BLURB HERE *** Michal Privoznik (5): virnuma: Introduce huge pages helpers virCaps: expose huge page info Introduce virNodeGetFreePages virsh: Expose virNodeGetFreePages qemu: Implement virNodeGetFreePages daemon/remote.c | 52 ++++++++++ docs/schemas/capability.rng | 21 ++++ include/libvirt/libvirt.h.in | 7 ++ src/conf/capabilities.c | 25 ++++- src/conf/capabilities.h | 15 ++- src/driver.h | 10 ++ src/libvirt.c | 95 +++++++++++++++++ src/libvirt_private.syms | 2 + src/libvirt_public.syms | 4 + src/libxl/libxl_conf.c | 1 + src/nodeinfo.c | 41 +++++++- src/qemu/qemu_capabilities.c | 29 +++++- src/qemu/qemu_driver.c | 43 ++++++++ src/remote/remote_driver.c | 50 +++++++++ src/remote/remote_protocol.x | 20 +++- src/remote_protocol-structs | 16 +++ src/test/test_driver.c | 2 +- src/util/virnuma.c | 242 +++++++++++++++++++++++++++++++++++++++++++ src/util/virnuma.h | 10 ++ src/xen/xend_internal.c | 1 + tests/vircaps2xmltest.c | 3 +- tests/vircapstest.c | 1 + tools/virsh-host.c | 167 +++++++++++++++++++++++++++++ tools/virsh.pod | 8 ++ 24 files changed, 856 insertions(+), 9 deletions(-) -- 1.8.5.5

For future work we need two functions that fetches total number of huge pages and number of free pages for given numa node and page size (virNumaGetHugePageInfo()). Then we need to learn which huge pages are supported on given node (virNumaGetHugePages()). Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/libvirt_private.syms | 2 + src/util/virnuma.c | 242 +++++++++++++++++++++++++++++++++++++++++++++++ src/util/virnuma.h | 10 ++ 3 files changed, 254 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index d1d6ff3..cb0d5b1 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1655,6 +1655,8 @@ virDomainNumatuneMemModeTypeFromString; virDomainNumatuneMemModeTypeToString; virNumaGetAutoPlacementAdvice; virNumaGetDistances; +virNumaGetHugePageInfo; +virNumaGetHugePages; virNumaGetMaxNode; virNumaGetNodeMemory; virNumaIsAvailable; diff --git a/src/util/virnuma.c b/src/util/virnuma.c index 1676208..6a586e9 100644 --- a/src/util/virnuma.c +++ b/src/util/virnuma.c @@ -34,12 +34,17 @@ #endif /* WITH_NUMACTL */ +#include <sys/types.h> +#include <dirent.h> + #include "virnuma.h" #include "vircommand.h" #include "virerror.h" #include "virlog.h" #include "viralloc.h" #include "virbitmap.h" +#include "virstring.h" +#include "virfile.h" #define VIR_FROM_THIS VIR_FROM_NONE @@ -472,3 +477,240 @@ virNumaGetDistances(int node ATTRIBUTE_UNUSED, return 0; } #endif + + +#define HUGEPAGES_NUMA_PREFIX "/sys/devices/system/node/" +#define HUGEPAGES_SYSTEM_PREFIX "/sys/kernel/mm/hugepages/" +#define HUGEPAGES_PREFIX "hugepages-" + +static int +virNumaGetHugePageInfoPath(char **path, + int node, + unsigned int page_size, + const char *suffix) +{ + + int ret = -1; + + if (node == -1) { + /* We are aiming at overall system info */ + if (page_size) { + /* And even on specific huge page size */ + if (virAsprintf(path, + HUGEPAGES_SYSTEM_PREFIX HUGEPAGES_PREFIX "%ukB/%s", + page_size, suffix ? suffix : "") < 0) + goto cleanup; + } else { + if (VIR_STRDUP(*path, HUGEPAGES_SYSTEM_PREFIX) < 0) + goto cleanup; + } + + } else { + /* We are aiming on specific NUMA node */ + if (page_size) { + /* And even on specific huge page size */ + if (virAsprintf(path, + HUGEPAGES_NUMA_PREFIX "node%d/hugepages/" + HUGEPAGES_PREFIX "%ukB/%s", + node, page_size, suffix ? suffix : "") < 0) + goto cleanup; + } else { + if (virAsprintf(path, + HUGEPAGES_NUMA_PREFIX "node%d/hugepages/", + node) < 0) + goto cleanup; + } + } + + ret = 0; + cleanup: + return ret; +} + + +/** + * virNumaGetHugePageInfo: + * @node: NUMA node id + * @page_size: which huge page are we interested in + * @page_avail: total number of huge pages in the pool + * @page_free: the number of free huge pages in the pool + * + * For given NUMA node and huge page size fetch information on + * total number of huge pages in the pool (both free and taken) + * and count for free huge pages in the pool. + * + * If you're interested in just one bit, pass NULL to the other one. + * + * As a special case, if @node == -1, overall info is fetched + * from the system. + * + * Returns 0 on success, -1 otherwise (with error reported). + */ +int +virNumaGetHugePageInfo(int node, + unsigned int page_size, + unsigned int *page_avail, + unsigned int *page_free) +{ + int ret = -1; + char *path = NULL; + char *buf = NULL; + char *end; + + if (page_avail) { + if (virNumaGetHugePageInfoPath(&path, node, + page_size, "nr_hugepages") < 0) + goto cleanup; + + if (virFileReadAll(path, 1024, &buf) < 0) { + virReportSystemError(errno, + _("unable to read %s"), + path); + goto cleanup; + } + + if (virStrToLong_ui(buf, &end, 10, page_avail) < 0 || + *end != '\n') { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("unable to parse: %s"), + buf); + goto cleanup; + } + VIR_FREE(buf); + VIR_FREE(path); + } + + if (page_free) { + if (virNumaGetHugePageInfoPath(&path, node, + page_size, "free_hugepages") < 0) + goto cleanup; + + if (virFileReadAll(path, 1024, &buf) < 0) { + virReportSystemError(errno, + _("unable to read %s"), + path); + goto cleanup; + } + + if (virStrToLong_ui(buf, &end, 10, page_free) < 0 || + *end != '\n') { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("unable to parse: %s"), + buf); + goto cleanup; + } + } + + ret = 0; + cleanup: + VIR_FREE(buf); + VIR_FREE(path); + return ret; +} + + +/** + * virNumaGetHugePages: + * @node: NUMA node id + * @hugepages_size: list of huge pages supported on @node + * @hugepages_avail: list of the pool sizes on @node + * @hugepages_free: list of free huge pages on @node + * @nhugepages: the lists size + * + * For given NUMA node fetch info on huge pages. The size of huge + * pages (e.g. 4K, 2M, 1G) is stored into @hugepages_size, the + * size of the pool is then stored into @hugepages_avail and the + * number of free huge pages in the pool is stored into + * @hugepages_free. + * + * If you're interested only in some lists, pass NULL to the + * other ones. + * + * As a special case, if @node == -1, overall info is fetched + * from the system. + * + * Returns 0 on success, -1 otherwise. + */ +int +virNumaGetHugePages(int node, + unsigned int **hugepages_size, + unsigned int **hugepages_avail, + unsigned int **hugepages_free, + size_t *nhugepages) +{ + int ret = -1; + char *path = NULL; + DIR *dir = NULL; + struct dirent *entry; + unsigned int *tmp_size = NULL, *tmp_avail = NULL, *tmp_free = NULL; + unsigned int ntmp = 0; + + if (virNumaGetHugePageInfoPath(&path, node, 0, NULL) < 0) + goto cleanup; + + if (!(dir = opendir(path))) { + virReportSystemError(errno, + _("unable to open path: %s"), + path); + goto cleanup; + } + + while (virDirRead(dir, &entry, path) > 0) { + const char *page_name = entry->d_name; + unsigned int page_size, page_avail = 0, page_free = 0; + char *end; + + /* Just to give you a hint, we're dealing with this: + * hugepages-2048kB/ or hugepages-1048576kB/ */ + if (!STRPREFIX(entry->d_name, HUGEPAGES_PREFIX)) + continue; + + page_name += strlen(HUGEPAGES_PREFIX); + + if (virStrToLong_ui(page_name, &end, 10, &page_size) < 0 || + STRCASENEQ(end, "kB")) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("unable to parse %s"), + entry->d_name); + goto cleanup; + } + + /* Querying more detailed info makes sense only sometimes */ + if ((hugepages_avail || hugepages_free) && + virNumaGetHugePageInfo(node, page_size, + &page_avail, &page_free) < 0) + goto cleanup; + + if (VIR_REALLOC_N(tmp_size, ntmp + 1) < 0 || + VIR_REALLOC_N(tmp_avail, ntmp + 1) < 0 || + VIR_REALLOC_N(tmp_free, ntmp + 1) < 0) + goto cleanup; + + tmp_size[ntmp] = page_size; + tmp_avail[ntmp] = page_avail; + tmp_free[ntmp] = page_free; + ntmp++; + } + + if (hugepages_size) { + *hugepages_size = tmp_size; + tmp_size = NULL; + } + if (hugepages_avail) { + *hugepages_avail = tmp_avail; + tmp_avail = NULL; + } + if (hugepages_free) { + *hugepages_free = tmp_free; + tmp_free = NULL; + } + *nhugepages = ntmp; + ret = 0; + cleanup: + VIR_FREE(tmp_free); + VIR_FREE(tmp_avail); + VIR_FREE(tmp_size); + closedir(dir); + VIR_FREE(path); + return ret; +} diff --git a/src/util/virnuma.h b/src/util/virnuma.h index fe1e966..3461935 100644 --- a/src/util/virnuma.h +++ b/src/util/virnuma.h @@ -69,4 +69,14 @@ unsigned int virNumaGetMaxCPUs(void); int virNumaGetNodeCPUs(int node, virBitmapPtr *cpus); +int virNumaGetHugePageInfo(int node, + unsigned int page_size, + unsigned int *page_avail, + unsigned int *page_free); +int virNumaGetHugePages(int node, + unsigned int **hugepages_size, + unsigned int **hugepages_avail, + unsigned int **hugepages_free, + size_t *nhugepages) + ATTRIBUTE_NONNULL(5); #endif /* __VIR_NUMA_H__ */ -- 1.8.5.5

On Tue, Jun 10, 2014 at 07:21:11PM +0200, Michal Privoznik wrote:
For future work we need two functions that fetches total number of huge pages and number of free pages for given numa node and page size (virNumaGetHugePageInfo()).
Then we need to learn which huge pages are supported on given node (virNumaGetHugePages()).
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/libvirt_private.syms | 2 + src/util/virnuma.c | 242 +++++++++++++++++++++++++++++++++++++++++++++++ src/util/virnuma.h | 10 ++ 3 files changed, 254 insertions(+)
ACK Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Tue, Jun 10, 2014 at 07:21:11PM +0200, Michal Privoznik wrote:
For future work we need two functions that fetches total number of huge pages and number of free pages for given numa node and page size (virNumaGetHugePageInfo()).
Then we need to learn which huge pages are supported on given node (virNumaGetHugePages()).
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/libvirt_private.syms | 2 + src/util/virnuma.c | 242 +++++++++++++++++++++++++++++++++++++++++++++++ src/util/virnuma.h | 10 ++ 3 files changed, 254 insertions(+)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index d1d6ff3..cb0d5b1 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1655,6 +1655,8 @@ virDomainNumatuneMemModeTypeFromString; virDomainNumatuneMemModeTypeToString; virNumaGetAutoPlacementAdvice; virNumaGetDistances; +virNumaGetHugePageInfo; +virNumaGetHugePages;
Actually this should really have s/Huge// in the name and return all page sizes, not merely huge ones. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

There are two places where you'll find info on huge pages. The first one is under <cpu/> element, where all supported huge page sizes are listed. Then the second one is under each <cell/> element which refers to concrete NUMA node. At this place, the size of huge page's pool is reported. So the capabilities XML looks something like this: <capabilities> <host> <uuid>01281cda-f352-cb11-a9db-e905fe22010c</uuid> <cpu> <arch>x86_64</arch> <model>Westmere</model> <vendor>Intel</vendor> <topology sockets='1' cores='1' threads='1'/> ... <pages unit='KiB' size='1048576'/> <pages unit='KiB' size='2048'/> </cpu> ... <topology> <cells num='4'> <cell id='0'> <memory unit='KiB'>4054408</memory> <pages unit='KiB' size='1048576'>1</pages> <pages unit='KiB' size='2048'>3</pages> <distances/> <cpus num='1'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> </cpus> </cell> <cell id='1'> <memory unit='KiB'>4071072</memory> <pages unit='KiB' size='1048576'>2</pages> <pages unit='KiB' size='2048'>1024</pages> <distances/> <cpus num='1'> <cpu id='1' socket_id='0' core_id='0' siblings='1'/> </cpus> </cell> ... </cells> </topology> ... </host> <guest/> </capabilities> Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- docs/schemas/capability.rng | 21 +++++++++++++++++++++ src/conf/capabilities.c | 25 ++++++++++++++++++++++--- src/conf/capabilities.h | 15 ++++++++++++++- src/libxl/libxl_conf.c | 1 + src/nodeinfo.c | 41 ++++++++++++++++++++++++++++++++++++++++- src/qemu/qemu_capabilities.c | 29 ++++++++++++++++++++++++++++- src/test/test_driver.c | 2 +- src/xen/xend_internal.c | 1 + tests/vircaps2xmltest.c | 3 ++- tests/vircapstest.c | 1 + 10 files changed, 131 insertions(+), 8 deletions(-) diff --git a/docs/schemas/capability.rng b/docs/schemas/capability.rng index 0c95c05..ee6cb2e 100644 --- a/docs/schemas/capability.rng +++ b/docs/schemas/capability.rng @@ -118,6 +118,9 @@ <empty/> </element> </zeroOrMore> + <zeroOrMore> + <ref name='hugepagesElem'/> + </zeroOrMore> </define> <define name='power_management'> @@ -188,6 +191,10 @@ <ref name='memory'/> </optional> + <zeroOrMore> + <ref name='hugepagesElem'/> + </zeroOrMore> + <optional> <element name='distances'> <zeroOrMore> @@ -416,4 +423,18 @@ <param name='pattern'>[a-zA-Z0-9\-_]+</param> </data> </define> + + <define name='hugepagesElem'> + <element name='pages'> + <optional> + <attribute name='unit'> + <ref name='unit'/> + </attribute> + </optional> + <attribute name='size'> + <ref name='unsignedInt'/> + </attribute> + <ref name='unsignedInt'/> + </element> + </define> </grammar> diff --git a/src/conf/capabilities.c b/src/conf/capabilities.c index 954456b..c773752 100644 --- a/src/conf/capabilities.c +++ b/src/conf/capabilities.c @@ -108,6 +108,7 @@ virCapabilitiesFreeHostNUMACell(virCapsHostNUMACellPtr cell) VIR_FREE(cell->cpus); VIR_FREE(cell->siblings); + VIR_FREE(cell->hugepages); VIR_FREE(cell); } @@ -223,6 +224,7 @@ virCapabilitiesDispose(void *object) } VIR_FREE(caps->host.secModels); + VIR_FREE(caps->host.hugePagesSize); virCPUDefFree(caps->host.cpu); } @@ -281,6 +283,8 @@ virCapabilitiesAddHostMigrateTransport(virCapsPtr caps, * @cpus: array of CPU definition structures, the pointer is stolen * @nsiblings: number of sibling NUMA nodes * @siblings: info on sibling NUMA nodes + * @nhugepages: number of hugepages at node @num + * @hugepages: info on each single huge page * * Registers a new NUMA cell for a host, passing in a * array of CPU IDs belonging to the cell @@ -292,7 +296,9 @@ virCapabilitiesAddHostNUMACell(virCapsPtr caps, int ncpus, virCapsHostNUMACellCPUPtr cpus, int nsiblings, - virCapsHostNUMACellSiblingInfoPtr siblings) + virCapsHostNUMACellSiblingInfoPtr siblings, + int nhugepages, + virCapsHostNUMACellHugePageInfoPtr hugepages) { virCapsHostNUMACellPtr cell; @@ -303,12 +309,14 @@ virCapabilitiesAddHostNUMACell(virCapsPtr caps, if (VIR_ALLOC(cell) < 0) return -1; - cell->ncpus = ncpus; cell->num = num; cell->mem = mem; + cell->ncpus = ncpus; cell->cpus = cpus; - cell->siblings = siblings; cell->nsiblings = nsiblings; + cell->siblings = siblings; + cell->nhugepages = nhugepages; + cell->hugepages = hugepages; caps->host.numaCell[caps->host.nnumaCell++] = cell; @@ -773,6 +781,12 @@ virCapabilitiesFormatNUMATopology(virBufferPtr buf, virBufferAsprintf(buf, "<memory unit='KiB'>%llu</memory>\n", cells[i]->mem); + for (j = 0; j < cells[i]->nhugepages; j++) { + virBufferAsprintf(buf, "<pages unit='KiB' size='%u'>%zu</pages>\n", + cells[i]->hugepages[j].size, + cells[i]->hugepages[j].avail); + } + if (cells[i]->nsiblings) { virBufferAddLit(buf, "<distances>\n"); virBufferAdjustIndent(buf, 2); @@ -856,6 +870,11 @@ virCapabilitiesFormatXML(virCapsPtr caps) } virCPUDefFormatBuf(&buf, caps->host.cpu, 0); + for (i = 0; i < caps->host.nhugePagesSize; i++) { + virBufferAsprintf(&buf, "<pages unit='KiB' size='%u'/>\n", + caps->host.hugePagesSize[i]); + } + virBufferAdjustIndent(&buf, -2); virBufferAddLit(&buf, "</cpu>\n"); diff --git a/src/conf/capabilities.h b/src/conf/capabilities.h index 53a83c9..384e256 100644 --- a/src/conf/capabilities.h +++ b/src/conf/capabilities.h @@ -102,6 +102,13 @@ struct _virCapsHostNUMACellSiblingInfo { unsigned int distance; /* distance to the node */ }; +typedef struct _virCapsHostNUMACellHugePageInfo virCapsHostNUMACellHugePageInfo; +typedef virCapsHostNUMACellHugePageInfo *virCapsHostNUMACellHugePageInfoPtr; +struct _virCapsHostNUMACellHugePageInfo { + unsigned int size; /* huge page size in kibibytes */ + size_t avail; /* the size of pool */ +}; + typedef struct _virCapsHostNUMACell virCapsHostNUMACell; typedef virCapsHostNUMACell *virCapsHostNUMACellPtr; struct _virCapsHostNUMACell { @@ -111,6 +118,8 @@ struct _virCapsHostNUMACell { virCapsHostNUMACellCPUPtr cpus; int nsiblings; virCapsHostNUMACellSiblingInfoPtr siblings; + int nhugepages; + virCapsHostNUMACellHugePageInfoPtr hugepages; }; typedef struct _virCapsHostSecModelLabel virCapsHostSecModelLabel; @@ -152,6 +161,8 @@ struct _virCapsHost { virCapsHostSecModelPtr secModels; virCPUDefPtr cpu; + int nhugePagesSize; /* size of hugePagesSize array */ + unsigned int *hugePagesSize; /* huge page sizes support on the system */ unsigned char host_uuid[VIR_UUID_BUFLEN]; }; @@ -206,7 +217,9 @@ virCapabilitiesAddHostNUMACell(virCapsPtr caps, int ncpus, virCapsHostNUMACellCPUPtr cpus, int nsiblings, - virCapsHostNUMACellSiblingInfoPtr siblings); + virCapsHostNUMACellSiblingInfoPtr siblings, + int nhugepages, + virCapsHostNUMACellHugePageInfoPtr hugepages); extern int diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c index cec37d6..eca51a1 100644 --- a/src/libxl/libxl_conf.c +++ b/src/libxl/libxl_conf.c @@ -210,6 +210,7 @@ libxlCapsInitNuma(libxl_ctx *ctx, virCapsPtr caps) if (virCapabilitiesAddHostNUMACell(caps, i, numa_info[i].size / 1024, nr_cpus_node[i], cpus[i], + 0, NULL, 0, NULL) < 0) { virCapabilitiesClearHostNUMACellCPUTopology(cpus[i], nr_cpus_node[i]); diff --git a/src/nodeinfo.c b/src/nodeinfo.c index 5eef42e..fbf2e3c 100644 --- a/src/nodeinfo.c +++ b/src/nodeinfo.c @@ -1646,6 +1646,7 @@ nodeCapsInitNUMAFake(virCapsPtr caps ATTRIBUTE_UNUSED) if (virCapabilitiesAddHostNUMACell(caps, 0, nodeinfo.memory, ncpus, cpus, + 0, NULL, 0, NULL) < 0) goto error; @@ -1795,6 +1796,36 @@ virNodeCapsGetSiblingInfo(int node, return ret; } +static int +virNodeCapsGetHugePagesInfo(int node, + virCapsHostNUMACellHugePageInfoPtr *hugepages, + int *nhugepages) +{ + int ret = -1; + unsigned int *pages_size = NULL, *pages_avail = NULL; + size_t npages, i; + + if (virNumaGetHugePages(node, &pages_size, + &pages_avail, NULL, &npages) < 0) + goto cleanup; + + if (VIR_ALLOC_N(*hugepages, npages) < 0) + goto cleanup; + *nhugepages = npages; + + for (i = 0; i < npages; i++) { + (*hugepages)[i].size = pages_size[i]; + (*hugepages)[i].avail = pages_avail[i]; + } + + ret = 0; + + cleanup: + VIR_FREE(pages_avail); + VIR_FREE(pages_size); + return ret; +} + int nodeCapsInitNUMA(virCapsPtr caps) { @@ -1804,6 +1835,8 @@ nodeCapsInitNUMA(virCapsPtr caps) virBitmapPtr cpumap = NULL; virCapsHostNUMACellSiblingInfoPtr siblings = NULL; int nsiblings; + virCapsHostNUMACellHugePageInfoPtr hugepages = NULL; + int nhugepages; int ret = -1; int ncpus = 0; int cpu; @@ -1846,17 +1879,22 @@ nodeCapsInitNUMA(virCapsPtr caps) if (virNodeCapsGetSiblingInfo(n, &siblings, &nsiblings) < 0) goto cleanup; + if (virNodeCapsGetHugePagesInfo(n, &hugepages, &nhugepages) < 0) + goto cleanup; + /* Detect the amount of memory in the numa cell in KiB */ virNumaGetNodeMemory(n, &memory, NULL); memory >>= 10; if (virCapabilitiesAddHostNUMACell(caps, n, memory, ncpus, cpus, - nsiblings, siblings) < 0) + nsiblings, siblings, + nhugepages, hugepages) < 0) goto cleanup; cpus = NULL; siblings = NULL; + hugepages = NULL; } ret = 0; @@ -1868,6 +1906,7 @@ nodeCapsInitNUMA(virCapsPtr caps) virBitmapFree(cpumap); VIR_FREE(cpus); VIR_FREE(siblings); + VIR_FREE(hugepages); if (ret < 0) VIR_FREE(cpus); diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c index 08c3d04..d07deeb 100644 --- a/src/qemu/qemu_capabilities.c +++ b/src/qemu/qemu_capabilities.c @@ -920,6 +920,29 @@ virQEMUCapsInitCPU(virCapsPtr caps, } +static int +virQEMUCapsInitHugePages(virCapsPtr caps) +{ + int ret = -1; + unsigned int *pages_size = NULL; + size_t npages; + + if (virNumaGetHugePages(-1 /* Magic constant for overall info */, + &pages_size, NULL, NULL, &npages) < 0) + goto cleanup; + + caps->host.hugePagesSize = pages_size; + pages_size = NULL; + caps->host.nhugePagesSize = npages; + npages = 0; + + ret = 0; + cleanup: + VIR_FREE(pages_size); + return ret; +} + + virCapsPtr virQEMUCapsInit(virQEMUCapsCachePtr cache) { virCapsPtr caps; @@ -943,10 +966,14 @@ virCapsPtr virQEMUCapsInit(virQEMUCapsCachePtr cache) VIR_WARN("Failed to get host CPU"); /* Add the power management features of the host */ - if (virNodeSuspendGetTargetMask(&caps->host.powerMgmt) < 0) VIR_WARN("Failed to get host power management capabilities"); + /* Add huge pages info */ + if (virQEMUCapsInitHugePages(caps) < 0) + VIR_WARN("Failed to get huge pages info"); + + /* Add domain migration transport URI */ virCapabilitiesAddHostMigrateTransport(caps, "tcp"); diff --git a/src/test/test_driver.c b/src/test/test_driver.c index f9e2b3d..0bf710a 100644 --- a/src/test/test_driver.c +++ b/src/test/test_driver.c @@ -338,7 +338,7 @@ testBuildCapabilities(virConnectPtr conn) if (virCapabilitiesAddHostNUMACell(caps, i, 0, privconn->cells[i].numCpus, - cpu_cells, 0, NULL) < 0) + cpu_cells, 0, NULL, 0, NULL) < 0) goto error; } diff --git a/src/xen/xend_internal.c b/src/xen/xend_internal.c index 5ddf71a..03fdde1 100644 --- a/src/xen/xend_internal.c +++ b/src/xen/xend_internal.c @@ -1102,6 +1102,7 @@ sexpr_to_xend_topology(const struct sexpr *root, virCapsPtr caps) if (virCapabilitiesAddHostNUMACell(caps, cell, 0, nb_cpus, cpuInfo, + 0, NULL, 0, NULL) < 0) goto error; cpuInfo = NULL; diff --git a/tests/vircaps2xmltest.c b/tests/vircaps2xmltest.c index fa02534..7166c98 100644 --- a/tests/vircaps2xmltest.c +++ b/tests/vircaps2xmltest.c @@ -74,7 +74,8 @@ buildVirCapabilities(int max_cells, if (virCapabilitiesAddHostNUMACell(caps, cell_id, max_mem_in_cell, max_cpus_in_cell, cell_cpus, - nsiblings, siblings) < 0) + nsiblings, siblings, + 0, NULL) < 0) goto error; cell_cpus = NULL; diff --git a/tests/vircapstest.c b/tests/vircapstest.c index 3edebba..59e9c2b 100644 --- a/tests/vircapstest.c +++ b/tests/vircapstest.c @@ -66,6 +66,7 @@ buildNUMATopology(int seq) if (virCapabilitiesAddHostNUMACell(caps, cell_id + seq, MAX_MEM_IN_CELL, MAX_CPUS_IN_CELL, cell_cpus, + 0, NULL, 0, NULL) < 0) goto error; -- 1.8.5.5

On Tue, Jun 10, 2014 at 07:21:12PM +0200, Michal Privoznik wrote:
There are two places where you'll find info on huge pages. The first one is under <cpu/> element, where all supported huge page sizes are listed. Then the second one is under each <cell/> element which refers to concrete NUMA node. At this place, the size of huge page's pool is reported. So the capabilities XML looks something like this:
<capabilities>
<host> <uuid>01281cda-f352-cb11-a9db-e905fe22010c</uuid> <cpu> <arch>x86_64</arch> <model>Westmere</model> <vendor>Intel</vendor> <topology sockets='1' cores='1' threads='1'/> ... <pages unit='KiB' size='1048576'/> <pages unit='KiB' size='2048'/>
Should have normal sized pages (ie 4k on x86) too, to avoid apps having to special case small pages.
</cpu> ... <topology> <cells num='4'> <cell id='0'> <memory unit='KiB'>4054408</memory> <pages unit='KiB' size='1048576'>1</pages> <pages unit='KiB' size='2048'>3</pages>
Should have normal sized pages here too. We should also declare whether '<memory>' refers to total memory across all page sizes, or just total memory of smallest page sizes. I'd say it should refer to all memory, since conceptually this is really about telling you how much RAM DIMMS are in each node.
<distances/> <cpus num='1'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> </cpus> </cell> <cell id='1'> <memory unit='KiB'>4071072</memory> <pages unit='KiB' size='1048576'>2</pages> <pages unit='KiB' size='2048'>1024</pages> <distances/> <cpus num='1'> <cpu id='1' socket_id='0' core_id='0' siblings='1'/> </cpus> </cell> ... </cells> </topology> ... </host>
<guest/>
</capabilities>
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- docs/schemas/capability.rng | 21 +++++++++++++++++++++ src/conf/capabilities.c | 25 ++++++++++++++++++++++--- src/conf/capabilities.h | 15 ++++++++++++++- src/libxl/libxl_conf.c | 1 + src/nodeinfo.c | 41 ++++++++++++++++++++++++++++++++++++++++- src/qemu/qemu_capabilities.c | 29 ++++++++++++++++++++++++++++- src/test/test_driver.c | 2 +- src/xen/xend_internal.c | 1 + tests/vircaps2xmltest.c | 3 ++- tests/vircapstest.c | 1 + 10 files changed, 131 insertions(+), 8 deletions(-)
diff --git a/src/conf/capabilities.h b/src/conf/capabilities.h index 53a83c9..384e256 100644 --- a/src/conf/capabilities.h +++ b/src/conf/capabilities.h @@ -102,6 +102,13 @@ struct _virCapsHostNUMACellSiblingInfo { unsigned int distance; /* distance to the node */ };
+typedef struct _virCapsHostNUMACellHugePageInfo virCapsHostNUMACellHugePageInfo; +typedef virCapsHostNUMACellHugePageInfo *virCapsHostNUMACellHugePageInfoPtr; +struct _virCapsHostNUMACellHugePageInfo { + unsigned int size; /* huge page size in kibibytes */ + size_t avail; /* the size of pool */ +};
s/Huge//. since this should be used to report on all page sizes
+ typedef struct _virCapsHostNUMACell virCapsHostNUMACell; typedef virCapsHostNUMACell *virCapsHostNUMACellPtr; struct _virCapsHostNUMACell { @@ -111,6 +118,8 @@ struct _virCapsHostNUMACell { virCapsHostNUMACellCPUPtr cpus; int nsiblings; virCapsHostNUMACellSiblingInfoPtr siblings; + int nhugepages; + virCapsHostNUMACellHugePageInfoPtr hugepages;
Better named as 'pageinfo' rather than 'hugepages' Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Thu, Jun 12, 2014 at 02:30:50PM +0100, Daniel P. Berrange wrote:
On Tue, Jun 10, 2014 at 07:21:12PM +0200, Michal Privoznik wrote:
There are two places where you'll find info on huge pages. The first one is under <cpu/> element, where all supported huge page sizes are listed. Then the second one is under each <cell/> element which refers to concrete NUMA node. At this place, the size of huge page's pool is reported. So the capabilities XML looks something like this:
<capabilities>
<host> <uuid>01281cda-f352-cb11-a9db-e905fe22010c</uuid> <cpu> <arch>x86_64</arch> <model>Westmere</model> <vendor>Intel</vendor> <topology sockets='1' cores='1' threads='1'/> ... <pages unit='KiB' size='1048576'/> <pages unit='KiB' size='2048'/>
Should have normal sized pages (ie 4k on x86) too, to avoid apps having to special case small pages.
Since we have to special-case small pages and kernel (at least to my knowledge) doesn't expose that information by classic means, I think reporting only hugepages is actually what we want here. For normal memory there are existing APIs already. Hugepages are different mainly because of one thing. The fact that there are some hugepages allocated is known by the user of the machine (be it mgmt app or an admin) and these hugepages were allocated for some purpose. It is fairly OK to presume that the number of hugepages (free or total) will change only when and if the user wants to (e.g. running a machine with specified size and hugepages). That cannot be said about small pages, though, and I think it is fair reason to special-case normal pages like this. Martin
</cpu> ... <topology> <cells num='4'> <cell id='0'> <memory unit='KiB'>4054408</memory> <pages unit='KiB' size='1048576'>1</pages> <pages unit='KiB' size='2048'>3</pages>
Should have normal sized pages here too.
We should also declare whether '<memory>' refers to total memory across all page sizes, or just total memory of smallest page sizes. I'd say it should refer to all memory, since conceptually this is really about telling you how much RAM DIMMS are in each node.
<distances/> <cpus num='1'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> </cpus> </cell> <cell id='1'> <memory unit='KiB'>4071072</memory> <pages unit='KiB' size='1048576'>2</pages> <pages unit='KiB' size='2048'>1024</pages> <distances/> <cpus num='1'> <cpu id='1' socket_id='0' core_id='0' siblings='1'/> </cpus> </cell> ... </cells> </topology> ... </host>
<guest/>
</capabilities>
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- docs/schemas/capability.rng | 21 +++++++++++++++++++++ src/conf/capabilities.c | 25 ++++++++++++++++++++++--- src/conf/capabilities.h | 15 ++++++++++++++- src/libxl/libxl_conf.c | 1 + src/nodeinfo.c | 41 ++++++++++++++++++++++++++++++++++++++++- src/qemu/qemu_capabilities.c | 29 ++++++++++++++++++++++++++++- src/test/test_driver.c | 2 +- src/xen/xend_internal.c | 1 + tests/vircaps2xmltest.c | 3 ++- tests/vircapstest.c | 1 + 10 files changed, 131 insertions(+), 8 deletions(-)
diff --git a/src/conf/capabilities.h b/src/conf/capabilities.h index 53a83c9..384e256 100644 --- a/src/conf/capabilities.h +++ b/src/conf/capabilities.h @@ -102,6 +102,13 @@ struct _virCapsHostNUMACellSiblingInfo { unsigned int distance; /* distance to the node */ };
+typedef struct _virCapsHostNUMACellHugePageInfo virCapsHostNUMACellHugePageInfo; +typedef virCapsHostNUMACellHugePageInfo *virCapsHostNUMACellHugePageInfoPtr; +struct _virCapsHostNUMACellHugePageInfo { + unsigned int size; /* huge page size in kibibytes */ + size_t avail; /* the size of pool */ +};
s/Huge//. since this should be used to report on all page sizes
+ typedef struct _virCapsHostNUMACell virCapsHostNUMACell; typedef virCapsHostNUMACell *virCapsHostNUMACellPtr; struct _virCapsHostNUMACell { @@ -111,6 +118,8 @@ struct _virCapsHostNUMACell { virCapsHostNUMACellCPUPtr cpus; int nsiblings; virCapsHostNUMACellSiblingInfoPtr siblings; + int nhugepages; + virCapsHostNUMACellHugePageInfoPtr hugepages;
Better named as 'pageinfo' rather than 'hugepages'
Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list

On Thu, Jun 12, 2014 at 07:21:47PM +0200, Martin Kletzander wrote:
On Thu, Jun 12, 2014 at 02:30:50PM +0100, Daniel P. Berrange wrote:
On Tue, Jun 10, 2014 at 07:21:12PM +0200, Michal Privoznik wrote:
There are two places where you'll find info on huge pages. The first one is under <cpu/> element, where all supported huge page sizes are listed. Then the second one is under each <cell/> element which refers to concrete NUMA node. At this place, the size of huge page's pool is reported. So the capabilities XML looks something like this:
<capabilities>
<host> <uuid>01281cda-f352-cb11-a9db-e905fe22010c</uuid> <cpu> <arch>x86_64</arch> <model>Westmere</model> <vendor>Intel</vendor> <topology sockets='1' cores='1' threads='1'/> ... <pages unit='KiB' size='1048576'/> <pages unit='KiB' size='2048'/>
Should have normal sized pages (ie 4k on x86) too, to avoid apps having to special case small pages.
Since we have to special-case small pages and kernel (at least to my knowledge) doesn't expose that information by classic means, I think reporting only hugepages is actually what we want here. For normal memory there are existing APIs already.
Hugepages are different mainly because of one thing. The fact that there are some hugepages allocated is known by the user of the machine (be it mgmt app or an admin) and these hugepages were allocated for some purpose. It is fairly OK to presume that the number of hugepages (free or total) will change only when and if the user wants to (e.g. running a machine with specified size and hugepages). That cannot be said about small pages, though, and I think it is fair reason to special-case normal pages like this.
That difference is something that's only relevant to the person who is provisioning the machine though. For applications consuming the libvirt APIs it is not relevant. For OpenStack we really want to have normal size pages dealt with the in the same way as huge pages since it will simplify our schedular/placement logic. So I really want these APIs to do this in libvirt so that OpenStack doesn't have to reverse engineer this itself. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 13.06.2014 10:28, Daniel P. Berrange wrote:
On Thu, Jun 12, 2014 at 07:21:47PM +0200, Martin Kletzander wrote:
On Thu, Jun 12, 2014 at 02:30:50PM +0100, Daniel P. Berrange wrote:
On Tue, Jun 10, 2014 at 07:21:12PM +0200, Michal Privoznik wrote:
There are two places where you'll find info on huge pages. The first one is under <cpu/> element, where all supported huge page sizes are listed. Then the second one is under each <cell/> element which refers to concrete NUMA node. At this place, the size of huge page's pool is reported. So the capabilities XML looks something like this:
<capabilities>
<host> <uuid>01281cda-f352-cb11-a9db-e905fe22010c</uuid> <cpu> <arch>x86_64</arch> <model>Westmere</model> <vendor>Intel</vendor> <topology sockets='1' cores='1' threads='1'/> ... <pages unit='KiB' size='1048576'/> <pages unit='KiB' size='2048'/>
Should have normal sized pages (ie 4k on x86) too, to avoid apps having to special case small pages.
Since we have to special-case small pages and kernel (at least to my knowledge) doesn't expose that information by classic means, I think reporting only hugepages is actually what we want here. For normal memory there are existing APIs already.
Hugepages are different mainly because of one thing. The fact that there are some hugepages allocated is known by the user of the machine (be it mgmt app or an admin) and these hugepages were allocated for some purpose. It is fairly OK to presume that the number of hugepages (free or total) will change only when and if the user wants to (e.g. running a machine with specified size and hugepages). That cannot be said about small pages, though, and I think it is fair reason to special-case normal pages like this.
That difference is something that's only relevant to the person who is provisioning the machine though. For applications consuming the libvirt APIs it is not relevant. For OpenStack we really want to have normal size pages dealt with the in the same way as huge pages since it will simplify our schedular/placement logic. So I really want these APIs to do this in libvirt so that OpenStack doesn't have to reverse engineer this itself.
But if we go this way, there are black holes hidden. For instance, the sizeof(ordinary pages pool). This is not accessible anywhere and the only algorithm I can think of is to take [(MemTotal on NODE #i) - sum(mem taken by all huge pages)] / PAGE_SIZE. So for instance on my machine where I have 1GB huge page per NUMA node, and 3 2MB per NUMA node: # grep MemTotal /sys/devices/system/node/node0/meminfo Node 0 MemTotal: 4054408 kB # cat /sys/devices/system/node/node0/hugepages/hugepages-1048576kB/nr_hugepages 1 # cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages 3 # getconf PAGESIZE 4096 (4054408 - (1*1048576 + 3*2048)) / 4 = 2999688 / 4 = 749922 ordinary pages. But it's not that simple as not all pages are available. Some are reserved for DMA transfers, some for kernel itself, etc. Without overcommit it's impossible to allocate that nearly 3GB. Is this something we really want to do? Michal

On Fri, Jun 13, 2014 at 04:30:41PM +0200, Michal Privoznik wrote:
On 13.06.2014 10:28, Daniel P. Berrange wrote:
On Thu, Jun 12, 2014 at 07:21:47PM +0200, Martin Kletzander wrote:
On Thu, Jun 12, 2014 at 02:30:50PM +0100, Daniel P. Berrange wrote:
On Tue, Jun 10, 2014 at 07:21:12PM +0200, Michal Privoznik wrote:
There are two places where you'll find info on huge pages. The first one is under <cpu/> element, where all supported huge page sizes are listed. Then the second one is under each <cell/> element which refers to concrete NUMA node. At this place, the size of huge page's pool is reported. So the capabilities XML looks something like this:
<capabilities>
<host> <uuid>01281cda-f352-cb11-a9db-e905fe22010c</uuid> <cpu> <arch>x86_64</arch> <model>Westmere</model> <vendor>Intel</vendor> <topology sockets='1' cores='1' threads='1'/> ... <pages unit='KiB' size='1048576'/> <pages unit='KiB' size='2048'/>
Should have normal sized pages (ie 4k on x86) too, to avoid apps having to special case small pages.
Since we have to special-case small pages and kernel (at least to my knowledge) doesn't expose that information by classic means, I think reporting only hugepages is actually what we want here. For normal memory there are existing APIs already.
Hugepages are different mainly because of one thing. The fact that there are some hugepages allocated is known by the user of the machine (be it mgmt app or an admin) and these hugepages were allocated for some purpose. It is fairly OK to presume that the number of hugepages (free or total) will change only when and if the user wants to (e.g. running a machine with specified size and hugepages). That cannot be said about small pages, though, and I think it is fair reason to special-case normal pages like this.
That difference is something that's only relevant to the person who is provisioning the machine though. For applications consuming the libvirt APIs it is not relevant. For OpenStack we really want to have normal size pages dealt with the in the same way as huge pages since it will simplify our schedular/placement logic. So I really want these APIs to do this in libvirt so that OpenStack doesn't have to reverse engineer this itself.
But if we go this way, there are black holes hidden. For instance, the sizeof(ordinary pages pool). This is not accessible anywhere and the only algorithm I can think of is to take [(MemTotal on NODE #i) - sum(mem taken by all huge pages)] / PAGE_SIZE. So for instance on my machine where I have 1GB huge page per NUMA node, and 3 2MB per NUMA node:
# grep MemTotal /sys/devices/system/node/node0/meminfo Node 0 MemTotal: 4054408 kB
# cat /sys/devices/system/node/node0/hugepages/hugepages-1048576kB/nr_hugepages 1
# cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages 3
# getconf PAGESIZE 4096
(4054408 - (1*1048576 + 3*2048)) / 4 = 2999688 / 4 = 749922 ordinary pages. But it's not that simple as not all pages are available. Some are reserved for DMA transfers, some for kernel itself, etc. Without overcommit it's impossible to allocate that nearly 3GB. Is this something we really want to do?
I've found one other way to get the number of free normal pages. It looks like nr_free_pages in /proc/zoneinfo is what Daniel wants to report probably. But given the fact that this is something that might not be true even in the time of parsing the file, I'm still not convinced it's something you want to report. Bit more accurate would be having the amount of memory that might be available to the machine. Although with overcommit settings and file caches this might not be feasible. Martin

On 16.06.2014 08:38, Martin Kletzander wrote:
On Fri, Jun 13, 2014 at 04:30:41PM +0200, Michal Privoznik wrote:
On 13.06.2014 10:28, Daniel P. Berrange wrote:
On Thu, Jun 12, 2014 at 07:21:47PM +0200, Martin Kletzander wrote:
On Thu, Jun 12, 2014 at 02:30:50PM +0100, Daniel P. Berrange wrote:
On Tue, Jun 10, 2014 at 07:21:12PM +0200, Michal Privoznik wrote:
There are two places where you'll find info on huge pages. The first one is under <cpu/> element, where all supported huge page sizes are listed. Then the second one is under each <cell/> element which refers to concrete NUMA node. At this place, the size of huge page's pool is reported. So the capabilities XML looks something like this:
<capabilities>
<host> <uuid>01281cda-f352-cb11-a9db-e905fe22010c</uuid> <cpu> <arch>x86_64</arch> <model>Westmere</model> <vendor>Intel</vendor> <topology sockets='1' cores='1' threads='1'/> ... <pages unit='KiB' size='1048576'/> <pages unit='KiB' size='2048'/>
Should have normal sized pages (ie 4k on x86) too, to avoid apps having to special case small pages.
Since we have to special-case small pages and kernel (at least to my knowledge) doesn't expose that information by classic means, I think reporting only hugepages is actually what we want here. For normal memory there are existing APIs already.
Hugepages are different mainly because of one thing. The fact that there are some hugepages allocated is known by the user of the machine (be it mgmt app or an admin) and these hugepages were allocated for some purpose. It is fairly OK to presume that the number of hugepages (free or total) will change only when and if the user wants to (e.g. running a machine with specified size and hugepages). That cannot be said about small pages, though, and I think it is fair reason to special-case normal pages like this.
That difference is something that's only relevant to the person who is provisioning the machine though. For applications consuming the libvirt APIs it is not relevant. For OpenStack we really want to have normal size pages dealt with the in the same way as huge pages since it will simplify our schedular/placement logic. So I really want these APIs to do this in libvirt so that OpenStack doesn't have to reverse engineer this itself.
But if we go this way, there are black holes hidden. For instance, the sizeof(ordinary pages pool). This is not accessible anywhere and the only algorithm I can think of is to take [(MemTotal on NODE #i) - sum(mem taken by all huge pages)] / PAGE_SIZE. So for instance on my machine where I have 1GB huge page per NUMA node, and 3 2MB per NUMA node:
# grep MemTotal /sys/devices/system/node/node0/meminfo Node 0 MemTotal: 4054408 kB
# cat /sys/devices/system/node/node0/hugepages/hugepages-1048576kB/nr_hugepages 1
# cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages 3
# getconf PAGESIZE 4096
(4054408 - (1*1048576 + 3*2048)) / 4 = 2999688 / 4 = 749922 ordinary pages. But it's not that simple as not all pages are available. Some are reserved for DMA transfers, some for kernel itself, etc. Without overcommit it's impossible to allocate that nearly 3GB. Is this something we really want to do?
I've found one other way to get the number of free normal pages. It looks like nr_free_pages in /proc/zoneinfo is what Daniel wants to report probably. But given the fact that this is something that might not be true even in the time of parsing the file, I'm still not convinced it's something you want to report. Bit more accurate would be having the amount of memory that might be available to the machine. Although with overcommit settings and file caches this might not be feasible.
No, the zoneinfo file is bringing merely the same info as /sys/devices/system/node/node*/meminfo. # getconf PAGESIZE 4096 # grep -i memfree /sys/devices/system/node/node3/meminfo Node 3 MemFree: 2370272 kB which is then 2370272/4 = 592568 free pages. And the corresponding field in the zoneinfo shows: # grep nr_free_pages /proc/zoneinfo | tail -n 1 nr_free_pages 592639 which is merely the same number (I wonder where the slight difference is coming from though). And the problem is not getting info on free pages rather than the size of the pages pool. Michal

The aim of the API is to get information on number of free huge pages on the system. The API behaves similar to the virNodeGetCellsFreeMemory(). User passes starting NUMA cell, the count of nodes that he's interested in, huge pages sizes (yes, multiple sizes can be queried at once) and the counts are returned in an array. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- daemon/remote.c | 52 ++++++++++++++++++++++++ include/libvirt/libvirt.h.in | 7 ++++ src/driver.h | 10 +++++ src/libvirt.c | 95 ++++++++++++++++++++++++++++++++++++++++++++ src/libvirt_public.syms | 4 ++ src/remote/remote_driver.c | 50 +++++++++++++++++++++++ src/remote/remote_protocol.x | 20 +++++++++- src/remote_protocol-structs | 16 ++++++++ 8 files changed, 253 insertions(+), 1 deletion(-) diff --git a/daemon/remote.c b/daemon/remote.c index 34c96c9..11ae758 100644 --- a/daemon/remote.c +++ b/daemon/remote.c @@ -6115,6 +6115,58 @@ remoteDispatchDomainGetTime(virNetServerPtr server ATTRIBUTE_UNUSED, return rv; } + +static int +remoteDispatchNodeGetFreePages(virNetServerPtr server ATTRIBUTE_UNUSED, + virNetServerClientPtr client, + virNetMessagePtr msg ATTRIBUTE_UNUSED, + virNetMessageErrorPtr rerr, + remote_node_get_free_pages_args *args, + remote_node_get_free_pages_ret *ret) +{ + int rv = -1; + int len; + struct daemonClientPrivate *priv = + virNetServerClientGetPrivateData(client); + + if (!priv->conn) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("connection not open")); + goto cleanup; + } + + if (args->pages.pages_len * args->cellCount > REMOTE_NODE_MAX_CELLS) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("the result won't fit into REMOTE_NODE_MAX_CELLS")); + goto cleanup; + } + + /* Allocate return buffer. */ + if (VIR_ALLOC_N(ret->counts.counts_val, + args->pages.pages_len * args->cellCount) < 0) + goto cleanup; + + if ((len = virNodeGetFreePages(priv->conn, + args->pages.pages_len, + args->pages.pages_val, + args->startCell, + args->cellCount, + (unsigned long long *) ret->counts.counts_val, + args->flags)) <= 0) + goto cleanup; + + ret->counts.counts_len = len; + rv = 0; + + cleanup: + if (rv < 0) { + virNetMessageSaveError(rerr); + VIR_FREE(ret->counts.counts_val); + } + return rv; + +} + + /*----- Helpers. -----*/ /* get_nonnull_domain and get_nonnull_network turn an on-wire diff --git a/include/libvirt/libvirt.h.in b/include/libvirt/libvirt.h.in index 127de11..b939848 100644 --- a/include/libvirt/libvirt.h.in +++ b/include/libvirt/libvirt.h.in @@ -5307,6 +5307,13 @@ int virDomainSetTime(virDomainPtr dom, unsigned int nseconds, unsigned int flags); +int virNodeGetFreePages(virConnectPtr conn, + unsigned int npages, + unsigned int *pages, + int startcell, + unsigned int cellcount, + unsigned long long *counts, + unsigned int flags); /** * virSchedParameterType: * diff --git a/src/driver.h b/src/driver.h index 5ac89d6..a0b258a 100644 --- a/src/driver.h +++ b/src/driver.h @@ -1173,6 +1173,15 @@ typedef int unsigned int nmountpoints, unsigned int flags); +typedef int +(*virDrvNodeGetFreePages)(virConnectPtr conn, + unsigned int npages, + unsigned int *pages, + int startCell, + unsigned int cellCount, + unsigned long long *counts, + unsigned int flags); + typedef struct _virDriver virDriver; typedef virDriver *virDriverPtr; @@ -1391,6 +1400,7 @@ struct _virDriver { virDrvDomainFSThaw domainFSThaw; virDrvDomainGetTime domainGetTime; virDrvDomainSetTime domainSetTime; + virDrvNodeGetFreePages nodeGetFreePages; }; diff --git a/src/libvirt.c b/src/libvirt.c index 6c4a124..4ce4e82 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -20910,3 +20910,98 @@ virDomainSetTime(virDomainPtr dom, virDispatchError(dom->conn); return -1; } + + +/** + * virNodeGetFreePages: + * @conn: pointer to the hypervisor connection + * @npages: number of items in the @pages array + * @pages: page sizes to query + * @startCell: index of first cell to return free pages info on. + * @cellCount: maximum number of cells for which free pages + * information can be returned. + * @counts: returned counts of free pages + * @flags: extra flags; not used yet, so callers should always pass 0 + * + * This calls queries the host system on free huge pages of + * specified size. Ont the input, @pages is expected to be + * filled with huge pages that caller is interested in (the size + * unit is kibibytes, so e.g. pass 2048 for 2MB), then @startcell + * refers to the first NUMA node that info should be collected + * from, and @cellcount tells how many consecutive nodes should + * be queried. On the function output, @counts is filled with + * desired information, where items are grouped by NUMA node. + * So from @counts[0] till @counts[@npages - 1] you'll find count + * for the first node (@startcell), then from @counts[@npages] + * till @count[2 * @npages - 1] you'll find info for the + * (@startcell + 1) node, and so on. It's callers responsibility + * to allocate the @counts array. + * + * Example how to use this API: + * + * unsigned int pages[] = { 4, 2048, 1048576} + * unsigned int npages = ARRAY_CARDINALITY(pages); + * int startcell = 0; + * unsigned int cellcount = 2; + * + * unsigned long long counts = malloc(sizeof(long long) * npages * cellcount); + * + * virNodeGetFreePages(conn, pages, npages, + * startcell, cellcount, counts, 0); + * + * for (i = 0 ; i < cellcount ; i++) { + * fprintf(stdout, "Cell %d\n", startcell + i); + * for (j = 0 ; j < npages ; j++) { + * fprintf(stdout, " Page size=%d count=%d bytes=%llu\n", + * pages[j], counts[(i * npages) + j], + * pages[j] * counts[(i * npages) + j]); + * } + * } + * + * This little code snippet will produce something like this: + * Cell 0 + * Page size=4096 count=300 bytes=1228800 + * Page size=2097152 count=0 bytes=0 + * Page size=1073741824 count=1 bytes=1073741824 + * Cell 1 + * Page size=4096 count=0 bytes=0 + * Page size=2097152 count=20 bytes=41943040 + * Page size=1073741824 count=0 bytes=0 + * + * Returns: the number of entries filled in @counts or -1 in case of error. + */ +int +virNodeGetFreePages(virConnectPtr conn, + unsigned int npages, + unsigned int *pages, + int startCell, + unsigned int cellCount, + unsigned long long *counts, + unsigned int flags) +{ + VIR_DEBUG("conn=%p, npages=%u, pages=%p, startCell=%u, " + "cellCount=%u, counts=%p, flags=%x", + conn, npages, pages, startCell, cellCount, counts, flags); + + virResetLastError(); + + virCheckConnectReturn(conn, -1); + virCheckNonZeroArgGoto(npages, error); + virCheckNonNullArgGoto(pages, error); + virCheckNonZeroArgGoto(cellCount, error); + virCheckNonNullArgGoto(counts, error); + + if (conn->driver->nodeGetFreePages) { + int ret; + ret = conn->driver->nodeGetFreePages(conn, npages, pages, startCell, + cellCount, counts, flags); + if (ret < 0) + goto error; + return ret; + } + + virReportUnsupportedError(); + error: + virDispatchError(conn); + return -1; +} diff --git a/src/libvirt_public.syms b/src/libvirt_public.syms index cce6bdf..40d2c1a 100644 --- a/src/libvirt_public.syms +++ b/src/libvirt_public.syms @@ -658,5 +658,9 @@ LIBVIRT_1.2.5 { virDomainSetTime; } LIBVIRT_1.2.3; +LIBVIRT_1.2.6 { + global: + virNodeGetFreePages; +} LIBVIRT_1.2.5; # .... define new API here using predicted next version number .... diff --git a/src/remote/remote_driver.c b/src/remote/remote_driver.c index 85fe597..563fac0 100644 --- a/src/remote/remote_driver.c +++ b/src/remote/remote_driver.c @@ -7469,6 +7469,55 @@ remoteDomainGetTime(virDomainPtr dom, } +static int +remoteNodeGetFreePages(virConnectPtr conn, + unsigned int npages, + unsigned int *pages, + int startCell, + unsigned int cellCount, + unsigned long long *counts, + unsigned int flags) +{ + int rv = -1; + remote_node_get_free_pages_args args; + remote_node_get_free_pages_ret ret; + struct private_data *priv = conn->privateData; + + remoteDriverLock(priv); + + if (npages * cellCount > REMOTE_NODE_MAX_CELLS) { + virReportError(VIR_ERR_RPC, + _("too many NUMA cells: %d > %d"), + npages * cellCount, REMOTE_NODE_MAX_CELLS); + goto done; + } + + if (VIR_ALLOC_N(args.pages.pages_val, npages) < 0) + goto done; + memcpy(args.pages.pages_val, pages, npages * sizeof(*pages)); + args.pages.pages_len = npages; + args.startCell = startCell; + args.cellCount = cellCount; + args.flags = flags; + + memset(&ret, 0, sizeof(ret)); + if (call(conn, priv, 0, REMOTE_PROC_NODE_GET_FREE_PAGES, + (xdrproc_t) xdr_remote_node_get_free_pages_args, (char *)&args, + (xdrproc_t) xdr_remote_node_get_free_pages_ret, (char *)&ret) == -1) + goto done; + + memcpy(counts, ret.counts.counts_val, ret.counts.counts_len * sizeof(*counts)); + + xdr_free((xdrproc_t) xdr_remote_node_get_free_pages_ret, (char *) &ret); + + rv = ret.counts.counts_len; + + done: + remoteDriverUnlock(priv); + return rv; +} + + /* get_nonnull_domain and get_nonnull_network turn an on-wire * (name, uuid) pair into virDomainPtr or virNetworkPtr object. * These can return NULL if underlying memory allocations fail, @@ -7805,6 +7854,7 @@ static virDriver remote_driver = { .domainFSThaw = remoteDomainFSThaw, /* 1.2.5 */ .domainGetTime = remoteDomainGetTime, /* 1.2.5 */ .domainSetTime = remoteDomainSetTime, /* 1.2.5 */ + .nodeGetFreePages = remoteNodeGetFreePages, /* 1.2.6 */ }; static virNetworkDriver network_driver = { diff --git a/src/remote/remote_protocol.x b/src/remote/remote_protocol.x index 1f9d583..ec4f3e2 100644 --- a/src/remote/remote_protocol.x +++ b/src/remote/remote_protocol.x @@ -2999,6 +2999,17 @@ struct remote_domain_fsthaw_ret { int filesystems; }; +struct remote_node_get_free_pages_args { + unsigned int pages<REMOTE_NODE_MAX_CELLS>; + int startCell; + unsigned int cellCount; + unsigned int flags; +}; + +struct remote_node_get_free_pages_ret { + unsigned hyper counts<REMOTE_NODE_MAX_CELLS>; +}; + /*----- Protocol. -----*/ @@ -5338,5 +5349,12 @@ enum remote_procedure { * @generate: both * @acl: domain:set_time */ - REMOTE_PROC_DOMAIN_SET_TIME = 338 + REMOTE_PROC_DOMAIN_SET_TIME = 338, + + /** + * @generate: none + * @priority: high + * @acl: connect:read + */ + REMOTE_PROC_NODE_GET_FREE_PAGES = 339 }; diff --git a/src/remote_protocol-structs b/src/remote_protocol-structs index 5b22049..6c51e75 100644 --- a/src/remote_protocol-structs +++ b/src/remote_protocol-structs @@ -2463,6 +2463,21 @@ struct remote_domain_fsthaw_args { struct remote_domain_fsthaw_ret { int filesystems; }; +struct remote_node_get_free_pages_args { + struct { + u_int pages_len; + u_int * pages_val; + } pages; + int startCell; + u_int cellCount; + u_int flags; +}; +struct remote_node_get_free_pages_ret { + struct { + u_int counts_len; + uint64_t * counts_val; + } counts; +}; enum remote_procedure { REMOTE_PROC_CONNECT_OPEN = 1, REMOTE_PROC_CONNECT_CLOSE = 2, @@ -2802,4 +2817,5 @@ enum remote_procedure { REMOTE_PROC_DOMAIN_FSTHAW = 336, REMOTE_PROC_DOMAIN_GET_TIME = 337, REMOTE_PROC_DOMAIN_SET_TIME = 338, + REMOTE_PROC_NODE_GET_FREE_PAGES = 339, }; -- 1.8.5.5

On Tue, Jun 10, 2014 at 07:21:13PM +0200, Michal Privoznik wrote:
The aim of the API is to get information on number of free huge pages on the system. The API behaves similar to the virNodeGetCellsFreeMemory(). User passes starting NUMA cell, the count of nodes that he's interested in, huge pages sizes (yes, multiple sizes can be queried at once) and the counts are returned in an array.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- daemon/remote.c | 52 ++++++++++++++++++++++++ include/libvirt/libvirt.h.in | 7 ++++ src/driver.h | 10 +++++ src/libvirt.c | 95 ++++++++++++++++++++++++++++++++++++++++++++ src/libvirt_public.syms | 4 ++ src/remote/remote_driver.c | 50 +++++++++++++++++++++++ src/remote/remote_protocol.x | 20 +++++++++- src/remote_protocol-structs | 16 ++++++++ 8 files changed, 253 insertions(+), 1 deletion(-)
ACK Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

The new API is exposed under 'freepages' command. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- tools/virsh-host.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++++ tools/virsh.pod | 8 +++ 2 files changed, 175 insertions(+) diff --git a/tools/virsh-host.c b/tools/virsh-host.c index 8091437..19a8d87 100644 --- a/tools/virsh-host.c +++ b/tools/virsh-host.c @@ -193,6 +193,167 @@ cmdFreecell(vshControl *ctl, const vshCmd *cmd) return ret; } + +/* + * "freepages" command + */ +static const vshCmdInfo info_freepages[] = { + {.name = "help", + .data = N_("NUMA free memory") + }, + {.name = "desc", + .data = N_("display available free memory for the NUMA cell.") + }, + {.name = NULL} +}; + +static const vshCmdOptDef opts_freepages[] = { + {.name = "cellno", + .type = VSH_OT_INT, + .help = N_("NUMA cell number") + }, + {.name = "pagesize", + .type = VSH_OT_INT, + .help = N_("huge page size (in kibibites)") + }, + {.name = "all", + .type = VSH_OT_BOOL, + .help = N_("show free huge pages for all NUMA cells") + }, + {.name = NULL} +}; + +static bool +cmdFreepages(vshControl *ctl, const vshCmd *cmd) +{ + bool ret = false; + unsigned int npages; + unsigned int *pagesize = NULL; + int cell; + unsigned long long *counts = NULL; + size_t i, j; + xmlNodePtr *nodes = NULL; + int nodes_cnt; + char *cap_xml = NULL; + xmlDocPtr doc = NULL; + xmlXPathContextPtr ctxt = NULL; + bool all = vshCommandOptBool(cmd, "all"); + bool cellno = vshCommandOptBool(cmd, "cellno"); + + VSH_EXCLUSIVE_OPTIONS_VAR(all, cellno); + + if (all) { + if (!(cap_xml = virConnectGetCapabilities(ctl->conn))) { + vshError(ctl, "%s", _("unable to get node capabilities")); + goto cleanup; + } + + if (!(doc = virXMLParseStringCtxt(cap_xml, _("capabilities"), &ctxt))) { + vshError(ctl, "%s", _("unable to parse node capabilities")); + goto cleanup; + } + + nodes_cnt = virXPathNodeSet("/capabilities/host/cpu/pages", ctxt, &nodes); + + if (nodes_cnt <= 0) { + vshError(ctl, "%s", _("could not get information about " + "supported huge pages")); + goto cleanup; + } + + pagesize = vshMalloc(ctl, nodes_cnt * sizeof(*pagesize)); + + for (i = 0; i < nodes_cnt; i++) { + char *val = virXMLPropString(nodes[i], "size"); + + if (virStrToLong_ui(val, NULL, 10, &pagesize[i]) < 0) { + vshError(ctl, _("unable to parse page size: %s"), val); + VIR_FREE(val); + goto cleanup; + } + + VIR_FREE(val); + } + + npages = nodes_cnt; + VIR_FREE(nodes); + + counts = vshMalloc(ctl, npages * sizeof(*counts)); + + nodes_cnt = virXPathNodeSet("/capabilities/host/topology/cells/cell", + ctxt, &nodes); + for (i = 0; i < nodes_cnt; i++) { + char *val = virXMLPropString(nodes[i], "id"); + + if (virStrToLong_i(val, NULL, 10, &cell) < 0) { + vshError(ctl, _("unable to parse numa node id: %s"), val); + VIR_FREE(val); + goto cleanup; + } + VIR_FREE(val); + + if (virNodeGetFreePages(ctl->conn, npages, pagesize, + cell, 1, counts, 0) < 0) + goto cleanup; + + vshPrint(ctl, _("Node %d:\n"), cell); + for (j = 0; j < npages; j++) { + vshPrint(ctl, "%uKiB: %lld\n", pagesize[j], counts[j]); + } + vshPrint(ctl, "%c", '\n'); + } + + } else { + if (!cellno) { + vshError(ctl, "%s", _("missing cellno argument")); + goto cleanup; + } + + if (vshCommandOptInt(cmd, "cellno", &cell) < 0) { + vshError(ctl, "%s", _("Invalid cellno argument")); + goto cleanup; + } + + if (cell < -1) { + vshError(ctl, "%s", _("cell number must be non-negative integer or -1")); + goto cleanup; + } + + pagesize = vshMalloc(ctl, sizeof(*pagesize)); + if (vshCommandOptScaledInt(cmd, "pagesize", (unsigned long long *) pagesize, + 1, UINT_MAX) < 0) { + vshError(ctl, "%s", _("page size has to be a number")); + goto cleanup; + } + + /* page size is expected in kibibytes */ + pagesize[0] /= 1024; + + if (!pagesize[0]) { + vshError(ctl, "%s", _("page size must be at least 1KiB")); + goto cleanup; + } + + counts = vshMalloc(ctl, sizeof(*counts)); + + if (virNodeGetFreePages(ctl->conn, 1, pagesize, cell, 1, counts, 0) < 0) + goto cleanup; + + vshPrint(ctl, "%uKiB: %lld\n", *pagesize, counts[0]); + } + + ret = true; + cleanup: + xmlXPathFreeContext(ctxt); + xmlFreeDoc(doc); + VIR_FREE(cap_xml); + VIR_FREE(nodes); + VIR_FREE(counts); + VIR_FREE(pagesize); + return ret; +} + + /* * "maxvcpus" command */ @@ -977,6 +1138,12 @@ const vshCmdDef hostAndHypervisorCmds[] = { .info = info_freecell, .flags = 0 }, + {.name = "freepages", + .handler = cmdFreepages, + .opts = opts_freepages, + .info = info_freepages, + .flags = 0 + }, {.name = "hostname", .handler = cmdHostname, .opts = NULL, diff --git a/tools/virsh.pod b/tools/virsh.pod index 80501f9..3aede8d 100644 --- a/tools/virsh.pod +++ b/tools/virsh.pod @@ -511,6 +511,14 @@ cell and the total free memory on the machine. Finally, with a numeric argument or with --cellno plus a cell number it will display the free memory for the specified cell only. +=item B<freepages> [{ [I<--cellno>] I<cellno> [I<--pagesize>] I<pagesize> | + I<--all> }] + +Prints the available amount of huge pages within a NUMA cell. I<cellno> refers +to the NUMA cell you're interested in. I<pagesize> is a scaled integer (see +B<NOTES> above). Alternatively, if I<--all> is used, info on each possible +combination of NUMA cell and huge page size is printed out. + =item B<cpu-baseline> I<FILE> [I<--features>] Compute baseline CPU which will be supported by all host CPUs given in <file>. -- 1.8.5.5

Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_driver.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 1191255..4d6a300 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -16868,6 +16868,48 @@ qemuDomainFSThaw(virDomainPtr dom, } +static int +qemuNodeGetFreePages(virConnectPtr conn, + unsigned int npages, + unsigned int *pages, + int startCell, + unsigned int cellCount, + unsigned long long *counts, + unsigned int flags) +{ + int ret = -1; + int cell; + size_t i, ncounts = 0; + + virCheckFlags(0, ret); + + if (virNodeGetFreePagesEnsureACL(conn) < 0) + return ret; + + for (cell = startCell; cell < (int) (startCell + cellCount); cell++) { + for (i = 0; i < npages; i++) { + unsigned int page_size = pages[i]; + unsigned int page_free; + + if (virNumaGetHugePageInfo(cell, page_size, NULL, &page_free) < 0) + goto cleanup; + + counts[ncounts++] = page_free; + } + } + + if (!ncounts) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("no suitable info found")); + goto cleanup; + } + + ret = ncounts; + cleanup: + return ret; +} + + static virDriver qemuDriver = { .no = VIR_DRV_QEMU, .name = QEMU_DRIVER_NAME, @@ -17062,6 +17104,7 @@ static virDriver qemuDriver = { .domainFSThaw = qemuDomainFSThaw, /* 1.2.5 */ .domainGetTime = qemuDomainGetTime, /* 1.2.5 */ .domainSetTime = qemuDomainSetTime, /* 1.2.5 */ + .nodeGetFreePages = qemuNodeGetFreePages, /* 1.2.6 */ }; -- 1.8.5.5

On Tue, Jun 10, 2014 at 07:21:15PM +0200, Michal Privoznik wrote:
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_driver.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+)
This should probably live in nodeinfo.c and be shared across QEMU, lxc and uml drivers. Can't share it with Xen, since the hypervisor owns page info, not Dom0 Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
participants (3)
-
Daniel P. Berrange
-
Martin Kletzander
-
Michal Privoznik