[libvirt] [PATCH 0/3] Couple of huge pages improvements

*** BLURB HERE *** Michal Privoznik (3): virNumaGetPageInfo: Take huge pages into account virNumaGetPages: Don't fail on huge page-less systems virnuma: Actually build huge page code src/nodeinfo.c | 2 +- src/util/virnuma.c | 87 +++++++++++++++++++++++++++++++----------------------- src/util/virnuma.h | 1 + 3 files changed, 52 insertions(+), 38 deletions(-) -- 1.8.5.5

On the Linux kernel, if huge pages are allocated the size they cut off from memory is accounted under the 'MemUsed' in the meminfo file. However, we want the sum to be subtracted from 'MemTotal'. This patch implements this feature. After this change, we can enable reporting of the ordinary system pages in the capability XML: <capabilities> <host> <uuid>01281cda-f352-cb11-a9db-e905fe22010c</uuid> <cpu> <arch>x86_64</arch> <model>Haswell</model> <vendor>Intel</vendor> <topology sockets='1' cores='1' threads='1'/> <feature/> <pages unit='KiB' size='4'/> <pages unit='KiB' size='2048'/> <pages unit='KiB' size='1048576'/> </cpu> <power_management/> <migration_features/> <topology> <cells num='4'> <cell id='0'> <memory unit='KiB'>4048248</memory> <pages unit='KiB' size='4'>748382</pages> <pages unit='KiB' size='2048'>3</pages> <pages unit='KiB' size='1048576'>1</pages> <distances/> <cpus num='1'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> </cpus> </cell> ... </cells> </topology> </host> </capabilities> You can see the beautiful thing about this: if you sum up all the <pages/> you'll get <memory/>. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/nodeinfo.c | 2 +- src/util/virnuma.c | 65 +++++++++++++++++++++++++++++++----------------------- src/util/virnuma.h | 1 + 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/src/nodeinfo.c b/src/nodeinfo.c index 8fe7adc..2c1c437 100644 --- a/src/nodeinfo.c +++ b/src/nodeinfo.c @@ -2038,7 +2038,7 @@ nodeGetFreePages(unsigned int npages, unsigned int page_size = pages[i]; unsigned int page_free; - if (virNumaGetPageInfo(cell, page_size, NULL, &page_free) < 0) + if (virNumaGetPageInfo(cell, page_size, 0, NULL, &page_free) < 0) goto cleanup; counts[ncounts++] = page_free; diff --git a/src/util/virnuma.c b/src/util/virnuma.c index 1745649..a176852 100644 --- a/src/util/virnuma.c +++ b/src/util/virnuma.c @@ -640,6 +640,8 @@ virNumaGetHugePageInfo(int node, * virNumaGetPageInfo: * @node: NUMA node id * @page_size: which huge page are we interested in (in KiB) + * @huge_page_sum: the sum of memory taken by huge pages (in + * bytes) * @page_avail: total number of huge pages in the pool * @page_free: the number of free huge pages in the pool * @@ -647,6 +649,13 @@ virNumaGetHugePageInfo(int node, * total number of pages in the pool (both free and taken) * and count for free pages in the pool. * + * The @huge_page_sum parameter exists there due to the Linux + * kernel limitation. The problem is, if there are some huge + * pages allocated, they are accounted under the 'MemUsed' field + * in the meminfo file instead of being subtracted from the + * 'MemTotal'. We must do the subtraction ourselves. + * If unsure, pass 0. + * * If you're interested in just one bit, pass NULL to the other one. * * As a special case, if @node == -1, overall info is fetched @@ -657,6 +666,7 @@ virNumaGetHugePageInfo(int node, int virNumaGetPageInfo(int node, unsigned int page_size, + unsigned long long huge_page_sum, unsigned int *page_avail, unsigned int *page_free) { @@ -666,7 +676,6 @@ virNumaGetPageInfo(int node, /* sysconf() returns page size in bytes, * the @page_size is however in kibibytes */ if (page_size == system_page_size / 1024) { -# if 0 unsigned long long memsize, memfree; /* TODO: come up with better algorithm that takes huge pages into @@ -679,16 +688,14 @@ virNumaGetPageInfo(int node, goto cleanup; } + /* see description above */ + memsize -= huge_page_sum; + if (page_avail) *page_avail = memsize / system_page_size; if (page_free) *page_free = memfree / system_page_size; -# else - virReportError(VIR_ERR_ARGUMENT_UNSUPPORTED, "%s", - _("system page size are not supported yet")); - goto cleanup; -# endif /* 0 */ } else { if (virNumaGetHugePageInfo(node, page_size, page_avail, page_free) < 0) goto cleanup; @@ -737,31 +744,18 @@ virNumaGetPages(int node, unsigned int ntmp = 0; size_t i; bool exchange; - -# if 0 - /* This has to be disabled until the time the issue in - * virNumaGetPageInfo is resolved. Sorry. */ long system_page_size; + unsigned long long huge_page_sum = 0; /* sysconf() returns page size in bytes, * but we are storing the page size in kibibytes. */ system_page_size = sysconf(_SC_PAGESIZE) / 1024; - /* We know that ordinary system pages are supported - * if nothing else is. */ - if (VIR_REALLOC_N(tmp_size, 1) < 0 || - VIR_REALLOC_N(tmp_avail, 1) < 0 || - VIR_REALLOC_N(tmp_free, 1) < 0) - goto cleanup; - - if (virNumaGetPageInfo(node, system_page_size, - &tmp_avail[ntmp], &tmp_free[ntmp]) < 0) - goto cleanup; - tmp_size[ntmp] = system_page_size; - ntmp++; -# endif /* 0 */ - - /* Now that we got ordinary system pages, lets get info on huge pages */ + /* Query huge pages at first. + * On Linux systems, the huge pages pool cuts off the available memory and + * is always shown as used memory. Here, however, we want to report + * slightly different information. So we take the total memory on a node + * and subtract memory taken by the huge pages. */ if (virNumaGetHugePageInfoPath(&path, node, 0, NULL) < 0) goto cleanup; @@ -792,9 +786,7 @@ virNumaGetPages(int node, goto cleanup; } - /* Querying more detailed info makes sense only sometimes */ - if ((pages_avail || pages_free) && - virNumaGetHugePageInfo(node, page_size, + if (virNumaGetHugePageInfo(node, page_size, &page_avail, &page_free) < 0) goto cleanup; @@ -807,11 +799,27 @@ virNumaGetPages(int node, tmp_avail[ntmp] = page_avail; tmp_free[ntmp] = page_free; ntmp++; + + /* page_size is in kibibytes while we want huge_page_sum + * in just bytes. */ + huge_page_sum += 1024 * page_size * page_avail; } if (direrr < 0) goto cleanup; + /* Now append the ordinary system pages */ + if (VIR_REALLOC_N(tmp_size, ntmp + 1) < 0 || + VIR_REALLOC_N(tmp_avail, ntmp + 1) < 0 || + VIR_REALLOC_N(tmp_free, ntmp + 1) < 0) + goto cleanup; + + if (virNumaGetPageInfo(node, system_page_size, huge_page_sum, + &tmp_avail[ntmp], &tmp_free[ntmp]) < 0) + goto cleanup; + tmp_size[ntmp] = system_page_size; + ntmp++; + /* Just to produce nice output, sort the arrays by increasing page size */ do { exchange = false; @@ -853,6 +861,7 @@ virNumaGetPages(int node, int virNumaGetPageInfo(int node ATTRIBUTE_UNUSED, unsigned int page_size ATTRIBUTE_UNUSED, + unsigned long long huge_page_sum ATTRIBUTE_UNUSED, unsigned int *page_avail ATTRIBUTE_UNUSED, unsigned int *page_free ATTRIBUTE_UNUSED) { diff --git a/src/util/virnuma.h b/src/util/virnuma.h index a7435dc..fbeb7a9 100644 --- a/src/util/virnuma.h +++ b/src/util/virnuma.h @@ -72,6 +72,7 @@ int virNumaGetNodeCPUs(int node, virBitmapPtr *cpus); int virNumaGetPageInfo(int node, unsigned int page_size, + unsigned long long huge_page_sum, unsigned int *page_avail, unsigned int *page_free); int virNumaGetPages(int node, -- 1.8.5.5

On 06/23/2014 03:59 PM, Michal Privoznik wrote:
On the Linux kernel, if huge pages are allocated the size they cut off from memory is accounted under the 'MemUsed' in the meminfo file. However, we want the sum to be subtracted from 'MemTotal'. This patch implements this feature. After this change, we can enable reporting of the ordinary system pages in the capability XML:
<capabilities>
<host> <uuid>01281cda-f352-cb11-a9db-e905fe22010c</uuid> <cpu> <arch>x86_64</arch> <model>Haswell</model> <vendor>Intel</vendor> <topology sockets='1' cores='1' threads='1'/> <feature/> <pages unit='KiB' size='4'/> <pages unit='KiB' size='2048'/> <pages unit='KiB' size='1048576'/> </cpu> <power_management/> <migration_features/> <topology> <cells num='4'> <cell id='0'> <memory unit='KiB'>4048248</memory> <pages unit='KiB' size='4'>748382</pages> <pages unit='KiB' size='2048'>3</pages> <pages unit='KiB' size='1048576'>1</pages> <distances/> <cpus num='1'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> </cpus> </cell> ... </cells> </topology> </host> </capabilities>
You can see the beautiful thing about this: if you sum up all the <pages/> you'll get <memory/>.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/nodeinfo.c | 2 +- src/util/virnuma.c | 65 +++++++++++++++++++++++++++++++----------------------- src/util/virnuma.h | 1 + 3 files changed, 39 insertions(+), 29 deletions(-)
@@ -647,6 +649,13 @@ virNumaGetHugePageInfo(int node, * total number of pages in the pool (both free and taken) * and count for free pages in the pool. * + * The @huge_page_sum parameter exists there due to the Linux
s/there//
+ * kernel limitation. The problem is, if there are some huge + * pages allocated, they are accounted under the 'MemUsed' field + * in the meminfo file instead of being subtracted from the + * 'MemTotal'. We must do the subtraction ourselves. + * If unsure, pass 0. + * * If you're interested in just one bit, pass NULL to the other one. * * As a special case, if @node == -1, overall info is fetched @@ -657,6 +666,7 @@ virNumaGetHugePageInfo(int node, int virNumaGetPageInfo(int node, unsigned int page_size, + unsigned long long huge_page_sum, unsigned int *page_avail, unsigned int *page_free) { @@ -666,7 +676,6 @@ virNumaGetPageInfo(int node, /* sysconf() returns page size in bytes, * the @page_size is however in kibibytes */ if (page_size == system_page_size / 1024) { -# if 0 unsigned long long memsize, memfree;
/* TODO: come up with better algorithm that takes huge pages into @@ -679,16 +688,14 @@ virNumaGetPageInfo(int node, goto cleanup; }
+ /* see description above */
I think you can just move the comment above here.
+ memsize -= huge_page_sum; + if (page_avail) *page_avail = memsize / system_page_size;
if (page_free) *page_free = memfree / system_page_size;
ACK Jan

On 24.06.2014 10:15, Ján Tomko wrote:
On 06/23/2014 03:59 PM, Michal Privoznik wrote:
On the Linux kernel, if huge pages are allocated the size they cut off from memory is accounted under the 'MemUsed' in the meminfo file. However, we want the sum to be subtracted from 'MemTotal'. This patch implements this feature. After this change, we can enable reporting of the ordinary system pages in the capability XML:
<capabilities>
<host> <uuid>01281cda-f352-cb11-a9db-e905fe22010c</uuid> <cpu> <arch>x86_64</arch> <model>Haswell</model> <vendor>Intel</vendor> <topology sockets='1' cores='1' threads='1'/> <feature/> <pages unit='KiB' size='4'/> <pages unit='KiB' size='2048'/> <pages unit='KiB' size='1048576'/> </cpu> <power_management/> <migration_features/> <topology> <cells num='4'> <cell id='0'> <memory unit='KiB'>4048248</memory> <pages unit='KiB' size='4'>748382</pages> <pages unit='KiB' size='2048'>3</pages> <pages unit='KiB' size='1048576'>1</pages> <distances/> <cpus num='1'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> </cpus> </cell> ... </cells> </topology> </host> </capabilities>
You can see the beautiful thing about this: if you sum up all the <pages/> you'll get <memory/>.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/nodeinfo.c | 2 +- src/util/virnuma.c | 65 +++++++++++++++++++++++++++++++----------------------- src/util/virnuma.h | 1 + 3 files changed, 39 insertions(+), 29 deletions(-)
@@ -647,6 +649,13 @@ virNumaGetHugePageInfo(int node, * total number of pages in the pool (both free and taken) * and count for free pages in the pool. * + * The @huge_page_sum parameter exists there due to the Linux
s/there//
Fixed.
+ * kernel limitation. The problem is, if there are some huge + * pages allocated, they are accounted under the 'MemUsed' field + * in the meminfo file instead of being subtracted from the + * 'MemTotal'. We must do the subtraction ourselves. + * If unsure, pass 0. + * * If you're interested in just one bit, pass NULL to the other one. * * As a special case, if @node == -1, overall info is fetched @@ -657,6 +666,7 @@ virNumaGetHugePageInfo(int node, int virNumaGetPageInfo(int node, unsigned int page_size, + unsigned long long huge_page_sum, unsigned int *page_avail, unsigned int *page_free) { @@ -666,7 +676,6 @@ virNumaGetPageInfo(int node, /* sysconf() returns page size in bytes, * the @page_size is however in kibibytes */ if (page_size == system_page_size / 1024) { -# if 0 unsigned long long memsize, memfree;
/* TODO: come up with better algorithm that takes huge pages into @@ -679,16 +688,14 @@ virNumaGetPageInfo(int node, goto cleanup; }
+ /* see description above */
I think you can just move the comment above here.
This one I'd really like in the function description. I mean, the @huge_page_sum is a function argument which certainly deserves a description and the aim of documenting internal functions is to not force other programmers to actually read the function code. So I'm keeping it there.
+ memsize -= huge_page_sum; + if (page_avail) *page_avail = memsize / system_page_size;
if (page_free) *page_free = memfree / system_page_size;
ACK
Jan
Michal

On 06/24/2014 11:56 AM, Michal Privoznik wrote:
On 24.06.2014 10:15, Ján Tomko wrote:
On 06/23/2014 03:59 PM, Michal Privoznik wrote:
+ * kernel limitation. The problem is, if there are some huge + * pages allocated, they are accounted under the 'MemUsed' field + * in the meminfo file instead of being subtracted from the + * 'MemTotal'. We must do the subtraction ourselves. + * If unsure, pass 0. + * * If you're interested in just one bit, pass NULL to the other one. * * As a special case, if @node == -1, overall info is fetched @@ -657,6 +666,7 @@ virNumaGetHugePageInfo(int node, int virNumaGetPageInfo(int node, unsigned int page_size, + unsigned long long huge_page_sum, unsigned int *page_avail, unsigned int *page_free) { @@ -666,7 +676,6 @@ virNumaGetPageInfo(int node, /* sysconf() returns page size in bytes, * the @page_size is however in kibibytes */ if (page_size == system_page_size / 1024) { -# if 0 unsigned long long memsize, memfree;
/* TODO: come up with better algorithm that takes huge pages into @@ -679,16 +688,14 @@ virNumaGetPageInfo(int node, goto cleanup; }
+ /* see description above */
I think you can just move the comment above here.
This one I'd really like in the function description. I mean, the @huge_page_sum is a function argument which certainly deserves a description and the aim of documenting internal functions is to not force other programmers to actually read the function code. So I'm keeping it there.
Oh, I thought it was pointing at the TODO comment: /* TODO: come up with better algorithm that takes huge pages into * account. The problem is huge pages cut off regular memory. */ Anyway, the comment "see description above" seems pointless to me. Jan

If we are running on a system that is not capable of huge pages (e.g. because the kernel is not configured that way) we still try to open "/sys/kernel/mm/hugepages/" which however does not exist. We should be tolerant to this specific use case. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/util/virnuma.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/util/virnuma.c b/src/util/virnuma.c index a176852..4901378 100644 --- a/src/util/virnuma.c +++ b/src/util/virnuma.c @@ -738,7 +738,7 @@ virNumaGetPages(int node, int ret = -1; char *path = NULL; DIR *dir = NULL; - int direrr; + int direrr = 0; struct dirent *entry; unsigned int *tmp_size = NULL, *tmp_avail = NULL, *tmp_free = NULL; unsigned int ntmp = 0; @@ -760,13 +760,17 @@ virNumaGetPages(int node, goto cleanup; if (!(dir = opendir(path))) { - virReportSystemError(errno, - _("unable to open path: %s"), - path); - goto cleanup; + /* It's okay if the @path doesn't exist. Maybe we are running on + * system without huge pages support where the path may not exist. */ + if (errno != ENOENT) { + virReportSystemError(errno, + _("unable to open path: %s"), + path); + goto cleanup; + } } - while ((direrr = virDirRead(dir, &entry, path)) > 0) { + while (dir && (direrr = virDirRead(dir, &entry, path)) > 0) { const char *page_name = entry->d_name; unsigned int page_size, page_avail = 0, page_free = 0; char *end; -- 1.8.5.5

On 06/23/2014 03:59 PM, Michal Privoznik wrote:
If we are running on a system that is not capable of huge pages (e.g. because the kernel is not configured that way) we still try to open "/sys/kernel/mm/hugepages/" which however does not exist. We should be tolerant to this specific use case.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/util/virnuma.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-)
ACK Jan

One of previous commits (e6258a33) tried to build the huge page code only on Linux since it's Linux centric indeed. But it failed miserably as it used 'WITH_LINUX' which is an automake conditional not a gcc one. In the sources we need to use __linux__. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/util/virnuma.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/util/virnuma.c b/src/util/virnuma.c index 4901378..a0fa31c 100644 --- a/src/util/virnuma.c +++ b/src/util/virnuma.c @@ -513,7 +513,7 @@ virNumaGetDistances(int node ATTRIBUTE_UNUSED, /* currently all the hugepage stuff below is linux only */ -#if WITH_LINUX +#if __linux__ # define HUGEPAGES_NUMA_PREFIX "/sys/devices/system/node/" # define HUGEPAGES_SYSTEM_PREFIX "/sys/kernel/mm/hugepages/" @@ -861,7 +861,7 @@ virNumaGetPages(int node, } -#else /* #if WITH_LINUX */ +#else /* #ifdef __linux__ */ int virNumaGetPageInfo(int node ATTRIBUTE_UNUSED, unsigned int page_size ATTRIBUTE_UNUSED, @@ -886,4 +886,4 @@ virNumaGetPages(int node ATTRIBUTE_UNUSED, _("page info is not supported on this platform")); return -1; } -#endif /* #if WITH_LINUX */ +#endif /* #ifdef __linux__ */ -- 1.8.5.5

On 06/23/14 15:59, Michal Privoznik wrote:
One of previous commits (e6258a33) tried to build the huge page code only on Linux since it's Linux centric indeed. But it failed miserably as it used 'WITH_LINUX' which is an automake conditional not a gcc one. In the sources we need to use __linux__.
Sigh. I shouldn't touch any code today ...
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/util/virnuma.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/util/virnuma.c b/src/util/virnuma.c index 4901378..a0fa31c 100644 --- a/src/util/virnuma.c +++ b/src/util/virnuma.c @@ -513,7 +513,7 @@ virNumaGetDistances(int node ATTRIBUTE_UNUSED,
/* currently all the hugepage stuff below is linux only */ -#if WITH_LINUX +#if __linux__
#ifdef
# define HUGEPAGES_NUMA_PREFIX "/sys/devices/system/node/" # define HUGEPAGES_SYSTEM_PREFIX "/sys/kernel/mm/hugepages/" @@ -861,7 +861,7 @@ virNumaGetPages(int node, }
-#else /* #if WITH_LINUX */ +#else /* #ifdef __linux__ */ int virNumaGetPageInfo(int node ATTRIBUTE_UNUSED, unsigned int page_size ATTRIBUTE_UNUSED, @@ -886,4 +886,4 @@ virNumaGetPages(int node ATTRIBUTE_UNUSED, _("page info is not supported on this platform")); return -1; } -#endif /* #if WITH_LINUX */ +#endif /* #ifdef __linux__ */
ACK with the macro fixed. Peter
participants (3)
-
Ján Tomko
-
Michal Privoznik
-
Peter Krempa