[libvirt] [PATCH 0/4] Add API to manipulate huge pages pool

This may seem like a cherry on top of the cake, but once we allow guests to use huge pages we must allow admins to allocate ones. Michal Privoznik (4): Introduce virNodeAllocPages virnuma: Introduce virNumaSetPagePoolSize nodeinfo: Implement nodeAllocPages virsh: Expose virNodeAllocPages daemon/remote.c | 37 ++++++++++++ include/libvirt/libvirt.h.in | 16 ++++++ src/driver.h | 10 ++++ src/libvirt.c | 67 ++++++++++++++++++++++ src/libvirt_private.syms | 2 + src/libvirt_public.syms | 1 + src/lxc/lxc_driver.c | 22 +++++++ src/nodeinfo.c | 29 ++++++++++ src/nodeinfo.h | 7 +++ src/qemu/qemu_driver.c | 22 +++++++ src/remote/remote_driver.c | 47 +++++++++++++++ src/remote/remote_protocol.x | 20 ++++++- src/remote_protocol-structs | 17 ++++++ src/uml/uml_driver.c | 22 +++++++ src/util/virnuma.c | 108 ++++++++++++++++++++++++++++++++++ src/util/virnuma.h | 4 ++ src/vbox/vbox_common.c | 19 ++++++ tools/virsh-host.c | 134 +++++++++++++++++++++++++++++++++++++++++++ tools/virsh.pod | 12 ++++ 19 files changed, 595 insertions(+), 1 deletion(-) -- 1.8.5.5

A long time ago in a galaxy far, far away it has been decided that libvirt will manage not only domains but host as well. And with my latest work on qemu driver supporting huge pages, we miss the cherry on top: an API to allocate huge pages on the run. Currently users are forced to log into the host and adjust the huge pages pool themselves. However, with this API the problem is gone - they can both size up and size down the pool. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- daemon/remote.c | 37 ++++++++++++++++++++++++ include/libvirt/libvirt.h.in | 16 +++++++++++ src/driver.h | 10 +++++++ src/libvirt.c | 67 ++++++++++++++++++++++++++++++++++++++++++++ src/libvirt_public.syms | 1 + src/remote/remote_driver.c | 47 +++++++++++++++++++++++++++++++ src/remote/remote_protocol.x | 20 ++++++++++++- src/remote_protocol-structs | 17 +++++++++++ 8 files changed, 214 insertions(+), 1 deletion(-) diff --git a/daemon/remote.c b/daemon/remote.c index daa4b60..020772d 100644 --- a/daemon/remote.c +++ b/daemon/remote.c @@ -6471,6 +6471,43 @@ remoteDispatchConnectGetAllDomainStats(virNetServerPtr server ATTRIBUTE_UNUSED, } +static int +remoteDispatchNodeAllocPages(virNetServerPtr server ATTRIBUTE_UNUSED, + virNetServerClientPtr client, + virNetMessagePtr msg ATTRIBUTE_UNUSED, + virNetMessageErrorPtr rerr, + remote_node_alloc_pages_args *args, + remote_node_alloc_pages_ret *ret) +{ + int rv = -1; + int len; + struct daemonClientPrivate *priv = + virNetServerClientGetPrivateData(client); + + if (!priv->conn) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("connection not open")); + goto cleanup; + } + + if ((len = virNodeAllocPages(priv->conn, + args->pageSizes.pageSizes_len, + args->pageSizes.pageSizes_val, + (unsigned long long *) args->pageCounts.pageCounts_val, + args->startCell, + args->cellCount, + args->flags)) < 0) + goto cleanup; + + ret->ret = len; + rv = 0; + + cleanup: + if (rv < 0) + virNetMessageSaveError(rerr); + return rv; +} + + /*----- Helpers. -----*/ /* get_nonnull_domain and get_nonnull_network turn an on-wire diff --git a/include/libvirt/libvirt.h.in b/include/libvirt/libvirt.h.in index 724314e..1a47bae 100644 --- a/include/libvirt/libvirt.h.in +++ b/include/libvirt/libvirt.h.in @@ -5505,6 +5505,22 @@ int virNodeGetFreePages(virConnectPtr conn, unsigned int cellcount, unsigned long long *counts, unsigned int flags); + +typedef enum { + VIR_NODE_ALLOC_PAGES_ADD = 0, /* Add @pageCounts to the pages pool. This + can be used only to size up the pool. */ + VIR_NODE_ALLOC_PAGES_SET = (1 << 0), /* Don't add @pageCounts, instead set + passed number of pages. This can be + used to free allocated pages. */ +} virNodeAllocPagesFlags; + +int virNodeAllocPages(virConnectPtr conn, + unsigned int npages, + unsigned int *pageSizes, + unsigned long long *pageCounts, + int startCell, + unsigned int cellCount, + unsigned int flags); /** * virSchedParameterType: * diff --git a/src/driver.h b/src/driver.h index bb748c4..dc62a8e 100644 --- a/src/driver.h +++ b/src/driver.h @@ -1212,6 +1212,15 @@ typedef int virDomainStatsRecordPtr **retStats, unsigned int flags); +typedef int +(*virDrvNodeAllocPages)(virConnectPtr conn, + unsigned int npages, + unsigned int *pageSizes, + unsigned long long *pageCounts, + int startCell, + unsigned int cellCount, + unsigned int flags); + typedef struct _virDriver virDriver; typedef virDriver *virDriverPtr; @@ -1435,6 +1444,7 @@ struct _virDriver { virDrvNodeGetFreePages nodeGetFreePages; virDrvConnectGetDomainCapabilities connectGetDomainCapabilities; virDrvConnectGetAllDomainStats connectGetAllDomainStats; + virDrvNodeAllocPages nodeAllocPages; }; diff --git a/src/libvirt.c b/src/libvirt.c index 7c63825..27445e5 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -21841,3 +21841,70 @@ virDomainStatsRecordListFree(virDomainStatsRecordPtr *stats) VIR_FREE(stats); } + + +/** + * virNodeAllocPages: + * @conn: pointer to the hypervisor connection + * @npages: number of items in the @pageSizes array + * @pageSizes: which huge page sizes to allocate + * @pageCounts: how many pages should be allocated + * @startCell: index of first cell to allocate pages on + * @cellCount: number of consecutive cells to allocate pages on + * @flags: extra flags; binary-OR of virConnectGetAllDomainStatsFlags + * + * Sometimes, when trying to start a new domain, it may be + * necessary to reserve some huge pages in the system pool which + * can be then allocated by the domain. This API serves that + * purpose. On its input, @pageSizes and @pageCounts are arrays + * of the same cardinality of @npages. The @pageSizes contains + * page sizes which are to be allocated in the system (the size + * unit is kibibytes), and @pageCounts then contains the number + * of pages to reserve. Depending on @flags specified, the number + * of pages is either added into the pool and the pool is sized + * up (@flags have VIR_NODE_ALLOC_PAGES_ADD set) or the number of + * pages is looked at the new size of pool and the pool can be + * both sized up or down (@flags have VIR_NODE_ALLOC_PAGES_SET + * set). The pages pool can be allocated over several NUMA nodes + * at once, just point at @startCell and tell how many subsequent + * NUMA nodes should be taken in. + * + * Returns: the number of nodes successfully adjusted or -1 in + * case of an error. + */ +int +virNodeAllocPages(virConnectPtr conn, + unsigned int npages, + unsigned int *pageSizes, + unsigned long long *pageCounts, + int startCell, + unsigned int cellCount, + unsigned int flags) +{ + VIR_DEBUG("conn=%p npages=%u pageSizes=%p pageCounts=%p " + "startCell=%d cellCount=%u flagx=%x", + conn, npages, pageSizes, pageCounts, startCell, + cellCount, flags); + + virResetLastError(); + + virCheckConnectReturn(conn, -1); + virCheckNonZeroArgGoto(npages, error); + virCheckNonNullArgGoto(pageSizes, error); + virCheckNonNullArgGoto(pageCounts, error); + + if (conn->driver->nodeAllocPages) { + int ret; + ret = conn->driver->nodeAllocPages(conn, npages, pageSizes, + pageCounts, startCell, + cellCount, flags); + if (ret < 0) + goto error; + return ret; + } + + virReportUnsupportedError(); + error: + virDispatchError(conn); + return -1; +} diff --git a/src/libvirt_public.syms b/src/libvirt_public.syms index e1f013f..89fbc71 100644 --- a/src/libvirt_public.syms +++ b/src/libvirt_public.syms @@ -677,6 +677,7 @@ LIBVIRT_1.2.8 { virDomainListGetStats; virDomainOpenGraphicsFD; virDomainStatsRecordListFree; + virNodeAllocPages; } LIBVIRT_1.2.7; # .... define new API here using predicted next version number .... diff --git a/src/remote/remote_driver.c b/src/remote/remote_driver.c index 75a3a7b..e75156d 100644 --- a/src/remote/remote_driver.c +++ b/src/remote/remote_driver.c @@ -7811,6 +7811,52 @@ remoteConnectGetAllDomainStats(virConnectPtr conn, return rv; } + +static int +remoteNodeAllocPages(virConnectPtr conn, + unsigned int npages, + unsigned int *pageSizes, + unsigned long long *pageCounts, + int startCell, + unsigned int cellCount, + unsigned int flags) +{ + int rv = -1; + remote_node_alloc_pages_args args; + remote_node_alloc_pages_ret ret; + struct private_data *priv = conn->privateData; + + remoteDriverLock(priv); + + if (npages > REMOTE_NODE_MAX_CELLS) { + virReportError(VIR_ERR_RPC, + _("too many NUMA cells: %d > %d"), + npages, REMOTE_NODE_MAX_CELLS); + goto done; + } + + args.pageSizes.pageSizes_val = (u_int *) pageSizes; + args.pageSizes.pageSizes_len = npages; + args.pageCounts.pageCounts_val = (uint64_t *) pageCounts; + args.pageCounts.pageCounts_len = npages; + args.startCell = startCell; + args.cellCount = cellCount; + args.flags = flags; + + memset(&ret, 0, sizeof(ret)); + if (call(conn, priv, 0, REMOTE_PROC_NODE_ALLOC_PAGES, + (xdrproc_t) xdr_remote_node_alloc_pages_args, (char *) &args, + (xdrproc_t) xdr_remote_node_alloc_pages_ret, (char *) &ret) == -1) + goto done; + + rv = ret.ret; + + done: + remoteDriverUnlock(priv); + return rv; +} + + /* get_nonnull_domain and get_nonnull_network turn an on-wire * (name, uuid) pair into virDomainPtr or virNetworkPtr object. * These can return NULL if underlying memory allocations fail, @@ -8152,6 +8198,7 @@ static virDriver remote_driver = { .nodeGetFreePages = remoteNodeGetFreePages, /* 1.2.6 */ .connectGetDomainCapabilities = remoteConnectGetDomainCapabilities, /* 1.2.7 */ .connectGetAllDomainStats = remoteConnectGetAllDomainStats, /* 1.2.8 */ + .nodeAllocPages = remoteNodeAllocPages, /* 1.2.8 */ }; static virNetworkDriver network_driver = { diff --git a/src/remote/remote_protocol.x b/src/remote/remote_protocol.x index a4ca0c3..7df45b0 100644 --- a/src/remote/remote_protocol.x +++ b/src/remote/remote_protocol.x @@ -3052,6 +3052,18 @@ struct remote_node_get_free_pages_ret { unsigned hyper counts<REMOTE_NODE_MAX_CELLS>; }; +struct remote_node_alloc_pages_args { + unsigned int pageSizes<REMOTE_NODE_MAX_CELLS>; + unsigned hyper pageCounts<REMOTE_NODE_MAX_CELLS>; + int startCell; + unsigned int cellCount; + unsigned int flags; +}; + +struct remote_node_alloc_pages_ret { + int ret; +}; + struct remote_network_dhcp_lease { remote_nonnull_string iface; hyper expirytime; @@ -5472,5 +5484,11 @@ enum remote_procedure { * @generate: both * @acl: domain:block_write */ - REMOTE_PROC_DOMAIN_BLOCK_COPY = 345 + REMOTE_PROC_DOMAIN_BLOCK_COPY = 345, + + /** + * @generate: none + * @acl: connect:write + */ + REMOTE_PROC_NODE_ALLOC_PAGES = 346 }; diff --git a/src/remote_protocol-structs b/src/remote_protocol-structs index e960415..b45811b 100644 --- a/src/remote_protocol-structs +++ b/src/remote_protocol-structs @@ -2510,6 +2510,22 @@ struct remote_node_get_free_pages_ret { uint64_t * counts_val; } counts; }; +struct remote_node_alloc_pages_args { + struct { + u_int pageSizes_len; + u_int * pageSizes_val; + } pageSizes; + struct { + u_int pageCounts_len; + uint64_t * pageCounts_val; + } pageCounts; + int startCell; + u_int cellCount; + u_int flags; +}; +struct remote_node_alloc_pages_ret { + int ret; +}; struct remote_network_dhcp_lease { remote_nonnull_string iface; int64_t expirytime; @@ -2901,4 +2917,5 @@ enum remote_procedure { REMOTE_PROC_DOMAIN_OPEN_GRAPHICS_FD = 343, REMOTE_PROC_CONNECT_GET_ALL_DOMAIN_STATS = 344, REMOTE_PROC_DOMAIN_BLOCK_COPY = 345, + REMOTE_PROC_NODE_ALLOC_PAGES = 346, }; -- 1.8.5.5

On 09/18/2014 10:24 AM, Michal Privoznik wrote:
A long time ago in a galaxy far, far away it has been decided that libvirt will manage not only domains but host as well. And with my latest work on qemu driver supporting huge pages, we miss the cherry on top: an API to allocate huge pages on the run. Currently users are forced to log into the host and adjust the huge pages pool themselves. However, with this API the problem is gone - they can both size up and size down the pool.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- daemon/remote.c | 37 ++++++++++++++++++++++++ include/libvirt/libvirt.h.in | 16 +++++++++++ src/driver.h | 10 +++++++ src/libvirt.c | 67 ++++++++++++++++++++++++++++++++++++++++++++ src/libvirt_public.syms | 1 + src/remote/remote_driver.c | 47 +++++++++++++++++++++++++++++++ src/remote/remote_protocol.x | 20 ++++++++++++- src/remote_protocol-structs | 17 +++++++++++ 8 files changed, 214 insertions(+), 1 deletion(-)
diff --git a/include/libvirt/libvirt.h.in b/include/libvirt/libvirt.h.in index 724314e..1a47bae 100644 --- a/include/libvirt/libvirt.h.in +++ b/include/libvirt/libvirt.h.in @@ -5505,6 +5505,22 @@ int virNodeGetFreePages(virConnectPtr conn, unsigned int cellcount, unsigned long long *counts, unsigned int flags); + +typedef enum { + VIR_NODE_ALLOC_PAGES_ADD = 0, /* Add @pageCounts to the pages pool. This + can be used only to size up the pool. */ + VIR_NODE_ALLOC_PAGES_SET = (1 << 0), /* Don't add @pageCounts, instead set + passed number of pages. This can be + used to free allocated pages. */
+} virNodeAllocPagesFlags; + +int virNodeAllocPages(virConnectPtr conn, + unsigned int npages, + unsigned int *pageSizes, + unsigned long long *pageCounts, + int startCell, + unsigned int cellCount, + unsigned int flags); /** * virSchedParameterType: * diff --git a/src/driver.h b/src/driver.h index bb748c4..dc62a8e 100644 --- a/src/driver.h +++ b/src/driver.h @@ -1212,6 +1212,15 @@ typedef int virDomainStatsRecordPtr **retStats, unsigned int flags);
+typedef int +(*virDrvNodeAllocPages)(virConnectPtr conn, + unsigned int npages, + unsigned int *pageSizes, + unsigned long long *pageCounts, + int startCell, + unsigned int cellCount, + unsigned int flags); + typedef struct _virDriver virDriver; typedef virDriver *virDriverPtr;
@@ -1435,6 +1444,7 @@ struct _virDriver { virDrvNodeGetFreePages nodeGetFreePages; virDrvConnectGetDomainCapabilities connectGetDomainCapabilities; virDrvConnectGetAllDomainStats connectGetAllDomainStats; + virDrvNodeAllocPages nodeAllocPages; };
diff --git a/src/libvirt.c b/src/libvirt.c index 7c63825..27445e5 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -21841,3 +21841,70 @@ virDomainStatsRecordListFree(virDomainStatsRecordPtr *stats)
VIR_FREE(stats); } + + +/** + * virNodeAllocPages: + * @conn: pointer to the hypervisor connection + * @npages: number of items in the @pageSizes array + * @pageSizes: which huge page sizes to allocate + * @pageCounts: how many pages should be allocated + * @startCell: index of first cell to allocate pages on + * @cellCount: number of consecutive cells to allocate pages on + * @flags: extra flags; binary-OR of virConnectGetAllDomainStatsFlags
virNodeAllocPagesFlags
+ * + * Sometimes, when trying to start a new domain, it may be + * necessary to reserve some huge pages in the system pool which + * can be then allocated by the domain. This API serves that + * purpose. On its input, @pageSizes and @pageCounts are arrays + * of the same cardinality of @npages. The @pageSizes contains + * page sizes which are to be allocated in the system (the size + * unit is kibibytes), and @pageCounts then contains the number + * of pages to reserve. Depending on @flags specified, the number + * of pages is either added into the pool and the pool is sized + * up (@flags have VIR_NODE_ALLOC_PAGES_ADD set) or the number of + * pages is looked at the new size of pool and the pool can be + * both sized up or down (@flags have VIR_NODE_ALLOC_PAGES_SET + * set). The pages pool can be allocated over several NUMA nodes + * at once, just point at @startCell and tell how many subsequent + * NUMA nodes should be taken in. + * + * Returns: the number of nodes successfully adjusted or -1 in + * case of an error. + */ +int +virNodeAllocPages(virConnectPtr conn, + unsigned int npages, + unsigned int *pageSizes, + unsigned long long *pageCounts, + int startCell,
If -1 has special meaning here, it should be documented above. If not, I'm not sure whether this should be unsinged, or left as signed to match the GetFreePages API.
+ unsigned int cellCount, + unsigned int flags) +{ + VIR_DEBUG("conn=%p npages=%u pageSizes=%p pageCounts=%p " + "startCell=%d cellCount=%u flagx=%x", + conn, npages, pageSizes, pageCounts, startCell, + cellCount, flags); + + virResetLastError(); + + virCheckConnectReturn(conn, -1); + virCheckNonZeroArgGoto(npages, error); + virCheckNonNullArgGoto(pageSizes, error); + virCheckNonNullArgGoto(pageCounts, error); + + if (conn->driver->nodeAllocPages) { + int ret; + ret = conn->driver->nodeAllocPages(conn, npages, pageSizes, + pageCounts, startCell, + cellCount, flags); + if (ret < 0) + goto error; + return ret; + } + + virReportUnsupportedError(); + error: + virDispatchError(conn); + return -1; +} diff --git a/src/libvirt_public.syms b/src/libvirt_public.syms index e1f013f..89fbc71 100644 --- a/src/libvirt_public.syms +++ b/src/libvirt_public.syms @@ -677,6 +677,7 @@ LIBVIRT_1.2.8 { virDomainListGetStats; virDomainOpenGraphicsFD; virDomainStatsRecordListFree; + virNodeAllocPages; } LIBVIRT_1.2.7;
This should be in 1.2.9 (same goes for the comments in the drivers). ACK series with the nits fixed. Jan

On 09/18/2014 02:24 AM, Michal Privoznik wrote:
A long time ago in a galaxy far, far away it has been decided that libvirt will manage not only domains but host as well. And with my latest work on qemu driver supporting huge pages, we miss the cherry on top: an API to allocate huge pages on the run. Currently users are forced to log into the host and adjust the huge pages pool themselves. However, with this API the problem is gone - they can both size up and size down the pool.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> ---
+/** + * virNodeAllocPages: + * @conn: pointer to the hypervisor connection + * @npages: number of items in the @pageSizes array
I'd mention '@pageSizes and @pageCounts arrays'
+ * @pageSizes: which huge page sizes to allocate + * @pageCounts: how many pages should be allocated + * @startCell: index of first cell to allocate pages on + * @cellCount: number of consecutive cells to allocate pages on + * @flags: extra flags; binary-OR of virConnectGetAllDomainStatsFlags + * + * Sometimes, when trying to start a new domain, it may be + * necessary to reserve some huge pages in the system pool which + * can be then allocated by the domain. This API serves that + * purpose. On its input, @pageSizes and @pageCounts are arrays + * of the same cardinality of @npages. The @pageSizes contains + * page sizes which are to be allocated in the system (the size + * unit is kibibytes), and @pageCounts then contains the number + * of pages to reserve. Depending on @flags specified, the number + * of pages is either added into the pool and the pool is sized + * up (@flags have VIR_NODE_ALLOC_PAGES_ADD set) or the number of
Technically, VIR_NODE_ALLOC_PAGES_ADD is 0, so it can't be set :) Do we even need this named constant, or can we just drop it?
+ * pages is looked at the new size of pool and the pool can be + * both sized up or down (@flags have VIR_NODE_ALLOC_PAGES_SET
The wording is a bit awkward; maybe: If @flags is 0 (VIR_NODE_ALLOC_PAGES_ADD), each pool corresponding to @pageSizes grows by the number of pages specified in the corresponding @pageCounts. If @flags contains VIR_NODE_ALLOC_PAGES_SET, each pool mentioned is resized to the given number of pages.
+ * set). The pages pool can be allocated over several NUMA nodes + * at once, just point at @startCell and tell how many subsequent + * NUMA nodes should be taken in.
Is there shorthand such as @startCell==-1 for allocating across all NUMA nodes? Is there checking that @npages cannot exceed the number of NUMA nodes available starting at @startCell through @cellCount? If I understand correctly, usage is something like (pseudo-code): virNodeAllocPages(conn, 2, (int[]){ [0]=2M, [1]=1G }, (int[]){ [0]=1024, [1]=1 }, 1, 2, 0) which proceeds to add 1024 pages of size 2M, and 1 page of size 1G, to the pools associated both with NUMA node 1 and NUMA node 2 (that is, I just allocated 6G across two nodes).
+ * + * Returns: the number of nodes successfully adjusted or -1 in + * case of an error.
Can this function partially succeed? That is, if I pass @npages==2 and @cellCount==2, will I ever get a non-negative result less than 4? If the result is 2, which allocations failed (are all allocations on the first cell attempted before any on the second, or are all allocations to the first pool size attempted across multiple cells before revisiting cells to allocate in the second pool size)?
+ */ +int +virNodeAllocPages(virConnectPtr conn, + unsigned int npages, + unsigned int *pageSizes, + unsigned long long *pageCounts, + int startCell, + unsigned int cellCount, + unsigned int flags) +{ + VIR_DEBUG("conn=%p npages=%u pageSizes=%p pageCounts=%p " + "startCell=%d cellCount=%u flagx=%x", + conn, npages, pageSizes, pageCounts, startCell, + cellCount, flags); + + virResetLastError(); + + virCheckConnectReturn(conn, -1); + virCheckNonZeroArgGoto(npages, error); + virCheckNonNullArgGoto(pageSizes, error); + virCheckNonNullArgGoto(pageCounts, error); +
Where is the validation that startCell and cellCount make sense? I'd assume we want to ensure cellCount is positive? Or is there a special case of a count of 0 for visiting all NUMA cells without knowing in advance how many there are?
+++ b/src/libvirt_public.syms @@ -677,6 +677,7 @@ LIBVIRT_1.2.8 { virDomainListGetStats; virDomainOpenGraphicsFD; virDomainStatsRecordListFree; + virNodeAllocPages; } LIBVIRT_1.2.7;
Need a new section for 1.2.9. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 23.09.2014 21:49, Eric Blake wrote:
On 09/18/2014 02:24 AM, Michal Privoznik wrote:
A long time ago in a galaxy far, far away it has been decided that libvirt will manage not only domains but host as well. And with my latest work on qemu driver supporting huge pages, we miss the cherry on top: an API to allocate huge pages on the run. Currently users are forced to log into the host and adjust the huge pages pool themselves. However, with this API the problem is gone - they can both size up and size down the pool.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> ---
+/** + * virNodeAllocPages: + * @conn: pointer to the hypervisor connection + * @npages: number of items in the @pageSizes array
I'd mention '@pageSizes and @pageCounts arrays'
+ * @pageSizes: which huge page sizes to allocate + * @pageCounts: how many pages should be allocated + * @startCell: index of first cell to allocate pages on + * @cellCount: number of consecutive cells to allocate pages on + * @flags: extra flags; binary-OR of virConnectGetAllDomainStatsFlags + * + * Sometimes, when trying to start a new domain, it may be + * necessary to reserve some huge pages in the system pool which + * can be then allocated by the domain. This API serves that + * purpose. On its input, @pageSizes and @pageCounts are arrays + * of the same cardinality of @npages. The @pageSizes contains + * page sizes which are to be allocated in the system (the size + * unit is kibibytes), and @pageCounts then contains the number + * of pages to reserve. Depending on @flags specified, the number + * of pages is either added into the pool and the pool is sized + * up (@flags have VIR_NODE_ALLOC_PAGES_ADD set) or the number of
Technically, VIR_NODE_ALLOC_PAGES_ADD is 0, so it can't be set :) Do we even need this named constant, or can we just drop it?
Well, I'd like to keep it to explicitly state that the pool is added to. You know: unsigned int flags = 0; if (<condition>) flag |= VIR_NODE_ALLOC_PAGES_SET; else flag |= VIR_NODE_ALLOC_PAGES_ADD; virNodeAlloPages(..., flags); This is just a hint for users, the named value is certainly not needed. I thought of following already pre-existing VIR_DOMAIN_AFFECT_CURRENT.
+ * pages is looked at the new size of pool and the pool can be + * both sized up or down (@flags have VIR_NODE_ALLOC_PAGES_SET
The wording is a bit awkward; maybe:
If @flags is 0 (VIR_NODE_ALLOC_PAGES_ADD), each pool corresponding to @pageSizes grows by the number of pages specified in the corresponding @pageCounts. If @flags contains VIR_NODE_ALLOC_PAGES_SET, each pool mentioned is resized to the given number of pages.
+ * set). The pages pool can be allocated over several NUMA nodes + * at once, just point at @startCell and tell how many subsequent + * NUMA nodes should be taken in.
Is there shorthand such as @startCell==-1 for allocating across all NUMA nodes?
Yes there is. I've updated the documentation.
Is there checking that @npages cannot exceed the number of NUMA nodes available starting at @startCell through @cellCount?
Well, that's question for 3/4 but er, how to put it. No, not currently, but I'll update 3/4.
If I understand correctly, usage is something like (pseudo-code):
virNodeAllocPages(conn, 2, (int[]){ [0]=2M, [1]=1G }, (int[]){ [0]=1024, [1]=1 }, 1, 2, 0)
which proceeds to add 1024 pages of size 2M, and 1 page of size 1G, to the pools associated both with NUMA node 1 and NUMA node 2 (that is, I just allocated 6G across two nodes).
Exactly!
+ * + * Returns: the number of nodes successfully adjusted or -1 in + * case of an error.
Can this function partially succeed? That is, if I pass @npages==2 and @cellCount==2, will I ever get a non-negative result less than 4? If the result is 2, which allocations failed (are all allocations on the first cell attempted before any on the second, or are all allocations to the first pool size attempted across multiple cells before revisiting cells to allocate in the second pool size)?
Since there's no rollback on partially successful returns (in fact, the majority of our APIs have no fallback at all) I find it just okay to have it like that. Moreover, runtime huge page allocation is still a bit of magic: the first call may fail, while the later attempt may succeed. Then - we already have an API to fetch the pool sizes (virNodeGetFreePages) so if the Alloc() partially fails, users may fetch which pools succeeded and which needs to be adjusted again.
+ */ +int +virNodeAllocPages(virConnectPtr conn, + unsigned int npages, + unsigned int *pageSizes, + unsigned long long *pageCounts, + int startCell, + unsigned int cellCount, + unsigned int flags) +{ + VIR_DEBUG("conn=%p npages=%u pageSizes=%p pageCounts=%p " + "startCell=%d cellCount=%u flagx=%x", + conn, npages, pageSizes, pageCounts, startCell, + cellCount, flags); + + virResetLastError(); + + virCheckConnectReturn(conn, -1); + virCheckNonZeroArgGoto(npages, error); + virCheckNonNullArgGoto(pageSizes, error); + virCheckNonNullArgGoto(pageCounts, error); +
Where is the validation that startCell and cellCount make sense? I'd assume we want to ensure cellCount is positive? Or is there a special case of a count of 0 for visiting all NUMA cells without knowing in advance how many there are?
Oh right. I'll add the check. If we ever want to go with your idea, we can just drop the check.
+++ b/src/libvirt_public.syms @@ -677,6 +677,7 @@ LIBVIRT_1.2.8 { virDomainListGetStats; virDomainOpenGraphicsFD; virDomainStatsRecordListFree; + virNodeAllocPages; } LIBVIRT_1.2.7;
Need a new section for 1.2.9.
I'm still not fully used to our new release style, apparently :) Thanks for catching that. Michal

This internal API can be used to allocate or free some pages in the huge pages pool. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/libvirt_private.syms | 1 + src/util/virnuma.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++ src/util/virnuma.h | 4 ++ 3 files changed, 113 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 51a692b..aabc11f 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1699,6 +1699,7 @@ virNumaGetPageInfo; virNumaGetPages; virNumaIsAvailable; virNumaNodeIsAvailable; +virNumaSetPagePoolSize; virNumaSetupMemoryPolicy; diff --git a/src/util/virnuma.c b/src/util/virnuma.c index 1a34398..690615f 100644 --- a/src/util/virnuma.c +++ b/src/util/virnuma.c @@ -841,6 +841,102 @@ virNumaGetPages(int node, } +int +virNumaSetPagePoolSize(int node, + unsigned int page_size, + unsigned long long page_count, + bool add) +{ + int ret = -1; + char *nr_path = NULL, *nr_buf = NULL; + char *end; + unsigned long long nr_count; + + if (page_size == sysconf(_SC_PAGESIZE) / 1024) { + /* Special case as kernel handles system pages + * differently to huge pages. */ + virReportError(VIR_ERR_OPERATION_UNSUPPORTED, "%s", + _("system pages pool can't be modified")); + goto cleanup; + } + + if (virNumaGetHugePageInfoPath(&nr_path, node, page_size, "nr_hugepages") < 0) + goto cleanup; + + /* Firstly check, if there's anything for us to do */ + if (virFileReadAll(nr_path, 1024, &nr_buf) < 0) + goto cleanup; + + if (virStrToLong_ull(nr_buf, &end, 10, &nr_count) < 0 || + *end != '\n') { + virReportError(VIR_ERR_OPERATION_FAILED, + _("invalid number '%s' in '%s'"), + nr_buf, nr_path); + goto cleanup; + } + + if (add) { + if (!page_count) { + VIR_DEBUG("Nothing left to do: add = true page_count = 0"); + ret = 0; + goto cleanup; + } + page_count += nr_count; + } else { + if (nr_count == page_count) { + VIR_DEBUG("Nothing left to do: nr_count = page_count = %llu", + page_count); + ret = 0; + goto cleanup; + } + } + + /* Okay, page pool adjustment must be done in two steps. In + * first we write the desired number into nr_hugepages file. + * Kernel then starts to allocate the pages (return from + * write should be postponed until the kernel is finished). + * However, kernel may have not been successful and reserved + * all the pages we wanted. So do the second read to check. + */ + VIR_FREE(nr_buf); + if (virAsprintf(&nr_buf, "%llu", page_count) < 0) + goto cleanup; + + if (virFileWriteStr(nr_path, nr_buf, 0) < 0) { + virReportSystemError(errno, + _("Unable to write to: %s"), nr_path); + goto cleanup; + } + + /* And now do the check. */ + + VIR_FREE(nr_buf); + if (virFileReadAll(nr_path, 1024, &nr_buf) < 0) + goto cleanup; + + if (virStrToLong_ull(nr_buf, &end, 10, &nr_count) < 0 || + *end != '\n') { + virReportError(VIR_ERR_OPERATION_FAILED, + _("invalid number '%s' in '%s'"), + nr_buf, nr_path); + goto cleanup; + } + + if (nr_count != page_count) { + virReportError(VIR_ERR_OPERATION_FAILED, + _("Unable to allocate %llu pages. Allocated only %llu"), + page_count, nr_count); + goto cleanup; + } + + ret = 0; + cleanup: + VIR_FREE(nr_buf); + VIR_FREE(nr_path); + return ret; +} + + #else /* #ifdef __linux__ */ int virNumaGetPageInfo(int node ATTRIBUTE_UNUSED, @@ -866,4 +962,16 @@ virNumaGetPages(int node ATTRIBUTE_UNUSED, _("page info is not supported on this platform")); return -1; } + + +int +virNumaSetPagePoolSize(int node ATTRIBUTE_UNUSED, + unsigned int page_size ATTRIBUTE_UNUSED, + unsigned long long page_count ATTRIBUTE_UNUSED, + bool add ATTRIBUTE_UNUSED) +{ + virReportError(VIR_ERR_OPERATION_UNSUPPORTED, "%s", + _("page pool allocation is not supported on this platform")); + return -1; +} #endif /* #ifdef __linux__ */ diff --git a/src/util/virnuma.h b/src/util/virnuma.h index 13ebec6..04b6b8c 100644 --- a/src/util/virnuma.h +++ b/src/util/virnuma.h @@ -59,4 +59,8 @@ int virNumaGetPages(int node, unsigned int **pages_free, size_t *npages) ATTRIBUTE_NONNULL(5); +int virNumaSetPagePoolSize(int node, + unsigned int page_size, + unsigned long long page_count, + bool add); #endif /* __VIR_NUMA_H__ */ -- 1.8.5.5

And add stubs to other drivers like: lxc, qemu, uml and vbox. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/libvirt_private.syms | 1 + src/lxc/lxc_driver.c | 22 ++++++++++++++++++++++ src/nodeinfo.c | 29 +++++++++++++++++++++++++++++ src/nodeinfo.h | 7 +++++++ src/qemu/qemu_driver.c | 22 ++++++++++++++++++++++ src/uml/uml_driver.c | 22 ++++++++++++++++++++++ src/vbox/vbox_common.c | 19 +++++++++++++++++++ 7 files changed, 122 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index aabc11f..c75da80 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -891,6 +891,7 @@ virLockManagerRelease; # nodeinfo.h +nodeAllocPages; nodeCapsInitNUMA; nodeGetCellsFreeMemory; nodeGetCPUBitmap; diff --git a/src/lxc/lxc_driver.c b/src/lxc/lxc_driver.c index c3cd62c..38763de 100644 --- a/src/lxc/lxc_driver.c +++ b/src/lxc/lxc_driver.c @@ -5685,6 +5685,27 @@ lxcNodeGetFreePages(virConnectPtr conn, } +static int +lxcNodeAllocPages(virConnectPtr conn, + unsigned int npages, + unsigned int *pageSizes, + unsigned long long *pageCounts, + int startCell, + unsigned int cellCount, + unsigned int flags) +{ + bool add = !(flags & VIR_NODE_ALLOC_PAGES_SET); + + virCheckFlags(VIR_NODE_ALLOC_PAGES_SET, -1); + + if (virNodeAllocPagesEnsureACL(conn) < 0) + return -1; + + return nodeAllocPages(npages, pageSizes, pageCounts, + startCell, cellCount, add); +} + + /* Function Tables */ static virDriver lxcDriver = { .no = VIR_DRV_LXC, @@ -5776,6 +5797,7 @@ static virDriver lxcDriver = { .domainReboot = lxcDomainReboot, /* 1.0.1 */ .domainLxcOpenNamespace = lxcDomainLxcOpenNamespace, /* 1.0.2 */ .nodeGetFreePages = lxcNodeGetFreePages, /* 1.2.6 */ + .nodeAllocPages = lxcNodeAllocPages, /* 1.2.8 */ }; static virStateDriver lxcStateDriver = { diff --git a/src/nodeinfo.c b/src/nodeinfo.c index af23b8b..4c92626 100644 --- a/src/nodeinfo.c +++ b/src/nodeinfo.c @@ -2053,3 +2053,32 @@ nodeGetFreePages(unsigned int npages, cleanup: return ret; } + +int +nodeAllocPages(unsigned int npages, + unsigned int *pageSizes, + unsigned long long *pageCounts, + int startCell, + unsigned int cellCount, + bool add) +{ + int ret = -1; + int cell; + size_t i, ncounts = 0; + + for (cell = startCell; cell < (int) (startCell + cellCount); cell++) { + for (i = 0; i < npages; i++) { + unsigned int page_size = pageSizes[i]; + unsigned long long page_count = pageCounts[i]; + + if (virNumaSetPagePoolSize(cell, page_size, page_count, add) < 0) + goto cleanup; + + ncounts++; + } + } + + ret = ncounts; + cleanup: + return ret; +} diff --git a/src/nodeinfo.h b/src/nodeinfo.h index 0896c6c..a993c63 100644 --- a/src/nodeinfo.h +++ b/src/nodeinfo.h @@ -63,4 +63,11 @@ int nodeGetFreePages(unsigned int npages, int startCell, unsigned int cellCount, unsigned long long *counts); + +int nodeAllocPages(unsigned int npages, + unsigned int *pageSizes, + unsigned long long *pageCounts, + int startCell, + unsigned int cellCount, + bool add); #endif /* __VIR_NODEINFO_H__*/ diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 5fd89a3..a8f3dca 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -17953,6 +17953,27 @@ qemuConnectGetAllDomainStats(virConnectPtr conn, } +static int +qemuNodeAllocPages(virConnectPtr conn, + unsigned int npages, + unsigned int *pageSizes, + unsigned long long *pageCounts, + int startCell, + unsigned int cellCount, + unsigned int flags) +{ + bool add = !(flags & VIR_NODE_ALLOC_PAGES_SET); + + virCheckFlags(VIR_NODE_ALLOC_PAGES_SET, -1); + + if (virNodeAllocPagesEnsureACL(conn) < 0) + return -1; + + return nodeAllocPages(npages, pageSizes, pageCounts, + startCell, cellCount, add); +} + + static virDriver qemuDriver = { .no = VIR_DRV_QEMU, .name = QEMU_DRIVER_NAME, @@ -18152,6 +18173,7 @@ static virDriver qemuDriver = { .nodeGetFreePages = qemuNodeGetFreePages, /* 1.2.6 */ .connectGetDomainCapabilities = qemuConnectGetDomainCapabilities, /* 1.2.7 */ .connectGetAllDomainStats = qemuConnectGetAllDomainStats, /* 1.2.8 */ + .nodeAllocPages = qemuNodeAllocPages, /* 1.2.8 */ }; diff --git a/src/uml/uml_driver.c b/src/uml/uml_driver.c index 12b0ba7..c255c07 100644 --- a/src/uml/uml_driver.c +++ b/src/uml/uml_driver.c @@ -2897,6 +2897,27 @@ umlNodeGetFreePages(virConnectPtr conn, } +static int +umlNodeAllocPages(virConnectPtr conn, + unsigned int npages, + unsigned int *pageSizes, + unsigned long long *pageCounts, + int startCell, + unsigned int cellCount, + unsigned int flags) +{ + bool add = !(flags & VIR_NODE_ALLOC_PAGES_SET); + + virCheckFlags(VIR_NODE_ALLOC_PAGES_SET, -1); + + if (virNodeAllocPagesEnsureACL(conn) < 0) + return -1; + + return nodeAllocPages(npages, pageSizes, pageCounts, + startCell, cellCount, add); +} + + static virDriver umlDriver = { .no = VIR_DRV_UML, .name = "UML", @@ -2959,6 +2980,7 @@ static virDriver umlDriver = { .nodeGetMemoryParameters = umlNodeGetMemoryParameters, /* 0.10.2 */ .nodeSetMemoryParameters = umlNodeSetMemoryParameters, /* 0.10.2 */ .nodeGetFreePages = umlNodeGetFreePages, /* 1.2.6 */ + .nodeAllocPages = umlNodeAllocPages, /* 1.2.8 */ }; static virStateDriver umlStateDriver = { diff --git a/src/vbox/vbox_common.c b/src/vbox/vbox_common.c index 7ff0761..e831255 100644 --- a/src/vbox/vbox_common.c +++ b/src/vbox/vbox_common.c @@ -7462,6 +7462,24 @@ vboxNodeGetFreePages(virConnectPtr conn ATTRIBUTE_UNUSED, return nodeGetFreePages(npages, pages, startCell, cellCount, counts); } +static int +vboxNodeAllocPages(virConnectPtr conn ATTRIBUTE_UNUSED, + unsigned int npages, + unsigned int *pageSizes, + unsigned long long *pageCounts, + int startCell, + unsigned int cellCount, + unsigned int flags) +{ + bool add = !(flags & VIR_NODE_ALLOC_PAGES_SET); + + virCheckFlags(VIR_NODE_ALLOC_PAGES_SET, -1); + + return nodeAllocPages(npages, pageSizes, pageCounts, + startCell, cellCount, add); +} + + /** * Function Tables */ @@ -7533,6 +7551,7 @@ virDriver vboxCommonDriver = { .domainSnapshotDelete = vboxDomainSnapshotDelete, /* 0.8.0 */ .connectIsAlive = vboxConnectIsAlive, /* 0.9.8 */ .nodeGetFreePages = vboxNodeGetFreePages, /* 1.2.6 */ + .nodeAllocPages = vboxNodeAllocPages, /* 1.2.8 */ }; static void updateDriver(void) -- 1.8.5.5

The new virsh command is named 'allocpages'. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- tools/virsh-host.c | 134 +++++++++++++++++++++++++++++++++++++++++++++++++++++ tools/virsh.pod | 12 +++++ 2 files changed, 146 insertions(+) diff --git a/tools/virsh-host.c b/tools/virsh-host.c index 7fc2120..15bfb7a 100644 --- a/tools/virsh-host.c +++ b/tools/virsh-host.c @@ -418,6 +418,134 @@ cmdFreepages(vshControl *ctl, const vshCmd *cmd) /* + * "allocpages" command + */ +static const vshCmdInfo info_allocpages[] = { + {.name = "help", + .data = N_("Manipulate pages pool size") + }, + {.name = "desc", + .data = N_("Allocate or free some pages in the pool for NUMA cell.") + }, + {.name = NULL} +}; +static const vshCmdOptDef opts_allocpages[] = { + {.name = "pagesize", + .type = VSH_OT_INT, + .flags = VSH_OFLAG_REQ, + .help = N_("page size (in kibibytes)") + }, + {.name = "pagecount", + .type = VSH_OT_INT, + .flags = VSH_OFLAG_REQ, + .help = N_("page count") + }, + {.name = "cellno", + .type = VSH_OT_INT, + .help = N_("NUMA cell number") + }, + {.name = "add", + .type = VSH_OT_BOOL, + .help = N_("instead of setting new pool size add pages to it") + }, + {.name = "all", + .type = VSH_OT_BOOL, + .help = N_("set on all NUMA cells") + }, + {.name = NULL} +}; + +static bool +cmdAllocpages(vshControl *ctl, const vshCmd *cmd) +{ + bool ret = false; + bool add = vshCommandOptBool(cmd, "add"); + bool all = vshCommandOptBool(cmd, "all"); + bool cellno = vshCommandOptBool(cmd, "cellno"); + int startCell = -1; + int cellCount = 1; + unsigned int pageSizes[1]; + unsigned long long pageCounts[1]; + unsigned int flags = 0; + char *cap_xml = NULL; + xmlDocPtr xml = NULL; + xmlXPathContextPtr ctxt = NULL; + xmlNodePtr *nodes = NULL; + + VSH_EXCLUSIVE_OPTIONS_VAR(all, cellno); + + if (cellno && vshCommandOptInt(cmd, "cellno", &startCell) < 0) { + vshError(ctl, "%s", _("cell number has to be a number")); + return false; + } + + if (vshCommandOptUInt(cmd, "pagesize", &pageSizes[0]) < 0) { + vshError(ctl, "%s", _("pagesize has to be a number")); + return false; + } + + if (vshCommandOptULongLong(cmd, "pagecount", &pageCounts[0]) < 0) { + vshError(ctl, "%s", _("pagecount hat to be a number")); + return false; + } + + flags |= add ? VIR_NODE_ALLOC_PAGES_ADD : VIR_NODE_ALLOC_PAGES_SET; + + if (all) { + unsigned long nodes_cnt; + size_t i; + + if (!(cap_xml = virConnectGetCapabilities(ctl->conn))) { + vshError(ctl, "%s", _("unable to get node capabilities")); + goto cleanup; + } + + xml = virXMLParseStringCtxt(cap_xml, _("(capabilities)"), &ctxt); + if (!xml) { + vshError(ctl, "%s", _("unable to get node capabilities")); + goto cleanup; + } + + nodes_cnt = virXPathNodeSet("/capabilities/host/topology/cells/cell", + ctxt, &nodes); + + if (nodes_cnt == -1) { + vshError(ctl, "%s", _("could not get information about " + "NUMA topology")); + goto cleanup; + } + + for (i = 0; i < nodes_cnt; i++) { + unsigned long id; + char *val = virXMLPropString(nodes[i], "id"); + if (virStrToLong_ul(val, NULL, 10, &id)) { + vshError(ctl, "%s", _("conversion from string failed")); + VIR_FREE(val); + goto cleanup; + } + VIR_FREE(val); + + if (virNodeAllocPages(ctl->conn, 1, pageSizes, + pageCounts, id, 1, flags) < 0) + goto cleanup; + } + } else { + if (virNodeAllocPages(ctl->conn, 1, pageSizes, pageCounts, + startCell, cellCount, flags) < 0) + goto cleanup; + } + + ret = true; + cleanup: + xmlXPathFreeContext(ctxt); + xmlFreeDoc(xml); + VIR_FREE(nodes); + VIR_FREE(cap_xml); + return ret; +} + + +/* * "maxvcpus" command */ static const vshCmdInfo info_maxvcpus[] = { @@ -1183,6 +1311,12 @@ cmdNodeMemoryTune(vshControl *ctl, const vshCmd *cmd) } const vshCmdDef hostAndHypervisorCmds[] = { + {.name = "allocpages", + .handler = cmdAllocpages, + .opts = opts_allocpages, + .info = info_capabilities, + .flags = 0 + }, {.name = "capabilities", .handler = cmdCapabilities, .opts = NULL, diff --git a/tools/virsh.pod b/tools/virsh.pod index 9919f92..eae9195 100644 --- a/tools/virsh.pod +++ b/tools/virsh.pod @@ -561,6 +561,18 @@ to the NUMA cell you're interested in. I<pagesize> is a scaled integer (see B<NOTES> above). Alternatively, if I<--all> is used, info on each possible combination of NUMA cell and page size is printed out. +=item B<allocpages> [I<--pagesize>] I<pagesize> [I<--pagecount>] I<pagecount> +[[I<--cellno>] I<cellno>] [I<--add>] [I<--all>] + +Change the size of pages pool of I<pagesize> on the host. If +I<--add> is specified, then I<pagecount> pages are added into the +pool. However, if I<--add> wasn't specified, then the +I<pagecount> is taken as the new absolute size of the pool (this +may be used to free some pages and size the pool down). The +I<cellno> modifier can be used to narrow the modification down to +a single host NUMA cell. On the other end of spectrum lies +I<--all> which executes the modification on all NUMA cells. + =item B<cpu-baseline> I<FILE> [I<--features>] Compute baseline CPU which will be supported by all host CPUs given in <file>. -- 1.8.5.5
participants (3)
-
Eric Blake
-
Ján Tomko
-
Michal Privoznik