[libvirt] [RFCv2 PATCH 0/5] Introduce RDT memory bandwidth allocation support

From: Bing Niu <bing.niu@intel.com> This series is to introduce RDT memory bandwidth allocation support by extending current virresctrl implementation. The Memory Bandwidth Allocation (MBA) feature provides indirect and approximate control over memory bandwidth available per-core. This feature provides a method to control applications which may be over-utilizing bandwidth relative to their priority in environments such as the data-center. The details can be found in Intel's SDM 17.19.7. Kernel supports MBA through resctrl file system same as CAT. Each resctrl group have a MB parameter to control how much memory bandwidth it can utilize in unit of percentage. In this series, MBA is enabled by enhancing existing virresctrl implementation. The policy employed for MBA is similar with CAT: The sum of each MBA group's bandwidth dose not exceed 100%. The enhancement of virresctrl include two parts: Part 1: Add two new structures virResctrlInfoMB and virResctrlAllocMB for collecting host system MBA capability and domain memory bandwidth allocation. Those two structures are the extension of existing virResctrlInfo and virResctrlAlloc. With them, virresctrl framework can support MBA and CAT concurrently. Each virResctrlAlloc represent a resource allocation including CAT, or MBA, or CAT&MBA. The policy of MBA is that: total memory bandwidth of each resctrl group, created by virresctrl, does not exceed to 100%. Part 2: On XML part, add new elements to host capabilities query and domain allocation to support memory bandwidth allocation. ---------------------------------------------------------------------------------------------- For host capabilities XML, new XML format like below example, <host> ..... <memory> <node id='0' cpus='0-19'> <control granularity='10' min ='10' maxAllocs='8'/> </node> </memory> </host> granularity --- memory bandwidth granularity min --- minimum memory bandwidth allowed maxAllocs --- maximum concurrent memory bandwidth allocation allowed. ---------------------------------------------------------------------------------------------- For domain XML, new format as below example <domain type='kvm' id='2'> ...... <cputune> ...... <shares>1024</shares> <memorytune vcpus='0-1'> <node id='0' bandwidth='20'/> </memorytune> </cputune> ...... </domain> id --- node where memory bandwidth allocation will happen bandwidth --- bandwidth allocated in percentage ---------------------------------------------------------------------------------------------- Changelog: v1->v2: Jano's comment: 1. put renaming parts into separated patches. 2. set the initial return value as -1. 3. using full name in structure definition. 4. do not use VIR_CACHE_TYPE_LAST for memory bandwidth allocation formatting. Pavel's comment: 1. add host capabilities XML for memory bandwidth allocation. 2. do not mix use cachetune section in XML for memory bandwidth allocation in domain XML. define a dedicated one for memory bandwidth allocation. Bing Niu (5): util: Rename and packing parts of virresctrl util: Add memory bandwidth support to resctrl conf: rename cachetune to restune conf: Introduce cputune/memorytune to support memory bandwidth allocation conf: add memory bandwidth allocation capability of host src/conf/capabilities.c | 112 +++++++++++ src/conf/capabilities.h | 11 ++ src/conf/domain_conf.c | 304 ++++++++++++++++++++++++++--- src/conf/domain_conf.h | 10 +- src/libvirt_private.syms | 4 +- src/qemu/qemu_process.c | 18 +- src/util/virresctrl.c | 496 ++++++++++++++++++++++++++++++++++++++++++++--- src/util/virresctrl.h | 35 +++- 8 files changed, 918 insertions(+), 72 deletions(-) -- 2.7.4

From: Bing Niu <bing.niu@intel.com> Renaming some functions and packing some CAT logic into functions Signed-off-by: Bing Niu <bing.niu@intel.com> --- src/conf/domain_conf.c | 2 +- src/libvirt_private.syms | 2 +- src/util/virresctrl.c | 93 +++++++++++++++++++++++++++++++----------------- src/util/virresctrl.h | 16 ++++----- 4 files changed, 70 insertions(+), 43 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 522e0c2..62c412e 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -26882,7 +26882,7 @@ virDomainCachetuneDefFormat(virBufferPtr buf, int ret = -1; virBufferSetChildIndent(&childrenBuf, buf); - virResctrlAllocForeachSize(cachetune->alloc, + virResctrlAllocForeachCache(cachetune->alloc, virDomainCachetuneDefFormatHelper, &childrenBuf); diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index ea24f28..fc17059 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2627,7 +2627,7 @@ virCacheTypeToString; virResctrlAllocAddPID; virResctrlAllocCreate; virResctrlAllocDeterminePath; -virResctrlAllocForeachSize; +virResctrlAllocForeachCache; virResctrlAllocFormat; virResctrlAllocGetID; virResctrlAllocGetUnused; diff --git a/src/util/virresctrl.c b/src/util/virresctrl.c index e492a63..e62061f 100644 --- a/src/util/virresctrl.c +++ b/src/util/virresctrl.c @@ -316,11 +316,9 @@ virResctrlUnlock(int fd) } -/* virResctrlInfo-related definitions */ static int -virResctrlGetInfo(virResctrlInfoPtr resctrl) +virResctrlGetCacheInfo(virResctrlInfoPtr resctrl, DIR *dirp) { - DIR *dirp = NULL; char *endptr = NULL; char *tmp_str = NULL; int ret = -1; @@ -332,12 +330,6 @@ virResctrlGetInfo(virResctrlInfoPtr resctrl) virResctrlInfoPerLevelPtr i_level = NULL; virResctrlInfoPerTypePtr i_type = NULL; - rv = virDirOpenIfExists(&dirp, SYSFS_RESCTRL_PATH "/info"); - if (rv <= 0) { - ret = rv; - goto cleanup; - } - while ((rv = virDirRead(dirp, &ent, SYSFS_RESCTRL_PATH "/info")) > 0) { VIR_DEBUG("Parsing info type '%s'", ent->d_name); if (ent->d_name[0] != 'L') @@ -443,12 +435,33 @@ virResctrlGetInfo(virResctrlInfoPtr resctrl) ret = 0; cleanup: - VIR_DIR_CLOSE(dirp); VIR_FREE(i_type); return ret; } +/* virResctrlInfo-related definitions */ +static int +virResctrlGetInfo(virResctrlInfoPtr resctrl) +{ + DIR *dirp = NULL; + int ret = -1; + + ret = virDirOpenIfExists(&dirp, SYSFS_RESCTRL_PATH "/info"); + if (ret <= 0) + goto cleanup; + + ret = virResctrlGetCacheInfo(resctrl, dirp); + if (ret < 0) + goto cleanup; + + ret = 0; + cleanup: + VIR_DIR_CLOSE(dirp); + return ret; +} + + virResctrlInfoPtr virResctrlInfoNew(void) { @@ -773,8 +786,8 @@ virResctrlAllocSetSize(virResctrlAllocPtr alloc, int -virResctrlAllocForeachSize(virResctrlAllocPtr alloc, - virResctrlAllocForeachSizeCallback cb, +virResctrlAllocForeachCache(virResctrlAllocPtr alloc, + virResctrlAllocForeachCacheCallback cb, void *opaque) { int ret = 0; @@ -835,17 +848,14 @@ virResctrlAllocGetID(virResctrlAllocPtr alloc) } -char * -virResctrlAllocFormat(virResctrlAllocPtr alloc) +static int +virResctrlAllocFormatCache(virResctrlAllocPtr alloc, virBufferPtr buf) { - virBuffer buf = VIR_BUFFER_INITIALIZER; + int ret = -1; unsigned int level = 0; unsigned int type = 0; unsigned int cache = 0; - if (!alloc) - return NULL; - for (level = 0; level < alloc->nlevels; level++) { virResctrlAllocPerLevelPtr a_level = alloc->levels[level]; @@ -858,7 +868,7 @@ virResctrlAllocFormat(virResctrlAllocPtr alloc) if (!a_type) continue; - virBufferAsprintf(&buf, "L%u%s:", level, virResctrlTypeToString(type)); + virBufferAsprintf(buf, "L%u%s:", level, virResctrlTypeToString(type)); for (cache = 0; cache < a_type->nmasks; cache++) { virBitmapPtr mask = a_type->masks[cache]; @@ -868,20 +878,37 @@ virResctrlAllocFormat(virResctrlAllocPtr alloc) continue; mask_str = virBitmapToString(mask, false, true); - if (!mask_str) { - virBufferFreeAndReset(&buf); - return NULL; - } + if (!mask_str) + return ret; - virBufferAsprintf(&buf, "%u=%s;", cache, mask_str); + virBufferAsprintf(buf, "%u=%s;", cache, mask_str); VIR_FREE(mask_str); } - virBufferTrim(&buf, ";", 1); - virBufferAddChar(&buf, '\n'); + virBufferTrim(buf, ";", 1); + virBufferAddChar(buf, '\n'); } } + ret = 0; + virBufferCheckError(buf); + return ret; +} + + +char * +virResctrlAllocFormat(virResctrlAllocPtr alloc) +{ + virBuffer buf = VIR_BUFFER_INITIALIZER; + + if (!alloc) + return NULL; + + if (virResctrlAllocFormatCache(alloc, &buf) < 0) { + virBufferFreeAndReset(&buf); + return NULL; + } + virBufferCheckError(&buf); return virBufferContentAndReset(&buf); } @@ -939,9 +966,9 @@ virResctrlAllocParseProcessCache(virResctrlInfoPtr resctrl, static int -virResctrlAllocParseProcessLine(virResctrlInfoPtr resctrl, - virResctrlAllocPtr alloc, - char *line) +virResctrlAllocParseCacheLine(virResctrlInfoPtr resctrl, + virResctrlAllocPtr alloc, + char *line) { char **caches = NULL; char *tmp = NULL; @@ -1009,7 +1036,7 @@ virResctrlAllocParse(virResctrlInfoPtr resctrl, lines = virStringSplitCount(schemata, "\n", 0, &nlines); for (i = 0; i < nlines; i++) { - if (virResctrlAllocParseProcessLine(resctrl, alloc, lines[i]) < 0) + if (virResctrlAllocParseCacheLine(resctrl, alloc, lines[i]) < 0) goto cleanup; } @@ -1401,8 +1428,8 @@ virResctrlAllocCopyMasks(virResctrlAllocPtr dst, * transforming `sizes` into `masks`. */ static int -virResctrlAllocMasksAssign(virResctrlInfoPtr resctrl, - virResctrlAllocPtr alloc) +virResctrlAllocAssign(virResctrlInfoPtr resctrl, + virResctrlAllocPtr alloc) { int ret = -1; unsigned int level = 0; @@ -1526,7 +1553,7 @@ virResctrlAllocCreate(virResctrlInfoPtr resctrl, if (lockfd < 0) goto cleanup; - if (virResctrlAllocMasksAssign(resctrl, alloc) < 0) + if (virResctrlAllocAssign(resctrl, alloc) < 0) goto cleanup; alloc_str = virResctrlAllocFormat(alloc); diff --git a/src/util/virresctrl.h b/src/util/virresctrl.h index 9052a2b..26c5f3b 100644 --- a/src/util/virresctrl.h +++ b/src/util/virresctrl.h @@ -67,11 +67,11 @@ virResctrlInfoGetCache(virResctrlInfoPtr resctrl, typedef struct _virResctrlAlloc virResctrlAlloc; typedef virResctrlAlloc *virResctrlAllocPtr; -typedef int virResctrlAllocForeachSizeCallback(unsigned int level, - virCacheType type, - unsigned int cache, - unsigned long long size, - void *opaque); +typedef int virResctrlAllocForeachCacheCallback(unsigned int level, + virCacheType type, + unsigned int cache, + unsigned long long size, + void *opaque); virResctrlAllocPtr virResctrlAllocNew(void); @@ -87,9 +87,9 @@ virResctrlAllocSetSize(virResctrlAllocPtr alloc, unsigned long long size); int -virResctrlAllocForeachSize(virResctrlAllocPtr alloc, - virResctrlAllocForeachSizeCallback cb, - void *opaque); +virResctrlAllocForeachCache(virResctrlAllocPtr alloc, + virResctrlAllocForeachCacheCallback cb, + void *opaque); int virResctrlAllocSetID(virResctrlAllocPtr alloc, -- 2.7.4

On 06/15/2018 05:29 AM, bing.niu@intel.com wrote:
From: Bing Niu <bing.niu@intel.com>
Renaming some functions and packing some CAT logic into functions
Try to do one "thing" per patch - the "and" gives it away... Thus one patch could rename various functions and other one(s) do the "refactor" (not packing) of functions (one per refactor). While it seems a bit odd - it helps keep reviews/changes quick and easy. It's also very useful when doing bisects to have smaller sets of change in order to more easily ascertain what broke something else.
Signed-off-by: Bing Niu <bing.niu@intel.com> --- src/conf/domain_conf.c | 2 +- src/libvirt_private.syms | 2 +- src/util/virresctrl.c | 93 +++++++++++++++++++++++++++++++----------------- src/util/virresctrl.h | 16 ++++----- 4 files changed, 70 insertions(+), 43 deletions(-)
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 522e0c2..62c412e 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -26882,7 +26882,7 @@ virDomainCachetuneDefFormat(virBufferPtr buf, int ret = -1;
virBufferSetChildIndent(&childrenBuf, buf); - virResctrlAllocForeachSize(cachetune->alloc, + virResctrlAllocForeachCache(cachetune->alloc, virDomainCachetuneDefFormatHelper, &childrenBuf);
You added a character to the name and thus will need to adjust the args to have one more space for proper alignment (e.g. 2nd/3rd args need another space to align under "cachetune".
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index ea24f28..fc17059 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2627,7 +2627,7 @@ virCacheTypeToString; virResctrlAllocAddPID; virResctrlAllocCreate; virResctrlAllocDeterminePath; -virResctrlAllocForeachSize; +virResctrlAllocForeachCache; virResctrlAllocFormat; virResctrlAllocGetID; virResctrlAllocGetUnused; diff --git a/src/util/virresctrl.c b/src/util/virresctrl.c index e492a63..e62061f 100644 --- a/src/util/virresctrl.c +++ b/src/util/virresctrl.c @@ -316,11 +316,9 @@ virResctrlUnlock(int fd) }
-/* virResctrlInfo-related definitions */
You could have kept this here instead of keeping it with the new code.
static int -virResctrlGetInfo(virResctrlInfoPtr resctrl) +virResctrlGetCacheInfo(virResctrlInfoPtr resctrl, DIR *dirp) { - DIR *dirp = NULL; char *endptr = NULL; char *tmp_str = NULL; int ret = -1; @@ -332,12 +330,6 @@ virResctrlGetInfo(virResctrlInfoPtr resctrl) virResctrlInfoPerLevelPtr i_level = NULL; virResctrlInfoPerTypePtr i_type = NULL;
- rv = virDirOpenIfExists(&dirp, SYSFS_RESCTRL_PATH "/info"); - if (rv <= 0) { - ret = rv; - goto cleanup; - } - while ((rv = virDirRead(dirp, &ent, SYSFS_RESCTRL_PATH "/info")) > 0) { VIR_DEBUG("Parsing info type '%s'", ent->d_name); if (ent->d_name[0] != 'L') @@ -443,12 +435,33 @@ virResctrlGetInfo(virResctrlInfoPtr resctrl)
ret = 0; cleanup: - VIR_DIR_CLOSE(dirp); VIR_FREE(i_type); return ret; }
+/* virResctrlInfo-related definitions */ +static int +virResctrlGetInfo(virResctrlInfoPtr resctrl) +{ + DIR *dirp = NULL; + int ret = -1; + + ret = virDirOpenIfExists(&dirp, SYSFS_RESCTRL_PATH "/info"); + if (ret <= 0) + goto cleanup; + + ret = virResctrlGetCacheInfo(resctrl, dirp); + if (ret < 0) + goto cleanup; + + ret = 0; + cleanup: + VIR_DIR_CLOSE(dirp); + return ret; +} + +
The refactor of virResctrlGetInfo should get its own patch...
virResctrlInfoPtr virResctrlInfoNew(void) { @@ -773,8 +786,8 @@ virResctrlAllocSetSize(virResctrlAllocPtr alloc,
int -virResctrlAllocForeachSize(virResctrlAllocPtr alloc, - virResctrlAllocForeachSizeCallback cb, +virResctrlAllocForeachCache(virResctrlAllocPtr alloc, + virResctrlAllocForeachCacheCallback cb, void *opaque)
Again here we need to add space so that the 2nd/3rd args align under virResctrlAllocPtr. This is part of the rename patch.
{ int ret = 0; @@ -835,17 +848,14 @@ virResctrlAllocGetID(virResctrlAllocPtr alloc) }
-char * -virResctrlAllocFormat(virResctrlAllocPtr alloc) +static int +virResctrlAllocFormatCache(virResctrlAllocPtr alloc, virBufferPtr buf)
And here we have a 3rd patch possibility to refactor virResctrlAllocFormat. Also one line per argument e.g.: static int virResctrlAllocFormatCache(virResctrlAllocPtr alloc, virBufferPtr buf)
{ - virBuffer buf = VIR_BUFFER_INITIALIZER; + int ret = -1;
I think this'll be unnecessary as there's only two places to return either -1 or 0.
unsigned int level = 0; unsigned int type = 0; unsigned int cache = 0;
- if (!alloc) - return NULL; - for (level = 0; level < alloc->nlevels; level++) { virResctrlAllocPerLevelPtr a_level = alloc->levels[level];
@@ -858,7 +868,7 @@ virResctrlAllocFormat(virResctrlAllocPtr alloc) if (!a_type) continue;
- virBufferAsprintf(&buf, "L%u%s:", level, virResctrlTypeToString(type)); + virBufferAsprintf(buf, "L%u%s:", level, virResctrlTypeToString(type));
for (cache = 0; cache < a_type->nmasks; cache++) { virBitmapPtr mask = a_type->masks[cache]; @@ -868,20 +878,37 @@ virResctrlAllocFormat(virResctrlAllocPtr alloc) continue;
mask_str = virBitmapToString(mask, false, true); - if (!mask_str) { - virBufferFreeAndReset(&buf); - return NULL; - } + if (!mask_str) + return ret;
Just return -1
- virBufferAsprintf(&buf, "%u=%s;", cache, mask_str); + virBufferAsprintf(buf, "%u=%s;", cache, mask_str); VIR_FREE(mask_str); }
- virBufferTrim(&buf, ";", 1); - virBufferAddChar(&buf, '\n'); + virBufferTrim(buf, ";", 1); + virBufferAddChar(buf, '\n'); } }
+ ret = 0; + virBufferCheckError(buf);
^^^^^^ [1]
+ return ret;
Just return 0
+} + + +char * +virResctrlAllocFormat(virResctrlAllocPtr alloc) +{ + virBuffer buf = VIR_BUFFER_INITIALIZER; + + if (!alloc) + return NULL; + + if (virResctrlAllocFormatCache(alloc, &buf) < 0) { + virBufferFreeAndReset(&buf); + return NULL; + } + virBufferCheckError(&buf);
^^^^^ [1] This is duplicated from the called function - I think you keep it here and remove it from the called function, but I reserve the right to change my mind. It's also 'of note' (and existing) that virBufferCheckError (or actually virBufferCheckErrorInternal) is not a void function. Because you (and other places in libvirt) don't check return status, I get Coverity warnings in my Coverity checker environment. There is of course a bite sized task related to this: https://wiki.libvirt.org/page/BiteSizedTasks#Clean_up_virBufferCheckError.28... but it probably needs to be expanded to combine virBufferCheckError, virBufferContentAndReset, and virBufferFreeAndReset. Which probably means it goes beyond a bite sized task.
return virBufferContentAndReset(&buf); }
The refactor of virResctrlAllocFormat would be a separate patch.
@@ -939,9 +966,9 @@ virResctrlAllocParseProcessCache(virResctrlInfoPtr resctrl,
static int -virResctrlAllocParseProcessLine(virResctrlInfoPtr resctrl, - virResctrlAllocPtr alloc, - char *line) +virResctrlAllocParseCacheLine(virResctrlInfoPtr resctrl, + virResctrlAllocPtr alloc, + char *line)
Here's one for the rename pile...
{ char **caches = NULL; char *tmp = NULL; @@ -1009,7 +1036,7 @@ virResctrlAllocParse(virResctrlInfoPtr resctrl,
lines = virStringSplitCount(schemata, "\n", 0, &nlines); for (i = 0; i < nlines; i++) { - if (virResctrlAllocParseProcessLine(resctrl, alloc, lines[i]) < 0) + if (virResctrlAllocParseCacheLine(resctrl, alloc, lines[i]) < 0) goto cleanup; }
@@ -1401,8 +1428,8 @@ virResctrlAllocCopyMasks(virResctrlAllocPtr dst, * transforming `sizes` into `masks`. */ static int -virResctrlAllocMasksAssign(virResctrlInfoPtr resctrl, - virResctrlAllocPtr alloc) +virResctrlAllocAssign(virResctrlInfoPtr resctrl, + virResctrlAllocPtr alloc)
Another in the rename pile... Although this one becomes more interesting in the next patch. John
{ int ret = -1; unsigned int level = 0; @@ -1526,7 +1553,7 @@ virResctrlAllocCreate(virResctrlInfoPtr resctrl, if (lockfd < 0) goto cleanup;
- if (virResctrlAllocMasksAssign(resctrl, alloc) < 0) + if (virResctrlAllocAssign(resctrl, alloc) < 0) goto cleanup;
alloc_str = virResctrlAllocFormat(alloc);
[...]

On 2018年06月30日 06:36, John Ferlan wrote:
On 06/15/2018 05:29 AM, bing.niu@intel.com wrote:
From: Bing Niu <bing.niu@intel.com>
Renaming some functions and packing some CAT logic into functions
Try to do one "thing" per patch - the "and" gives it away...
Thus one patch could rename various functions and other one(s) do the "refactor" (not packing) of functions (one per refactor).
While it seems a bit odd - it helps keep reviews/changes quick and easy. It's also very useful when doing bisects to have smaller sets of change in order to more easily ascertain what broke something else.OK. I will put renaming part and refactor into individual patches
Signed-off-by: Bing Niu <bing.niu@intel.com> --- src/conf/domain_conf.c | 2 +- src/libvirt_private.syms | 2 +- src/util/virresctrl.c | 93 +++++++++++++++++++++++++++++++----------------- src/util/virresctrl.h | 16 ++++----- 4 files changed, 70 insertions(+), 43 deletions(-)
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 522e0c2..62c412e 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -26882,7 +26882,7 @@ virDomainCachetuneDefFormat(virBufferPtr buf, int ret = -1;
virBufferSetChildIndent(&childrenBuf, buf); - virResctrlAllocForeachSize(cachetune->alloc, + virResctrlAllocForeachCache(cachetune->alloc, virDomainCachetuneDefFormatHelper, &childrenBuf);
You added a character to the name and thus will need to adjust the args to have one more space for proper alignment (e.g. 2nd/3rd args need another space to align under "cachetune".
Yes. I ran syntax-check, unfortunately it didn't throw a error to me. :(
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index ea24f28..fc17059 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2627,7 +2627,7 @@ virCacheTypeToString; virResctrlAllocAddPID; virResctrlAllocCreate; virResctrlAllocDeterminePath; -virResctrlAllocForeachSize; +virResctrlAllocForeachCache; virResctrlAllocFormat; virResctrlAllocGetID; virResctrlAllocGetUnused; diff --git a/src/util/virresctrl.c b/src/util/virresctrl.c index e492a63..e62061f 100644 --- a/src/util/virresctrl.c +++ b/src/util/virresctrl.c @@ -316,11 +316,9 @@ virResctrlUnlock(int fd) }
-/* virResctrlInfo-related definitions */
You could have kept this here instead of keeping it with the new code.
That is due to I refactor virResctrlGetInfo and add a virResctrlGetCacheInfo before virResctrlGetInfo. Actually I didn't touch this line, however the git format diff as that. :(
static int -virResctrlGetInfo(virResctrlInfoPtr resctrl) +virResctrlGetCacheInfo(virResctrlInfoPtr resctrl, DIR *dirp) { - DIR *dirp = NULL; char *endptr = NULL; char *tmp_str = NULL; int ret = -1; @@ -332,12 +330,6 @@ virResctrlGetInfo(virResctrlInfoPtr resctrl) virResctrlInfoPerLevelPtr i_level = NULL; virResctrlInfoPerTypePtr i_type = NULL;
- rv = virDirOpenIfExists(&dirp, SYSFS_RESCTRL_PATH "/info"); - if (rv <= 0) { - ret = rv; - goto cleanup; - } - while ((rv = virDirRead(dirp, &ent, SYSFS_RESCTRL_PATH "/info")) > 0) { VIR_DEBUG("Parsing info type '%s'", ent->d_name); if (ent->d_name[0] != 'L') @@ -443,12 +435,33 @@ virResctrlGetInfo(virResctrlInfoPtr resctrl)
ret = 0; cleanup: - VIR_DIR_CLOSE(dirp); VIR_FREE(i_type); return ret; }
+/* virResctrlInfo-related definitions */ +static int +virResctrlGetInfo(virResctrlInfoPtr resctrl) +{ + DIR *dirp = NULL; + int ret = -1; + + ret = virDirOpenIfExists(&dirp, SYSFS_RESCTRL_PATH "/info"); + if (ret <= 0) + goto cleanup; + + ret = virResctrlGetCacheInfo(resctrl, dirp); + if (ret < 0) + goto cleanup; + + ret = 0; + cleanup: + VIR_DIR_CLOSE(dirp); + return ret; +} + +
The refactor of virResctrlGetInfo should get its own patch...
Will do in next version.
virResctrlInfoPtr virResctrlInfoNew(void) { @@ -773,8 +786,8 @@ virResctrlAllocSetSize(virResctrlAllocPtr alloc,
int -virResctrlAllocForeachSize(virResctrlAllocPtr alloc, - virResctrlAllocForeachSizeCallback cb, +virResctrlAllocForeachCache(virResctrlAllocPtr alloc, + virResctrlAllocForeachCacheCallback cb, void *opaque)
Again here we need to add space so that the 2nd/3rd args align under virResctrlAllocPtr. This is part of the rename patch.
Will do in next version.
{ int ret = 0; @@ -835,17 +848,14 @@ virResctrlAllocGetID(virResctrlAllocPtr alloc) }
-char * -virResctrlAllocFormat(virResctrlAllocPtr alloc) +static int +virResctrlAllocFormatCache(virResctrlAllocPtr alloc, virBufferPtr buf)
And here we have a 3rd patch possibility to refactor virResctrlAllocFormat. Also one line per argument e.g.:
Will do in next version. Have a curious question about "one line per argument". Usually we separate arguments into multiple lines only if the line length for putting in one line beyonds 80m character. So in libvert's coding convention, we need put one arguments one line regardless of line length? Because above line doesn't exceed 80 characters.
static int virResctrlAllocFormatCache(virResctrlAllocPtr alloc, virBufferPtr buf)
{ - virBuffer buf = VIR_BUFFER_INITIALIZER; + int ret = -1;
I think this'll be unnecessary as there's only two places to return either -1 or 0.
Will fix in next version.
unsigned int level = 0; unsigned int type = 0; unsigned int cache = 0;
- if (!alloc) - return NULL; - for (level = 0; level < alloc->nlevels; level++) { virResctrlAllocPerLevelPtr a_level = alloc->levels[level];
@@ -858,7 +868,7 @@ virResctrlAllocFormat(virResctrlAllocPtr alloc) if (!a_type) continue;
- virBufferAsprintf(&buf, "L%u%s:", level, virResctrlTypeToString(type)); + virBufferAsprintf(buf, "L%u%s:", level, virResctrlTypeToString(type));
for (cache = 0; cache < a_type->nmasks; cache++) { virBitmapPtr mask = a_type->masks[cache]; @@ -868,20 +878,37 @@ virResctrlAllocFormat(virResctrlAllocPtr alloc) continue;
mask_str = virBitmapToString(mask, false, true); - if (!mask_str) { - virBufferFreeAndReset(&buf); - return NULL; - } + if (!mask_str) + return ret;
Just return -1
OK.
- virBufferAsprintf(&buf, "%u=%s;", cache, mask_str); + virBufferAsprintf(buf, "%u=%s;", cache, mask_str); VIR_FREE(mask_str); }
- virBufferTrim(&buf, ";", 1); - virBufferAddChar(&buf, '\n'); + virBufferTrim(buf, ";", 1); + virBufferAddChar(buf, '\n'); } }
+ ret = 0; + virBufferCheckError(buf);
^^^^^^ [1]
Ok. I will remove it from caller and let each callee do virBufferCheckError. And add return value testing parts.
+ return ret;
Just return 0
OK!
+} + + +char * +virResctrlAllocFormat(virResctrlAllocPtr alloc) +{ + virBuffer buf = VIR_BUFFER_INITIALIZER; + + if (!alloc) + return NULL; + + if (virResctrlAllocFormatCache(alloc, &buf) < 0) { + virBufferFreeAndReset(&buf); + return NULL; + } + virBufferCheckError(&buf);
^^^^^ [1] This is duplicated from the called function - I think you keep it here and remove it from the called function, but I reserve the right to change my mind.
It's also 'of note' (and existing) that virBufferCheckError (or actually virBufferCheckErrorInternal) is not a void function. Because you (and other places in libvirt) don't check return status, I get Coverity warnings in my Coverity checker environment. There is of course a bite sized task related to this:
https://wiki.libvirt.org/page/BiteSizedTasks#Clean_up_virBufferCheckError.28...
but it probably needs to be expanded to combine virBufferCheckError, virBufferContentAndReset, and virBufferFreeAndReset. Which probably means it goes beyond a bite sized task.
Thanks for the clarification here. :)
return virBufferContentAndReset(&buf); }
The refactor of virResctrlAllocFormat would be a separate patch.
@@ -939,9 +966,9 @@ virResctrlAllocParseProcessCache(virResctrlInfoPtr resctrl,
static int -virResctrlAllocParseProcessLine(virResctrlInfoPtr resctrl, - virResctrlAllocPtr alloc, - char *line) +virResctrlAllocParseCacheLine(virResctrlInfoPtr resctrl, + virResctrlAllocPtr alloc, + char *line)
Here's one for the rename pile...
OK!
{ char **caches = NULL; char *tmp = NULL; @@ -1009,7 +1036,7 @@ virResctrlAllocParse(virResctrlInfoPtr resctrl,
lines = virStringSplitCount(schemata, "\n", 0, &nlines); for (i = 0; i < nlines; i++) { - if (virResctrlAllocParseProcessLine(resctrl, alloc, lines[i]) < 0) + if (virResctrlAllocParseCacheLine(resctrl, alloc, lines[i]) < 0) goto cleanup; }
@@ -1401,8 +1428,8 @@ virResctrlAllocCopyMasks(virResctrlAllocPtr dst, * transforming `sizes` into `masks`. */ static int -virResctrlAllocMasksAssign(virResctrlInfoPtr resctrl, - virResctrlAllocPtr alloc) +virResctrlAllocAssign(virResctrlInfoPtr resctrl, + virResctrlAllocPtr alloc)
Another in the rename pile... Although this one becomes more interesting in the next patch.
OK! Bing
John
{ int ret = -1; unsigned int level = 0; @@ -1526,7 +1553,7 @@ virResctrlAllocCreate(virResctrlInfoPtr resctrl, if (lockfd < 0) goto cleanup;
- if (virResctrlAllocMasksAssign(resctrl, alloc) < 0) + if (virResctrlAllocAssign(resctrl, alloc) < 0) goto cleanup;
alloc_str = virResctrlAllocFormat(alloc);
[...]

On 07/06/2018 03:00 AM, bing.niu wrote:
On 2018年06月30日 06:36, John Ferlan wrote:
On 06/15/2018 05:29 AM, bing.niu@intel.com wrote:
From: Bing Niu <bing.niu@intel.com>
Renaming some functions and packing some CAT logic into functions
Try to do one "thing" per patch - the "and" gives it away...
Thus one patch could rename various functions and other one(s) do the "refactor" (not packing) of functions (one per refactor).
While it seems a bit odd - it helps keep reviews/changes quick and easy. It's also very useful when doing bisects to have smaller sets of change in order to more easily ascertain what broke something else.OK. I will put renaming part and refactor into individual patches
Signed-off-by: Bing Niu <bing.niu@intel.com> --- src/conf/domain_conf.c | 2 +- src/libvirt_private.syms | 2 +- src/util/virresctrl.c | 93 +++++++++++++++++++++++++++++++----------------- src/util/virresctrl.h | 16 ++++----- 4 files changed, 70 insertions(+), 43 deletions(-)
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 522e0c2..62c412e 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -26882,7 +26882,7 @@ virDomainCachetuneDefFormat(virBufferPtr buf, int ret = -1; virBufferSetChildIndent(&childrenBuf, buf); - virResctrlAllocForeachSize(cachetune->alloc, + virResctrlAllocForeachCache(cachetune->alloc, virDomainCachetuneDefFormatHelper, &childrenBuf);
You added a character to the name and thus will need to adjust the args to have one more space for proper alignment (e.g. 2nd/3rd args need another space to align under "cachetune".
Yes. I ran syntax-check, unfortunately it didn't throw a error to me. :(
Try to leave a couple of blank lines around your responses - easier to find that way!
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index ea24f28..fc17059 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2627,7 +2627,7 @@ virCacheTypeToString; virResctrlAllocAddPID; virResctrlAllocCreate; virResctrlAllocDeterminePath; -virResctrlAllocForeachSize; +virResctrlAllocForeachCache; virResctrlAllocFormat; virResctrlAllocGetID; virResctrlAllocGetUnused; diff --git a/src/util/virresctrl.c b/src/util/virresctrl.c index e492a63..e62061f 100644 --- a/src/util/virresctrl.c +++ b/src/util/virresctrl.c @@ -316,11 +316,9 @@ virResctrlUnlock(int fd) } -/* virResctrlInfo-related definitions */
You could have kept this here instead of keeping it with the new code.
That is due to I refactor virResctrlGetInfo and add a virResctrlGetCacheInfo before virResctrlGetInfo. Actually I didn't touch this line, however the git format diff as that. :(
Ahhh - makes sense. I've seen git am do strange things in the past. Sometimes I have to run with -3 for diff resolution and that seems to bring out the phenomena more. Cannot remember for this series though.
[...]
-virResctrlAllocForeachSize(virResctrlAllocPtr alloc, - virResctrlAllocForeachSizeCallback cb, +virResctrlAllocForeachCache(virResctrlAllocPtr alloc, + virResctrlAllocForeachCacheCallback cb, void *opaque)
Again here we need to add space so that the 2nd/3rd args align under virResctrlAllocPtr. This is part of the rename patch. Will do in next version.
{ int ret = 0; @@ -835,17 +848,14 @@ virResctrlAllocGetID(virResctrlAllocPtr alloc) } -char * -virResctrlAllocFormat(virResctrlAllocPtr alloc) +static int +virResctrlAllocFormatCache(virResctrlAllocPtr alloc, virBufferPtr buf)
And here we have a 3rd patch possibility to refactor virResctrlAllocFormat. Also one line per argument e.g.: Will do in next version. Have a curious question about "one line per argument". Usually we separate arguments into multiple lines only if the line length for putting in one line beyonds 80m character. So in libvert's coding convention, we need put one arguments one line regardless of line length? Because above line doesn't exceed 80 characters.
Right, understood. Just trying to follow what other patches have requested - cannot recall if it's a "strict policy" on the hacking page though. The formatting bikeshed also can differ depending on reviewers. John [...]

From: Bing Niu <bing.niu@intel.com> Add memory bandwidth allocation support basing on existing virresctrl implementation. Two new structures virResctrlInfoMB and virResctrlAllocMB are introduced. virResctrlInfoMB is used to record host system MBA supporting information, e.g., minimum bandwidth allowed, bandwidth granularity, number of bandwidth controller (same as number of last level cache). virResctrlAllocMB is used for allocating memory bandwidth. Following virResctrlAllocPerType, it also employs a nested sparse array to indicate whether allocation is available for particular llc. Overall, the memory bandwidth allocation follows the same sequence as existing virresctrl cache allocation model. It exposes interfaces for probing host's memory bandwidth capability on initialization time and memory bandwidth allocation on runtime. Signed-off-by: Bing Niu <bing.niu@intel.com> --- src/libvirt_private.syms | 2 + src/util/virresctrl.c | 383 ++++++++++++++++++++++++++++++++++++++++++++++- src/util/virresctrl.h | 14 ++ 3 files changed, 397 insertions(+), 2 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index fc17059..6925062 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2628,6 +2628,7 @@ virResctrlAllocAddPID; virResctrlAllocCreate; virResctrlAllocDeterminePath; virResctrlAllocForeachCache; +virResctrlAllocForeachMemory; virResctrlAllocFormat; virResctrlAllocGetID; virResctrlAllocGetUnused; @@ -2635,6 +2636,7 @@ virResctrlAllocNew; virResctrlAllocRemove; virResctrlAllocSetID; virResctrlAllocSetSize; +virResctrlSetMemoryBandwidth; virResctrlInfoGetCache; virResctrlInfoNew; diff --git a/src/util/virresctrl.c b/src/util/virresctrl.c index e62061f..de736b0 100644 --- a/src/util/virresctrl.c +++ b/src/util/virresctrl.c @@ -80,18 +80,21 @@ typedef virResctrlInfoPerType *virResctrlInfoPerTypePtr; typedef struct _virResctrlInfoPerLevel virResctrlInfoPerLevel; typedef virResctrlInfoPerLevel *virResctrlInfoPerLevelPtr; +typedef struct _virResctrlInfoMB virResctrlInfoMB; +typedef virResctrlInfoMB *virResctrlInfoMBPtr; + typedef struct _virResctrlAllocPerType virResctrlAllocPerType; typedef virResctrlAllocPerType *virResctrlAllocPerTypePtr; typedef struct _virResctrlAllocPerLevel virResctrlAllocPerLevel; typedef virResctrlAllocPerLevel *virResctrlAllocPerLevelPtr; - +typedef struct _virResctrlAllocMB virResctrlAllocMB; +typedef virResctrlAllocMB *virResctrlAllocMBPtr; /* Class definitions and initializations */ static virClassPtr virResctrlInfoClass; static virClassPtr virResctrlAllocClass; - /* virResctrlInfo */ struct _virResctrlInfoPerType { /* Kernel-provided information */ @@ -116,11 +119,29 @@ struct _virResctrlInfoPerLevel { virResctrlInfoPerTypePtr *types; }; +/* Information about memory bandwidth allocation */ +struct _virResctrlInfoMB { + /* minimum memory bandwidth allowed*/ + unsigned int min_bandwidth; + /* bandwidth granularity */ + unsigned int bandwidth_granularity; + /* Maximum number of simultaneous allocations */ + unsigned int max_allocation; + /* level number of last level cache*/ + unsigned int last_level_cache; + /* max id of last level cache, this is used to track + * how many last level cache available in host system + * */ + unsigned int max_id; +}; + struct _virResctrlInfo { virObject parent; virResctrlInfoPerLevelPtr *levels; size_t nlevels; + + virResctrlInfoMBPtr mb_info; }; @@ -146,6 +167,7 @@ virResctrlInfoDispose(void *obj) VIR_FREE(level); } + VIR_FREE(resctrl->mb_info); VIR_FREE(resctrl->levels); } @@ -202,12 +224,23 @@ struct _virResctrlAllocPerLevel { * VIR_CACHE_TYPE_LAST number of items */ }; +/* + * virResctrlAllocMB represents one memory bandwidth allocation. Since it can have + * multiple last level caches in a NUMA system, it is also represented as a nested + * sparse arrays as virRestrlAllocPerLevel + */ +struct _virResctrlAllocMB { + unsigned int **bandwidth; + size_t nsizes; +}; + struct _virResctrlAlloc { virObject parent; virResctrlAllocPerLevelPtr *levels; size_t nlevels; + virResctrlAllocMBPtr mba; /* The identifier (any unique string for now) */ char *id; /* libvirt-generated path in /sys/fs/resctrl for this particular @@ -251,6 +284,13 @@ virResctrlAllocDispose(void *obj) VIR_FREE(level); } + if (alloc->mba) { + virResctrlAllocMBPtr mba = alloc->mba; + for (i = 0; i < mba->nsizes; i++) + VIR_FREE(mba->bandwidth[i]); + } + + VIR_FREE(alloc->mba); VIR_FREE(alloc->id); VIR_FREE(alloc->path); VIR_FREE(alloc->levels); @@ -440,6 +480,61 @@ virResctrlGetCacheInfo(virResctrlInfoPtr resctrl, DIR *dirp) } +static int +virResctrlGetMemoryBandwidthInfo(virResctrlInfoPtr resctrl) +{ + int ret = -1; + int rv = -1; + virResctrlInfoMBPtr i_mb = NULL; + + /* query memory bandwidth allocation info */ + if (VIR_ALLOC(i_mb) < 0) + goto cleanup; + rv = virFileReadValueUint(&i_mb->bandwidth_granularity, + SYSFS_RESCTRL_PATH "/info/MB/bandwidth_gran"); + if (rv == -2) { + /* The file doesn't exist, so it's unusable for us, + * properly mba unsupported */ + VIR_WARN("The path '" SYSFS_RESCTRL_PATH "/info/MB/bandwidth_gran'" + "does not exist"); + ret = 0; + goto cleanup; + } else if (rv < 0) { + /* Other failures are fatal, so just quit */ + goto cleanup; + } + + rv = virFileReadValueUint(&i_mb->min_bandwidth, + SYSFS_RESCTRL_PATH "/info/MB/min_bandwidth"); + if (rv == -2) { + /* If the previous file exists, so should this one. Hence -2 is + * fatal in this case as well (errors out in next condition) - the + * kernel interface might've changed too much or something else is + * wrong. */ + + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Cannot get min bandwidth from resctrl cache info")); + } + if (rv < 0) + goto cleanup; + + rv = virFileReadValueUint(&i_mb->max_allocation, + SYSFS_RESCTRL_PATH "/info/MB/num_closids"); + if (rv == -2) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Cannot get max allocation from resctrl cache info")); + } + if (rv < 0) + goto cleanup; + + VIR_STEAL_PTR(resctrl->mb_info, i_mb); + ret = 0; + cleanup: + VIR_FREE(i_mb); + return ret; +} + + /* virResctrlInfo-related definitions */ static int virResctrlGetInfo(virResctrlInfoPtr resctrl) @@ -451,6 +546,10 @@ virResctrlGetInfo(virResctrlInfoPtr resctrl) if (ret <= 0) goto cleanup; + ret = virResctrlGetMemoryBandwidthInfo(resctrl); + if (ret < 0) + goto cleanup; + ret = virResctrlGetCacheInfo(resctrl, dirp); if (ret < 0) goto cleanup; @@ -492,6 +591,9 @@ virResctrlInfoIsEmpty(virResctrlInfoPtr resctrl) if (!resctrl) return true; + if (resctrl->mb_info) + return false; + for (i = 0; i < resctrl->nlevels; i++) { virResctrlInfoPerLevelPtr i_level = resctrl->levels[i]; @@ -517,12 +619,26 @@ virResctrlInfoGetCache(virResctrlInfoPtr resctrl, { virResctrlInfoPerLevelPtr i_level = NULL; virResctrlInfoPerTypePtr i_type = NULL; + virResctrlInfoMBPtr mb_info = NULL; size_t i = 0; int ret = -1; if (virResctrlInfoIsEmpty(resctrl)) return 0; + /* Let's take the opportunity to update the number of + * last level cache. This number is used to calculate + * free memory bandwidth */ + if (resctrl->mb_info) { + mb_info = resctrl->mb_info; + if (level > mb_info->last_level_cache) { + mb_info->last_level_cache = level; + mb_info->max_id = 0; + } else if (mb_info->last_level_cache == level) { + mb_info->max_id++; + } + } + if (level >= resctrl->nlevels) return 0; @@ -593,6 +709,9 @@ virResctrlAllocIsEmpty(virResctrlAllocPtr alloc) if (!alloc) return true; + if (alloc->mba) + return false; + for (i = 0; i < alloc->nlevels; i++) { virResctrlAllocPerLevelPtr a_level = alloc->levels[i]; @@ -786,6 +905,27 @@ virResctrlAllocSetSize(virResctrlAllocPtr alloc, int +virResctrlAllocForeachMemory(virResctrlAllocPtr alloc, + virResctrlAllocForeachMemoryCallback cb, + void *opaque) +{ + unsigned int id = 0; + + if (!alloc) + return 0; + + if (alloc->mba) { + virResctrlAllocMBPtr mba = alloc->mba; + for (id = 0; id < mba->nsizes; id++) + if (mba->bandwidth[id]) + cb(id, *mba->bandwidth[id], opaque); + } + + return 0; +} + + +int virResctrlAllocForeachCache(virResctrlAllocPtr alloc, virResctrlAllocForeachCacheCallback cb, void *opaque) @@ -848,6 +988,217 @@ virResctrlAllocGetID(virResctrlAllocPtr alloc) } +static void +virResctrlMemoryBandwidthSubstract(virResctrlAllocPtr free, + virResctrlAllocPtr used) +{ + size_t i; + + if (used->mba) { + for (i = 0; i < used->mba->nsizes; i++) { + if (used->mba->bandwidth[i]) + *(free->mba->bandwidth[i]) -= *(used->mba->bandwidth[i]); + } + } +} + + +int +virResctrlSetMemoryBandwidth(virResctrlAllocPtr alloc, + unsigned int id, + unsigned int memory_bandwidth) +{ + virResctrlAllocMBPtr mba = alloc->mba; + + if (!mba) { + if (VIR_ALLOC(mba) < 0) + return -1; + alloc->mba = mba; + } + + if (mba->nsizes <= id && + VIR_EXPAND_N(mba->bandwidth, mba->nsizes, + id - mba->nsizes + 1) < 0) + return -1; + + if (mba->bandwidth[id]) { + virReportError(VIR_ERR_XML_ERROR, + _("Collision Memory Bandwidth on node %d"), + id); + return -1; + } + + if (VIR_ALLOC(mba->bandwidth[id]) < 0) + return -1; + + *(mba->bandwidth[id]) = memory_bandwidth; + return 0; +} + + +static int +virResctrlAllocMemoryBandwidthFormat(virResctrlAllocPtr alloc, virBufferPtr buf) +{ + int id; + + if (!alloc->mba) + return 0; + + virBufferAddLit(buf, "MB:"); + + for (id = 0; id < alloc->mba->nsizes; id++) { + if (alloc->mba->bandwidth[id]) { + virBufferAsprintf(buf, "%u=%u;", id, + *(alloc->mba->bandwidth[id])); + } + } + + virBufferTrim(buf, ";", 1); + virBufferAddChar(buf, '\n'); + virBufferCheckError(buf); + return 0; +} + + +static int +virResctrlAllocMemoryBandwidth(virResctrlInfoPtr resctrl, + virResctrlAllocPtr alloc, virResctrlAllocPtr free) +{ + int id; + int ret = -1; + virResctrlAllocMBPtr mb_alloc = alloc->mba; + virResctrlAllocMBPtr mb_free = free->mba; + virResctrlInfoMBPtr mb_info = resctrl->mb_info; + + if (!mb_alloc) { + ret = 0; + return ret; + } + + if (mb_alloc && !mb_info) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("RDT Memory Bandwidth allocation" + " unsupported")); + return ret; + } + + for (id = 0; id < mb_alloc->nsizes; id ++) { + if (!mb_alloc->bandwidth[id]) + continue; + + if (*(mb_alloc->bandwidth[id]) % mb_info->bandwidth_granularity) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("Memory Bandwidth allocation of size " + "%u is not divisible by granularity %u"), + *(mb_alloc->bandwidth[id]), + mb_info->bandwidth_granularity); + return ret; + } + if (*(mb_alloc->bandwidth[id]) < mb_info->min_bandwidth) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("Memory Bandwidth allocation of size " + "%u is smaller than the minimum " + "allowed allocation %u"), + *(mb_alloc->bandwidth[id]), + mb_info->min_bandwidth); + return ret; + } + if (id > mb_info->max_id) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("bandwidth controller %u not exist," + " max controller id %u"), + id, mb_info->max_id); + return ret; + } + if (*(mb_alloc->bandwidth[id]) > *(mb_free->bandwidth[id])) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("Not enough room for allocation of %u " + "bandwidth for node %u%%, freed %u%%"), + id, *(mb_alloc->bandwidth[id]), + *(mb_free->bandwidth[id])); + return ret; + } + } + ret = 0; + return ret; +} + + +static int +virResctrlAllocParseMemoryBandwidthLine(virResctrlInfoPtr resctrl, + virResctrlAllocPtr alloc, + char *line) +{ + char **mbs = NULL; + char *tmp = NULL; + unsigned int bandwidth; + size_t nmb = 0; + unsigned int id; + size_t i; + int ret = -1; + + /* For no reason there can be spaces */ + virSkipSpaces((const char **) &line); + + if (STRNEQLEN(line, "MB", 2)) + return 0; + + if (!resctrl || !resctrl->mb_info + || !resctrl->mb_info->min_bandwidth + || !resctrl->mb_info->bandwidth_granularity) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Missing or inconsistent resctrl info for " + "memory bandwidth allocation")); + } + + if (!alloc->mba) { + if (VIR_ALLOC(alloc->mba) < 0) + return -1; + } + + tmp = strchr(line, ':'); + if (!tmp) + return 0; + tmp++; + + mbs = virStringSplitCount(tmp, ";", 0, &nmb); + if (!nmb) + return 0; + + for (i = 0; i < nmb; i++) { + tmp = strchr(mbs[i], '='); + *tmp = '\0'; + tmp++; + + if (virStrToLong_uip(mbs[i], NULL, 10, &id) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Invalid node id %u "), id); + goto cleanup; + } + if (virStrToLong_uip(tmp, NULL, 10, &bandwidth) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Invalid bandwidth %u"), bandwidth); + goto cleanup; + } + if (alloc->mba->nsizes <= id && + VIR_EXPAND_N(alloc->mba->bandwidth, alloc->mba->nsizes, + id - alloc->mba->nsizes + 1) < 0) { + goto cleanup; + } + if (!alloc->mba->bandwidth[id]) { + if (VIR_ALLOC(alloc->mba->bandwidth[id]) < 0) + goto cleanup; + } + + *(alloc->mba->bandwidth[id]) = bandwidth; + } + ret = 0; + cleanup: + virStringListFree(mbs); + return ret; +} + + static int virResctrlAllocFormatCache(virResctrlAllocPtr alloc, virBufferPtr buf) { @@ -909,6 +1260,11 @@ virResctrlAllocFormat(virResctrlAllocPtr alloc) return NULL; } + if (virResctrlAllocMemoryBandwidthFormat(alloc, &buf) < 0) { + virBufferFreeAndReset(&buf); + return NULL; + } + virBufferCheckError(&buf); return virBufferContentAndReset(&buf); } @@ -1036,6 +1392,9 @@ virResctrlAllocParse(virResctrlInfoPtr resctrl, lines = virStringSplitCount(schemata, "\n", 0, &nlines); for (i = 0; i < nlines; i++) { + if (virResctrlAllocParseMemoryBandwidthLine(resctrl, alloc, lines[i]) < 0) + goto cleanup; + if (virResctrlAllocParseCacheLine(resctrl, alloc, lines[i]) < 0) goto cleanup; } @@ -1170,6 +1529,22 @@ virResctrlAllocNewFromInfo(virResctrlInfoPtr info) } } + /* set default free memory bandwidth to 100%*/ + if (info->mb_info) { + if (VIR_ALLOC(ret->mba) < 0) + goto error; + + if (VIR_EXPAND_N(ret->mba->bandwidth, ret->mba->nsizes, + info->mb_info->max_id - ret->mba->nsizes + 1) < 0) + goto error; + + for (i = 0; i < ret->mba->nsizes; i++) { + if (VIR_ALLOC(ret->mba->bandwidth[i]) < 0) + goto error; + *(ret->mba->bandwidth[i]) = 100; + } + } + cleanup: virBitmapFree(mask); return ret; @@ -1233,6 +1608,7 @@ virResctrlAllocGetUnused(virResctrlInfoPtr resctrl) goto error; } + virResctrlMemoryBandwidthSubstract(ret, alloc); virResctrlAllocSubtract(ret, alloc); virObjectUnref(alloc); alloc = NULL; @@ -1444,6 +1820,9 @@ virResctrlAllocAssign(virResctrlInfoPtr resctrl, if (!alloc_default) goto cleanup; + if (virResctrlAllocMemoryBandwidth(resctrl, alloc, alloc_free) < 0) + goto cleanup; + if (virResctrlAllocCopyMasks(alloc, alloc_default) < 0) goto cleanup; diff --git a/src/util/virresctrl.h b/src/util/virresctrl.h index 26c5f3b..5e78334 100644 --- a/src/util/virresctrl.h +++ b/src/util/virresctrl.h @@ -73,6 +73,10 @@ typedef int virResctrlAllocForeachCacheCallback(unsigned int level, unsigned long long size, void *opaque); +typedef int virResctrlAllocForeachMemoryCallback(unsigned int id, + unsigned int size, + void *opaque); + virResctrlAllocPtr virResctrlAllocNew(void); @@ -92,6 +96,16 @@ virResctrlAllocForeachCache(virResctrlAllocPtr alloc, void *opaque); int +virResctrlAllocForeachMemory(virResctrlAllocPtr resctrl, + virResctrlAllocForeachMemoryCallback cb, + void *opaque); + +int +virResctrlSetMemoryBandwidth(virResctrlAllocPtr alloc, + unsigned int id, + unsigned int memory_bandwidth); + +int virResctrlAllocSetID(virResctrlAllocPtr alloc, const char *id); const char * -- 2.7.4

On 06/15/2018 05:29 AM, bing.niu@intel.com wrote:
From: Bing Niu <bing.niu@intel.com>
Add memory bandwidth allocation support basing on existing
s/basing/based
virresctrl implementation. Two new structures virResctrlInfoMB ^^ "cache" (IOW: fit it in between)
and virResctrlAllocMB are introduced.
virResctrlInfoMB is used to record host system MBA supporting information, e.g., minimum bandwidth allowed, bandwidth granularity, number of bandwidth controller (same as number of last level cache).
virResctrlAllocMB is used for allocating memory bandwidth. Following virResctrlAllocPerType, it also employs a nested sparse array to indicate whether allocation is available for particular llc.
llc == ? [last level cache as I found later...] Probably could have "split" this patch too in order to add each memory bandwidth and memory bandwidth info separately. Lots of stuff in this patch to keep track of.
Overall, the memory bandwidth allocation follows the same sequence as existing virresctrl cache allocation model. It exposes interfaces for probing host's memory bandwidth capability on initialization time and memory bandwidth allocation on runtime.
Signed-off-by: Bing Niu <bing.niu@intel.com> --- src/libvirt_private.syms | 2 + src/util/virresctrl.c | 383 ++++++++++++++++++++++++++++++++++++++++++++++- src/util/virresctrl.h | 14 ++ 3 files changed, 397 insertions(+), 2 deletions(-)
Rather than using "MB", could we use "MemBW" - I see MB and think megabytes. Maybe it's just me... Also rather than spelling out MemoryBandwidth sometimes, use the same shorthand to be consistent. Makes it easier to search for MemBW related functionality. Although someone else reviewing may have a different opinion - world's full of 'em ;-). Beyond that, after reading subsequent patches it doesn't seem memory tunes are a related to cachetunes other than that both use resctrl API's in order to manipulate data. If domain XML didn't have "cputune/cachetune", then does part of virResctrlAlloc not get used? I assume that'd be the @levels. Can both cachetunes and memtunes be supplied? I assume so, but figured I'd ask.
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index fc17059..6925062 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2628,6 +2628,7 @@ virResctrlAllocAddPID; virResctrlAllocCreate; virResctrlAllocDeterminePath; virResctrlAllocForeachCache; +virResctrlAllocForeachMemory; virResctrlAllocFormat; virResctrlAllocGetID; virResctrlAllocGetUnused; @@ -2635,6 +2636,7 @@ virResctrlAllocNew; virResctrlAllocRemove; virResctrlAllocSetID; virResctrlAllocSetSize; +virResctrlSetMemoryBandwidth; virResctrlInfoGetCache; virResctrlInfoNew;
syntax-check would have told you this is out of order alphabetically
diff --git a/src/util/virresctrl.c b/src/util/virresctrl.c index e62061f..de736b0 100644 --- a/src/util/virresctrl.c +++ b/src/util/virresctrl.c @@ -80,18 +80,21 @@ typedef virResctrlInfoPerType *virResctrlInfoPerTypePtr; typedef struct _virResctrlInfoPerLevel virResctrlInfoPerLevel; typedef virResctrlInfoPerLevel *virResctrlInfoPerLevelPtr;
+typedef struct _virResctrlInfoMB virResctrlInfoMB; +typedef virResctrlInfoMB *virResctrlInfoMBPtr; + typedef struct _virResctrlAllocPerType virResctrlAllocPerType; typedef virResctrlAllocPerType *virResctrlAllocPerTypePtr;
typedef struct _virResctrlAllocPerLevel virResctrlAllocPerLevel; typedef virResctrlAllocPerLevel *virResctrlAllocPerLevelPtr;
- +typedef struct _virResctrlAllocMB virResctrlAllocMB; +typedef virResctrlAllocMB *virResctrlAllocMBPtr; /* Class definitions and initializations */ static virClassPtr virResctrlInfoClass; static virClassPtr virResctrlAllocClass;
- /* virResctrlInfo */ struct _virResctrlInfoPerType { /* Kernel-provided information */ @@ -116,11 +119,29 @@ struct _virResctrlInfoPerLevel { virResctrlInfoPerTypePtr *types; };
+/* Information about memory bandwidth allocation */ +struct _virResctrlInfoMB { + /* minimum memory bandwidth allowed*/
s/d*/d */
+ unsigned int min_bandwidth; + /* bandwidth granularity */ + unsigned int bandwidth_granularity; + /* Maximum number of simultaneous allocations */ + unsigned int max_allocation; + /* level number of last level cache*/
s/e*/e */
+ unsigned int last_level_cache; + /* max id of last level cache, this is used to track + * how many last level cache available in host system + * */
Non standard comment format ("* */"
+ unsigned int max_id; +}; + struct _virResctrlInfo { virObject parent;
virResctrlInfoPerLevelPtr *levels; size_t nlevels; + + virResctrlInfoMBPtr mb_info;
How about "MemBWInfo" or membw_info
};
@@ -146,6 +167,7 @@ virResctrlInfoDispose(void *obj) VIR_FREE(level); }
+ VIR_FREE(resctrl->mb_info); VIR_FREE(resctrl->levels); }
There's a really detailed description of virResctrlAlloc right about here that could use some of the details about the memory bandwidth included into it, especially since you're changing _virResctrlAlloc
@@ -202,12 +224,23 @@ struct _virResctrlAllocPerLevel { * VIR_CACHE_TYPE_LAST number of items */ };
+/* + * virResctrlAllocMB represents one memory bandwidth allocation. Since it can have + * multiple last level caches in a NUMA system, it is also represented as a nested + * sparse arrays as virRestrlAllocPerLevel + */ +struct _virResctrlAllocMB { + unsigned int **bandwidth;
This should be bandwidths
+ size_t nsizes;
This should be 'nbandwidths' as in the number of **bandwidths.
+}; + struct _virResctrlAlloc { virObject parent;
virResctrlAllocPerLevelPtr *levels; size_t nlevels;
+ virResctrlAllocMBPtr mba;
s/mba/MemBW or MemBWAlloc Also add a blank line for reading separation...
/* The identifier (any unique string for now) */ char *id; /* libvirt-generated path in /sys/fs/resctrl for this particular @@ -251,6 +284,13 @@ virResctrlAllocDispose(void *obj) VIR_FREE(level); }
+ if (alloc->mba) { + virResctrlAllocMBPtr mba = alloc->mba; + for (i = 0; i < mba->nsizes; i++) + VIR_FREE(mba->bandwidth[i]); + } + + VIR_FREE(alloc->mba); VIR_FREE(alloc->id); VIR_FREE(alloc->path); VIR_FREE(alloc->levels); @@ -440,6 +480,61 @@ virResctrlGetCacheInfo(virResctrlInfoPtr resctrl, DIR *dirp) }
+static int +virResctrlGetMemoryBandwidthInfo(virResctrlInfoPtr resctrl) +{ + int ret = -1; + int rv = -1; + virResctrlInfoMBPtr i_mb = NULL; + + /* query memory bandwidth allocation info */ + if (VIR_ALLOC(i_mb) < 0) + goto cleanup; + rv = virFileReadValueUint(&i_mb->bandwidth_granularity, + SYSFS_RESCTRL_PATH "/info/MB/bandwidth_gran"); + if (rv == -2) { + /* The file doesn't exist, so it's unusable for us, + * properly mba unsupported */ + VIR_WARN("The path '" SYSFS_RESCTRL_PATH "/info/MB/bandwidth_gran'" + "does not exist"); + ret = 0; + goto cleanup; + } else if (rv < 0) { + /* Other failures are fatal, so just quit */ + goto cleanup; + } + + rv = virFileReadValueUint(&i_mb->min_bandwidth, + SYSFS_RESCTRL_PATH "/info/MB/min_bandwidth"); + if (rv == -2) { + /* If the previous file exists, so should this one. Hence -2 is + * fatal in this case as well (errors out in next condition) - the + * kernel interface might've changed too much or something else is + * wrong. */ + + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Cannot get min bandwidth from resctrl cache info")); + } + if (rv < 0) + goto cleanup; + + rv = virFileReadValueUint(&i_mb->max_allocation, + SYSFS_RESCTRL_PATH "/info/MB/num_closids"); + if (rv == -2) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Cannot get max allocation from resctrl cache info")); + } + if (rv < 0) + goto cleanup; + + VIR_STEAL_PTR(resctrl->mb_info, i_mb); + ret = 0; + cleanup: + VIR_FREE(i_mb); + return ret; +} + + /* virResctrlInfo-related definitions */ static int virResctrlGetInfo(virResctrlInfoPtr resctrl) @@ -451,6 +546,10 @@ virResctrlGetInfo(virResctrlInfoPtr resctrl) if (ret <= 0) goto cleanup;
+ ret = virResctrlGetMemoryBandwidthInfo(resctrl); + if (ret < 0) + goto cleanup; + ret = virResctrlGetCacheInfo(resctrl, dirp); if (ret < 0) goto cleanup; @@ -492,6 +591,9 @@ virResctrlInfoIsEmpty(virResctrlInfoPtr resctrl) if (!resctrl) return true;
+ if (resctrl->mb_info) + return false; + for (i = 0; i < resctrl->nlevels; i++) { virResctrlInfoPerLevelPtr i_level = resctrl->levels[i];
@@ -517,12 +619,26 @@ virResctrlInfoGetCache(virResctrlInfoPtr resctrl, { virResctrlInfoPerLevelPtr i_level = NULL; virResctrlInfoPerTypePtr i_type = NULL; + virResctrlInfoMBPtr mb_info = NULL; size_t i = 0; int ret = -1;
if (virResctrlInfoIsEmpty(resctrl)) return 0;
+ /* Let's take the opportunity to update the number of + * last level cache. This number is used to calculate + * free memory bandwidth */ + if (resctrl->mb_info) { + mb_info = resctrl->mb_info; + if (level > mb_info->last_level_cache) { + mb_info->last_level_cache = level; + mb_info->max_id = 0; + } else if (mb_info->last_level_cache == level) { + mb_info->max_id++; + } + } + if (level >= resctrl->nlevels) return 0;
@@ -593,6 +709,9 @@ virResctrlAllocIsEmpty(virResctrlAllocPtr alloc) if (!alloc) return true;
+ if (alloc->mba) + return false; + for (i = 0; i < alloc->nlevels; i++) { virResctrlAllocPerLevelPtr a_level = alloc->levels[i];
@@ -786,6 +905,27 @@ virResctrlAllocSetSize(virResctrlAllocPtr alloc,
int +virResctrlAllocForeachMemory(virResctrlAllocPtr alloc, + virResctrlAllocForeachMemoryCallback cb, + void *opaque) +{ + unsigned int id = 0;
size_t i; since this is a for loop
+ + if (!alloc) + return 0; + + if (alloc->mba) { + virResctrlAllocMBPtr mba = alloc->mba; + for (id = 0; id < mba->nsizes; id++) + if (mba->bandwidth[id]) + cb(id, *mba->bandwidth[id], opaque); + } + + return 0; +} + + +int virResctrlAllocForeachCache(virResctrlAllocPtr alloc, virResctrlAllocForeachCacheCallback cb, void *opaque) @@ -848,6 +988,217 @@ virResctrlAllocGetID(virResctrlAllocPtr alloc) }
+static void +virResctrlMemoryBandwidthSubstract(virResctrlAllocPtr free, + virResctrlAllocPtr used)
Subtract (and don't forget to fix the alignment for arg2 when you do change the naming)
+{ + size_t i;
Nit: if (!used->MemBW) return; for (...)
+ + if (used->mba) { + for (i = 0; i < used->mba->nsizes; i++) { + if (used->mba->bandwidth[i]) + *(free->mba->bandwidth[i]) -= *(used->mba->bandwidth[i]); + } + } +} + + +int +virResctrlSetMemoryBandwidth(virResctrlAllocPtr alloc, + unsigned int id, + unsigned int memory_bandwidth) +{ + virResctrlAllocMBPtr mba = alloc->mba; + + if (!mba) { + if (VIR_ALLOC(mba) < 0) + return -1; + alloc->mba = mba; + } + + if (mba->nsizes <= id && + VIR_EXPAND_N(mba->bandwidth, mba->nsizes, + id - mba->nsizes + 1) < 0) + return -1; + + if (mba->bandwidth[id]) {
Compare != 0 since it's an unsigned int contained and not a bool or pointer (makes it clearer).
+ virReportError(VIR_ERR_XML_ERROR, + _("Collision Memory Bandwidth on node %d"),
Memory Bandwidth already defined for node %u (it's unsigned...)
+ id); + return -1; + } + + if (VIR_ALLOC(mba->bandwidth[id]) < 0) + return -1; + + *(mba->bandwidth[id]) = memory_bandwidth; + return 0; +} + + +static int +virResctrlAllocMemoryBandwidthFormat(virResctrlAllocPtr alloc, virBufferPtr buf)
One argument per line
+{ + int id;
size_t i; it's a for loop
+ + if (!alloc->mba) + return 0; + + virBufferAddLit(buf, "MB:");
MemBW
+ + for (id = 0; id < alloc->mba->nsizes; id++) { + if (alloc->mba->bandwidth[id]) { + virBufferAsprintf(buf, "%u=%u;", id,
"%zd=%u;" zd is the format used for size_t
+ *(alloc->mba->bandwidth[id])); + } + } + + virBufferTrim(buf, ";", 1); + virBufferAddChar(buf, '\n'); + virBufferCheckError(buf); + return 0; +} + + +static int +virResctrlAllocMemoryBandwidth(virResctrlInfoPtr resctrl, + virResctrlAllocPtr alloc, virResctrlAllocPtr free)
One argument per line
+{ + int id;
size_t i; It's a for loop
+ int ret = -1;
Just return 0 or -1 directly, don't bother this this since you're not going to a cleanup label it really doesn't matter.
+ virResctrlAllocMBPtr mb_alloc = alloc->mba; + virResctrlAllocMBPtr mb_free = free->mba; + virResctrlInfoMBPtr mb_info = resctrl->mb_info; + + if (!mb_alloc) { + ret = 0; + return ret; + }
Just return 0 - don't bother with the ret = 0;
+ + if (mb_alloc && !mb_info) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("RDT Memory Bandwidth allocation" + " unsupported")); + return ret; + } + + for (id = 0; id < mb_alloc->nsizes; id ++) {
s/ ++/++/ (no space on the autoincrement.
+ if (!mb_alloc->bandwidth[id]) + continue; + + if (*(mb_alloc->bandwidth[id]) % mb_info->bandwidth_granularity) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("Memory Bandwidth allocation of size " + "%u is not divisible by granularity %u"), + *(mb_alloc->bandwidth[id]), + mb_info->bandwidth_granularity); + return ret; + } + if (*(mb_alloc->bandwidth[id]) < mb_info->min_bandwidth) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("Memory Bandwidth allocation of size " + "%u is smaller than the minimum " + "allowed allocation %u"), + *(mb_alloc->bandwidth[id]), + mb_info->min_bandwidth); + return ret; + } + if (id > mb_info->max_id) {
Is @id the right comparison here, should it be the current *(mb_alloc->bandwidth[id])" value? @id was the VIR_EXPAND_N of the maximum size. In any case, a very strange comparison.
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("bandwidth controller %u not exist," + " max controller id %u"),
s/%u/%zd/ s/exist,/exist, / s/ max/max (IOW: Space on the first line)
+ id, mb_info->max_id); + return ret; + } + if (*(mb_alloc->bandwidth[id]) > *(mb_free->bandwidth[id])) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("Not enough room for allocation of %u "
s/%u/%zd/
+ "bandwidth for node %u%%, freed %u%%"),
The error message is a bit strange - especially that "freed" part. I'm still not 100% clear about all that's going on, but I'm sure to come back to this in some future review.
+ id, *(mb_alloc->bandwidth[id]), + *(mb_free->bandwidth[id])); + return ret; + } + } + ret = 0; + return ret; +} + + +static int +virResctrlAllocParseMemoryBandwidthLine(virResctrlInfoPtr resctrl, + virResctrlAllocPtr alloc, + char *line) +{ + char **mbs = NULL; + char *tmp = NULL; + unsigned int bandwidth; + size_t nmb = 0;
s/nmb/nmbs/ (or numbws if you change @mbs to be mbws
+ unsigned int id; + size_t i; + int ret = -1; + + /* For no reason there can be spaces */ + virSkipSpaces((const char **) &line); + + if (STRNEQLEN(line, "MB", 2))
To follow the above ..."MemBW", 5...
+ return 0; + + if (!resctrl || !resctrl->mb_info + || !resctrl->mb_info->min_bandwidth + || !resctrl->mb_info->bandwidth_granularity) {
Put the || on the previous line
+ virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Missing or inconsistent resctrl info for " + "memory bandwidth allocation")); + } + + if (!alloc->mba) { + if (VIR_ALLOC(alloc->mba) < 0) + return -1; + }
Hmmm, this is familiar...
+ + tmp = strchr(line, ':'); + if (!tmp) + return 0; + tmp++; + + mbs = virStringSplitCount(tmp, ";", 0, &nmb); + if (!nmb) + return 0;
Since @nmb is a size_t, make the explicit == 0 comparison. Alternatively, use "if (!mbs)"... What happens is Coverity gets grumpy thinking a 0 can be returned with something in @mbs, so it complains. Another way around it is "ret = 0; goto cleanup;" to trick it into thinking it's calling the Free function.
+ + for (i = 0; i < nmb; i++) { + tmp = strchr(mbs[i], '=');
strchr can return NULL, which would cause the following line all sorts of problems. Yes, I know how it was written, but better safe than sorry.
+ *tmp = '\0'; + tmp++; + + if (virStrToLong_uip(mbs[i], NULL, 10, &id) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Invalid node id %u "), id); + goto cleanup; + } + if (virStrToLong_uip(tmp, NULL, 10, &bandwidth) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Invalid bandwidth %u"), bandwidth); + goto cleanup; + }
Including the VIR_ALLOC above... here to ...
+ if (alloc->mba->nsizes <= id && + VIR_EXPAND_N(alloc->mba->bandwidth, alloc->mba->nsizes, + id - alloc->mba->nsizes + 1) < 0) { + goto cleanup; + } + if (!alloc->mba->bandwidth[id]) { + if (VIR_ALLOC(alloc->mba->bandwidth[id]) < 0) + goto cleanup; + } + + *(alloc->mba->bandwidth[id]) = bandwidth; + }
...here replicates virResctrlSetMemoryBandwidth (reuse, recycle, don't cause someone to change 2 places because they *will forget* one!)
+ ret = 0; + cleanup: + virStringListFree(mbs); + return ret; +} + + static int virResctrlAllocFormatCache(virResctrlAllocPtr alloc, virBufferPtr buf) { @@ -909,6 +1260,11 @@ virResctrlAllocFormat(virResctrlAllocPtr alloc) return NULL; }
+ if (virResctrlAllocMemoryBandwidthFormat(alloc, &buf) < 0) { + virBufferFreeAndReset(&buf); + return NULL; + } + virBufferCheckError(&buf); return virBufferContentAndReset(&buf); } @@ -1036,6 +1392,9 @@ virResctrlAllocParse(virResctrlInfoPtr resctrl,
lines = virStringSplitCount(schemata, "\n", 0, &nlines); for (i = 0; i < nlines; i++) { + if (virResctrlAllocParseMemoryBandwidthLine(resctrl, alloc, lines[i]) < 0) + goto cleanup; + if (virResctrlAllocParseCacheLine(resctrl, alloc, lines[i]) < 0) goto cleanup; } @@ -1170,6 +1529,22 @@ virResctrlAllocNewFromInfo(virResctrlInfoPtr info) } }
+ /* set default free memory bandwidth to 100%*/
Perhaps this answers my question above about why the %%, but I'm not 100% sure
+ if (info->mb_info) { + if (VIR_ALLOC(ret->mba) < 0) + goto error; + + if (VIR_EXPAND_N(ret->mba->bandwidth, ret->mba->nsizes, + info->mb_info->max_id - ret->mba->nsizes + 1) < 0)
This is a strange calculation especially since ret->mba->nsizes has to be zero because we just did a VIR_ALLOC(ret->mba).
+ goto error; + + for (i = 0; i < ret->mba->nsizes; i++) { + if (VIR_ALLOC(ret->mba->bandwidth[i]) < 0) + goto error; + *(ret->mba->bandwidth[i]) = 100;
I think I'm missing something subtle here. Maybe it's the "alloc" and "free" - not quite sure. Perhaps some relationship that the *Subtract method handles. There's virResctrlSetMemoryBandwidth which restricts the "mba->bandwidth[id]" with that collision message I noted, but perhaps this is just some internal thing/logic that I'll "get" the next time I review a patch series.
+ } + } + cleanup: virBitmapFree(mask); return ret; @@ -1233,6 +1608,7 @@ virResctrlAllocGetUnused(virResctrlInfoPtr resctrl) goto error; }
+ virResctrlMemoryBandwidthSubstract(ret, alloc);
Subtract
virResctrlAllocSubtract(ret, alloc); virObjectUnref(alloc); alloc = NULL; @@ -1444,6 +1820,9 @@ virResctrlAllocAssign(virResctrlInfoPtr resctrl, if (!alloc_default) goto cleanup;
+ if (virResctrlAllocMemoryBandwidth(resctrl, alloc, alloc_free) < 0) + goto cleanup; +
Now that this is added, the description for this function should be updated since it seems to be going beyond just size to mask now. John
if (virResctrlAllocCopyMasks(alloc, alloc_default) < 0) goto cleanup;
diff --git a/src/util/virresctrl.h b/src/util/virresctrl.h index 26c5f3b..5e78334 100644 --- a/src/util/virresctrl.h +++ b/src/util/virresctrl.h @@ -73,6 +73,10 @@ typedef int virResctrlAllocForeachCacheCallback(unsigned int level, unsigned long long size, void *opaque);
+typedef int virResctrlAllocForeachMemoryCallback(unsigned int id, + unsigned int size, + void *opaque); + virResctrlAllocPtr virResctrlAllocNew(void);
@@ -92,6 +96,16 @@ virResctrlAllocForeachCache(virResctrlAllocPtr alloc, void *opaque);
int +virResctrlAllocForeachMemory(virResctrlAllocPtr resctrl, + virResctrlAllocForeachMemoryCallback cb, + void *opaque); + +int +virResctrlSetMemoryBandwidth(virResctrlAllocPtr alloc, + unsigned int id, + unsigned int memory_bandwidth); + +int virResctrlAllocSetID(virResctrlAllocPtr alloc, const char *id); const char *

On 2018年06月30日 06:39, John Ferlan wrote:
On 06/15/2018 05:29 AM, bing.niu@intel.com wrote:
From: Bing Niu <bing.niu@intel.com>
Add memory bandwidth allocation support basing on existing
s/basing/based
Will fix.
virresctrl implementation. Two new structures virResctrlInfoMB ^^ "cache" (IOW: fit it in between)
OK, will change to "Add memory bandwidth allocation support based on existing virresctrl, so that virresctrl can fit cache allocation and memory bandwidth allocation concurrently"
and virResctrlAllocMB are introduced.
virResctrlInfoMB is used to record host system MBA supporting information, e.g., minimum bandwidth allowed, bandwidth granularity, number of bandwidth controller (same as number of last level cache).
virResctrlAllocMB is used for allocating memory bandwidth. Following virResctrlAllocPerType, it also employs a nested sparse array to indicate whether allocation is available for particular llc.
llc == ? [last level cache as I found later...]
Probably could have "split" this patch too in order to add each memory bandwidth and memory bandwidth info separately. Lots of stuff in this patch to keep track of.
Good idea. Will split virResctrlInfoMB and virResctrlAllocMB in next round.
Overall, the memory bandwidth allocation follows the same sequence as existing virresctrl cache allocation model. It exposes interfaces for probing host's memory bandwidth capability on initialization time and memory bandwidth allocation on runtime.
Signed-off-by: Bing Niu <bing.niu@intel.com> --- src/libvirt_private.syms | 2 + src/util/virresctrl.c | 383 ++++++++++++++++++++++++++++++++++++++++++++++- src/util/virresctrl.h | 14 ++ 3 files changed, 397 insertions(+), 2 deletions(-)
Rather than using "MB", could we use "MemBW" - I see MB and think megabytes. Maybe it's just me... Also rather than spelling out MemoryBandwidth sometimes, use the same shorthand to be consistent. Makes it easier to search for MemBW related functionality. Although someone else reviewing may have a different opinion - world's full of 'em ;-).
agree and will change to MemBW.
Beyond that, after reading subsequent patches it doesn't seem memory tunes are a related to cachetunes other than that both use resctrl API's in order to manipulate data.
If domain XML didn't have "cputune/cachetune", then does part of virResctrlAlloc not get used? I assume that'd be the @levels. Can both cachetunes and memtunes be supplied? I assume so, but figured I'd ask.
Since we already discussed in patch3. I will not reply here. :)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index fc17059..6925062 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2628,6 +2628,7 @@ virResctrlAllocAddPID; virResctrlAllocCreate; virResctrlAllocDeterminePath; virResctrlAllocForeachCache; +virResctrlAllocForeachMemory; virResctrlAllocFormat; virResctrlAllocGetID; virResctrlAllocGetUnused; @@ -2635,6 +2636,7 @@ virResctrlAllocNew; virResctrlAllocRemove; virResctrlAllocSetID; virResctrlAllocSetSize; +virResctrlSetMemoryBandwidth; virResctrlInfoGetCache; virResctrlInfoNew;
syntax-check would have told you this is out of order alphabetically
Unfortunately syntax-check didn't throw a error to me. And will fix to keep alphabetically.
diff --git a/src/util/virresctrl.c b/src/util/virresctrl.c index e62061f..de736b0 100644 --- a/src/util/virresctrl.c +++ b/src/util/virresctrl.c @@ -80,18 +80,21 @@ typedef virResctrlInfoPerType *virResctrlInfoPerTypePtr; typedef struct _virResctrlInfoPerLevel virResctrlInfoPerLevel; typedef virResctrlInfoPerLevel *virResctrlInfoPerLevelPtr;
+typedef struct _virResctrlInfoMB virResctrlInfoMB; +typedef virResctrlInfoMB *virResctrlInfoMBPtr; + typedef struct _virResctrlAllocPerType virResctrlAllocPerType; typedef virResctrlAllocPerType *virResctrlAllocPerTypePtr;
typedef struct _virResctrlAllocPerLevel virResctrlAllocPerLevel; typedef virResctrlAllocPerLevel *virResctrlAllocPerLevelPtr;
- +typedef struct _virResctrlAllocMB virResctrlAllocMB; +typedef virResctrlAllocMB *virResctrlAllocMBPtr; /* Class definitions and initializations */ static virClassPtr virResctrlInfoClass; static virClassPtr virResctrlAllocClass;
- /* virResctrlInfo */ struct _virResctrlInfoPerType { /* Kernel-provided information */ @@ -116,11 +119,29 @@ struct _virResctrlInfoPerLevel { virResctrlInfoPerTypePtr *types; };
+/* Information about memory bandwidth allocation */ +struct _virResctrlInfoMB { + /* minimum memory bandwidth allowed*/
s/d*/d */
Will fix.
+ unsigned int min_bandwidth; + /* bandwidth granularity */ + unsigned int bandwidth_granularity; + /* Maximum number of simultaneous allocations */ + unsigned int max_allocation; + /* level number of last level cache*/
s/e*/e */
Will fix.
+ unsigned int last_level_cache; + /* max id of last level cache, this is used to track + * how many last level cache available in host system + * */
Non standard comment format ("* */"
Will fix by remove that space.
+ unsigned int max_id; +}; + struct _virResctrlInfo { virObject parent;
virResctrlInfoPerLevelPtr *levels; size_t nlevels; + + virResctrlInfoMBPtr mb_info;
How about "MemBWInfo" or membw_info
OK. will change in next version.
};
@@ -146,6 +167,7 @@ virResctrlInfoDispose(void *obj) VIR_FREE(level); }
+ VIR_FREE(resctrl->mb_info); VIR_FREE(resctrl->levels); }
There's a really detailed description of virResctrlAlloc right about here that could use some of the details about the memory bandwidth included into it, especially since you're changing _virResctrlAlloc
Yes. I just notice that, I will add memory bandwidth part in the description of virResctrlAlloc.
@@ -202,12 +224,23 @@ struct _virResctrlAllocPerLevel { * VIR_CACHE_TYPE_LAST number of items */ };
+/* + * virResctrlAllocMB represents one memory bandwidth allocation. Since it can have + * multiple last level caches in a NUMA system, it is also represented as a nested + * sparse arrays as virRestrlAllocPerLevel + */ +struct _virResctrlAllocMB { + unsigned int **bandwidth;
This should be bandwidths
+ size_t nsizes;
This should be 'nbandwidths' as in the number of **bandwidths. Will fix above two.
+}; + struct _virResctrlAlloc { virObject parent;
virResctrlAllocPerLevelPtr *levels; size_t nlevels;
+ virResctrlAllocMBPtr mba;
s/mba/MemBW or MemBWAlloc
Also add a blank line for reading separation... I prefer to MemBW since there is no *Alloc* in levels. :) Will fix those two.
/* The identifier (any unique string for now) */ char *id; /* libvirt-generated path in /sys/fs/resctrl for this particular @@ -251,6 +284,13 @@ virResctrlAllocDispose(void *obj) VIR_FREE(level); }
+ if (alloc->mba) { + virResctrlAllocMBPtr mba = alloc->mba; + for (i = 0; i < mba->nsizes; i++) + VIR_FREE(mba->bandwidth[i]); + } + + VIR_FREE(alloc->mba); VIR_FREE(alloc->id); VIR_FREE(alloc->path); VIR_FREE(alloc->levels); @@ -440,6 +480,61 @@ virResctrlGetCacheInfo(virResctrlInfoPtr resctrl, DIR *dirp) }
+static int +virResctrlGetMemoryBandwidthInfo(virResctrlInfoPtr resctrl) +{ + int ret = -1; + int rv = -1; + virResctrlInfoMBPtr i_mb = NULL; + + /* query memory bandwidth allocation info */ + if (VIR_ALLOC(i_mb) < 0) + goto cleanup; + rv = virFileReadValueUint(&i_mb->bandwidth_granularity, + SYSFS_RESCTRL_PATH "/info/MB/bandwidth_gran"); + if (rv == -2) { + /* The file doesn't exist, so it's unusable for us, + * properly mba unsupported */ + VIR_WARN("The path '" SYSFS_RESCTRL_PATH "/info/MB/bandwidth_gran'" + "does not exist"); + ret = 0; + goto cleanup; + } else if (rv < 0) { + /* Other failures are fatal, so just quit */ + goto cleanup; + } + + rv = virFileReadValueUint(&i_mb->min_bandwidth, + SYSFS_RESCTRL_PATH "/info/MB/min_bandwidth"); + if (rv == -2) { + /* If the previous file exists, so should this one. Hence -2 is + * fatal in this case as well (errors out in next condition) - the + * kernel interface might've changed too much or something else is + * wrong. */ + + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Cannot get min bandwidth from resctrl cache info")); + } + if (rv < 0) + goto cleanup; + + rv = virFileReadValueUint(&i_mb->max_allocation, + SYSFS_RESCTRL_PATH "/info/MB/num_closids"); + if (rv == -2) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Cannot get max allocation from resctrl cache info")); + } + if (rv < 0) + goto cleanup; + + VIR_STEAL_PTR(resctrl->mb_info, i_mb); + ret = 0; + cleanup: + VIR_FREE(i_mb); + return ret; +} + + /* virResctrlInfo-related definitions */ static int virResctrlGetInfo(virResctrlInfoPtr resctrl) @@ -451,6 +546,10 @@ virResctrlGetInfo(virResctrlInfoPtr resctrl) if (ret <= 0) goto cleanup;
+ ret = virResctrlGetMemoryBandwidthInfo(resctrl); + if (ret < 0) + goto cleanup; + ret = virResctrlGetCacheInfo(resctrl, dirp); if (ret < 0) goto cleanup; @@ -492,6 +591,9 @@ virResctrlInfoIsEmpty(virResctrlInfoPtr resctrl) if (!resctrl) return true;
+ if (resctrl->mb_info) + return false; + for (i = 0; i < resctrl->nlevels; i++) { virResctrlInfoPerLevelPtr i_level = resctrl->levels[i];
@@ -517,12 +619,26 @@ virResctrlInfoGetCache(virResctrlInfoPtr resctrl, { virResctrlInfoPerLevelPtr i_level = NULL; virResctrlInfoPerTypePtr i_type = NULL; + virResctrlInfoMBPtr mb_info = NULL; size_t i = 0; int ret = -1;
if (virResctrlInfoIsEmpty(resctrl)) return 0;
+ /* Let's take the opportunity to update the number of + * last level cache. This number is used to calculate + * free memory bandwidth */ + if (resctrl->mb_info) { + mb_info = resctrl->mb_info; + if (level > mb_info->last_level_cache) { + mb_info->last_level_cache = level; + mb_info->max_id = 0; + } else if (mb_info->last_level_cache == level) { + mb_info->max_id++; + } + } + if (level >= resctrl->nlevels) return 0;
@@ -593,6 +709,9 @@ virResctrlAllocIsEmpty(virResctrlAllocPtr alloc) if (!alloc) return true;
+ if (alloc->mba) + return false; + for (i = 0; i < alloc->nlevels; i++) { virResctrlAllocPerLevelPtr a_level = alloc->levels[i];
@@ -786,6 +905,27 @@ virResctrlAllocSetSize(virResctrlAllocPtr alloc,
int +virResctrlAllocForeachMemory(virResctrlAllocPtr alloc, + virResctrlAllocForeachMemoryCallback cb, + void *opaque) +{ + unsigned int id = 0;
size_t i; since this is a for loop OK!
+ + if (!alloc) + return 0; + + if (alloc->mba) { + virResctrlAllocMBPtr mba = alloc->mba; + for (id = 0; id < mba->nsizes; id++) + if (mba->bandwidth[id]) + cb(id, *mba->bandwidth[id], opaque); + } + + return 0; +} + + +int virResctrlAllocForeachCache(virResctrlAllocPtr alloc, virResctrlAllocForeachCacheCallback cb, void *opaque) @@ -848,6 +988,217 @@ virResctrlAllocGetID(virResctrlAllocPtr alloc) }
+static void +virResctrlMemoryBandwidthSubstract(virResctrlAllocPtr free, + virResctrlAllocPtr used)
Subtract (and don't forget to fix the alignment for arg2 when you do change the naming)
+{ + size_t i;
Nit:
if (!used->MemBW) return;
for (...) Okay, Will change above two.
+ + if (used->mba) { + for (i = 0; i < used->mba->nsizes; i++) { + if (used->mba->bandwidth[i]) + *(free->mba->bandwidth[i]) -= *(used->mba->bandwidth[i]); + } + } +} + + +int +virResctrlSetMemoryBandwidth(virResctrlAllocPtr alloc, + unsigned int id, + unsigned int memory_bandwidth) +{ + virResctrlAllocMBPtr mba = alloc->mba; + + if (!mba) { + if (VIR_ALLOC(mba) < 0) + return -1; + alloc->mba = mba; + } + + if (mba->nsizes <= id && + VIR_EXPAND_N(mba->bandwidth, mba->nsizes, + id - mba->nsizes + 1) < 0) + return -1; + + if (mba->bandwidth[id]) {
Compare != 0 since it's an unsigned int contained and not a bool or pointer (makes it clearer). OK.
+ virReportError(VIR_ERR_XML_ERROR, + _("Collision Memory Bandwidth on node %d"),
Memory Bandwidth already defined for node %u
(it's unsigned...) Will fix in next version.
+ id); + return -1; + } + + if (VIR_ALLOC(mba->bandwidth[id]) < 0) + return -1; + + *(mba->bandwidth[id]) = memory_bandwidth; + return 0; +} + + +static int +virResctrlAllocMemoryBandwidthFormat(virResctrlAllocPtr alloc, virBufferPtr buf)
One argument per line again. I am not sure whether this is a "max 80 character in one line" rule in libvirt. :). Anyway I will use one new line for new argument.
+{ + int id;
size_t i; it's a for loop
+ + if (!alloc->mba) + return 0; + + virBufferAddLit(buf, "MB:");
MemBW
+ + for (id = 0; id < alloc->mba->nsizes; id++) { + if (alloc->mba->bandwidth[id]) { + virBufferAsprintf(buf, "%u=%u;", id,
"%zd=%u;"
zd is the format used for size_t
+ *(alloc->mba->bandwidth[id]));
+ } + } + + virBufferTrim(buf, ";", 1); + virBufferAddChar(buf, '\n'); + virBufferCheckError(buf); + return 0; +} + + +static int +virResctrlAllocMemoryBandwidth(virResctrlInfoPtr resctrl, + virResctrlAllocPtr alloc, virResctrlAllocPtr free)
One argument per line OK!
+{ + int id;
size_t i; It's a for loop
+ int ret = -1;
Just return 0 or -1 directly, don't bother this this since you're not going to a cleanup label it really doesn't matter.
+ virResctrlAllocMBPtr mb_alloc = alloc->mba; + virResctrlAllocMBPtr mb_free = free->mba; + virResctrlInfoMBPtr mb_info = resctrl->mb_info; + + if (!mb_alloc) { + ret = 0; + return ret; + }
Just return 0 - don't bother with the ret = 0; Will fix above two.
+ + if (mb_alloc && !mb_info) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("RDT Memory Bandwidth allocation" + " unsupported")); + return ret; + } + + for (id = 0; id < mb_alloc->nsizes; id ++) {
s/ ++/++/
(no space on the autoincrement. My bad. and will fix
+ if (!mb_alloc->bandwidth[id]) + continue; + + if (*(mb_alloc->bandwidth[id]) % mb_info->bandwidth_granularity) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("Memory Bandwidth allocation of size " + "%u is not divisible by granularity %u"), + *(mb_alloc->bandwidth[id]), + mb_info->bandwidth_granularity); + return ret; + } + if (*(mb_alloc->bandwidth[id]) < mb_info->min_bandwidth) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("Memory Bandwidth allocation of size " + "%u is smaller than the minimum " + "allowed allocation %u"), + *(mb_alloc->bandwidth[id]), + mb_info->min_bandwidth); + return ret; + } + if (id > mb_info->max_id) {
Is @id the right comparison here, should it be the current *(mb_alloc->bandwidth[id])" value? @id was the VIR_EXPAND_N of the maximum size. In any case, a very strange comparison. Here id is the node of the current memory bandwidth happen. It should be not larger than the max node number which is populated during traverse host cache hierarchy.
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("bandwidth controller %u not exist," + " max controller id %u"),
s/%u/%zd/ s/exist,/exist, / s/ max/max
(IOW: Space on the first line) OK!
+ id, mb_info->max_id); + return ret; + } + if (*(mb_alloc->bandwidth[id]) > *(mb_free->bandwidth[id])) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("Not enough room for allocation of %u "
s/%u/%zd/
+ "bandwidth for node %u%%, freed %u%%"),
The error message is a bit strange - especially that "freed" part. I'm still not 100% clear about all that's going on, but I'm sure to come back to this in some future review. This error message is to tell the current requested memory bandwidth and current free memory bandwidth. If requesting 60% bandwidth, however
Will fix above three. there is a domain already allocating 50% bandwidth. Above error will fire.
+ id, *(mb_alloc->bandwidth[id]), + *(mb_free->bandwidth[id])); + return ret; + } + } + ret = 0; + return ret; +} + + +static int +virResctrlAllocParseMemoryBandwidthLine(virResctrlInfoPtr resctrl, + virResctrlAllocPtr alloc, + char *line) +{ + char **mbs = NULL; + char *tmp = NULL; + unsigned int bandwidth; + size_t nmb = 0;
s/nmb/nmbs/ (or numbws if you change @mbs to be mbws
OK
+ unsigned int id; + size_t i; + int ret = -1; + + /* For no reason there can be spaces */ + virSkipSpaces((const char **) &line); + + if (STRNEQLEN(line, "MB", 2))
To follow the above ..."MemBW", 5...
No, this is to parse the memory bandwidth information of existing resctrl groups. Above "MB" is the format return by RDT kernel module. Not somthing define by us.
+ return 0; + + if (!resctrl || !resctrl->mb_info + || !resctrl->mb_info->min_bandwidth + || !resctrl->mb_info->bandwidth_granularity) {
Put the || on the previous line
OK!
+ virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Missing or inconsistent resctrl info for " + "memory bandwidth allocation")); + } + + if (!alloc->mba) { + if (VIR_ALLOC(alloc->mba) < 0) + return -1; + }
Hmmm, this is familiar...
+ + tmp = strchr(line, ':'); + if (!tmp) + return 0; + tmp++; + + mbs = virStringSplitCount(tmp, ";", 0, &nmb); + if (!nmb) + return 0;
Since @nmb is a size_t, make the explicit == 0 comparison. Alternatively, use "if (!mbs)"... What happens is Coverity gets grumpy thinking a 0 can be returned with something in @mbs, so it complains. Another way around it is "ret = 0; goto cleanup;" to trick it into thinking it's calling the Free function.
OK I will change to "if (nmb != 0)"
+ + for (i = 0; i < nmb; i++) { + tmp = strchr(mbs[i], '=');
strchr can return NULL, which would cause the following line all sorts of problems. Yes, I know how it was written, but better safe than sorry.
OK I will add a NULL pointer testing here
+ *tmp = '\0'; + tmp++; + + if (virStrToLong_uip(mbs[i], NULL, 10, &id) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Invalid node id %u "), id); + goto cleanup; + } + if (virStrToLong_uip(tmp, NULL, 10, &bandwidth) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Invalid bandwidth %u"), bandwidth); + goto cleanup; + }
Including the VIR_ALLOC above... here to ...
+ if (alloc->mba->nsizes <= id && + VIR_EXPAND_N(alloc->mba->bandwidth, alloc->mba->nsizes, + id - alloc->mba->nsizes + 1) < 0) { + goto cleanup; + } + if (!alloc->mba->bandwidth[id]) { + if (VIR_ALLOC(alloc->mba->bandwidth[id]) < 0) + goto cleanup; + } + + *(alloc->mba->bandwidth[id]) = bandwidth; + }
...here replicates virResctrlSetMemoryBandwidth (reuse, recycle, don't cause someone to change 2 places because they *will forget* one!)
Well, this function is used to collect the exsiting resctrl groups allocating information to calculate the free memory bandwidth. This virResctrlAllocPtr will be disposed as soon as we finish calculating or error happen. And virResctrlSetMemoryBandwidth is used by domain_conf set memory bandwidth from XML. Those two virResctrlAllocPtrs are unrelative. So I think it's safe.
+ ret = 0; + cleanup: + virStringListFree(mbs); + return ret; +} + + static int virResctrlAllocFormatCache(virResctrlAllocPtr alloc, virBufferPtr buf) { @@ -909,6 +1260,11 @@ virResctrlAllocFormat(virResctrlAllocPtr alloc) return NULL; }
+ if (virResctrlAllocMemoryBandwidthFormat(alloc, &buf) < 0) { + virBufferFreeAndReset(&buf); + return NULL; + } + virBufferCheckError(&buf); return virBufferContentAndReset(&buf); } @@ -1036,6 +1392,9 @@ virResctrlAllocParse(virResctrlInfoPtr resctrl,
lines = virStringSplitCount(schemata, "\n", 0, &nlines); for (i = 0; i < nlines; i++) { + if (virResctrlAllocParseMemoryBandwidthLine(resctrl, alloc, lines[i]) < 0) + goto cleanup; + if (virResctrlAllocParseCacheLine(resctrl, alloc, lines[i]) < 0) goto cleanup; } @@ -1170,6 +1529,22 @@ virResctrlAllocNewFromInfo(virResctrlInfoPtr info) } }
+ /* set default free memory bandwidth to 100%*/
Perhaps this answers my question above about why the %%, but I'm not 100% sure
+ if (info->mb_info) { + if (VIR_ALLOC(ret->mba) < 0) + goto error; + + if (VIR_EXPAND_N(ret->mba->bandwidth, ret->mba->nsizes, + info->mb_info->max_id - ret->mba->nsizes + 1) < 0)
This is a strange calculation especially since ret->mba->nsizes has to be zero because we just did a VIR_ALLOC(ret->mba).
I should change to if (VIR_EXPAND_N(ret->mba->bandwidth, ret->mba->nsizes, info->mb_info->max_id + 1) < 0) This is used to initialize virResctrlAllocPtr basing on virResctrlInfoPtr.
+ goto error; + + for (i = 0; i < ret->mba->nsizes; i++) { + if (VIR_ALLOC(ret->mba->bandwidth[i]) < 0) + goto error; + *(ret->mba->bandwidth[i]) = 100;
I think I'm missing something subtle here. Maybe it's the "alloc" and "free" - not quite sure. Perhaps some relationship that the *Subtract method handles.
There's virResctrlSetMemoryBandwidth which restricts the "mba->bandwidth[id]" with that collision message I noted, but perhaps this is just some internal thing/logic that I'll "get" the next time I review a patch series.
+ } + } + cleanup: virBitmapFree(mask); return ret; @@ -1233,6 +1608,7 @@ virResctrlAllocGetUnused(virResctrlInfoPtr resctrl) goto error; }
+ virResctrlMemoryBandwidthSubstract(ret, alloc);
Subtract
OK
virResctrlAllocSubtract(ret, alloc); virObjectUnref(alloc); alloc = NULL; @@ -1444,6 +1820,9 @@ virResctrlAllocAssign(virResctrlInfoPtr resctrl, if (!alloc_default) goto cleanup;
+ if (virResctrlAllocMemoryBandwidth(resctrl, alloc, alloc_free) < 0) + goto cleanup; +
Now that this is added, the description for this function should be updated since it seems to be going beyond just size to mask now.
Yep! I will update function description in next version. Bing
John
if (virResctrlAllocCopyMasks(alloc, alloc_default) < 0) goto cleanup;
diff --git a/src/util/virresctrl.h b/src/util/virresctrl.h index 26c5f3b..5e78334 100644 --- a/src/util/virresctrl.h +++ b/src/util/virresctrl.h @@ -73,6 +73,10 @@ typedef int virResctrlAllocForeachCacheCallback(unsigned int level, unsigned long long size, void *opaque);
+typedef int virResctrlAllocForeachMemoryCallback(unsigned int id, + unsigned int size, + void *opaque); + virResctrlAllocPtr virResctrlAllocNew(void);
@@ -92,6 +96,16 @@ virResctrlAllocForeachCache(virResctrlAllocPtr alloc, void *opaque);
int +virResctrlAllocForeachMemory(virResctrlAllocPtr resctrl, + virResctrlAllocForeachMemoryCallback cb, + void *opaque); + +int +virResctrlSetMemoryBandwidth(virResctrlAllocPtr alloc, + unsigned int id, + unsigned int memory_bandwidth); + +int virResctrlAllocSetID(virResctrlAllocPtr alloc, const char *id); const char *

From: Bing Niu <bing.niu@intel.com> resctrl not only supports cache tuning, but also memory bandwidth tuning. Rename cachetune to restune(resource tuning) to reflect that. Signed-off-by: Bing Niu <bing.niu@intel.com> --- src/conf/domain_conf.c | 44 ++++++++++++++++++++++---------------------- src/conf/domain_conf.h | 10 +++++----- src/qemu/qemu_process.c | 18 +++++++++--------- 3 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 62c412e..b3543f3 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -2950,14 +2950,14 @@ virDomainLoaderDefFree(virDomainLoaderDefPtr loader) static void -virDomainCachetuneDefFree(virDomainCachetuneDefPtr cachetune) +virDomainRestuneDefFree(virDomainRestuneDefPtr restune) { - if (!cachetune) + if (!restune) return; - virObjectUnref(cachetune->alloc); - virBitmapFree(cachetune->vcpus); - VIR_FREE(cachetune); + virObjectUnref(restune->alloc); + virBitmapFree(restune->vcpus); + VIR_FREE(restune); } @@ -3147,9 +3147,9 @@ void virDomainDefFree(virDomainDefPtr def) virDomainShmemDefFree(def->shmems[i]); VIR_FREE(def->shmems); - for (i = 0; i < def->ncachetunes; i++) - virDomainCachetuneDefFree(def->cachetunes[i]); - VIR_FREE(def->cachetunes); + for (i = 0; i < def->nrestunes; i++) + virDomainRestuneDefFree(def->restunes[i]); + VIR_FREE(def->restunes); VIR_FREE(def->keywrap); @@ -18971,7 +18971,7 @@ virDomainCachetuneDefParse(virDomainDefPtr def, xmlNodePtr *nodes = NULL; virBitmapPtr vcpus = NULL; virResctrlAllocPtr alloc = virResctrlAllocNew(); - virDomainCachetuneDefPtr tmp_cachetune = NULL; + virDomainRestuneDefPtr tmp_restune = NULL; char *tmp = NULL; char *vcpus_str = NULL; char *alloc_id = NULL; @@ -18984,7 +18984,7 @@ virDomainCachetuneDefParse(virDomainDefPtr def, if (!alloc) goto cleanup; - if (VIR_ALLOC(tmp_cachetune) < 0) + if (VIR_ALLOC(tmp_restune) < 0) goto cleanup; vcpus_str = virXMLPropString(node, "vcpus"); @@ -19025,8 +19025,8 @@ virDomainCachetuneDefParse(virDomainDefPtr def, goto cleanup; } - for (i = 0; i < def->ncachetunes; i++) { - if (virBitmapOverlaps(def->cachetunes[i]->vcpus, vcpus)) { + for (i = 0; i < def->nrestunes; i++) { + if (virBitmapOverlaps(def->restunes[i]->vcpus, vcpus)) { virReportError(VIR_ERR_XML_ERROR, "%s", _("Overlapping vcpus in cachetunes")); goto cleanup; @@ -19056,16 +19056,16 @@ virDomainCachetuneDefParse(virDomainDefPtr def, if (virResctrlAllocSetID(alloc, alloc_id) < 0) goto cleanup; - VIR_STEAL_PTR(tmp_cachetune->vcpus, vcpus); - VIR_STEAL_PTR(tmp_cachetune->alloc, alloc); + VIR_STEAL_PTR(tmp_restune->vcpus, vcpus); + VIR_STEAL_PTR(tmp_restune->alloc, alloc); - if (VIR_APPEND_ELEMENT(def->cachetunes, def->ncachetunes, tmp_cachetune) < 0) + if (VIR_APPEND_ELEMENT(def->restunes, def->nrestunes, tmp_restune) < 0) goto cleanup; ret = 0; cleanup: ctxt->node = oldnode; - virDomainCachetuneDefFree(tmp_cachetune); + virDomainRestuneDefFree(tmp_restune); virObjectUnref(alloc); virBitmapFree(vcpus); VIR_FREE(alloc_id); @@ -26874,7 +26874,7 @@ virDomainCachetuneDefFormatHelper(unsigned int level, static int virDomainCachetuneDefFormat(virBufferPtr buf, - virDomainCachetuneDefPtr cachetune, + virDomainRestuneDefPtr restune, unsigned int flags) { virBuffer childrenBuf = VIR_BUFFER_INITIALIZER; @@ -26882,7 +26882,7 @@ virDomainCachetuneDefFormat(virBufferPtr buf, int ret = -1; virBufferSetChildIndent(&childrenBuf, buf); - virResctrlAllocForeachCache(cachetune->alloc, + virResctrlAllocForeachCache(restune->alloc, virDomainCachetuneDefFormatHelper, &childrenBuf); @@ -26895,14 +26895,14 @@ virDomainCachetuneDefFormat(virBufferPtr buf, goto cleanup; } - vcpus = virBitmapFormat(cachetune->vcpus); + vcpus = virBitmapFormat(restune->vcpus); if (!vcpus) goto cleanup; virBufferAsprintf(buf, "<cachetune vcpus='%s'", vcpus); if (!(flags & VIR_DOMAIN_DEF_FORMAT_INACTIVE)) { - const char *alloc_id = virResctrlAllocGetID(cachetune->alloc); + const char *alloc_id = virResctrlAllocGetID(restune->alloc); if (!alloc_id) goto cleanup; @@ -27023,8 +27023,8 @@ virDomainCputuneDefFormat(virBufferPtr buf, def->iothreadids[i]->iothread_id); } - for (i = 0; i < def->ncachetunes; i++) - virDomainCachetuneDefFormat(&childrenBuf, def->cachetunes[i], flags); + for (i = 0; i < def->nrestunes; i++) + virDomainCachetuneDefFormat(&childrenBuf, def->restunes[i], flags); if (virBufferCheckError(&childrenBuf) < 0) return -1; diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 6344c02..9e71a0b 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -2228,10 +2228,10 @@ struct _virDomainCputune { }; -typedef struct _virDomainCachetuneDef virDomainCachetuneDef; -typedef virDomainCachetuneDef *virDomainCachetuneDefPtr; +typedef struct _virDomainRestuneDef virDomainRestuneDef; +typedef virDomainRestuneDef *virDomainRestuneDefPtr; -struct _virDomainCachetuneDef { +struct _virDomainRestuneDef { virBitmapPtr vcpus; virResctrlAllocPtr alloc; }; @@ -2409,8 +2409,8 @@ struct _virDomainDef { virDomainCputune cputune; - virDomainCachetuneDefPtr *cachetunes; - size_t ncachetunes; + virDomainRestuneDefPtr *restunes; + size_t nrestunes; virDomainNumaPtr numa; virDomainResourceDefPtr resource; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 93fd6ba..0659c06 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2447,7 +2447,7 @@ qemuProcessResctrlCreate(virQEMUDriverPtr driver, virCapsPtr caps = NULL; qemuDomainObjPrivatePtr priv = vm->privateData; - if (!vm->def->ncachetunes) + if (!vm->def->nrestunes) return 0; /* Force capability refresh since resctrl info can change @@ -2456,9 +2456,9 @@ qemuProcessResctrlCreate(virQEMUDriverPtr driver, if (!caps) return -1; - for (i = 0; i < vm->def->ncachetunes; i++) { + for (i = 0; i < vm->def->nrestunes; i++) { if (virResctrlAllocCreate(caps->host.resctrl, - vm->def->cachetunes[i]->alloc, + vm->def->restunes[i]->alloc, priv->machineName) < 0) goto cleanup; } @@ -5259,8 +5259,8 @@ qemuProcessSetupVcpu(virDomainObjPtr vm, &vcpu->sched) < 0) return -1; - for (i = 0; i < vm->def->ncachetunes; i++) { - virDomainCachetuneDefPtr ct = vm->def->cachetunes[i]; + for (i = 0; i < vm->def->nrestunes; i++) { + virDomainRestuneDefPtr ct = vm->def->restunes[i]; if (virBitmapIsBitSet(ct->vcpus, vcpuid)) { if (virResctrlAllocAddPID(ct->alloc, vcpupid) < 0) @@ -6955,8 +6955,8 @@ void qemuProcessStop(virQEMUDriverPtr driver, /* Remove resctrl allocation after cgroups are cleaned up which makes it * kind of safer (although removing the allocation should work even with * pids in tasks file */ - for (i = 0; i < vm->def->ncachetunes; i++) - virResctrlAllocRemove(vm->def->cachetunes[i]->alloc); + for (i = 0; i < vm->def->nrestunes; i++) + virResctrlAllocRemove(vm->def->restunes[i]->alloc); qemuProcessRemoveDomainStatus(driver, vm); @@ -7676,8 +7676,8 @@ qemuProcessReconnect(void *opaque) if (qemuConnectAgent(driver, obj) < 0) goto error; - for (i = 0; i < obj->def->ncachetunes; i++) { - if (virResctrlAllocDeterminePath(obj->def->cachetunes[i]->alloc, + for (i = 0; i < obj->def->nrestunes; i++) { + if (virResctrlAllocDeterminePath(obj->def->restunes[i]->alloc, priv->machineName) < 0) goto error; } -- 2.7.4

On 06/15/2018 05:29 AM, bing.niu@intel.com wrote:
From: Bing Niu <bing.niu@intel.com>
resctrl not only supports cache tuning, but also memory bandwidth tuning. Rename cachetune to restune(resource tuning) to reflect that.
Signed-off-by: Bing Niu <bing.niu@intel.com> --- src/conf/domain_conf.c | 44 ++++++++++++++++++++++---------------------- src/conf/domain_conf.h | 10 +++++----- src/qemu/qemu_process.c | 18 +++++++++--------- 3 files changed, 36 insertions(+), 36 deletions(-)
Not so sure I'm liking [R|r]estune[s]... Still @cachetunes is an array of a structure that contains a vCPU bitmap and a virResctrlAllocPtr for the /cputune/cachetunes data. The virResctrlAllocPtr was changed to add a the bandwidth allocation, so does this really have to change? I think the question is - if both cachetunes and memtunes exist in domain XML, then for which does the @vcpus relate to. That is, from the cover there's: <memorytune vcpus='0-1'> <node id='0' bandwidth='20'/> </memorytune> and then there's a <cachetune vcpus='0-3'> <cache id='0' level='3' type='both' size='3' unit='MiB'/> <cache id='1' level='3' type='both' size='3' unit='MiB'/> </cachetune> Now what? I haven't looked at patch 4 yet to even know how this "works". I think what you want to do is create a virDomainMemtuneDef that mimics virDomainCachetuneDef, but I'm not 100% sure. Of course that still leaves me wondering how much of virResctrlAllocPtr then becomes wasted. You chose an integration point, but essentially have separate technologies. I'm trying to make sense of it all and am starting to get lost in the process of doing that. If @bandwidth can not be > 100... Can we duplicate the @id somehow? So how does that free buffer that got 100% and then gets subtracted from really work in this model? I probably need to study the code some more, but it's not clear it requires using the "same" sort of logic that cachetune uses where there could be different types (instructs, data, or both) that would be drawing from the size, right? I think that's how that worked. So what's the corollary for memtunes? You set a range from 0 to 100, right and that's it. OK, too many questions and it's late so I'll stop here... Hopefully I didn't leave any stray thoughts in the review of patch 1 or 2. John
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 62c412e..b3543f3 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -2950,14 +2950,14 @@ virDomainLoaderDefFree(virDomainLoaderDefPtr loader)
static void -virDomainCachetuneDefFree(virDomainCachetuneDefPtr cachetune) +virDomainRestuneDefFree(virDomainRestuneDefPtr restune) { - if (!cachetune) + if (!restune) return;
- virObjectUnref(cachetune->alloc); - virBitmapFree(cachetune->vcpus); - VIR_FREE(cachetune); + virObjectUnref(restune->alloc); + virBitmapFree(restune->vcpus); + VIR_FREE(restune); }
@@ -3147,9 +3147,9 @@ void virDomainDefFree(virDomainDefPtr def) virDomainShmemDefFree(def->shmems[i]); VIR_FREE(def->shmems);
- for (i = 0; i < def->ncachetunes; i++) - virDomainCachetuneDefFree(def->cachetunes[i]); - VIR_FREE(def->cachetunes); + for (i = 0; i < def->nrestunes; i++) + virDomainRestuneDefFree(def->restunes[i]); + VIR_FREE(def->restunes);
VIR_FREE(def->keywrap);
@@ -18971,7 +18971,7 @@ virDomainCachetuneDefParse(virDomainDefPtr def, xmlNodePtr *nodes = NULL; virBitmapPtr vcpus = NULL; virResctrlAllocPtr alloc = virResctrlAllocNew(); - virDomainCachetuneDefPtr tmp_cachetune = NULL; + virDomainRestuneDefPtr tmp_restune = NULL; char *tmp = NULL; char *vcpus_str = NULL; char *alloc_id = NULL; @@ -18984,7 +18984,7 @@ virDomainCachetuneDefParse(virDomainDefPtr def, if (!alloc) goto cleanup;
- if (VIR_ALLOC(tmp_cachetune) < 0) + if (VIR_ALLOC(tmp_restune) < 0) goto cleanup;
vcpus_str = virXMLPropString(node, "vcpus"); @@ -19025,8 +19025,8 @@ virDomainCachetuneDefParse(virDomainDefPtr def, goto cleanup; }
- for (i = 0; i < def->ncachetunes; i++) { - if (virBitmapOverlaps(def->cachetunes[i]->vcpus, vcpus)) { + for (i = 0; i < def->nrestunes; i++) { + if (virBitmapOverlaps(def->restunes[i]->vcpus, vcpus)) { virReportError(VIR_ERR_XML_ERROR, "%s", _("Overlapping vcpus in cachetunes")); goto cleanup; @@ -19056,16 +19056,16 @@ virDomainCachetuneDefParse(virDomainDefPtr def, if (virResctrlAllocSetID(alloc, alloc_id) < 0) goto cleanup;
- VIR_STEAL_PTR(tmp_cachetune->vcpus, vcpus); - VIR_STEAL_PTR(tmp_cachetune->alloc, alloc); + VIR_STEAL_PTR(tmp_restune->vcpus, vcpus); + VIR_STEAL_PTR(tmp_restune->alloc, alloc);
- if (VIR_APPEND_ELEMENT(def->cachetunes, def->ncachetunes, tmp_cachetune) < 0) + if (VIR_APPEND_ELEMENT(def->restunes, def->nrestunes, tmp_restune) < 0) goto cleanup;
ret = 0; cleanup: ctxt->node = oldnode; - virDomainCachetuneDefFree(tmp_cachetune); + virDomainRestuneDefFree(tmp_restune); virObjectUnref(alloc); virBitmapFree(vcpus); VIR_FREE(alloc_id); @@ -26874,7 +26874,7 @@ virDomainCachetuneDefFormatHelper(unsigned int level,
static int virDomainCachetuneDefFormat(virBufferPtr buf, - virDomainCachetuneDefPtr cachetune, + virDomainRestuneDefPtr restune, unsigned int flags) { virBuffer childrenBuf = VIR_BUFFER_INITIALIZER; @@ -26882,7 +26882,7 @@ virDomainCachetuneDefFormat(virBufferPtr buf, int ret = -1;
virBufferSetChildIndent(&childrenBuf, buf); - virResctrlAllocForeachCache(cachetune->alloc, + virResctrlAllocForeachCache(restune->alloc, virDomainCachetuneDefFormatHelper, &childrenBuf);
@@ -26895,14 +26895,14 @@ virDomainCachetuneDefFormat(virBufferPtr buf, goto cleanup; }
- vcpus = virBitmapFormat(cachetune->vcpus); + vcpus = virBitmapFormat(restune->vcpus); if (!vcpus) goto cleanup;
virBufferAsprintf(buf, "<cachetune vcpus='%s'", vcpus);
if (!(flags & VIR_DOMAIN_DEF_FORMAT_INACTIVE)) { - const char *alloc_id = virResctrlAllocGetID(cachetune->alloc); + const char *alloc_id = virResctrlAllocGetID(restune->alloc); if (!alloc_id) goto cleanup;
@@ -27023,8 +27023,8 @@ virDomainCputuneDefFormat(virBufferPtr buf, def->iothreadids[i]->iothread_id); }
- for (i = 0; i < def->ncachetunes; i++) - virDomainCachetuneDefFormat(&childrenBuf, def->cachetunes[i], flags); + for (i = 0; i < def->nrestunes; i++) + virDomainCachetuneDefFormat(&childrenBuf, def->restunes[i], flags);
if (virBufferCheckError(&childrenBuf) < 0) return -1; diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 6344c02..9e71a0b 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -2228,10 +2228,10 @@ struct _virDomainCputune { };
-typedef struct _virDomainCachetuneDef virDomainCachetuneDef; -typedef virDomainCachetuneDef *virDomainCachetuneDefPtr; +typedef struct _virDomainRestuneDef virDomainRestuneDef; +typedef virDomainRestuneDef *virDomainRestuneDefPtr;
-struct _virDomainCachetuneDef { +struct _virDomainRestuneDef { virBitmapPtr vcpus; virResctrlAllocPtr alloc; }; @@ -2409,8 +2409,8 @@ struct _virDomainDef {
virDomainCputune cputune;
- virDomainCachetuneDefPtr *cachetunes; - size_t ncachetunes; + virDomainRestuneDefPtr *restunes; + size_t nrestunes;
virDomainNumaPtr numa; virDomainResourceDefPtr resource; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 93fd6ba..0659c06 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2447,7 +2447,7 @@ qemuProcessResctrlCreate(virQEMUDriverPtr driver, virCapsPtr caps = NULL; qemuDomainObjPrivatePtr priv = vm->privateData;
- if (!vm->def->ncachetunes) + if (!vm->def->nrestunes) return 0;
/* Force capability refresh since resctrl info can change @@ -2456,9 +2456,9 @@ qemuProcessResctrlCreate(virQEMUDriverPtr driver, if (!caps) return -1;
- for (i = 0; i < vm->def->ncachetunes; i++) { + for (i = 0; i < vm->def->nrestunes; i++) { if (virResctrlAllocCreate(caps->host.resctrl, - vm->def->cachetunes[i]->alloc, + vm->def->restunes[i]->alloc, priv->machineName) < 0) goto cleanup; } @@ -5259,8 +5259,8 @@ qemuProcessSetupVcpu(virDomainObjPtr vm, &vcpu->sched) < 0) return -1;
- for (i = 0; i < vm->def->ncachetunes; i++) { - virDomainCachetuneDefPtr ct = vm->def->cachetunes[i]; + for (i = 0; i < vm->def->nrestunes; i++) { + virDomainRestuneDefPtr ct = vm->def->restunes[i];
if (virBitmapIsBitSet(ct->vcpus, vcpuid)) { if (virResctrlAllocAddPID(ct->alloc, vcpupid) < 0) @@ -6955,8 +6955,8 @@ void qemuProcessStop(virQEMUDriverPtr driver, /* Remove resctrl allocation after cgroups are cleaned up which makes it * kind of safer (although removing the allocation should work even with * pids in tasks file */ - for (i = 0; i < vm->def->ncachetunes; i++) - virResctrlAllocRemove(vm->def->cachetunes[i]->alloc); + for (i = 0; i < vm->def->nrestunes; i++) + virResctrlAllocRemove(vm->def->restunes[i]->alloc);
qemuProcessRemoveDomainStatus(driver, vm);
@@ -7676,8 +7676,8 @@ qemuProcessReconnect(void *opaque) if (qemuConnectAgent(driver, obj) < 0) goto error;
- for (i = 0; i < obj->def->ncachetunes; i++) { - if (virResctrlAllocDeterminePath(obj->def->cachetunes[i]->alloc, + for (i = 0; i < obj->def->nrestunes; i++) { + if (virResctrlAllocDeterminePath(obj->def->restunes[i]->alloc, priv->machineName) < 0) goto error; }

Hi John, Thanks for reviewing! Since major questions are from this thread, so I think we can start from this. On 2018年06月30日 06:47, John Ferlan wrote:
On 06/15/2018 05:29 AM, bing.niu@intel.com wrote:
From: Bing Niu <bing.niu@intel.com>
resctrl not only supports cache tuning, but also memory bandwidth tuning. Rename cachetune to restune(resource tuning) to reflect that.
Signed-off-by: Bing Niu <bing.niu@intel.com> --- src/conf/domain_conf.c | 44 ++++++++++++++++++++++---------------------- src/conf/domain_conf.h | 10 +++++----- src/qemu/qemu_process.c | 18 +++++++++--------- 3 files changed, 36 insertions(+), 36 deletions(-)
Not so sure I'm liking [R|r]estune[s]... Still @cachetunes is an array of a structure that contains a vCPU bitmap and a virResctrlAllocPtr for the /cputune/cachetunes data. The virResctrlAllocPtr was changed to add a the bandwidth allocation, so does this really have to change?
The cachetunes from Cache Allocation Technology(CAT) and memorytunes from Memory Bandwidth Allocation(MBA) are both features from Resource Directory Technology. RDT is a collection of resource distribution technology, right now it includes CAT and MBA. Details information can be found 17.18.1 of Intel's sdm. https://software.intel.com/sites/default/files/managed/39/c5/325462-sdm-vol-... The RDT capability and configuration methods is exposed to user land in form of resctrl file system by kernel. The basic programming model for this resctrl fs interface like this: 1. create a folder under /sys/fs/resctrl. And this folder is called one resctrl group. 2. writing user desired CAT and MBA config to the file /sys/fs/resctrl/<YOUR_GROUP_NAME>/schemata to allocate the cache or memory bandwidth. you can set CAT and MBA together or any of them. 3. moving the threads you want to that resctrl group by writing threads' PID to /sys/fs/resctrl/<YOUR_GROUP_NAME>/task file. So that those threads can be assign with the resource allocated in step 2. And those resources are shared by those threads. *NOTE*: A thread only can be put in one resctrl group. This requirement is from how RDT and resctrl works. Each resctrl group has a closid. The resource configuration in above step2 is set to that closid's msr(IA32_L?_QoS) to describe the resource allocation for this closid. And thread use this closid to claim the allocated resource during context switch(scheduled_in()) in kernel by writing the closid to the msr IA32_PQR_ASSOC. With closid wrote in IA32_PQR_ASSOC, RDT hardware knows current running closid and allocated resource basing on this closid's IA32_L?_QoS msr. That why a thread can be put into one restrcl group only. Details description of resctrl can be found: https://github.com/torvalds/linux/blob/v4.17/Documentation/x86/intel_rdt_ui.... :) Basing on above information, my understanding is that virResctrlAllocPtr is not only bind to cachetunes. It should be a backing object to describe a resctrl group. Especially current virresctrl class already works as that. Libvirt will create one resctrl group for each virResctrlAllocPtr by virResctrlAllocCreate() interface. So from components view, the big picture is something like below. <memorytune ^cpus='0-3'> +---------+ | <cachetune vcpus='0-3'> XML | + parser +-----------+ | | +------------------------------+ | | internal resctrl +------v--------------+ backing object | virResctrlAllocPtr | +------+--------------+ | | +------------------------------+ | +--v-------+ | | | schemata | /sys/fs/resctrl | tasks | | . | | . | | | +----------+ Does this make sense? :)
I think the question is - if both cachetunes and memtunes exist in domain XML, then for which does the @vcpus relate to. That is, from the cover there's:
<memorytune vcpus='0-1'> <node id='0' bandwidth='20'/> </memorytune>
and then there's a
<cachetune vcpus='0-3'> <cache id='0' level='3' type='both' size='3' unit='MiB'/> <cache id='1' level='3' type='both' size='3' unit='MiB'/> </cachetune>
Now what? I haven't looked at patch 4 yet to even know how this "works".
I also raised this concern in the first round review. Ideally, if we can have a domain XML like this <resourcetune vcpu='0-3'> <memorytune> <node id='0' bandwidth='20'/> </memorytune> <cachetune> <cache id='0' level='3' type='both' size='3' unit='MiB'/> <cache id='1' level='3' type='both' size='3' unit='MiB'/> </cachetune> <resourcetune> That will be more clear. Above domain XML means: one resctrl group with memory bandwidth 20 @node 0, L3 cache 3MB @ node0 and node 1. Put vcpu '0-3' to this resctrl group. So those resources are shared among vcpus. However, Pavel pointed out that we must keep backward compatible. And we need keep <cachetune vcpus='0-3'>. In RFC v1, I also proposed to put memory bandwidth into cachetunes section like: <cachetune vcpus='0-3'> <cache id='0' level='3' type='both' size='3' unit='MiB'/> <node id='0' bandwidth='20'/> <cachetune vcpus> Maintainer also comments on this "it's not a clear XML definition", describing memory bandwidth inside a cachetune section. That's why we come with a separated memorytune section. In your example, the current implement will stop creating resctrl group and throw an error. vcpu list cannot overlap between memorytune and cachetune, but they can be same. Since vcpu can be put into one resctrl group only. My understanding is that we can set using <resourcetune> to describe cachetune and memorytune together as a long term goal. Since that needs to deprecate cachetune's vcpu list. I am not sure about procedure to remove existing XML elements. Maybe you can point out. For the short term goal, we can use this separated <memorytune vcpus='0-1'> section. How do you think? :)
I think what you want to do is create a virDomainMemtuneDef that mimics virDomainCachetuneDef, but I'm not 100% sure.
Of course that still leaves me wondering how much of virResctrlAllocPtr then becomes wasted. You chose an integration point, but essentially have separate technologies. I'm trying to make sense of it all and am starting to get lost in the process of doing that.
From resctrl group perspective, I don't think extend memory bandwidth information in virResctrlAllocPtr is a waste. I think this is a feature extending. :)
If @bandwidth can not be > 100... Can we duplicate the @id somehow? So
Something like <cachetune vcpus='0-3'> <cache id='0' level='3' type='both' size='3' unit='MiB'/> <node id='0' bandwidth='20'/> </cachetune> or <cachetune vcpus='0-3'> <cache id='0' level='3' type='both' size='3' unit='MiB' bandwidth='20'/> </cachetune> ??? That also works. This is something I did in RFC v1. But there is a comment said it's better not to mix memory bandwidth with cachetune. That may confuse people. :(
how does that free buffer that got 100% and then gets subtracted from really work in this model? I probably need to study the code some more, but it's not clear it requires using the "same" sort of logic that cachetune uses where there could be different types (instructs, data, or both) that would be drawing from the size, right? I think that's how that worked. So what's the corollary for memtunes? You set a range from 0 to 100, right and that's it. OK, too many questions and it's late so I'll stop here... Hopefully I didn't leave any stray thoughts in the review of patch 1 or 2.
The policy we used for memory bandwidth is consistent with the existing cachetune. Current cache allocation logic model forbids cache allocation overlap. This is to prevent resource contention I think. Same strategy on memory allocation part. Since total bandwidth beyond to 100 will lead to bandwidth competition. Existing cache allocation logic achieves this by first populating virResctrlInfoPtr structure. Libvirt will fill the virResctrlInfoPtr structure during traverse host's cache hierarchy. When allocating cache by virResctrlAllocCreate, libvirt will first create a virResctrlAllocPtr basing on information from virResctrlInfoPtr. This virResctrlAllocPtr represents the maximum resource available(I mean no any allocation previously). Then libvirt traverse all resctrl groups (in /sys/fs/resctrl/)created already, subtract the resource they claimed. So the left resource is the resource free. Libvirt compares this free resource with resource requested in virResctrlAllocCreate. If enough free resource available, then new resctrl group will be created. Memory bandwidth part follows the same idea, only the difference is memory bandwidth use percentage number and cache use bitmap. Bing
John
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 62c412e..b3543f3 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -2950,14 +2950,14 @@ virDomainLoaderDefFree(virDomainLoaderDefPtr loader)
static void -virDomainCachetuneDefFree(virDomainCachetuneDefPtr cachetune) +virDomainRestuneDefFree(virDomainRestuneDefPtr restune) { - if (!cachetune) + if (!restune) return;
- virObjectUnref(cachetune->alloc); - virBitmapFree(cachetune->vcpus); - VIR_FREE(cachetune); + virObjectUnref(restune->alloc); + virBitmapFree(restune->vcpus); + VIR_FREE(restune); }
@@ -3147,9 +3147,9 @@ void virDomainDefFree(virDomainDefPtr def) virDomainShmemDefFree(def->shmems[i]); VIR_FREE(def->shmems);
- for (i = 0; i < def->ncachetunes; i++) - virDomainCachetuneDefFree(def->cachetunes[i]); - VIR_FREE(def->cachetunes); + for (i = 0; i < def->nrestunes; i++) + virDomainRestuneDefFree(def->restunes[i]); + VIR_FREE(def->restunes);
VIR_FREE(def->keywrap);
@@ -18971,7 +18971,7 @@ virDomainCachetuneDefParse(virDomainDefPtr def, xmlNodePtr *nodes = NULL; virBitmapPtr vcpus = NULL; virResctrlAllocPtr alloc = virResctrlAllocNew(); - virDomainCachetuneDefPtr tmp_cachetune = NULL; + virDomainRestuneDefPtr tmp_restune = NULL; char *tmp = NULL; char *vcpus_str = NULL; char *alloc_id = NULL; @@ -18984,7 +18984,7 @@ virDomainCachetuneDefParse(virDomainDefPtr def, if (!alloc) goto cleanup;
- if (VIR_ALLOC(tmp_cachetune) < 0) + if (VIR_ALLOC(tmp_restune) < 0) goto cleanup;
vcpus_str = virXMLPropString(node, "vcpus"); @@ -19025,8 +19025,8 @@ virDomainCachetuneDefParse(virDomainDefPtr def, goto cleanup; }
- for (i = 0; i < def->ncachetunes; i++) { - if (virBitmapOverlaps(def->cachetunes[i]->vcpus, vcpus)) { + for (i = 0; i < def->nrestunes; i++) { + if (virBitmapOverlaps(def->restunes[i]->vcpus, vcpus)) { virReportError(VIR_ERR_XML_ERROR, "%s", _("Overlapping vcpus in cachetunes")); goto cleanup; @@ -19056,16 +19056,16 @@ virDomainCachetuneDefParse(virDomainDefPtr def, if (virResctrlAllocSetID(alloc, alloc_id) < 0) goto cleanup;
- VIR_STEAL_PTR(tmp_cachetune->vcpus, vcpus); - VIR_STEAL_PTR(tmp_cachetune->alloc, alloc); + VIR_STEAL_PTR(tmp_restune->vcpus, vcpus); + VIR_STEAL_PTR(tmp_restune->alloc, alloc);
- if (VIR_APPEND_ELEMENT(def->cachetunes, def->ncachetunes, tmp_cachetune) < 0) + if (VIR_APPEND_ELEMENT(def->restunes, def->nrestunes, tmp_restune) < 0) goto cleanup;
ret = 0; cleanup: ctxt->node = oldnode; - virDomainCachetuneDefFree(tmp_cachetune); + virDomainRestuneDefFree(tmp_restune); virObjectUnref(alloc); virBitmapFree(vcpus); VIR_FREE(alloc_id); @@ -26874,7 +26874,7 @@ virDomainCachetuneDefFormatHelper(unsigned int level,
static int virDomainCachetuneDefFormat(virBufferPtr buf, - virDomainCachetuneDefPtr cachetune, + virDomainRestuneDefPtr restune, unsigned int flags) { virBuffer childrenBuf = VIR_BUFFER_INITIALIZER; @@ -26882,7 +26882,7 @@ virDomainCachetuneDefFormat(virBufferPtr buf, int ret = -1;
virBufferSetChildIndent(&childrenBuf, buf); - virResctrlAllocForeachCache(cachetune->alloc, + virResctrlAllocForeachCache(restune->alloc, virDomainCachetuneDefFormatHelper, &childrenBuf);
@@ -26895,14 +26895,14 @@ virDomainCachetuneDefFormat(virBufferPtr buf, goto cleanup; }
- vcpus = virBitmapFormat(cachetune->vcpus); + vcpus = virBitmapFormat(restune->vcpus); if (!vcpus) goto cleanup;
virBufferAsprintf(buf, "<cachetune vcpus='%s'", vcpus);
if (!(flags & VIR_DOMAIN_DEF_FORMAT_INACTIVE)) { - const char *alloc_id = virResctrlAllocGetID(cachetune->alloc); + const char *alloc_id = virResctrlAllocGetID(restune->alloc); if (!alloc_id) goto cleanup;
@@ -27023,8 +27023,8 @@ virDomainCputuneDefFormat(virBufferPtr buf, def->iothreadids[i]->iothread_id); }
- for (i = 0; i < def->ncachetunes; i++) - virDomainCachetuneDefFormat(&childrenBuf, def->cachetunes[i], flags); + for (i = 0; i < def->nrestunes; i++) + virDomainCachetuneDefFormat(&childrenBuf, def->restunes[i], flags);
if (virBufferCheckError(&childrenBuf) < 0) return -1; diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 6344c02..9e71a0b 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -2228,10 +2228,10 @@ struct _virDomainCputune { };
-typedef struct _virDomainCachetuneDef virDomainCachetuneDef; -typedef virDomainCachetuneDef *virDomainCachetuneDefPtr; +typedef struct _virDomainRestuneDef virDomainRestuneDef; +typedef virDomainRestuneDef *virDomainRestuneDefPtr;
-struct _virDomainCachetuneDef { +struct _virDomainRestuneDef { virBitmapPtr vcpus; virResctrlAllocPtr alloc; }; @@ -2409,8 +2409,8 @@ struct _virDomainDef {
virDomainCputune cputune;
- virDomainCachetuneDefPtr *cachetunes; - size_t ncachetunes; + virDomainRestuneDefPtr *restunes; + size_t nrestunes;
virDomainNumaPtr numa; virDomainResourceDefPtr resource; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 93fd6ba..0659c06 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2447,7 +2447,7 @@ qemuProcessResctrlCreate(virQEMUDriverPtr driver, virCapsPtr caps = NULL; qemuDomainObjPrivatePtr priv = vm->privateData;
- if (!vm->def->ncachetunes) + if (!vm->def->nrestunes) return 0;
/* Force capability refresh since resctrl info can change @@ -2456,9 +2456,9 @@ qemuProcessResctrlCreate(virQEMUDriverPtr driver, if (!caps) return -1;
- for (i = 0; i < vm->def->ncachetunes; i++) { + for (i = 0; i < vm->def->nrestunes; i++) { if (virResctrlAllocCreate(caps->host.resctrl, - vm->def->cachetunes[i]->alloc, + vm->def->restunes[i]->alloc, priv->machineName) < 0) goto cleanup; } @@ -5259,8 +5259,8 @@ qemuProcessSetupVcpu(virDomainObjPtr vm, &vcpu->sched) < 0) return -1;
- for (i = 0; i < vm->def->ncachetunes; i++) { - virDomainCachetuneDefPtr ct = vm->def->cachetunes[i]; + for (i = 0; i < vm->def->nrestunes; i++) { + virDomainRestuneDefPtr ct = vm->def->restunes[i];
if (virBitmapIsBitSet(ct->vcpus, vcpuid)) { if (virResctrlAllocAddPID(ct->alloc, vcpupid) < 0) @@ -6955,8 +6955,8 @@ void qemuProcessStop(virQEMUDriverPtr driver, /* Remove resctrl allocation after cgroups are cleaned up which makes it * kind of safer (although removing the allocation should work even with * pids in tasks file */ - for (i = 0; i < vm->def->ncachetunes; i++) - virResctrlAllocRemove(vm->def->cachetunes[i]->alloc); + for (i = 0; i < vm->def->nrestunes; i++) + virResctrlAllocRemove(vm->def->restunes[i]->alloc);
qemuProcessRemoveDomainStatus(driver, vm);
@@ -7676,8 +7676,8 @@ qemuProcessReconnect(void *opaque) if (qemuConnectAgent(driver, obj) < 0) goto error;
- for (i = 0; i < obj->def->ncachetunes; i++) { - if (virResctrlAllocDeterminePath(obj->def->cachetunes[i]->alloc, + for (i = 0; i < obj->def->nrestunes; i++) { + if (virResctrlAllocDeterminePath(obj->def->restunes[i]->alloc, priv->machineName) < 0) goto error; }
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list

On 07/03/2018 12:10 AM, bing.niu wrote:
Hi John, Thanks for reviewing! Since major questions are from this thread, so I think we can start from this.
On 2018年06月30日 06:47, John Ferlan wrote:
On 06/15/2018 05:29 AM, bing.niu@intel.com wrote:
From: Bing Niu <bing.niu@intel.com>
resctrl not only supports cache tuning, but also memory bandwidth tuning. Rename cachetune to restune(resource tuning) to reflect that.
Signed-off-by: Bing Niu <bing.niu@intel.com> --- src/conf/domain_conf.c | 44 ++++++++++++++++++++++---------------------- src/conf/domain_conf.h | 10 +++++----- src/qemu/qemu_process.c | 18 +++++++++--------- 3 files changed, 36 insertions(+), 36 deletions(-)
Not so sure I'm liking [R|r]estune[s]... Still @cachetunes is an array of a structure that contains a vCPU bitmap and a virResctrlAllocPtr for the /cputune/cachetunes data. The virResctrlAllocPtr was changed to add a the bandwidth allocation, so does this really have to change?
The cachetunes from Cache Allocation Technology(CAT) and memorytunes from Memory Bandwidth Allocation(MBA) are both features from Resource Directory Technology. RDT is a collection of resource distribution technology, right now it includes CAT and MBA. Details information can be found 17.18.1 of Intel's sdm. https://software.intel.com/sites/default/files/managed/39/c5/325462-sdm-vol-...
tl;dr :-) So some day soon it'll be more complicated and have even more rules?!?!
The RDT capability and configuration methods is exposed to user land in form of resctrl file system by kernel. The basic programming model for this resctrl fs interface like this:
1. create a folder under /sys/fs/resctrl. And this folder is called one resctrl group.
2. writing user desired CAT and MBA config to the file /sys/fs/resctrl/<YOUR_GROUP_NAME>/schemata to allocate the cache or memory bandwidth. you can set CAT and MBA together or any of them.
3. moving the threads you want to that resctrl group by writing threads' PID to /sys/fs/resctrl/<YOUR_GROUP_NAME>/task file. So that those threads can be assign with the resource allocated in step 2. And those resources are shared by those threads.
*NOTE*: A thread only can be put in one resctrl group. This requirement is from how RDT and resctrl works.
IOW: A vCPU or the vCPUs being used by CAT and MBA would need to be the same because in "this world" the group *is* the single vCPU or the range of vCPU's.
Each resctrl group has a closid. The resource configuration in above step2 is set to that closid's msr(IA32_L?_QoS) to describe the resource allocation for this closid. And thread use this closid to claim the allocated resource during context switch(scheduled_in()) in kernel by writing the closid to the msr IA32_PQR_ASSOC. With closid wrote in IA32_PQR_ASSOC, RDT hardware knows current running closid and allocated resource basing on this closid's IA32_L?_QoS msr. That why a thread can be put into one restrcl group only.
Details description of resctrl can be found: https://github.com/torvalds/linux/blob/v4.17/Documentation/x86/intel_rdt_ui....
:)
More lengthy things that I didn't read... Thanks for the pointers. I think keeping track of all the rules and relationships is going to be a real "challenge".
Basing on above information, my understanding is that virResctrlAllocPtr is not only bind to cachetunes. It should be a backing object to describe a resctrl group. Especially current virresctrl class already works as that. Libvirt will create one resctrl group for each virResctrlAllocPtr by virResctrlAllocCreate() interface.
So from components view, the big picture is something like below.
Yay, pictures! Thanks for taking the time.
<memorytune ^cpus='0-3'> +---------+ | <cachetune vcpus='0-3'> XML | + parser +-----------+ | | +------------------------------+ | | internal resctrl +------v--------------+ backing object | virResctrlAllocPtr | +------+--------------+ | | +------------------------------+ | +--v-------+ | | | schemata | /sys/fs/resctrl | tasks | | . | | . | | | +----------+
Does this make sense? :)
Yes and I think further has me believe that the vCPUs in this case are the groups and perhaps becomes the internal "key" for all this. I didn't go look, but have to assume multiple cachetune entries create multiple groups into which the vCPU's for the cachetune entries are placed. So far it's been easy because there's no way to overlap cachetune vCPU values (my assumption, I didn't look at the code). Now we add memtune and dictate that if it uses a vCPU or set of vCPUs that it either matches the existing cachetune entries or is itself unique and doesn't overlap any cachetune. Assume the following: <cachetune vcpus='0-1'> <cachetune vcpus='2-4'> <cachetune vcpus='5-7'> A valid memtune must also be '0-1', '2-4', 5-7', or any new grouping from 8->maxvcpus, such as '8-11'. Conversely, if <memtune vcpus='0-4'> was used, that's invalid. Similarly <memtune vcpus='3'> would be invalid. For the former, it's crossing two boundaries and for the latter it's "part of" some other boundary. Hopefully that's right...
I think the question is - if both cachetunes and memtunes exist in domain XML, then for which does the @vcpus relate to. That is, from the cover there's:
<memorytune vcpus='0-1'> <node id='0' bandwidth='20'/> </memorytune>
and then there's a
<cachetune vcpus='0-3'> <cache id='0' level='3' type='both' size='3' unit='MiB'/> <cache id='1' level='3' type='both' size='3' unit='MiB'/> </cachetune>
Now what? I haven't looked at patch 4 yet to even know how this "works".
I also raised this concern in the first round review. Ideally, if we can have a domain XML like this
<resourcetune vcpu='0-3'> <memorytune> <node id='0' bandwidth='20'/> </memorytune> <cachetune> <cache id='0' level='3' type='both' size='3' unit='MiB'/> <cache id='1' level='3' type='both' size='3' unit='MiB'/> </cachetune> <resourcetune>
That will be more clear. Above domain XML means: one resctrl group with memory bandwidth 20 @node 0, L3 cache 3MB @ node0 and node 1. Put vcpu '0-3' to this resctrl group. So those resources are shared among vcpus. However, Pavel pointed out that we must keep backward compatible. And we need keep <cachetune vcpus='0-3'>.
I think I was right above based on this part... Unfortunately forward thinking about memorytune didn't happen, so what you propose above is what I think "internally" once we've read the <cachetune> and <memorytune> elements we end up with. And yes, it makes sense why you renamed things now and why the vcpus are essentially reused between the two. Not a fan of restunes, RsrcDir is closer, but not a perfect shorter name the ResourceDirectory. Also rather than essentially cut-copy-paste for the vcpus logic into memtunes from cachetunes, that should be extracted into it's own helper.
In RFC v1, I also proposed to put memory bandwidth into cachetunes section like: <cachetune vcpus='0-3'> <cache id='0' level='3' type='both' size='3' unit='MiB'/> <node id='0' bandwidth='20'/> <cachetune vcpus>
Maintainer also comments on this "it's not a clear XML definition", describing memory bandwidth inside a cachetune section. That's why we come with a separated memorytune section.
I didn't go back to the RFC when I reviewed. I tried to have a fresh look. After scanning Pavel's comments now though, I do agree having a <node> inside a <cachetune> to go along with a <cache> element is a bit confusing. But I do now see where the "math" is for the total bandwidth that can be used - it's the per node value. Something that I didn't see during my first pass. Of course I may have realized that had the example in the commit had more than 1 vCPU and 1 node.
In your example, the current implement will stop creating resctrl group and throw an error. vcpu list cannot overlap between memorytune and cachetune, but they can be same. Since vcpu can be put into one resctrl group only.
My understanding is that we can set using <resourcetune> to describe cachetune and memorytune together as a long term goal. Since that needs to deprecate cachetune's vcpu list. I am not sure about procedure to remove existing XML elements. Maybe you can point out. For the short term goal, we can use this separated <memorytune vcpus='0-1'> section. How do you think? :)
There is no deprecation possible. You will always have to support the cachetune as a direct child of cputune. If you "fix" things up such that you introduce some other means to have a cputune/resourcetune/cachetune working before you introduce memtune as a child of resourcetune, then that works. But you'll have to know whether you read cputune/cachetune and then format out in that manner. See how the <auth> element has been handled as a "child" of the disk element originally, but now possible as a child of the source element of the disk. It's possible, but messy.
I think what you want to do is create a virDomainMemtuneDef that mimics virDomainCachetuneDef, but I'm not 100% sure.
Of course that still leaves me wondering how much of virResctrlAllocPtr then becomes wasted. You chose an integration point, but essentially have separate technologies. I'm trying to make sense of it all and am starting to get lost in the process of doing that.
From resctrl group perspective, I don't think extend memory bandwidth information in virResctrlAllocPtr is a waste. I think this is a feature extending. :)
Now that the picture is becoming clearer, I can see your point.
If @bandwidth can not be > 100... Can we duplicate the @id somehow? So
Something like <cachetune vcpus='0-3'> <cache id='0' level='3' type='both' size='3' unit='MiB'/> <node id='0' bandwidth='20'/> </cachetune>
or
<cachetune vcpus='0-3'> <cache id='0' level='3' type='both' size='3' unit='MiB' bandwidth='20'/> </cachetune> ??? That also works. This is something I did in RFC v1. But there is a comment said it's better not to mix memory bandwidth with cachetune. That may confuse people. :(
Right - as noted. I didn't read that originally. I jumped into a review for this series mainly because no one else had started reviewing it.
how does that free buffer that got 100% and then gets subtracted from really work in this model? I probably need to study the code some more, but it's not clear it requires using the "same" sort of logic that cachetune uses where there could be different types (instructs, data, or both) that would be drawing from the size, right? I think that's how that worked. So what's the corollary for memtunes? You set a range from 0 to 100, right and that's it. OK, too many questions and it's late so I'll stop here... Hopefully I didn't leave any stray thoughts in the review of patch 1 or 2.
The policy we used for memory bandwidth is consistent with the existing cachetune. Current cache allocation logic model forbids cache allocation overlap. This is to prevent resource contention I think. Same strategy on memory allocation part. Since total bandwidth beyond to 100 will lead to bandwidth competition.
I think I've explained what I missed above... Essentially your example had 1 vCPU and thus 1 node and so it didn't make sense about the 100%. Of course if there were 3 vCPU's and 3 nodes each having some percentage, yeah then it would have been more obvious. My suggestion, let's try to get through reworking how the cachetunes data is stored. Some sort of common API to "add" a new vCPU's value that can be used by cachetune now and eventually memtune later. The vCPUs being the obvious key rather than cachetune. Fair warning - I have some days out of the office coming up over the next week or so. John
Existing cache allocation logic achieves this by first populating virResctrlInfoPtr structure. Libvirt will fill the virResctrlInfoPtr structure during traverse host's cache hierarchy. When allocating cache by virResctrlAllocCreate, libvirt will first create a virResctrlAllocPtr basing on information from virResctrlInfoPtr. This virResctrlAllocPtr represents the maximum resource available(I mean no any allocation previously). Then libvirt traverse all resctrl groups (in /sys/fs/resctrl/)created already, subtract the resource they claimed. So the left resource is the resource free. Libvirt compares this free resource with resource requested in virResctrlAllocCreate. If enough free resource available, then new resctrl group will be created. Memory bandwidth part follows the same idea, only the difference is memory bandwidth use percentage number and cache use bitmap.
Bing
John
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 62c412e..b3543f3 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -2950,14 +2950,14 @@ virDomainLoaderDefFree(virDomainLoaderDefPtr loader) static void -virDomainCachetuneDefFree(virDomainCachetuneDefPtr cachetune) +virDomainRestuneDefFree(virDomainRestuneDefPtr restune) { - if (!cachetune) + if (!restune) return; - virObjectUnref(cachetune->alloc); - virBitmapFree(cachetune->vcpus); - VIR_FREE(cachetune); + virObjectUnref(restune->alloc); + virBitmapFree(restune->vcpus); + VIR_FREE(restune); } @@ -3147,9 +3147,9 @@ void virDomainDefFree(virDomainDefPtr def) virDomainShmemDefFree(def->shmems[i]); VIR_FREE(def->shmems); - for (i = 0; i < def->ncachetunes; i++) - virDomainCachetuneDefFree(def->cachetunes[i]); - VIR_FREE(def->cachetunes); + for (i = 0; i < def->nrestunes; i++) + virDomainRestuneDefFree(def->restunes[i]); + VIR_FREE(def->restunes); VIR_FREE(def->keywrap); @@ -18971,7 +18971,7 @@ virDomainCachetuneDefParse(virDomainDefPtr def, xmlNodePtr *nodes = NULL; virBitmapPtr vcpus = NULL; virResctrlAllocPtr alloc = virResctrlAllocNew(); - virDomainCachetuneDefPtr tmp_cachetune = NULL; + virDomainRestuneDefPtr tmp_restune = NULL; char *tmp = NULL; char *vcpus_str = NULL; char *alloc_id = NULL; @@ -18984,7 +18984,7 @@ virDomainCachetuneDefParse(virDomainDefPtr def, if (!alloc) goto cleanup; - if (VIR_ALLOC(tmp_cachetune) < 0) + if (VIR_ALLOC(tmp_restune) < 0) goto cleanup; vcpus_str = virXMLPropString(node, "vcpus"); @@ -19025,8 +19025,8 @@ virDomainCachetuneDefParse(virDomainDefPtr def, goto cleanup; } - for (i = 0; i < def->ncachetunes; i++) { - if (virBitmapOverlaps(def->cachetunes[i]->vcpus, vcpus)) { + for (i = 0; i < def->nrestunes; i++) { + if (virBitmapOverlaps(def->restunes[i]->vcpus, vcpus)) { virReportError(VIR_ERR_XML_ERROR, "%s", _("Overlapping vcpus in cachetunes")); goto cleanup; @@ -19056,16 +19056,16 @@ virDomainCachetuneDefParse(virDomainDefPtr def, if (virResctrlAllocSetID(alloc, alloc_id) < 0) goto cleanup; - VIR_STEAL_PTR(tmp_cachetune->vcpus, vcpus); - VIR_STEAL_PTR(tmp_cachetune->alloc, alloc); + VIR_STEAL_PTR(tmp_restune->vcpus, vcpus); + VIR_STEAL_PTR(tmp_restune->alloc, alloc); - if (VIR_APPEND_ELEMENT(def->cachetunes, def->ncachetunes, tmp_cachetune) < 0) + if (VIR_APPEND_ELEMENT(def->restunes, def->nrestunes, tmp_restune) < 0) goto cleanup; ret = 0; cleanup: ctxt->node = oldnode; - virDomainCachetuneDefFree(tmp_cachetune); + virDomainRestuneDefFree(tmp_restune); virObjectUnref(alloc); virBitmapFree(vcpus); VIR_FREE(alloc_id); @@ -26874,7 +26874,7 @@ virDomainCachetuneDefFormatHelper(unsigned int level, static int virDomainCachetuneDefFormat(virBufferPtr buf, - virDomainCachetuneDefPtr cachetune, + virDomainRestuneDefPtr restune, unsigned int flags) { virBuffer childrenBuf = VIR_BUFFER_INITIALIZER; @@ -26882,7 +26882,7 @@ virDomainCachetuneDefFormat(virBufferPtr buf, int ret = -1; virBufferSetChildIndent(&childrenBuf, buf); - virResctrlAllocForeachCache(cachetune->alloc, + virResctrlAllocForeachCache(restune->alloc, virDomainCachetuneDefFormatHelper, &childrenBuf); @@ -26895,14 +26895,14 @@ virDomainCachetuneDefFormat(virBufferPtr buf, goto cleanup; } - vcpus = virBitmapFormat(cachetune->vcpus); + vcpus = virBitmapFormat(restune->vcpus); if (!vcpus) goto cleanup; virBufferAsprintf(buf, "<cachetune vcpus='%s'", vcpus); if (!(flags & VIR_DOMAIN_DEF_FORMAT_INACTIVE)) { - const char *alloc_id = virResctrlAllocGetID(cachetune->alloc); + const char *alloc_id = virResctrlAllocGetID(restune->alloc); if (!alloc_id) goto cleanup; @@ -27023,8 +27023,8 @@ virDomainCputuneDefFormat(virBufferPtr buf, def->iothreadids[i]->iothread_id); } - for (i = 0; i < def->ncachetunes; i++) - virDomainCachetuneDefFormat(&childrenBuf, def->cachetunes[i], flags); + for (i = 0; i < def->nrestunes; i++) + virDomainCachetuneDefFormat(&childrenBuf, def->restunes[i], flags); if (virBufferCheckError(&childrenBuf) < 0) return -1; diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 6344c02..9e71a0b 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -2228,10 +2228,10 @@ struct _virDomainCputune { }; -typedef struct _virDomainCachetuneDef virDomainCachetuneDef; -typedef virDomainCachetuneDef *virDomainCachetuneDefPtr; +typedef struct _virDomainRestuneDef virDomainRestuneDef; +typedef virDomainRestuneDef *virDomainRestuneDefPtr; -struct _virDomainCachetuneDef { +struct _virDomainRestuneDef { virBitmapPtr vcpus; virResctrlAllocPtr alloc; }; @@ -2409,8 +2409,8 @@ struct _virDomainDef { virDomainCputune cputune; - virDomainCachetuneDefPtr *cachetunes; - size_t ncachetunes; + virDomainRestuneDefPtr *restunes; + size_t nrestunes; virDomainNumaPtr numa; virDomainResourceDefPtr resource; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 93fd6ba..0659c06 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2447,7 +2447,7 @@ qemuProcessResctrlCreate(virQEMUDriverPtr driver, virCapsPtr caps = NULL; qemuDomainObjPrivatePtr priv = vm->privateData; - if (!vm->def->ncachetunes) + if (!vm->def->nrestunes) return 0; /* Force capability refresh since resctrl info can change @@ -2456,9 +2456,9 @@ qemuProcessResctrlCreate(virQEMUDriverPtr driver, if (!caps) return -1; - for (i = 0; i < vm->def->ncachetunes; i++) { + for (i = 0; i < vm->def->nrestunes; i++) { if (virResctrlAllocCreate(caps->host.resctrl, - vm->def->cachetunes[i]->alloc, + vm->def->restunes[i]->alloc, priv->machineName) < 0) goto cleanup; } @@ -5259,8 +5259,8 @@ qemuProcessSetupVcpu(virDomainObjPtr vm, &vcpu->sched) < 0) return -1; - for (i = 0; i < vm->def->ncachetunes; i++) { - virDomainCachetuneDefPtr ct = vm->def->cachetunes[i]; + for (i = 0; i < vm->def->nrestunes; i++) { + virDomainRestuneDefPtr ct = vm->def->restunes[i]; if (virBitmapIsBitSet(ct->vcpus, vcpuid)) { if (virResctrlAllocAddPID(ct->alloc, vcpupid) < 0) @@ -6955,8 +6955,8 @@ void qemuProcessStop(virQEMUDriverPtr driver, /* Remove resctrl allocation after cgroups are cleaned up which makes it * kind of safer (although removing the allocation should work even with * pids in tasks file */ - for (i = 0; i < vm->def->ncachetunes; i++) - virResctrlAllocRemove(vm->def->cachetunes[i]->alloc); + for (i = 0; i < vm->def->nrestunes; i++) + virResctrlAllocRemove(vm->def->restunes[i]->alloc); qemuProcessRemoveDomainStatus(driver, vm); @@ -7676,8 +7676,8 @@ qemuProcessReconnect(void *opaque) if (qemuConnectAgent(driver, obj) < 0) goto error; - for (i = 0; i < obj->def->ncachetunes; i++) { - if (virResctrlAllocDeterminePath(obj->def->cachetunes[i]->alloc, + for (i = 0; i < obj->def->nrestunes; i++) { + if (virResctrlAllocDeterminePath(obj->def->restunes[i]->alloc, priv->machineName) < 0) goto error; }
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list

On 2018年07月06日 06:40, John Ferlan wrote:
On 07/03/2018 12:10 AM, bing.niu wrote:
Hi John, Thanks for reviewing! Since major questions are from this thread, so I think we can start from this.
On 2018年06月30日 06:47, John Ferlan wrote:
On 06/15/2018 05:29 AM, bing.niu@intel.com wrote:
From: Bing Niu <bing.niu@intel.com>
resctrl not only supports cache tuning, but also memory bandwidth tuning. Rename cachetune to restune(resource tuning) to reflect that.
Signed-off-by: Bing Niu <bing.niu@intel.com> --- src/conf/domain_conf.c | 44 ++++++++++++++++++++++---------------------- src/conf/domain_conf.h | 10 +++++----- src/qemu/qemu_process.c | 18 +++++++++--------- 3 files changed, 36 insertions(+), 36 deletions(-)
Not so sure I'm liking [R|r]estune[s]... Still @cachetunes is an array of a structure that contains a vCPU bitmap and a virResctrlAllocPtr for the /cputune/cachetunes data. The virResctrlAllocPtr was changed to add a the bandwidth allocation, so does this really have to change?
The cachetunes from Cache Allocation Technology(CAT) and memorytunes from Memory Bandwidth Allocation(MBA) are both features from Resource Directory Technology. RDT is a collection of resource distribution technology, right now it includes CAT and MBA. Details information can be found 17.18.1 of Intel's sdm. https://software.intel.com/sites/default/files/managed/39/c5/325462-sdm-vol-...
tl;dr :-)
So some day soon it'll be more complicated and have even more rules?!?!
Hopefully not, and Intel do backward compatible well.:) So overall programing module should be steady.
The RDT capability and configuration methods is exposed to user land in form of resctrl file system by kernel. The basic programming model for this resctrl fs interface like this:
1. create a folder under /sys/fs/resctrl. And this folder is called one resctrl group.
2. writing user desired CAT and MBA config to the file /sys/fs/resctrl/<YOUR_GROUP_NAME>/schemata to allocate the cache or memory bandwidth. you can set CAT and MBA together or any of them.
3. moving the threads you want to that resctrl group by writing threads' PID to /sys/fs/resctrl/<YOUR_GROUP_NAME>/task file. So that those threads can be assign with the resource allocated in step 2. And those resources are shared by those threads.
*NOTE*: A thread only can be put in one resctrl group. This requirement is from how RDT and resctrl works.
IOW: A vCPU or the vCPUs being used by CAT and MBA would need to be the same because in "this world" the group *is* the single vCPU or the range of vCPU's.
Yup! vCPU actually is a thread of QEMU from host view.
Each resctrl group has a closid. The resource configuration in above step2 is set to that closid's msr(IA32_L?_QoS) to describe the resource allocation for this closid. And thread use this closid to claim the allocated resource during context switch(scheduled_in()) in kernel by writing the closid to the msr IA32_PQR_ASSOC. With closid wrote in IA32_PQR_ASSOC, RDT hardware knows current running closid and allocated resource basing on this closid's IA32_L?_QoS msr. That why a thread can be put into one restrcl group only.
Details description of resctrl can be found: https://github.com/torvalds/linux/blob/v4.17/Documentation/x86/intel_rdt_ui....
:)
More lengthy things that I didn't read... Thanks for the pointers. I think keeping track of all the rules and relationships is going to be a real "challenge".
Yes, that is a real "challenge". However, kernel modules usually give a consistent and unified interface. That will help a lot. We should be able to support new features by extending the existing logic.
Basing on above information, my understanding is that virResctrlAllocPtr is not only bind to cachetunes. It should be a backing object to describe a resctrl group. Especially current virresctrl class already works as that. Libvirt will create one resctrl group for each virResctrlAllocPtr by virResctrlAllocCreate() interface.
So from components view, the big picture is something like below.
Yay, pictures! Thanks for taking the time.
<memorytune ^cpus='0-3'> +---------+ | <cachetune vcpus='0-3'> XML | + parser +-----------+ | | +------------------------------+ | | internal resctrl +------v--------------+ backing object | virResctrlAllocPtr | +------+--------------+ | | +------------------------------+ | +--v-------+ | | | schemata | /sys/fs/resctrl | tasks | | . | | . | | | +----------+
Does this make sense? :)
Yes and I think further has me believe that the vCPUs in this case are the groups and perhaps becomes the internal "key" for all this. I didn't go look, but have to assume multiple cachetune entries create multiple groups into which the vCPU's for the cachetune entries are placed. So far it's been easy because there's no way to overlap cachetune vCPU values (my assumption, I didn't look at the code).
Now we add memtune and dictate that if it uses a vCPU or set of vCPUs that it either matches the existing cachetune entries or is itself unique and doesn't overlap any cachetune.
Assume the following:
<cachetune vcpus='0-1'> <cachetune vcpus='2-4'> <cachetune vcpus='5-7'>
A valid memtune must also be '0-1', '2-4', 5-7', or any new grouping from 8->maxvcpus, such as '8-11'.
Conversely, if <memtune vcpus='0-4'> was used, that's invalid. Similarly <memtune vcpus='3'> would be invalid. For the former, it's crossing two boundaries and for the latter it's "part of" some other boundary.
Hopefully that's right...
You are definitely right! That is how we implement in virDomainMemorytuneDefParse() of [RFCv2 PATCH 4/5] like below. + for (i = 0; i < def->nrestunes; i++) { + /* vcpus group has been created, directly use the existing one. + * Just updating memory allocation information of that group + * */ + if (virBitmapEqual(def->restunes[i]->vcpus, vcpus)) { + alloc = def->restunes[i]->alloc; + break; + } + + /* vcpus can not overlapping */ + if (virBitmapOverlaps(def->restunes[i]->vcpus, vcpus)) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Overlapping vcpus in memorytunes")); + goto cleanup; + } + } + if (!alloc) { + alloc = virResctrlAllocNew(); + new_alloc = true; + }
I think the question is - if both cachetunes and memtunes exist in domain XML, then for which does the @vcpus relate to. That is, from the cover there's:
<memorytune vcpus='0-1'> <node id='0' bandwidth='20'/> </memorytune>
and then there's a
<cachetune vcpus='0-3'> <cache id='0' level='3' type='both' size='3' unit='MiB'/> <cache id='1' level='3' type='both' size='3' unit='MiB'/> </cachetune>
Now what? I haven't looked at patch 4 yet to even know how this "works".
I also raised this concern in the first round review. Ideally, if we can have a domain XML like this
<resourcetune vcpu='0-3'> <memorytune> <node id='0' bandwidth='20'/> </memorytune> <cachetune> <cache id='0' level='3' type='both' size='3' unit='MiB'/> <cache id='1' level='3' type='both' size='3' unit='MiB'/> </cachetune> <resourcetune>
That will be more clear. Above domain XML means: one resctrl group with memory bandwidth 20 @node 0, L3 cache 3MB @ node0 and node 1. Put vcpu '0-3' to this resctrl group. So those resources are shared among vcpus. However, Pavel pointed out that we must keep backward compatible. And we need keep <cachetune vcpus='0-3'>.
I think I was right above based on this part...
Unfortunately forward thinking about memorytune didn't happen, so what you propose above is what I think "internally" once we've read the <cachetune> and <memorytune> elements we end up with. And yes, it makes sense why you renamed things now and why the vcpus are essentially reused between the two.
Indeed. It's hard to keep space for feature extension without enough information. Introducing a parent section <resourcetune> for the single child <cachetune> without <memorytune> also confuses people.
Not a fan of restunes, RsrcDir is closer, but not a perfect shorter name the ResourceDirectory. Also rather than essentially cut-copy-paste for the vcpus logic into memtunes from cachetunes, that should be extracted into it's own helper.
In RFC v1, I also proposed to put memory bandwidth into cachetunes section like: <cachetune vcpus='0-3'> <cache id='0' level='3' type='both' size='3' unit='MiB'/> <node id='0' bandwidth='20'/> <cachetune vcpus>
Maintainer also comments on this "it's not a clear XML definition", describing memory bandwidth inside a cachetune section. That's why we come with a separated memorytune section.
I didn't go back to the RFC when I reviewed. I tried to have a fresh look. After scanning Pavel's comments now though, I do agree having a <node> inside a <cachetune> to go along with a <cache> element is a bit confusing. But I do now see where the "math" is for the total bandwidth that can be used - it's the per node value. Something that I didn't see during my first pass. Of course I may have realized that had the example in the commit had more than 1 vCPU and 1 node. Great! So we all agree to make <memtune> a separated session. Beside more than 1 vCPU and 1 node, there is also the other possibility:
OK. I will pack vcpus logic into a helper function in next version. put the host supporting threads (QEMU I/O thread and monitor thread) to the other group, So that memory bandwidth can be distributed among vCPUs and host supporting threads. I will put the previous review link in the cover letter for easy reference in future.
In your example, the current implement will stop creating resctrl group and throw an error. vcpu list cannot overlap between memorytune and cachetune, but they can be same. Since vcpu can be put into one resctrl group only.
My understanding is that we can set using <resourcetune> to describe cachetune and memorytune together as a long term goal. Since that needs to deprecate cachetune's vcpu list. I am not sure about procedure to remove existing XML elements. Maybe you can point out. For the short term goal, we can use this separated <memorytune vcpus='0-1'> section. How do you think? :)
There is no deprecation possible. You will always have to support the cachetune as a direct child of cputune. If you "fix" things up such that you introduce some other means to have a cputune/resourcetune/cachetune working before you introduce memtune as a child of resourcetune, then that works. But you'll have to know whether you read cputune/cachetune and then format out in that manner. See how the <auth> element has been handled as a "child" of the disk element originally, but now possible as a child of the source element of the disk. It's possible, but messy.
hmmm..., just learn the <auth> part. That cross reference between disk/auth and source/auth does make code complex and not so straightforward. So better we keep cputune/cachetune and introduce cputune/memtune.
I think what you want to do is create a virDomainMemtuneDef that mimics virDomainCachetuneDef, but I'm not 100% sure.
Of course that still leaves me wondering how much of virResctrlAllocPtr then becomes wasted. You chose an integration point, but essentially have separate technologies. I'm trying to make sense of it all and am starting to get lost in the process of doing that.
From resctrl group perspective, I don't think extend memory bandwidth information in virResctrlAllocPtr is a waste. I think this is a feature extending. :)
Now that the picture is becoming clearer, I can see your point.
Great~ :)
If @bandwidth can not be > 100... Can we duplicate the @id somehow? So
Something like <cachetune vcpus='0-3'> <cache id='0' level='3' type='both' size='3' unit='MiB'/> <node id='0' bandwidth='20'/> </cachetune>
or
<cachetune vcpus='0-3'> <cache id='0' level='3' type='both' size='3' unit='MiB' bandwidth='20'/> </cachetune> ??? That also works. This is something I did in RFC v1. But there is a comment said it's better not to mix memory bandwidth with cachetune. That may confuse people. :(
Right - as noted. I didn't read that originally. I jumped into a review for this series mainly because no one else had started reviewing it.
Thanks for jumping in. I wait too long a time. :( Your comments are absolutely helpful.
how does that free buffer that got 100% and then gets subtracted from really work in this model? I probably need to study the code some more, but it's not clear it requires using the "same" sort of logic that cachetune uses where there could be different types (instructs, data, or both) that would be drawing from the size, right? I think that's how that worked. So what's the corollary for memtunes? You set a range from 0 to 100, right and that's it. OK, too many questions and it's late so I'll stop here... Hopefully I didn't leave any stray thoughts in the review of patch 1 or 2.
The policy we used for memory bandwidth is consistent with the existing cachetune. Current cache allocation logic model forbids cache allocation overlap. This is to prevent resource contention I think. Same strategy on memory allocation part. Since total bandwidth beyond to 100 will lead to bandwidth competition.
I think I've explained what I missed above... Essentially your example had 1 vCPU and thus 1 node and so it didn't make sense about the 100%. Of course if there were 3 vCPU's and 3 nodes each having some percentage, yeah then it would have been more obvious.
My suggestion, let's try to get through reworking how the cachetunes data is stored. Some sort of common API to "add" a new vCPU's value that can be used by cachetune now and eventually memtune later. The vCPUs being the obvious key rather than cachetune.
The current cachetunes data is stored in a pointer based fashion. The advantage of that is NULL pointer means no cachetuen information for particular level/type cache. And vCPU is stored in virDomainCachetuneDef of domain_conf. And overlapping testing happens in domain_conf part.The reason vCPU not part of virresctrl is properly because domain_conf need this information to formate domain's XML to store. As your suggested, I changed this virDomainCachetuneDef --> virDomainRestuneDef in this patch. Since it will hold information both cachetune and memtune. And yes, we can revisit virresctrl to extract common part of cachetune and memtune.
Fair warning - I have some days out of the office coming up over the next week or so.
Thanks for let me know this. I think we align together basing on above discuss. I will reply your comment in the PATH1 PATH2 and PATH4, and revise. Especially docs/schemas/*.rng, you pointed out. At the same time, we can see whether Pavel will give comment also. ;) Bing
John
Existing cache allocation logic achieves this by first populating virResctrlInfoPtr structure. Libvirt will fill the virResctrlInfoPtr structure during traverse host's cache hierarchy. When allocating cache by virResctrlAllocCreate, libvirt will first create a virResctrlAllocPtr basing on information from virResctrlInfoPtr. This virResctrlAllocPtr represents the maximum resource available(I mean no any allocation previously). Then libvirt traverse all resctrl groups (in /sys/fs/resctrl/)created already, subtract the resource they claimed. So the left resource is the resource free. Libvirt compares this free resource with resource requested in virResctrlAllocCreate. If enough free resource available, then new resctrl group will be created. Memory bandwidth part follows the same idea, only the difference is memory bandwidth use percentage number and cache use bitmap.
Bing
John
[.....]

From: Bing Niu <bing.niu@intel.com> Introduce a new section memorytune to support memory bandwidth allocation. This is consistent with existing cachetune . As the example below: <memorytune vcpus='0'> <node id='0' bandwidth='30'/> </memorytune> id --- on which node memory bandwidth to be set bandwidth --- the memory bandwidth percent to set. Signed-off-by: Bing Niu <bing.niu@intel.com> --- src/conf/domain_conf.c | 260 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 260 insertions(+) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index b3543f3..dbfd69f 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -19076,6 +19076,189 @@ virDomainCachetuneDefParse(virDomainDefPtr def, } +static int +virDomainMemorytuneDefParseMemory(xmlXPathContextPtr ctxt, + xmlNodePtr node, + virResctrlAllocPtr alloc) +{ + xmlNodePtr oldnode = ctxt->node; + unsigned int id; + unsigned int bandwidth; + char *tmp = NULL; + int ret = -1; + + ctxt->node = node; + + tmp = virXMLPropString(node, "id"); + if (!tmp) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Missing memorytune attribute 'id'")); + goto cleanup; + } + if (virStrToLong_uip(tmp, NULL, 10, &id) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("Invalid memorytune attribute 'id' value '%s'"), + tmp); + goto cleanup; + } + VIR_FREE(tmp); + + tmp = virXMLPropString(node, "bandwidth"); + if (!tmp) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Missing memorytune attribute 'bandwidth'")); + goto cleanup; + } + if (virStrToLong_uip(tmp, NULL, 10, &bandwidth) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("Invalid memorytune attribute 'bandwidth' value '%s'"), + tmp); + goto cleanup; + } + VIR_FREE(tmp); + if (virResctrlSetMemoryBandwidth(alloc, id, bandwidth) < 0) + goto cleanup; + + ret = 0; + cleanup: + ctxt->node = oldnode; + VIR_FREE(tmp); + return ret; +} + + +static int +virDomainMemorytuneDefParse(virDomainDefPtr def, + xmlXPathContextPtr ctxt, + xmlNodePtr node, + unsigned int flags) +{ + xmlNodePtr oldnode = ctxt->node; + xmlNodePtr *nodes = NULL; + virBitmapPtr vcpus = NULL; + virResctrlAllocPtr alloc = NULL; + virDomainRestuneDefPtr tmp_cachetune = NULL; + char *tmp = NULL; + char *vcpus_str = NULL; + char *alloc_id = NULL; + ssize_t i = 0; + int n; + int ret = -1; + bool new_alloc = false; + + ctxt->node = node; + + if (VIR_ALLOC(tmp_cachetune) < 0) + goto cleanup; + + vcpus_str = virXMLPropString(node, "vcpus"); + if (!vcpus_str) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Missing memorytune attribute 'vcpus'")); + goto cleanup; + } + if (virBitmapParse(vcpus_str, &vcpus, VIR_DOMAIN_CPUMASK_LEN) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("Invalid memorytune attribute 'vcpus' value '%s'"), + vcpus_str); + goto cleanup; + } + + /* We need to limit the bitmap to number of vCPUs. If there's nothing left, + * then we can just clean up and return 0 immediately */ + virBitmapShrink(vcpus, def->maxvcpus); + + if (virBitmapIsAllClear(vcpus)) { + ret = 0; + goto cleanup; + } + + if ((n = virXPathNodeSet("./node", ctxt, &nodes)) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Cannot extract memory nodes under memorytune")); + goto cleanup; + } + for (i = 0; i < def->nrestunes; i++) { + /* vcpus group has been created, directly use the existing one. + * Just updating memory allocation information of that group + * */ + if (virBitmapEqual(def->restunes[i]->vcpus, vcpus)) { + alloc = def->restunes[i]->alloc; + break; + } + + /* vcpus can not overlapping */ + if (virBitmapOverlaps(def->restunes[i]->vcpus, vcpus)) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Overlapping vcpus in memorytunes")); + goto cleanup; + } + } + + if (!alloc) { + alloc = virResctrlAllocNew(); + new_alloc = true; + } + + for (i = 0; i < n; i++) { + if (virDomainMemorytuneDefParseMemory(ctxt, nodes[i], alloc) < 0) + goto cleanup; + } + /* + * If this is a new allocation, format ID and append to restune, otherwise + * just update the existing alloc information + * */ + if (new_alloc) { + if (virResctrlAllocIsEmpty(alloc)) { + ret = 0; + goto cleanup; + } + + + /* We need to format it back because we need to be consistent in the naming + * even when users specify some "sub-optimal" string there. */ + VIR_FREE(vcpus_str); + vcpus_str = virBitmapFormat(vcpus); + if (!vcpus_str) + goto cleanup; + + if (!(flags & VIR_DOMAIN_DEF_PARSE_INACTIVE)) + alloc_id = virXMLPropString(node, "id"); + + if (!alloc_id) { + /* The number of allocations is limited and the directory structure is flat, + * not hierarchical, so we need to have all same allocations in one + * directory, so it's nice to have it named appropriately. For now it's + * 'vcpus_...' but it's designed in order for it to be changeable in the + * future (it's part of the status XML). */ + if (virAsprintf(&alloc_id, "vcpus_%s", vcpus_str) < 0) + goto cleanup; + } + + if (virResctrlAllocSetID(alloc, alloc_id) < 0) + goto cleanup; + + VIR_STEAL_PTR(tmp_cachetune->vcpus, vcpus); + VIR_STEAL_PTR(tmp_cachetune->alloc, alloc); + + if (VIR_APPEND_ELEMENT(def->restunes, def->nrestunes, tmp_cachetune) < 0) + goto cleanup; + } + ret = 0; + cleanup: + ctxt->node = oldnode; + virDomainRestuneDefFree(tmp_cachetune); + if (new_alloc) + virObjectUnref(alloc); + virBitmapFree(vcpus); + VIR_FREE(alloc_id); + VIR_FREE(vcpus_str); + VIR_FREE(nodes); + VIR_FREE(tmp); + return ret; +} + + static virDomainDefPtr virDomainDefParseXML(xmlDocPtr xml, xmlNodePtr root, @@ -19671,6 +19854,18 @@ virDomainDefParseXML(xmlDocPtr xml, } VIR_FREE(nodes); + if ((n = virXPathNodeSet("./cputune/memorytune", ctxt, &nodes)) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("cannot extract memorytune nodes")); + goto error; + } + + for (i = 0; i < n; i++) { + if (virDomainMemorytuneDefParse(def, ctxt, nodes[i], flags) < 0) + goto error; + } + VIR_FREE(nodes); + if (virCPUDefParseXML(ctxt, "./cpu[1]", VIR_CPU_TYPE_GUEST, &def->cpu) < 0) goto error; @@ -26922,6 +27117,68 @@ virDomainCachetuneDefFormat(virBufferPtr buf, static int +virDomainMemorytuneDefFormatHelper(unsigned int id, + unsigned int bandwidth, + void *opaque) +{ + virBufferPtr buf = opaque; + + virBufferAsprintf(buf, + "<node id='%u' bandwidth='%u'/>\n", + id, bandwidth); + return 0; +} + + +static int +virDomainMemorytuneDefFormat(virBufferPtr buf, + virDomainRestuneDefPtr restune, + unsigned int flags) +{ + virBuffer childrenBuf = VIR_BUFFER_INITIALIZER; + char *vcpus = NULL; + int ret = -1; + + virBufferSetChildIndent(&childrenBuf, buf); + virResctrlAllocForeachMemory(restune->alloc, + virDomainMemorytuneDefFormatHelper, + &childrenBuf); + + + if (virBufferCheckError(&childrenBuf) < 0) + goto cleanup; + + if (!virBufferUse(&childrenBuf)) { + ret = 0; + goto cleanup; + } + + vcpus = virBitmapFormat(restune->vcpus); + if (!vcpus) + goto cleanup; + + virBufferAsprintf(buf, "<memorytune vcpus='%s'", vcpus); + + if (!(flags & VIR_DOMAIN_DEF_FORMAT_INACTIVE)) { + const char *alloc_id = virResctrlAllocGetID(restune->alloc); + if (!alloc_id) + goto cleanup; + + virBufferAsprintf(buf, " id='%s'", alloc_id); + } + virBufferAddLit(buf, ">\n"); + + virBufferAddBuffer(buf, &childrenBuf); + virBufferAddLit(buf, "</memorytune>\n"); + + ret = 0; + cleanup: + virBufferFreeAndReset(&childrenBuf); + VIR_FREE(vcpus); + return ret; +} + +static int virDomainCputuneDefFormat(virBufferPtr buf, virDomainDefPtr def, unsigned int flags) @@ -27026,6 +27283,9 @@ virDomainCputuneDefFormat(virBufferPtr buf, for (i = 0; i < def->nrestunes; i++) virDomainCachetuneDefFormat(&childrenBuf, def->restunes[i], flags); + for (i = 0; i < def->nrestunes; i++) + virDomainMemorytuneDefFormat(&childrenBuf, def->restunes[i], flags); + if (virBufferCheckError(&childrenBuf) < 0) return -1; -- 2.7.4

On 06/15/2018 05:29 AM, bing.niu@intel.com wrote:
From: Bing Niu <bing.niu@intel.com>
Introduce a new section memorytune to support memory bandwidth allocation. This is consistent with existing cachetune . As the example below:
<memorytune vcpus='0'> <node id='0' bandwidth='30'/> </memorytune>
The formatting above needed to be cleaned up.
id --- on which node memory bandwidth to be set bandwidth --- the memory bandwidth percent to set.
Signed-off-by: Bing Niu <bing.niu@intel.com> --- src/conf/domain_conf.c | 260 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 260 insertions(+)
This would seem to be missing docs/schemas/*.rng changes, docs/formatdomain.html.in changes, tests/*xml2xmltest.c changes and the associated data input/output files, etc. I only scanned what's below - I have too many questions and concerns from the previous patches to get too in depth especially since this is missing so much stuff. I'm not going to look at patch 5. John
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index b3543f3..dbfd69f 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -19076,6 +19076,189 @@ virDomainCachetuneDefParse(virDomainDefPtr def, }
+static int +virDomainMemorytuneDefParseMemory(xmlXPathContextPtr ctxt, + xmlNodePtr node, + virResctrlAllocPtr alloc) +{ + xmlNodePtr oldnode = ctxt->node; + unsigned int id; + unsigned int bandwidth; + char *tmp = NULL; + int ret = -1; + + ctxt->node = node; + + tmp = virXMLPropString(node, "id"); + if (!tmp) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Missing memorytune attribute 'id'")); + goto cleanup; + } + if (virStrToLong_uip(tmp, NULL, 10, &id) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("Invalid memorytune attribute 'id' value '%s'"), + tmp); + goto cleanup; + } + VIR_FREE(tmp); + + tmp = virXMLPropString(node, "bandwidth"); + if (!tmp) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Missing memorytune attribute 'bandwidth'")); + goto cleanup; + } + if (virStrToLong_uip(tmp, NULL, 10, &bandwidth) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("Invalid memorytune attribute 'bandwidth' value '%s'"), + tmp); + goto cleanup; + } + VIR_FREE(tmp); + if (virResctrlSetMemoryBandwidth(alloc, id, bandwidth) < 0) + goto cleanup; + + ret = 0; + cleanup: + ctxt->node = oldnode; + VIR_FREE(tmp); + return ret; +} + + +static int +virDomainMemorytuneDefParse(virDomainDefPtr def, + xmlXPathContextPtr ctxt, + xmlNodePtr node, + unsigned int flags) +{ + xmlNodePtr oldnode = ctxt->node; + xmlNodePtr *nodes = NULL; + virBitmapPtr vcpus = NULL; + virResctrlAllocPtr alloc = NULL; + virDomainRestuneDefPtr tmp_cachetune = NULL; + char *tmp = NULL; + char *vcpus_str = NULL; + char *alloc_id = NULL; + ssize_t i = 0; + int n; + int ret = -1; + bool new_alloc = false; + + ctxt->node = node; + + if (VIR_ALLOC(tmp_cachetune) < 0) + goto cleanup; + + vcpus_str = virXMLPropString(node, "vcpus"); + if (!vcpus_str) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Missing memorytune attribute 'vcpus'")); + goto cleanup; + } + if (virBitmapParse(vcpus_str, &vcpus, VIR_DOMAIN_CPUMASK_LEN) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("Invalid memorytune attribute 'vcpus' value '%s'"), + vcpus_str); + goto cleanup; + } + + /* We need to limit the bitmap to number of vCPUs. If there's nothing left, + * then we can just clean up and return 0 immediately */ + virBitmapShrink(vcpus, def->maxvcpus); + + if (virBitmapIsAllClear(vcpus)) { + ret = 0; + goto cleanup; + } + + if ((n = virXPathNodeSet("./node", ctxt, &nodes)) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Cannot extract memory nodes under memorytune")); + goto cleanup; + } + for (i = 0; i < def->nrestunes; i++) { + /* vcpus group has been created, directly use the existing one. + * Just updating memory allocation information of that group + * */ + if (virBitmapEqual(def->restunes[i]->vcpus, vcpus)) { + alloc = def->restunes[i]->alloc; + break; + } + + /* vcpus can not overlapping */ + if (virBitmapOverlaps(def->restunes[i]->vcpus, vcpus)) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Overlapping vcpus in memorytunes")); + goto cleanup; + } + } + + if (!alloc) { + alloc = virResctrlAllocNew(); + new_alloc = true; + } + + for (i = 0; i < n; i++) { + if (virDomainMemorytuneDefParseMemory(ctxt, nodes[i], alloc) < 0) + goto cleanup; + } + /* + * If this is a new allocation, format ID and append to restune, otherwise + * just update the existing alloc information + * */ + if (new_alloc) { + if (virResctrlAllocIsEmpty(alloc)) { + ret = 0; + goto cleanup; + } + + + /* We need to format it back because we need to be consistent in the naming + * even when users specify some "sub-optimal" string there. */ + VIR_FREE(vcpus_str); + vcpus_str = virBitmapFormat(vcpus); + if (!vcpus_str) + goto cleanup; + + if (!(flags & VIR_DOMAIN_DEF_PARSE_INACTIVE)) + alloc_id = virXMLPropString(node, "id"); + + if (!alloc_id) { + /* The number of allocations is limited and the directory structure is flat, + * not hierarchical, so we need to have all same allocations in one + * directory, so it's nice to have it named appropriately. For now it's + * 'vcpus_...' but it's designed in order for it to be changeable in the + * future (it's part of the status XML). */ + if (virAsprintf(&alloc_id, "vcpus_%s", vcpus_str) < 0) + goto cleanup; + } + + if (virResctrlAllocSetID(alloc, alloc_id) < 0) + goto cleanup; + + VIR_STEAL_PTR(tmp_cachetune->vcpus, vcpus); + VIR_STEAL_PTR(tmp_cachetune->alloc, alloc); + + if (VIR_APPEND_ELEMENT(def->restunes, def->nrestunes, tmp_cachetune) < 0) + goto cleanup; + } + ret = 0; + cleanup: + ctxt->node = oldnode; + virDomainRestuneDefFree(tmp_cachetune); + if (new_alloc) + virObjectUnref(alloc); + virBitmapFree(vcpus); + VIR_FREE(alloc_id); + VIR_FREE(vcpus_str); + VIR_FREE(nodes); + VIR_FREE(tmp); + return ret; +} + + static virDomainDefPtr virDomainDefParseXML(xmlDocPtr xml, xmlNodePtr root, @@ -19671,6 +19854,18 @@ virDomainDefParseXML(xmlDocPtr xml, } VIR_FREE(nodes);
+ if ((n = virXPathNodeSet("./cputune/memorytune", ctxt, &nodes)) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("cannot extract memorytune nodes")); + goto error; + } + + for (i = 0; i < n; i++) { + if (virDomainMemorytuneDefParse(def, ctxt, nodes[i], flags) < 0) + goto error; + } + VIR_FREE(nodes); + if (virCPUDefParseXML(ctxt, "./cpu[1]", VIR_CPU_TYPE_GUEST, &def->cpu) < 0) goto error;
@@ -26922,6 +27117,68 @@ virDomainCachetuneDefFormat(virBufferPtr buf,
static int +virDomainMemorytuneDefFormatHelper(unsigned int id, + unsigned int bandwidth, + void *opaque) +{ + virBufferPtr buf = opaque; + + virBufferAsprintf(buf, + "<node id='%u' bandwidth='%u'/>\n", + id, bandwidth); + return 0; +} + + +static int +virDomainMemorytuneDefFormat(virBufferPtr buf, + virDomainRestuneDefPtr restune, + unsigned int flags) +{ + virBuffer childrenBuf = VIR_BUFFER_INITIALIZER; + char *vcpus = NULL; + int ret = -1; + + virBufferSetChildIndent(&childrenBuf, buf); + virResctrlAllocForeachMemory(restune->alloc, + virDomainMemorytuneDefFormatHelper, + &childrenBuf); + + + if (virBufferCheckError(&childrenBuf) < 0) + goto cleanup; + + if (!virBufferUse(&childrenBuf)) { + ret = 0; + goto cleanup; + } + + vcpus = virBitmapFormat(restune->vcpus); + if (!vcpus) + goto cleanup; + + virBufferAsprintf(buf, "<memorytune vcpus='%s'", vcpus); + + if (!(flags & VIR_DOMAIN_DEF_FORMAT_INACTIVE)) { + const char *alloc_id = virResctrlAllocGetID(restune->alloc); + if (!alloc_id) + goto cleanup; + + virBufferAsprintf(buf, " id='%s'", alloc_id); + } + virBufferAddLit(buf, ">\n"); + + virBufferAddBuffer(buf, &childrenBuf); + virBufferAddLit(buf, "</memorytune>\n"); + + ret = 0; + cleanup: + virBufferFreeAndReset(&childrenBuf); + VIR_FREE(vcpus); + return ret; +} + +static int virDomainCputuneDefFormat(virBufferPtr buf, virDomainDefPtr def, unsigned int flags) @@ -27026,6 +27283,9 @@ virDomainCputuneDefFormat(virBufferPtr buf, for (i = 0; i < def->nrestunes; i++) virDomainCachetuneDefFormat(&childrenBuf, def->restunes[i], flags);
+ for (i = 0; i < def->nrestunes; i++) + virDomainMemorytuneDefFormat(&childrenBuf, def->restunes[i], flags); + if (virBufferCheckError(&childrenBuf) < 0) return -1;

On 2018年06月30日 06:47, John Ferlan wrote:
On 06/15/2018 05:29 AM, bing.niu@intel.com wrote:
From: Bing Niu <bing.niu@intel.com>
Introduce a new section memorytune to support memory bandwidth allocation. This is consistent with existing cachetune . As the example below:
<memorytune vcpus='0'> <node id='0' bandwidth='30'/> </memorytune>
The formatting above needed to be cleaned up.
OK!
id --- on which node memory bandwidth to be set bandwidth --- the memory bandwidth percent to set.
Signed-off-by: Bing Niu <bing.niu@intel.com> --- src/conf/domain_conf.c | 260 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 260 insertions(+)
This would seem to be missing docs/schemas/*.rng changes, docs/formatdomain.html.in changes, tests/*xml2xmltest.c changes and the associated data input/output files, etc.
I will add docs/schemas/*.rng for patch4 and patch5. Bing
I only scanned what's below - I have too many questions and concerns from the previous patches to get too in depth especially since this is missing so much stuff. I'm not going to look at patch 5.
John
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index b3543f3..dbfd69f 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -19076,6 +19076,189 @@ virDomainCachetuneDefParse(virDomainDefPtr def, }
+static int +virDomainMemorytuneDefParseMemory(xmlXPathContextPtr ctxt, + xmlNodePtr node, + virResctrlAllocPtr alloc) +{ + xmlNodePtr oldnode = ctxt->node; + unsigned int id; + unsigned int bandwidth; + char *tmp = NULL; + int ret = -1; + + ctxt->node = node; + + tmp = virXMLPropString(node, "id"); + if (!tmp) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Missing memorytune attribute 'id'")); + goto cleanup; + } + if (virStrToLong_uip(tmp, NULL, 10, &id) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("Invalid memorytune attribute 'id' value '%s'"), + tmp); + goto cleanup; + } + VIR_FREE(tmp); + + tmp = virXMLPropString(node, "bandwidth"); + if (!tmp) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Missing memorytune attribute 'bandwidth'")); + goto cleanup; + } + if (virStrToLong_uip(tmp, NULL, 10, &bandwidth) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("Invalid memorytune attribute 'bandwidth' value '%s'"), + tmp); + goto cleanup; + } + VIR_FREE(tmp); + if (virResctrlSetMemoryBandwidth(alloc, id, bandwidth) < 0) + goto cleanup; + + ret = 0; + cleanup: + ctxt->node = oldnode; + VIR_FREE(tmp); + return ret; +} + + +static int +virDomainMemorytuneDefParse(virDomainDefPtr def, + xmlXPathContextPtr ctxt, + xmlNodePtr node, + unsigned int flags) +{ + xmlNodePtr oldnode = ctxt->node; + xmlNodePtr *nodes = NULL; + virBitmapPtr vcpus = NULL; + virResctrlAllocPtr alloc = NULL; + virDomainRestuneDefPtr tmp_cachetune = NULL; + char *tmp = NULL; + char *vcpus_str = NULL; + char *alloc_id = NULL; + ssize_t i = 0; + int n; + int ret = -1; + bool new_alloc = false; + + ctxt->node = node; + + if (VIR_ALLOC(tmp_cachetune) < 0) + goto cleanup; + + vcpus_str = virXMLPropString(node, "vcpus"); + if (!vcpus_str) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Missing memorytune attribute 'vcpus'")); + goto cleanup; + } + if (virBitmapParse(vcpus_str, &vcpus, VIR_DOMAIN_CPUMASK_LEN) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("Invalid memorytune attribute 'vcpus' value '%s'"), + vcpus_str); + goto cleanup; + } + + /* We need to limit the bitmap to number of vCPUs. If there's nothing left, + * then we can just clean up and return 0 immediately */ + virBitmapShrink(vcpus, def->maxvcpus); + + if (virBitmapIsAllClear(vcpus)) { + ret = 0; + goto cleanup; + } + + if ((n = virXPathNodeSet("./node", ctxt, &nodes)) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Cannot extract memory nodes under memorytune")); + goto cleanup; + } + for (i = 0; i < def->nrestunes; i++) { + /* vcpus group has been created, directly use the existing one. + * Just updating memory allocation information of that group + * */ + if (virBitmapEqual(def->restunes[i]->vcpus, vcpus)) { + alloc = def->restunes[i]->alloc; + break; + } + + /* vcpus can not overlapping */ + if (virBitmapOverlaps(def->restunes[i]->vcpus, vcpus)) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Overlapping vcpus in memorytunes")); + goto cleanup; + } + } + + if (!alloc) { + alloc = virResctrlAllocNew(); + new_alloc = true; + } + + for (i = 0; i < n; i++) { + if (virDomainMemorytuneDefParseMemory(ctxt, nodes[i], alloc) < 0) + goto cleanup; + } + /* + * If this is a new allocation, format ID and append to restune, otherwise + * just update the existing alloc information + * */ + if (new_alloc) { + if (virResctrlAllocIsEmpty(alloc)) { + ret = 0; + goto cleanup; + } + + + /* We need to format it back because we need to be consistent in the naming + * even when users specify some "sub-optimal" string there. */ + VIR_FREE(vcpus_str); + vcpus_str = virBitmapFormat(vcpus); + if (!vcpus_str) + goto cleanup; + + if (!(flags & VIR_DOMAIN_DEF_PARSE_INACTIVE)) + alloc_id = virXMLPropString(node, "id"); + + if (!alloc_id) { + /* The number of allocations is limited and the directory structure is flat, + * not hierarchical, so we need to have all same allocations in one + * directory, so it's nice to have it named appropriately. For now it's + * 'vcpus_...' but it's designed in order for it to be changeable in the + * future (it's part of the status XML). */ + if (virAsprintf(&alloc_id, "vcpus_%s", vcpus_str) < 0) + goto cleanup; + } + + if (virResctrlAllocSetID(alloc, alloc_id) < 0) + goto cleanup; + + VIR_STEAL_PTR(tmp_cachetune->vcpus, vcpus); + VIR_STEAL_PTR(tmp_cachetune->alloc, alloc); + + if (VIR_APPEND_ELEMENT(def->restunes, def->nrestunes, tmp_cachetune) < 0) + goto cleanup; + } + ret = 0; + cleanup: + ctxt->node = oldnode; + virDomainRestuneDefFree(tmp_cachetune); + if (new_alloc) + virObjectUnref(alloc); + virBitmapFree(vcpus); + VIR_FREE(alloc_id); + VIR_FREE(vcpus_str); + VIR_FREE(nodes); + VIR_FREE(tmp); + return ret; +} + + static virDomainDefPtr virDomainDefParseXML(xmlDocPtr xml, xmlNodePtr root, @@ -19671,6 +19854,18 @@ virDomainDefParseXML(xmlDocPtr xml, } VIR_FREE(nodes);
+ if ((n = virXPathNodeSet("./cputune/memorytune", ctxt, &nodes)) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("cannot extract memorytune nodes")); + goto error; + } + + for (i = 0; i < n; i++) { + if (virDomainMemorytuneDefParse(def, ctxt, nodes[i], flags) < 0) + goto error; + } + VIR_FREE(nodes); + if (virCPUDefParseXML(ctxt, "./cpu[1]", VIR_CPU_TYPE_GUEST, &def->cpu) < 0) goto error;
@@ -26922,6 +27117,68 @@ virDomainCachetuneDefFormat(virBufferPtr buf,
static int +virDomainMemorytuneDefFormatHelper(unsigned int id, + unsigned int bandwidth, + void *opaque) +{ + virBufferPtr buf = opaque; + + virBufferAsprintf(buf, + "<node id='%u' bandwidth='%u'/>\n", + id, bandwidth); + return 0; +} + + +static int +virDomainMemorytuneDefFormat(virBufferPtr buf, + virDomainRestuneDefPtr restune, + unsigned int flags) +{ + virBuffer childrenBuf = VIR_BUFFER_INITIALIZER; + char *vcpus = NULL; + int ret = -1; + + virBufferSetChildIndent(&childrenBuf, buf); + virResctrlAllocForeachMemory(restune->alloc, + virDomainMemorytuneDefFormatHelper, + &childrenBuf); + + + if (virBufferCheckError(&childrenBuf) < 0) + goto cleanup; + + if (!virBufferUse(&childrenBuf)) { + ret = 0; + goto cleanup; + } + + vcpus = virBitmapFormat(restune->vcpus); + if (!vcpus) + goto cleanup; + + virBufferAsprintf(buf, "<memorytune vcpus='%s'", vcpus); + + if (!(flags & VIR_DOMAIN_DEF_FORMAT_INACTIVE)) { + const char *alloc_id = virResctrlAllocGetID(restune->alloc); + if (!alloc_id) + goto cleanup; + + virBufferAsprintf(buf, " id='%s'", alloc_id); + } + virBufferAddLit(buf, ">\n"); + + virBufferAddBuffer(buf, &childrenBuf); + virBufferAddLit(buf, "</memorytune>\n"); + + ret = 0; + cleanup: + virBufferFreeAndReset(&childrenBuf); + VIR_FREE(vcpus); + return ret; +} + +static int virDomainCputuneDefFormat(virBufferPtr buf, virDomainDefPtr def, unsigned int flags) @@ -27026,6 +27283,9 @@ virDomainCputuneDefFormat(virBufferPtr buf, for (i = 0; i < def->nrestunes; i++) virDomainCachetuneDefFormat(&childrenBuf, def->restunes[i], flags);
+ for (i = 0; i < def->nrestunes; i++) + virDomainMemorytuneDefFormat(&childrenBuf, def->restunes[i], flags); + if (virBufferCheckError(&childrenBuf) < 0) return -1;

From: Bing Niu <bing.niu@intel.com> Add new XML section to report host's memory bandwidth allocation capability. The format as below example: <host> ..... <memory> <node id='0' cpus='0-19'> <control granularity='10' min ='10' maxAllocs='8'/> </node> </memory> </host> granularity ---- granularity of memory bandwidth, unit percentage. min ---- minimum memory bandwidth allowed, unit percentage. maxAllocs ---- maximum memory bandwidth allocation group supported. Signed-off-by: Bing Niu <bing.niu@intel.com> --- src/conf/capabilities.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++ src/conf/capabilities.h | 11 +++++ src/util/virresctrl.c | 20 +++++++++ src/util/virresctrl.h | 15 +++++++ 4 files changed, 154 insertions(+) diff --git a/src/conf/capabilities.c b/src/conf/capabilities.c index 7a810ef..3bba153 100644 --- a/src/conf/capabilities.c +++ b/src/conf/capabilities.c @@ -198,6 +198,16 @@ virCapabilitiesFreeNUMAInfo(virCapsPtr caps) } static void +virCapsHostMemoryNodeFree(virCapsHostMemoryNodePtr ptr) +{ + if (!ptr) + return; + + virBitmapFree(ptr->cpus); + VIR_FREE(ptr); +} + +static void virCapabilitiesClearSecModel(virCapsHostSecModelPtr secmodel) { size_t i; @@ -239,6 +249,11 @@ virCapsDispose(void *object) virCapsHostCacheBankFree(caps->host.caches[i]); VIR_FREE(caps->host.caches); + for (i = 0; i < caps->host.nnodes; i++) + virCapsHostMemoryNodeFree(caps->host.nodes[i]); + VIR_FREE(caps->host.nodes); + + VIR_FREE(caps->host.netprefix); VIR_FREE(caps->host.pagesSize); virCPUDefFree(caps->host.cpu); @@ -957,6 +972,58 @@ virCapabilitiesFormatCaches(virBufferPtr buf, return 0; } +static int +virCapabilitiesFormatMemory(virBufferPtr buf, + size_t nnodes, + virCapsHostMemoryNodePtr *nodes) +{ + size_t i = 0; + virBuffer controlBuf = VIR_BUFFER_INITIALIZER; + + if (!nnodes) + return 0; + + virBufferAddLit(buf, "<memory>\n"); + virBufferAdjustIndent(buf, 2); + + for (i = 0; i < nnodes; i++) { + virCapsHostMemoryNodePtr node = nodes[i]; + virResctrlInfoPerNodePtr control = &node->control; + char *cpus_str = virBitmapFormat(node->cpus); + + if (!cpus_str) + return -1; + + virBufferAsprintf(buf, + "<node id='%u' cpus='%s'", + node->id, cpus_str); + VIR_FREE(cpus_str); + + virBufferSetChildIndent(&controlBuf, buf); + virBufferAsprintf(&controlBuf, + "<control granularity='%u' min ='%u'" + " maxAllocs='%u'/>\n", + control->granularity, control->min, + control->max_allocation); + + if (virBufferCheckError(&controlBuf) < 0) + return -1; + + if (virBufferUse(&controlBuf)) { + virBufferAddLit(buf, ">\n"); + virBufferAddBuffer(buf, &controlBuf); + virBufferAddLit(buf, "</node>\n"); + } else { + virBufferAddLit(buf, "/>\n"); + } + } + + virBufferAdjustIndent(buf, -2); + virBufferAddLit(buf, "</memory>\n"); + + return 0; +} + /** * virCapabilitiesFormatXML: * @caps: capabilities to format @@ -1060,6 +1127,10 @@ virCapabilitiesFormatXML(virCapsPtr caps) caps->host.caches) < 0) goto error; + if (virCapabilitiesFormatMemory(&buf, caps->host.nnodes, + caps->host.nodes) < 0) + goto error; + for (i = 0; i < caps->host.nsecModels; i++) { virBufferAddLit(&buf, "<secmodel>\n"); virBufferAdjustIndent(&buf, 2); @@ -1602,6 +1673,40 @@ virCapabilitiesInitResctrl(virCapsPtr caps) } +static int +virCapabilitiesInitResctrlMemory(virCapsPtr caps) +{ + virCapsHostMemoryNodePtr node = NULL; + size_t i = 0; + int ret = -1; + + for (i = 0; i < caps->host.ncaches; i++) { + virCapsHostCacheBankPtr bank = caps->host.caches[i]; + if (VIR_ALLOC(node) < 0) + goto cleanup; + + if (virResctrlInfoGetMemory(caps->host.resctrl, bank->level, &node->control) > 0) { + node->id = bank->id; + if (!(node->cpus = virBitmapNewCopy(bank->cpus))) + goto cleanup; + + if (VIR_APPEND_ELEMENT(caps->host.nodes, + caps->host.nnodes, + node) < 0) { + goto cleanup; + } + } + virCapsHostMemoryNodeFree(node); + node = NULL; + } + + ret = 0; + cleanup: + virCapsHostMemoryNodeFree(node); + return ret; +} + + int virCapabilitiesInitCaches(virCapsPtr caps) { @@ -1731,6 +1836,9 @@ virCapabilitiesInitCaches(virCapsPtr caps) qsort(caps->host.caches, caps->host.ncaches, sizeof(*caps->host.caches), virCapsHostCacheBankSorter); + if (virCapabilitiesInitResctrlMemory(caps) < 0) + goto cleanup; + ret = 0; cleanup: VIR_FREE(type); diff --git a/src/conf/capabilities.h b/src/conf/capabilities.h index fe1b9ea..73e165e 100644 --- a/src/conf/capabilities.h +++ b/src/conf/capabilities.h @@ -151,6 +151,14 @@ struct _virCapsHostCacheBank { virResctrlInfoPerCachePtr *controls; }; +typedef struct _virCapsHostMemoryNode virCapsHostMemoryNode; +typedef virCapsHostMemoryNode *virCapsHostMemoryNodePtr; +struct _virCapsHostMemoryNode { + unsigned int id; + virBitmapPtr cpus; /* All CPUs that belong to this node*/ + virResctrlInfoPerNode control; +}; + typedef struct _virCapsHost virCapsHost; typedef virCapsHost *virCapsHostPtr; struct _virCapsHost { @@ -175,6 +183,9 @@ struct _virCapsHost { size_t ncaches; virCapsHostCacheBankPtr *caches; + size_t nnodes; + virCapsHostMemoryNodePtr *nodes; + size_t nsecModels; virCapsHostSecModelPtr secModels; diff --git a/src/util/virresctrl.c b/src/util/virresctrl.c index de736b0..17f0a58 100644 --- a/src/util/virresctrl.c +++ b/src/util/virresctrl.c @@ -611,6 +611,26 @@ virResctrlInfoIsEmpty(virResctrlInfoPtr resctrl) int +virResctrlInfoGetMemory(virResctrlInfoPtr resctrl, + unsigned int level, + virResctrlInfoPerNodePtr control) +{ + virResctrlInfoMBPtr mb_info = resctrl->mb_info; + + if (!mb_info) + return 0; + + if (mb_info->last_level_cache != level) + return 0; + + control->granularity = mb_info->bandwidth_granularity; + control->min = mb_info->min_bandwidth; + control->max_allocation = mb_info->max_allocation; + return 1; +} + + +int virResctrlInfoGetCache(virResctrlInfoPtr resctrl, unsigned int level, unsigned long long size, diff --git a/src/util/virresctrl.h b/src/util/virresctrl.h index 5e78334..01fcade 100644 --- a/src/util/virresctrl.h +++ b/src/util/virresctrl.h @@ -50,6 +50,17 @@ struct _virResctrlInfoPerCache { unsigned int max_allocation; }; +typedef struct _virResctrlInfoPerNode virResctrlInfoPerNode; +typedef virResctrlInfoPerNode *virResctrlInfoPerNodePtr; +struct _virResctrlInfoPerNode { + /* Smallest possible increase of the allocation bandwidth in percentage */ + unsigned int granularity; + /* Minimal allocatable bandwidth in percentage */ + unsigned int min; + /* Maximum number of simultaneous allocations */ + unsigned int max_allocation; +}; + typedef struct _virResctrlInfo virResctrlInfo; typedef virResctrlInfo *virResctrlInfoPtr; @@ -63,6 +74,10 @@ virResctrlInfoGetCache(virResctrlInfoPtr resctrl, size_t *ncontrols, virResctrlInfoPerCachePtr **controls); +int +virResctrlInfoGetMemory(virResctrlInfoPtr resctrl, + unsigned int level, + virResctrlInfoPerNodePtr control); /* Alloc-related things */ typedef struct _virResctrlAlloc virResctrlAlloc; typedef virResctrlAlloc *virResctrlAllocPtr; -- 2.7.4
participants (3)
-
bing.niu
-
bing.niu@intel.com
-
John Ferlan