[libvirt] [resend v2 0/7] Support cache tune in libvirt

This series patches are for supportting CAT featues, which also called cache tune in libvirt. First to expose cache information which could be tuned in capabilites XML. Then add new domain xml element support to add cacahe bank which will apply on this libvirt domain. This series patches add a util file `resctrl.c/h`, an interface to talk with linux kernel's sys fs. There are still some TODOs such as expose new public interface to get free cache information. Some discussion about this feature support can be found from: https://www.redhat.com/archives/libvir-list/2017-January/msg00644.html Eli Qiao (7): Resctrl: Add some utils functions Resctrl: expose cache information to capabilities Resctrl: Add new xml element to support cache tune Resctrl: Add private interface to set cachebanks Qemu: Set cache banks Resctrl: enable l3code/l3data Resctrl: Make sure l3data/l3code are pairs docs/schemas/domaincommon.rng | 41 ++ include/libvirt/virterror.h | 1 + src/Makefile.am | 1 + src/conf/capabilities.c | 56 +++ src/conf/capabilities.h | 23 + src/conf/domain_conf.c | 154 +++++++ src/conf/domain_conf.h | 18 + src/libvirt_private.syms | 10 + src/qemu/qemu_capabilities.c | 68 +++ src/qemu/qemu_driver.c | 4 + src/qemu/qemu_process.c | 11 + src/util/virerror.c | 1 + src/util/virresctrl.c | 1019 +++++++++++++++++++++++++++++++++++++++++ src/util/virresctrl.h | 135 ++++++ 14 files changed, 1542 insertions(+) create mode 100644 src/util/virresctrl.c create mode 100644 src/util/virresctrl.h -- 1.9.1

This patch adds some utils struct and functions to expose resctrl information. virResCtrlAvailable: If resctrl interface exist on host virResCtrlGet: get specify type resource contral information virResCtrlInit: initialize resctrl struct from the host's sys fs. ResCtrlAll[]: an array to maintain resource control information. Signed-off-by: Eli Qiao <liyong.qiao@intel.com> --- include/libvirt/virterror.h | 1 + src/Makefile.am | 1 + src/libvirt_private.syms | 4 + src/util/virerror.c | 1 + src/util/virresctrl.c | 330 ++++++++++++++++++++++++++++++++++++++++++++ src/util/virresctrl.h | 89 ++++++++++++ 6 files changed, 426 insertions(+) create mode 100644 src/util/virresctrl.c create mode 100644 src/util/virresctrl.h diff --git a/include/libvirt/virterror.h b/include/libvirt/virterror.h index 2efee8f..3dd2d08 100644 --- a/include/libvirt/virterror.h +++ b/include/libvirt/virterror.h @@ -132,6 +132,7 @@ typedef enum { VIR_FROM_PERF = 65, /* Error from perf */ VIR_FROM_LIBSSH = 66, /* Error from libssh connection transport */ + VIR_FROM_RESCTRL = 67, /* Error from resource control */ # ifdef VIR_ENUM_SENTINELS VIR_ERR_DOMAIN_LAST diff --git a/src/Makefile.am b/src/Makefile.am index 2f32d41..b626f29 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -162,6 +162,7 @@ UTIL_SOURCES = \ util/virprocess.c util/virprocess.h \ util/virqemu.c util/virqemu.h \ util/virrandom.h util/virrandom.c \ + util/virresctrl.h util/virresctrl.c \ util/virrotatingfile.h util/virrotatingfile.c \ util/virscsi.c util/virscsi.h \ util/virscsivhost.c util/virscsivhost.h \ diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 8e994c7..743e5ac 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2313,6 +2313,10 @@ virRandomGenerateWWN; virRandomInt; +# util/virresctrl.h +virResCtrlAvailable; +virResCtrlInit; + # util/virrotatingfile.h virRotatingFileReaderConsume; virRotatingFileReaderFree; diff --git a/src/util/virerror.c b/src/util/virerror.c index ef17fb5..93dfd4f 100644 --- a/src/util/virerror.c +++ b/src/util/virerror.c @@ -139,6 +139,7 @@ VIR_ENUM_IMPL(virErrorDomain, VIR_ERR_DOMAIN_LAST, "Perf", /* 65 */ "Libssh transport layer", + "Rescouce Control", ) diff --git a/src/util/virresctrl.c b/src/util/virresctrl.c new file mode 100644 index 0000000..63bc808 --- /dev/null +++ b/src/util/virresctrl.c @@ -0,0 +1,330 @@ +/* + * virresctrl.c: methods for managing resource contral + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + * + * Authors: + * Eli Qiao <liyong.qiao@intel.com> + */ +#include <config.h> + +#include <sys/ioctl.h> +#if defined HAVE_SYS_SYSCALL_H +# include <sys/syscall.h> +#endif +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +#include "virresctrl.h" +#include "viralloc.h" +#include "virerror.h" +#include "virfile.h" +#include "virhostcpu.h" +#include "virlog.h" +#include "virstring.h" +#include "nodeinfo.h" + +VIR_LOG_INIT("util.resctrl"); + +#define VIR_FROM_THIS VIR_FROM_RESCTRL + +static unsigned int host_id = 0; + +static virResCtrl ResCtrlAll[] = { + { + .name = "L3", + .cache_level = "l3", + }, + { + .name = "L3DATA", + .cache_level = "l3", + }, + { + .name = "L3CODE", + .cache_level = "l3", + }, + { + .name = "L2", + .cache_level = "l2", + }, +}; + +static int virResCtrlGetInfoStr(const int type, const char *item, char **str) +{ + int ret = 0; + char *tmp; + char *path; + + if (asprintf(&path, "%s/%s/%s", RESCTRL_INFO_DIR, ResCtrlAll[type].name, item) < 0) + return -1; + if (virFileReadAll(path, 10, str) < 0) { + ret = -1; + goto cleanup; + } + + if ((tmp = strchr(*str, '\n'))) { + *tmp = '\0'; + } + +cleanup: + VIR_FREE(path); + return ret; +} + + +static int virResCtrlGetCPUValue(const char* path, char** value) +{ + int ret = -1; + char* tmp; + + if(virFileReadAll(path, 10, value) < 0) { + goto cleanup; + } + if ((tmp = strchr(*value, '\n'))) { + *tmp = '\0'; + } + ret = 0; +cleanup: + return ret; +} + +static int virResctrlGetCPUSocketID(const size_t cpu, int* socket_id) +{ + int ret = -1; + char* physical_package_path = NULL; + char* physical_package = NULL; + if (virAsprintf(&physical_package_path, + "%s/cpu/cpu%zu/topology/physical_package_id", + SYSFS_SYSTEM_PATH, cpu) < 0) { + return -1; + } + + if(virResCtrlGetCPUValue(physical_package_path, + &physical_package) < 0) + goto cleanup; + + if (virStrToLong_i(physical_package, NULL, 0, socket_id) < 0) + goto cleanup; + + ret = 0; +cleanup: + VIR_FREE(physical_package); + VIR_FREE(physical_package_path); + return ret; +} + +static int virResCtrlGetCPUCache(const size_t cpu, int type, int *cache) +{ + int ret = -1; + char* cache_dir = NULL; + char* cache_str = NULL; + char* tmp; + int carry = -1; + + if (virAsprintf(&cache_dir, + "%s/cpu/cpu%zu/cache/index%d/size", + SYSFS_SYSTEM_PATH, cpu, type) < 0) + return -1; + + if(virResCtrlGetCPUValue(cache_dir, &cache_str) < 0) + goto cleanup; + + tmp = cache_str; + + while (*tmp != '\0') + tmp++; + if (*(tmp - 1) == 'K') { + *(tmp - 1) = '\0'; + carry = 1; + } + else if (*(tmp - 1) == 'M') { + *(tmp - 1) = '\0'; + carry = 1024; + } + + if (virStrToLong_i(cache_str, NULL, 0, cache) < 0) + goto cleanup; + + *cache = (*cache) * carry; + + if (*cache < 0) + goto cleanup; + + ret = 0; +cleanup: + VIR_FREE(cache_dir); + VIR_FREE(cache_str); + return ret; +} + +/* Fill all cache bank informations */ +static virResCacheBankPtr virResCtrlGetCacheBanks(int type, int* n_sockets) +{ + int npresent_cpus; + int index = -1; + virResCacheBankPtr bank; + + *n_sockets = 1; + if ((npresent_cpus = virHostCPUGetCount()) < 0) + return NULL; + + if (type == RDT_RESOURCE_L3 + || type == RDT_RESOURCE_L3DATA + || type == RDT_RESOURCE_L3CODE) + index = 3; + else if (type == RDT_RESOURCE_L2) { + index = 2; + } + + if (index == -1) + return NULL; + + if(VIR_ALLOC_N(bank, *n_sockets) < 0) + { + *n_sockets = 0; + return NULL; + } + + for( size_t i = 0; i < npresent_cpus ; i ++) { + int s_id; + int cache_size; + + if (virResctrlGetCPUSocketID(i, &s_id) < 0) { + goto error; + } + + if(s_id > (*n_sockets - 1)) { + size_t cur = *n_sockets; + size_t exp = s_id - (*n_sockets) + 1; + if(VIR_EXPAND_N(bank, cur, exp) < 0) { + goto error; + } + } + *n_sockets = s_id + 1; + if (bank[s_id].cpu_mask == NULL) { + if (!(bank[s_id].cpu_mask = virBitmapNew(npresent_cpus))) + goto error; + } + + ignore_value(virBitmapSetBit(bank[s_id].cpu_mask, i)); + + if (bank[s_id].cache_size == 0) { + if (virResCtrlGetCPUCache(i, index, &cache_size) < 0) { + goto error; + } + bank[s_id].cache_size = cache_size; + bank[s_id].cache_min = cache_size / ResCtrlAll[type].cbm_len; + } + } + return bank; + +error: + *n_sockets = 0; + VIR_FREE(bank); + return NULL; +} + +static int virResCtrlGetConfig(int type) +{ + int ret; + int i; + char *str; + + /* Read min_cbm_bits from resctrl. + eg: /sys/fs/resctrl/info/L3/num_closids + */ + if ((ret = virResCtrlGetInfoStr(type, "num_closids", &str)) < 0) { + return ret; + } + if (virStrToLong_i(str, NULL, 10, &ResCtrlAll[type].num_closid) < 0) { + return -1; + } + VIR_FREE(str); + + /* Read min_cbm_bits from resctrl. + eg: /sys/fs/resctrl/info/L3/cbm_mask + */ + if ((ret = virResCtrlGetInfoStr(type, "min_cbm_bits", &str)) < 0) { + return ret; + } + if (virStrToLong_i(str, NULL, 10, &ResCtrlAll[type].min_cbm_bits) < 0) { + return -1; + } + VIR_FREE(str); + + /* Read cbm_mask string from resctrl. + eg: /sys/fs/resctrl/info/L3/cbm_mask + */ + if ((ret = virResCtrlGetInfoStr(type, "cbm_mask", &str)) < 0) { + return ret; + } + + /* Calculate cbm length from the default cbm_mask. */ + ResCtrlAll[type].cbm_len = strlen(str) * 4; + VIR_FREE(str); + + /* Get all cache bank informations */ + ResCtrlAll[type].cache_banks = virResCtrlGetCacheBanks(type, + &(ResCtrlAll[type].num_banks)); + + if(ResCtrlAll[type].cache_banks == NULL) + return -1; + + for( i = 0; i < ResCtrlAll[type].num_banks; i++) + { + /*L3CODE and L3DATA shares same L3 resource, so they should + * have same host_id. */ + if (type == RDT_RESOURCE_L3CODE) { + ResCtrlAll[type].cache_banks[i].host_id = ResCtrlAll[RDT_RESOURCE_L3DATA].cache_banks[i].host_id; + } + else { + ResCtrlAll[type].cache_banks[i].host_id = host_id++; + } + } + + ResCtrlAll[type].enabled = true; + + return ret; +} + +int virResCtrlInit(void) { + int i = 0; + char *tmp; + int rc = 0; + + for(i = 0; i < RDT_NUM_RESOURCES; i++) { + if ((rc = asprintf(&tmp, "%s/%s", RESCTRL_INFO_DIR, ResCtrlAll[i].name)) < 0) { + continue; + } + if (virFileExists(tmp)) { + if ((rc = virResCtrlGetConfig(i)) < 0 ) + VIR_WARN("Ignor error while get config for %d", i); + } + + VIR_FREE(tmp); + } + return rc; +} + +bool virResCtrlAvailable(void) { + if (!virFileExists(RESCTRL_INFO_DIR)) + return false; + return true; +} + +virResCtrlPtr virResCtrlGet(int type) { + return &ResCtrlAll[type]; +} diff --git a/src/util/virresctrl.h b/src/util/virresctrl.h new file mode 100644 index 0000000..f713e66 --- /dev/null +++ b/src/util/virresctrl.h @@ -0,0 +1,89 @@ +/* + * * virrscctrl.h: methods for managing rscctrl + * * + * * Copyright (C) 2016 Intel, Inc. + * * + * * This library is free software; you can redistribute it and/or + * * modify it under the terms of the GNU Lesser General Public + * * License as published by the Free Software Foundation; either + * * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + * + * Authors: + * Eli Qiao <liyong.qiao@intel.com> + */ + +#ifndef __VIR_RESCTRL_H__ +# define __VIR_RESCTRL_H__ + +# include "virutil.h" +# include "virbitmap.h" + +#define RESCTRL_DIR "/sys/fs/resctrl" +#define RESCTRL_INFO_DIR "/sys/fs/resctrl/info" +#define SYSFS_SYSTEM_PATH "/sys/devices/system" + +#define MAX_CPU_SOCKET_NUM 8 +#define MAX_CBM_BIT_LEN 32 +#define MAX_SCHEMATA_LEN 1024 +#define MAX_FILE_LEN ( 10 * 1024 * 1024) + +enum { + RDT_RESOURCE_L3, + RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE, + RDT_RESOURCE_L2, + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + + +typedef struct _virResCacheBank virResCacheBank; +typedef virResCacheBank *virResCacheBankPtr; +struct _virResCacheBank { + unsigned int host_id; + unsigned long long cache_size; + unsigned long long cache_left; + unsigned long long cache_min; + virBitmapPtr cpu_mask; +}; + +/** + * struct rdt_resource - attributes of an RDT resource + * @enabled: Is this feature enabled on this machine + * @name: Name to use in "schemata" file + * @num_closid: Number of CLOSIDs available + * @max_cbm: Largest Cache Bit Mask allowed + * @min_cbm_bits: Minimum number of consecutive bits to be set + * in a cache bit mask + * @cache_level: Which cache level defines scope of this domain + * @num_banks: Number of cache bank on this machine. + * @cache_banks: Array of cache bank + */ +typedef struct _virResCtrl virResCtrl; +typedef virResCtrl *virResCtrlPtr; +struct _virResCtrl { + bool enabled; + const char *name; + int num_closid; + int cbm_len; + int min_cbm_bits; + const char* cache_level; + int num_banks; + virResCacheBankPtr cache_banks; +}; + + +bool virResCtrlAvailable(void); +int virResCtrlInit(void); +virResCtrlPtr virResCtrlGet(int); + +#endif -- 1.9.1

On Mon, Feb 06, 2017 at 10:23:36AM +0800, Eli Qiao wrote:
This patch adds some utils struct and functions to expose resctrl information.
virResCtrlAvailable: If resctrl interface exist on host virResCtrlGet: get specify type resource contral information virResCtrlInit: initialize resctrl struct from the host's sys fs. ResCtrlAll[]: an array to maintain resource control information.
Signed-off-by: Eli Qiao <liyong.qiao@intel.com> --- include/libvirt/virterror.h | 1 + src/Makefile.am | 1 + src/libvirt_private.syms | 4 + src/util/virerror.c | 1 + src/util/virresctrl.c | 330 ++++++++++++++++++++++++++++++++++++++++++++ src/util/virresctrl.h | 89 ++++++++++++ 6 files changed, 426 insertions(+) create mode 100644 src/util/virresctrl.c create mode 100644 src/util/virresctrl.h
diff --git a/src/util/virresctrl.c b/src/util/virresctrl.c new file mode 100644 index 0000000..63bc808 --- /dev/null +++ b/src/util/virresctrl.c @@ -0,0 +1,330 @@ +/* + * virresctrl.c: methods for managing resource contral + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + * + * Authors: + * Eli Qiao <liyong.qiao@intel.com> + */ +#include <config.h> + +#include <sys/ioctl.h> +#if defined HAVE_SYS_SYSCALL_H +# include <sys/syscall.h> +#endif +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +#include "virresctrl.h" +#include "viralloc.h" +#include "virerror.h" +#include "virfile.h" +#include "virhostcpu.h" +#include "virlog.h" +#include "virstring.h" +#include "nodeinfo.h" + +VIR_LOG_INIT("util.resctrl"); + +#define VIR_FROM_THIS VIR_FROM_RESCTRL + +static unsigned int host_id = 0; + +static virResCtrl ResCtrlAll[] = {
Lowercase for global variable names please.
+ { + .name = "L3", + .cache_level = "l3", + }, + { + .name = "L3DATA", + .cache_level = "l3", + }, + { + .name = "L3CODE", + .cache_level = "l3", + }, + { + .name = "L2", + .cache_level = "l2", + }, +}; + +static int virResCtrlGetInfoStr(const int type, const char *item, char **str) +{ + int ret = 0; + char *tmp; + char *path; + + if (asprintf(&path, "%s/%s/%s", RESCTRL_INFO_DIR, ResCtrlAll[type].name, item) < 0) + return -1;
Use of asprintf is forbidden in libvirt - use virAsprintf. Please make sure to run 'make check' and 'make syntax-check' on each patch to catch this kind of policy error. There's quite a few other style rules missed in these patches - i won't list them all since 'make syntax-check' can tell you.
+ if (virFileReadAll(path, 10, str) < 0) { + ret = -1; + goto cleanup; + } + + if ((tmp = strchr(*str, '\n'))) { + *tmp = '\0'; + } + +cleanup: + VIR_FREE(path); + return ret; +} + + +static int virResCtrlGetCPUValue(const char* path, char** value) +{ + int ret = -1; + char* tmp;
The '*' should be next to the variable name, not the type name. Multiple other cases follow
+ + if(virFileReadAll(path, 10, value) < 0) { + goto cleanup; + } + if ((tmp = strchr(*value, '\n'))) { + *tmp = '\0'; + } + ret = 0; +cleanup: + return ret; +} +
+int virResCtrlInit(void) { + int i = 0; + char *tmp; + int rc = 0; + + for(i = 0; i < RDT_NUM_RESOURCES; i++) { + if ((rc = asprintf(&tmp, "%s/%s", RESCTRL_INFO_DIR, ResCtrlAll[i].name)) < 0) { + continue;
Silently ignoring OOM is not good - please return a proper error - using virAsprintf would do so.
+ } + if (virFileExists(tmp)) { + if ((rc = virResCtrlGetConfig(i)) < 0 ) + VIR_WARN("Ignor error while get config for %d", i);
Again don't ignore errors like this - this should be fatal.
+ } + + VIR_FREE(tmp); + } + return rc; +} + +bool virResCtrlAvailable(void) { + if (!virFileExists(RESCTRL_INFO_DIR)) + return false; + return true; +} + +virResCtrlPtr virResCtrlGet(int type) { + return &ResCtrlAll[type]; +}
diff --git a/src/util/virresctrl.h b/src/util/virresctrl.h new file mode 100644 index 0000000..f713e66 --- /dev/null +++ b/src/util/virresctrl.h @@ -0,0 +1,89 @@
+ +#ifndef __VIR_RESCTRL_H__ +# define __VIR_RESCTRL_H__ + +# include "virutil.h" +# include "virbitmap.h" + +#define RESCTRL_DIR "/sys/fs/resctrl" +#define RESCTRL_INFO_DIR "/sys/fs/resctrl/info" +#define SYSFS_SYSTEM_PATH "/sys/devices/system" + +#define MAX_CPU_SOCKET_NUM 8 +#define MAX_CBM_BIT_LEN 32 +#define MAX_SCHEMATA_LEN 1024 +#define MAX_FILE_LEN ( 10 * 1024 * 1024)
Doesn't seem like any of this needs to be in the header file
+ +enum { + RDT_RESOURCE_L3, + RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE, + RDT_RESOURCE_L2, + /* Must be the last */ + RDT_NUM_RESOURCES,
Use a VIR_ prefix on these constants. Also the libvirt naming convention is to use RDT_RESOURCE_LAST as the last element.
+}; + + +typedef struct _virResCacheBank virResCacheBank; +typedef virResCacheBank *virResCacheBankPtr; +struct _virResCacheBank { + unsigned int host_id; + unsigned long long cache_size; + unsigned long long cache_left; + unsigned long long cache_min; + virBitmapPtr cpu_mask; +};
+typedef struct _virResCtrl virResCtrl; +typedef virResCtrl *virResCtrlPtr; +struct _virResCtrl { + bool enabled; + const char *name; + int num_closid; + int cbm_len; + int min_cbm_bits; + const char* cache_level; + int num_banks; + virResCacheBankPtr cache_banks; +};
Can any of these fields change at runtime, or are they all immutable once populated.
+bool virResCtrlAvailable(void); +int virResCtrlInit(void); +virResCtrlPtr virResCtrlGet(int);
API docs for these would be nice, especially describing whether virResCtrlPtr returned from this method is immutable or not. Since libvirt is multi-threaded this is important to know. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://entangle-photo.org -o- http://search.cpan.org/~danberr/ :|

-- Eli Qiao Sent with Sparrow (http://www.sparrowmailapp.com/?sig) On Tuesday, 7 February 2017 at 12:11 AM, Daniel P. Berrange wrote:
On Mon, Feb 06, 2017 at 10:23:36AM +0800, Eli Qiao wrote:
This patch adds some utils struct and functions to expose resctrl information.
virResCtrlAvailable: If resctrl interface exist on host virResCtrlGet: get specify type resource contral information virResCtrlInit: initialize resctrl struct from the host's sys fs. ResCtrlAll[]: an array to maintain resource control information.
Signed-off-by: Eli Qiao <liyong.qiao@intel.com (mailto:liyong.qiao@intel.com)> --- include/libvirt/virterror.h | 1 + src/Makefile.am (http://Makefile.am) | 1 + src/libvirt_private.syms | 4 + src/util/virerror.c | 1 + src/util/virresctrl.c | 330 ++++++++++++++++++++++++++++++++++++++++++++ src/util/virresctrl.h | 89 ++++++++++++ 6 files changed, 426 insertions(+) create mode 100644 src/util/virresctrl.c create mode 100644 src/util/virresctrl.h
diff --git a/src/util/virresctrl.c b/src/util/virresctrl.c new file mode 100644 index 0000000..63bc808 --- /dev/null +++ b/src/util/virresctrl.c @@ -0,0 +1,330 @@ +/* + * virresctrl.c: methods for managing resource contral + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * <http://www.gnu.org/licenses/>. + * + * Authors: + * Eli Qiao <liyong.qiao@intel.com (mailto:liyong.qiao@intel.com)> + */ +#include <config.h> + +#include <sys/ioctl.h> +#if defined HAVE_SYS_SYSCALL_H +# include <sys/syscall.h> +#endif +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +#include "virresctrl.h" +#include "viralloc.h" +#include "virerror.h" +#include "virfile.h" +#include "virhostcpu.h" +#include "virlog.h" +#include "virstring.h" +#include "nodeinfo.h" + +VIR_LOG_INIT("util.resctrl"); + +#define VIR_FROM_THIS VIR_FROM_RESCTRL + +static unsigned int host_id = 0; + +static virResCtrl ResCtrlAll[] = {
Lowercase for global variable names please.
+ { + .name = "L3", + .cache_level = "l3", + }, + { + .name = "L3DATA", + .cache_level = "l3", + }, + { + .name = "L3CODE", + .cache_level = "l3", + }, + { + .name = "L2", + .cache_level = "l2", + }, +}; + +static int virResCtrlGetInfoStr(const int type, const char *item, char **str) +{ + int ret = 0; + char *tmp; + char *path; + + if (asprintf(&path, "%s/%s/%s", RESCTRL_INFO_DIR, ResCtrlAll[type].name, item) < 0) + return -1;
Use of asprintf is forbidden in libvirt - use virAsprintf.
Please make sure to run 'make check' and 'make syntax-check' on each patch to catch this kind of policy error. There's quite a few other style rules missed in these patches - i won't list them all since 'make syntax-check' can tell you.
okay, thanks Daniel.
+ if (virFileReadAll(path, 10, str) < 0) { + ret = -1; + goto cleanup; + } + + if ((tmp = strchr(*str, '\n'))) { + *tmp = '\0'; + } + +cleanup: + VIR_FREE(path); + return ret; +} + + +static int virResCtrlGetCPUValue(const char* path, char** value) +{ + int ret = -1; + char* tmp;
The '*' should be next to the variable name, not the type name. Multiple other cases follow
okay
+ + if(virFileReadAll(path, 10, value) < 0) { + goto cleanup; + } + if ((tmp = strchr(*value, '\n'))) { + *tmp = '\0'; + } + ret = 0; +cleanup: + return ret; +} +
+int virResCtrlInit(void) { + int i = 0; + char *tmp; + int rc = 0; + + for(i = 0; i < RDT_NUM_RESOURCES; i++) { + if ((rc = asprintf(&tmp, "%s/%s", RESCTRL_INFO_DIR, ResCtrlAll[i].name)) < 0) { + continue;
Silently ignoring OOM is not good - please return a proper error - using virAsprintf would do so.
okay.
+ } + if (virFileExists(tmp)) { + if ((rc = virResCtrlGetConfig(i)) < 0 ) + VIR_WARN("Ignor error while get config for %d", i);
Again don't ignore errors like this - this should be fatal.
sure
+ } + + VIR_FREE(tmp); + } + return rc; +} + +bool virResCtrlAvailable(void) { + if (!virFileExists(RESCTRL_INFO_DIR)) + return false; + return true; +} + +virResCtrlPtr virResCtrlGet(int type) { + return &ResCtrlAll[type]; +}
diff --git a/src/util/virresctrl.h b/src/util/virresctrl.h new file mode 100644 index 0000000..f713e66 --- /dev/null +++ b/src/util/virresctrl.h @@ -0,0 +1,89 @@
+ +#ifndef __VIR_RESCTRL_H__ +# define __VIR_RESCTRL_H__ + +# include "virutil.h" +# include "virbitmap.h" + +#define RESCTRL_DIR "/sys/fs/resctrl" +#define RESCTRL_INFO_DIR "/sys/fs/resctrl/info" +#define SYSFS_SYSTEM_PATH "/sys/devices/system" + +#define MAX_CPU_SOCKET_NUM 8 +#define MAX_CBM_BIT_LEN 32 +#define MAX_SCHEMATA_LEN 1024 +#define MAX_FILE_LEN ( 10 * 1024 * 1024)
Doesn't seem like any of this needs to be in the header file okay, will move to c file.
+ +enum { + RDT_RESOURCE_L3, + RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE, + RDT_RESOURCE_L2, + /* Must be the last */ + RDT_NUM_RESOURCES,
Use a VIR_ prefix on these constants. Also the libvirt naming convention is to use RDT_RESOURCE_LAST as the last element.
okay
+}; + + +typedef struct _virResCacheBank virResCacheBank; +typedef virResCacheBank *virResCacheBankPtr; +struct _virResCacheBank { + unsigned int host_id; + unsigned long long cache_size; + unsigned long long cache_left; + unsigned long long cache_min; + virBitmapPtr cpu_mask; +};
+typedef struct _virResCtrl virResCtrl; +typedef virResCtrl *virResCtrlPtr; +struct _virResCtrl { + bool enabled; + const char *name; + int num_closid; + int cbm_len; + int min_cbm_bits; + const char* cache_level; + int num_banks; + virResCacheBankPtr cache_banks; +};
Can any of these fields change at runtime, or are they all immutable once populated.
Only cache_banks will be change at runtime, such as cache_left on each socket will be updated if libvirt allocates cache to domains.
+bool virResCtrlAvailable(void); +int virResCtrlInit(void); +virResCtrlPtr virResCtrlGet(int);
API docs for these would be nice, especially describing whether virResCtrlPtr returned from this method is immutable or not. Since libvirt is multi-threaded this is important to know.
Okay, I will fill all API docs for the public APIs
Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://entangle-photo.org -o- http://search.cpan.org/~danberr/ :|

On Tue, Feb 07, 2017 at 02:43:17PM +0800, Eli Qiao wrote:
On Tuesday, 7 February 2017 at 12:11 AM, Daniel P. Berrange wrote:
On Mon, Feb 06, 2017 at 10:23:36AM +0800, Eli Qiao wrote:
This patch adds some utils struct and functions to expose resctrl information.
virResCtrlAvailable: If resctrl interface exist on host virResCtrlGet: get specify type resource contral information virResCtrlInit: initialize resctrl struct from the host's sys fs. ResCtrlAll[]: an array to maintain resource control information.
Signed-off-by: Eli Qiao <liyong.qiao@intel.com (mailto:liyong.qiao@intel.com)> --- +}; + + +typedef struct _virResCacheBank virResCacheBank; +typedef virResCacheBank *virResCacheBankPtr; +struct _virResCacheBank { + unsigned int host_id; + unsigned long long cache_size; + unsigned long long cache_left; + unsigned long long cache_min; + virBitmapPtr cpu_mask; +};
+typedef struct _virResCtrl virResCtrl; +typedef virResCtrl *virResCtrlPtr; +struct _virResCtrl { + bool enabled; + const char *name; + int num_closid; + int cbm_len; + int min_cbm_bits; + const char* cache_level; + int num_banks; + virResCacheBankPtr cache_banks; +};
Can any of these fields change at runtime, or are they all immutable once populated.
Only cache_banks will be change at runtime, such as cache_left on each socket will be updated if libvirt allocates cache to domains.
Ok, so the API is returning a direct point to the virResCtrlPtr struct and there's no locking here. So if cache_banks struct contents changes while some caller is reading the virResCtrlPtr struct we have race problems. I'd suggest that best option is probably to take the mutable cache_left field *out* of the struct. That lets the callers treat the entire struct as immutable. Then add a separate API to explicitly query the cache_left value - only that api needs to do locking to return a consistent point-in-time view of the cache_left value. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://entangle-photo.org -o- http://search.cpan.org/~danberr/ :|

This patch expose cache information to host's capabilites xml. For l3 cache allocation <cache> <bank id='0' type='l3' size='56320' unit='KiB' cpus='0-21,44-65'> <control min='2816' reserved='2816' unit='KiB' scope='L3'/> </bank> <bank id='1' type='l3' size='56320' unit='KiB' cpus='22-43,66-87'> <control min='2816' reserved='2816' unit='KiB' scope='L3'/> </bank> </cache> For l3 cache allocation supported cdp(seperate data/code): <cache> <bank id='0' type='l3' size='56320' unit='KiB' cpus='0-21,44-65'> <control min='2816' reserved='2816' unit='KiB' scope='L3DATA'/> <control min='2816' reserved='2816' unit='KiB' scope='L3CODE'/> </bank> <bank id='1' type='l3' size='56320' unit='KiB' cpus='22-43,66-87'> <control min='2816' reserved='2816' unit='KiB' scope='L3DATA'/> <control min='2816' reserved='2816' unit='KiB' scope='L3CODE'/> </bank> </cache> RFC on mailing list. https://www.redhat.com/archives/libvir-list/2017-January/msg00644.html Signed-off-by: Eli Qiao <liyong.qiao@intel.com> --- src/conf/capabilities.c | 56 ++++++++++++++++++++++++++++++++++++ src/conf/capabilities.h | 23 +++++++++++++++ src/libvirt_private.syms | 1 + src/qemu/qemu_capabilities.c | 68 ++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_driver.c | 4 +++ 5 files changed, 152 insertions(+) diff --git a/src/conf/capabilities.c b/src/conf/capabilities.c index 9ab343b..290c25f 100644 --- a/src/conf/capabilities.c +++ b/src/conf/capabilities.c @@ -198,6 +198,18 @@ virCapabilitiesClearSecModel(virCapsHostSecModelPtr secmodel) } static void +virCapabilitiesClearCacheBank(virCapsHostCacheBankPtr cachebank) +{ + size_t i; + for (i = 0; i < cachebank->ncontrol; i++) { + VIR_FREE(cachebank->control[i].scope); + } + VIR_FREE(cachebank->type); + VIR_FREE(cachebank->cpus); +} + + +static void virCapabilitiesDispose(void *object) { virCapsPtr caps = object; @@ -221,6 +233,10 @@ virCapabilitiesDispose(void *object) virCapabilitiesClearSecModel(&caps->host.secModels[i]); VIR_FREE(caps->host.secModels); + for (i = 0; i < caps->host.ncachebank; i++) + virCapabilitiesClearCacheBank(caps->host.cachebank[i]); + VIR_FREE(caps->host.cachebank); + VIR_FREE(caps->host.netprefix); VIR_FREE(caps->host.pagesSize); virCPUDefFree(caps->host.cpu); @@ -844,6 +860,41 @@ virCapabilitiesFormatNUMATopology(virBufferPtr buf, return 0; } +static int +virCapabilitiesFormatCache(virBufferPtr buf, + size_t ncachebank, + virCapsHostCacheBankPtr *cachebank) +{ + size_t i; + size_t j; + + virBufferAddLit(buf, "<cache>\n"); + virBufferAdjustIndent(buf, 2); + + for( i = 0 ; i < ncachebank; i++) { + virBufferAsprintf(buf, + "<bank id='%u' type='%s' size='%llu' unit='KiB' cpus='%s'>\n", + cachebank[i]->id, + cachebank[i]->type, + cachebank[i]->size, + cachebank[i]->cpus); + + virBufferAdjustIndent(buf, 2); + for( j = 0; j < cachebank[i]->ncontrol; j ++) { + virBufferAsprintf(buf, + "<control min='%llu' reserved='%llu' unit='KiB' scope='%s'/>\n", + cachebank[i]->control[j].min, + cachebank[i]->control[j].reserved, + cachebank[i]->control[j].scope); + } + virBufferAdjustIndent(buf, -2); + virBufferAddLit(buf, "</bank>\n"); + } + virBufferAdjustIndent(buf, -2); + virBufferAddLit(buf, "</cache>\n"); + return 0; +} + /** * virCapabilitiesFormatXML: * @caps: capabilities to format @@ -931,6 +982,11 @@ virCapabilitiesFormatXML(virCapsPtr caps) virBufferAddLit(&buf, "</migration_features>\n"); } + if (caps->host.ncachebank && + virCapabilitiesFormatCache(&buf, caps->host.ncachebank, + caps->host.cachebank) < 0) + return NULL; + if (caps->host.netprefix) virBufferAsprintf(&buf, "<netprefix>%s</netprefix>\n", caps->host.netprefix); diff --git a/src/conf/capabilities.h b/src/conf/capabilities.h index cfdc34a..b446de5 100644 --- a/src/conf/capabilities.h +++ b/src/conf/capabilities.h @@ -138,6 +138,25 @@ struct _virCapsHostSecModel { virCapsHostSecModelLabelPtr labels; }; +typedef struct _virCapsHostCacheControl virCapsHostCacheControl; +typedef virCapsHostCacheControl *virCapsHostCacheControlPtr; +struct _virCapsHostCacheControl { + unsigned long long min; + unsigned long long reserved; + char* scope; +}; + +typedef struct _virCapsHostCacheBank virCapsHostCacheBank; +typedef virCapsHostCacheBank *virCapsHostCacheBankPtr; +struct _virCapsHostCacheBank { + unsigned int id; + char* type; + char* cpus; + unsigned long long size; + size_t ncontrol; + virCapsHostCacheControlPtr control; +}; + typedef struct _virCapsHost virCapsHost; typedef virCapsHost *virCapsHostPtr; struct _virCapsHost { @@ -160,6 +179,10 @@ struct _virCapsHost { size_t nsecModels; virCapsHostSecModelPtr secModels; + size_t ncachebank; + size_t ncachebank_max; + virCapsHostCacheBankPtr *cachebank; + char *netprefix; virCPUDefPtr cpu; int nPagesSize; /* size of pagesSize array */ diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 743e5ac..d93b775 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2316,6 +2316,7 @@ virRandomInt; # util/virresctrl.h virResCtrlAvailable; virResCtrlInit; +virResCtrlGet; # util/virrotatingfile.h virRotatingFileReaderConsume; diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c index 3247d25..23f416d 100644 --- a/src/qemu/qemu_capabilities.c +++ b/src/qemu/qemu_capabilities.c @@ -45,6 +45,7 @@ #include "qemu_domain.h" #define __QEMU_CAPSRIV_H_ALLOW__ #include "qemu_capspriv.h" +#include "virresctrl.h" #include <fcntl.h> #include <sys/stat.h> @@ -1098,7 +1099,71 @@ virQEMUCapsInitCPU(virCapsPtr caps, goto cleanup; } +static int +virQEMUCapsInitCache(virCapsPtr caps) +{ + int i, j; + virResCtrlPtr resctrl; + virCapsHostCacheBankPtr bank; + + for (i = 0; i < RDT_NUM_RESOURCES; i ++) + { + /* L3DATA and L3CODE share L3 resources */ + if ( i == RDT_RESOURCE_L3CODE ) + continue; + resctrl = virResCtrlGet(i); + + if(resctrl->enabled) { + for( j = 0; j < resctrl->num_banks; j++) + { + if(VIR_RESIZE_N(caps->host.cachebank, caps->host.ncachebank_max, + caps->host.ncachebank, 1) < 0) + return -1; + + if(VIR_ALLOC(bank) < 0) + return -1; + + bank->id = resctrl->cache_banks[j].host_id; + if(VIR_STRDUP(bank->type, resctrl->cache_level) < 0) + goto err; + if(VIR_STRDUP(bank->cpus, virBitmapFormat(resctrl->cache_banks[j].cpu_mask)) < 0) + goto err; + bank->size = resctrl->cache_banks[j].cache_size; + /*L3DATA and L3CODE shares L3 cache resources, so fill them to the control element*/ + if ( i == RDT_RESOURCE_L3DATA ) { + if(VIR_EXPAND_N(bank->control, bank->ncontrol, 2) < 0) + goto err; + + bank->control[0].min = virResCtrlGet(RDT_RESOURCE_L3DATA)->cache_banks[j].cache_min; + bank->control[0].reserved = bank->control[0].min * (virResCtrlGet(RDT_RESOURCE_L3DATA)->min_cbm_bits); + if(VIR_STRDUP(bank->control[0].scope, + virResCtrlGet(RDT_RESOURCE_L3DATA)->name) < 0) + goto err; + + bank->control[1].min = virResCtrlGet(RDT_RESOURCE_L3CODE)->cache_banks[j].cache_min; + bank->control[1].reserved = bank->control[1].min * (virResCtrlGet(RDT_RESOURCE_L3CODE)->min_cbm_bits); + if(VIR_STRDUP(bank->control[1].scope, + virResCtrlGet(RDT_RESOURCE_L3CODE)->name) < 0) + goto err; + } + else { + if(VIR_EXPAND_N(bank->control, bank->ncontrol, 1) < 0) + goto err; + bank->control[0].min = resctrl->cache_banks[j].cache_min; + bank->control[0].reserved = bank->control[0].min * resctrl->min_cbm_bits; + if(VIR_STRDUP(bank->control[0].scope, resctrl->name) < 0) + goto err; + } + caps->host.cachebank[caps->host.ncachebank++] = bank; + } + } + } + return 0; +err: + VIR_FREE(bank); + return -1; +} static int virQEMUCapsInitPages(virCapsPtr caps) { @@ -1144,6 +1209,9 @@ virCapsPtr virQEMUCapsInit(virQEMUCapsCachePtr cache) if (virQEMUCapsInitCPU(caps, hostarch) < 0) VIR_WARN("Failed to get host CPU"); + if (virQEMUCapsInitCache(caps) < 0) + VIR_WARN("Failed to get host cache"); + /* Add the power management features of the host */ if (virNodeSuspendGetTargetMask(&caps->host.powerMgmt) < 0) VIR_WARN("Failed to get host power management capabilities"); diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 37ccfdf..520b74d 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -105,6 +105,7 @@ #include "vircgroup.h" #include "virperf.h" #include "virnuma.h" +#include "virresctrl.h" #include "dirname.h" #include "network/bridge_driver.h" @@ -849,6 +850,9 @@ qemuStateInitialize(bool privileged, run_gid = cfg->group; } + if(virResCtrlAvailable() && virResCtrlInit() < 0) + VIR_WARN("Faild to initialize resource control."); + qemu_driver->qemuCapsCache = virQEMUCapsCacheNew(cfg->libDir, cfg->cacheDir, run_uid, -- 1.9.1

On Mon, Feb 06, 2017 at 10:23:37AM +0800, Eli Qiao wrote:
This patch expose cache information to host's capabilites xml.
For l3 cache allocation <cache> <bank id='0' type='l3' size='56320' unit='KiB' cpus='0-21,44-65'> <control min='2816' reserved='2816' unit='KiB' scope='L3'/> </bank> <bank id='1' type='l3' size='56320' unit='KiB' cpus='22-43,66-87'> <control min='2816' reserved='2816' unit='KiB' scope='L3'/> </bank> </cache>
For l3 cache allocation supported cdp(seperate data/code): <cache> <bank id='0' type='l3' size='56320' unit='KiB' cpus='0-21,44-65'> <control min='2816' reserved='2816' unit='KiB' scope='L3DATA'/> <control min='2816' reserved='2816' unit='KiB' scope='L3CODE'/> </bank> <bank id='1' type='l3' size='56320' unit='KiB' cpus='22-43,66-87'> <control min='2816' reserved='2816' unit='KiB' scope='L3DATA'/> <control min='2816' reserved='2816' unit='KiB' scope='L3CODE'/> </bank> </cache>
RFC on mailing list. https://www.redhat.com/archives/libvir-list/2017-January/msg00644.html
Signed-off-by: Eli Qiao <liyong.qiao@intel.com> --- src/conf/capabilities.c | 56 ++++++++++++++++++++++++++++++++++++ src/conf/capabilities.h | 23 +++++++++++++++ src/libvirt_private.syms | 1 + src/qemu/qemu_capabilities.c | 68 ++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_driver.c | 4 +++ 5 files changed, 152 insertions(+)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c index 3247d25..23f416d 100644 --- a/src/qemu/qemu_capabilities.c +++ b/src/qemu/qemu_capabilities.c @@ -45,6 +45,7 @@ #include "qemu_domain.h" #define __QEMU_CAPSRIV_H_ALLOW__ #include "qemu_capspriv.h" +#include "virresctrl.h"
#include <fcntl.h> #include <sys/stat.h> @@ -1098,7 +1099,71 @@ virQEMUCapsInitCPU(virCapsPtr caps, goto cleanup; }
+static int +virQEMUCapsInitCache(virCapsPtr caps) +{ + int i, j; + virResCtrlPtr resctrl; + virCapsHostCacheBankPtr bank; + + for (i = 0; i < RDT_NUM_RESOURCES; i ++) + { + /* L3DATA and L3CODE share L3 resources */ + if ( i == RDT_RESOURCE_L3CODE ) + continue;
+ resctrl = virResCtrlGet(i); + + if(resctrl->enabled) { + for( j = 0; j < resctrl->num_banks; j++) + { + if(VIR_RESIZE_N(caps->host.cachebank, caps->host.ncachebank_max, + caps->host.ncachebank, 1) < 0) + return -1; + + if(VIR_ALLOC(bank) < 0) + return -1; + + bank->id = resctrl->cache_banks[j].host_id; + if(VIR_STRDUP(bank->type, resctrl->cache_level) < 0) + goto err; + if(VIR_STRDUP(bank->cpus, virBitmapFormat(resctrl->cache_banks[j].cpu_mask)) < 0) + goto err; + bank->size = resctrl->cache_banks[j].cache_size; + /*L3DATA and L3CODE shares L3 cache resources, so fill them to the control element*/ + if ( i == RDT_RESOURCE_L3DATA ) { + if(VIR_EXPAND_N(bank->control, bank->ncontrol, 2) < 0) + goto err; + + bank->control[0].min = virResCtrlGet(RDT_RESOURCE_L3DATA)->cache_banks[j].cache_min; + bank->control[0].reserved = bank->control[0].min * (virResCtrlGet(RDT_RESOURCE_L3DATA)->min_cbm_bits); + if(VIR_STRDUP(bank->control[0].scope, + virResCtrlGet(RDT_RESOURCE_L3DATA)->name) < 0) + goto err; + + bank->control[1].min = virResCtrlGet(RDT_RESOURCE_L3CODE)->cache_banks[j].cache_min; + bank->control[1].reserved = bank->control[1].min * (virResCtrlGet(RDT_RESOURCE_L3CODE)->min_cbm_bits); + if(VIR_STRDUP(bank->control[1].scope, + virResCtrlGet(RDT_RESOURCE_L3CODE)->name) < 0) + goto err; + } + else { + if(VIR_EXPAND_N(bank->control, bank->ncontrol, 1) < 0) + goto err; + bank->control[0].min = resctrl->cache_banks[j].cache_min; + bank->control[0].reserved = bank->control[0].min * resctrl->min_cbm_bits; + if(VIR_STRDUP(bank->control[0].scope, resctrl->name) < 0) + goto err; + } + caps->host.cachebank[caps->host.ncachebank++] = bank; + } + } + } + return 0; +err: + VIR_FREE(bank); + return -1; +}
I don't think this code should be in the QEMU driver - better to have it in nodeinfo.c so it is common to all drivers. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://entangle-photo.org -o- http://search.cpan.org/~danberr/ :|

-- Eli Qiao Sent with Sparrow (http://www.sparrowmailapp.com/?sig) On Tuesday, 7 February 2017 at 12:02 AM, Daniel P. Berrange wrote:
On Mon, Feb 06, 2017 at 10:23:37AM +0800, Eli Qiao wrote:
This patch expose cache information to host's capabilites xml.
For l3 cache allocation <cache> <bank id='0' type='l3' size='56320' unit='KiB' cpus='0-21,44-65'> <control min='2816' reserved='2816' unit='KiB' scope='L3'/> </bank> <bank id='1' type='l3' size='56320' unit='KiB' cpus='22-43,66-87'> <control min='2816' reserved='2816' unit='KiB' scope='L3'/> </bank> </cache>
For l3 cache allocation supported cdp(seperate data/code): <cache> <bank id='0' type='l3' size='56320' unit='KiB' cpus='0-21,44-65'> <control min='2816' reserved='2816' unit='KiB' scope='L3DATA'/> <control min='2816' reserved='2816' unit='KiB' scope='L3CODE'/> </bank> <bank id='1' type='l3' size='56320' unit='KiB' cpus='22-43,66-87'> <control min='2816' reserved='2816' unit='KiB' scope='L3DATA'/> <control min='2816' reserved='2816' unit='KiB' scope='L3CODE'/> </bank> </cache>
RFC on mailing list. https://www.redhat.com/archives/libvir-list/2017-January/msg00644.html
Signed-off-by: Eli Qiao <liyong.qiao@intel.com (mailto:liyong.qiao@intel.com)> --- src/conf/capabilities.c | 56 ++++++++++++++++++++++++++++++++++++ src/conf/capabilities.h | 23 +++++++++++++++ src/libvirt_private.syms | 1 + src/qemu/qemu_capabilities.c | 68 ++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_driver.c | 4 +++ 5 files changed, 152 insertions(+)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c index 3247d25..23f416d 100644 --- a/src/qemu/qemu_capabilities.c +++ b/src/qemu/qemu_capabilities.c @@ -45,6 +45,7 @@ #include "qemu_domain.h" #define __QEMU_CAPSRIV_H_ALLOW__ #include "qemu_capspriv.h" +#include "virresctrl.h"
#include <fcntl.h> #include <sys/stat.h> @@ -1098,7 +1099,71 @@ virQEMUCapsInitCPU(virCapsPtr caps, goto cleanup; }
+static int +virQEMUCapsInitCache(virCapsPtr caps) +{ + int i, j; + virResCtrlPtr resctrl; + virCapsHostCacheBankPtr bank; + + for (i = 0; i < RDT_NUM_RESOURCES; i ++) + { + /* L3DATA and L3CODE share L3 resources */ + if ( i == RDT_RESOURCE_L3CODE ) + continue;
+ resctrl = virResCtrlGet(i); + + if(resctrl->enabled) { + for( j = 0; j < resctrl->num_banks; j++) + { + if(VIR_RESIZE_N(caps->host.cachebank, caps->host.ncachebank_max, + caps->host.ncachebank, 1) < 0) + return -1; + + if(VIR_ALLOC(bank) < 0) + return -1; + + bank->id = resctrl->cache_banks[j].host_id; + if(VIR_STRDUP(bank->type, resctrl->cache_level) < 0) + goto err; + if(VIR_STRDUP(bank->cpus, virBitmapFormat(resctrl->cache_banks[j].cpu_mask)) < 0) + goto err; + bank->size = resctrl->cache_banks[j].cache_size; + /*L3DATA and L3CODE shares L3 cache resources, so fill them to the control element*/ + if ( i == RDT_RESOURCE_L3DATA ) { + if(VIR_EXPAND_N(bank->control, bank->ncontrol, 2) < 0) + goto err; + + bank->control[0].min = virResCtrlGet(RDT_RESOURCE_L3DATA)->cache_banks[j].cache_min; + bank->control[0].reserved = bank->control[0].min * (virResCtrlGet(RDT_RESOURCE_L3DATA)->min_cbm_bits); + if(VIR_STRDUP(bank->control[0].scope, + virResCtrlGet(RDT_RESOURCE_L3DATA)->name) < 0) + goto err; + + bank->control[1].min = virResCtrlGet(RDT_RESOURCE_L3CODE)->cache_banks[j].cache_min; + bank->control[1].reserved = bank->control[1].min * (virResCtrlGet(RDT_RESOURCE_L3CODE)->min_cbm_bits); + if(VIR_STRDUP(bank->control[1].scope, + virResCtrlGet(RDT_RESOURCE_L3CODE)->name) < 0) + goto err; + } + else { + if(VIR_EXPAND_N(bank->control, bank->ncontrol, 1) < 0) + goto err; + bank->control[0].min = resctrl->cache_banks[j].cache_min; + bank->control[0].reserved = bank->control[0].min * resctrl->min_cbm_bits; + if(VIR_STRDUP(bank->control[0].scope, resctrl->name) < 0) + goto err; + } + caps->host.cachebank[caps->host.ncachebank++] = bank; + } + } + } + return 0; +err: + VIR_FREE(bank); + return -1; +}
I don't think this code should be in the QEMU driver - better to have it in nodeinfo.c so it is common to all drivers.
Sure, I will move it to nodeinfo.c
Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://entangle-photo.org -o- http://search.cpan.org/~danberr/ :|

This patch adds new xml element to support cache tune as: <cputune> ... <cachetune id='1' host_id='0' type='l3' size='2816' unit='KiB'/> ... </cputune> id: any non-minus number host_id: reference of the host's cache banks id, it's from capabilities type: cache bank type size: should be multiples of the min_size of the bank on host. --- docs/schemas/domaincommon.rng | 41 +++++++++++++ src/conf/domain_conf.c | 134 ++++++++++++++++++++++++++++++++++++++++++ src/conf/domain_conf.h | 18 ++++++ src/libvirt_private.syms | 3 + src/util/virresctrl.c | 25 ++++++++ src/util/virresctrl.h | 3 +- 6 files changed, 223 insertions(+), 1 deletion(-) diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng index cc6e0d0..d0f9e54 100644 --- a/docs/schemas/domaincommon.rng +++ b/docs/schemas/domaincommon.rng @@ -795,6 +795,27 @@ </attribute> </element> </zeroOrMore> + <zeroOrMore> + <element name="cachetune"> + <attribute name="id"> + <ref name="cacheid"/> + </attribute> + <attribute name="host_id"> + <ref name="hostid"/> + </attribute> + <attribute name="type"> + <ref name="cachetype"/> + </attribute> + <attribute name="size"> + <ref name="unsignedInt"/> + </attribute> + <optional> + <attribute name="unit"> + <ref name="cacheunit"/> + </attribute> + </optional> + </element> + </zeroOrMore> <optional> <element name="emulatorpin"> <attribute name="cpuset"> @@ -5451,6 +5472,26 @@ <param name="minInclusive">-1</param> </data> </define> + <define name="cacheid"> + <data type="unsignedShort"> + <param name="pattern">[0-9]+</param> + </data> + </define> + <define name="hostid"> + <data type="unsignedShort"> + <param name="pattern">[0-9]+</param> + </data> + </define> + <define name="cachetype"> + <data type="string"> + <param name="pattern">(l3)</param> + </data> + </define> + <define name="cacheunit"> + <data type="string"> + <param name="pattern">KiB</param> + </data> + </define> <!-- weight currently is in range [100, 1000] --> <define name="weight"> <data type="unsignedInt"> diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index c06b128..0304b36 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -56,6 +56,7 @@ #include "virstring.h" #include "virnetdev.h" #include "virhostdev.h" +#include "virresctrl.h" #define VIR_FROM_THIS VIR_FROM_DOMAIN @@ -15604,6 +15605,116 @@ virDomainVcpuPinDefParseXML(virDomainDefPtr def, return ret; } +/* Parse the XML definition for cachetune + * and a cachetune has the form + * <cachetune id='0' host_id='0' type='l3' size='1024' unit='KiB'/> + */ +static int +virDomainCacheTuneDefParseXML(virDomainDefPtr def, + int n, + xmlNodePtr* nodes) +{ + char* tmp = NULL; + int i; + size_t j; + int type = -1 ; + virDomainCacheBankPtr bank = NULL; + virResCtrlPtr resctrl; + + if (VIR_ALLOC_N(bank, n) < 0) + goto cleanup; + + for (i = 0; i < n; i++) { + if (!(tmp = virXMLPropString(nodes[i], "id"))) { + virReportError(VIR_ERR_XML_ERROR, "%s", _("missing id in cache tune")); + goto cleanup; + } + if (virStrToLong_uip(tmp, NULL, 10, &(bank[i].id)) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("invalid setting for cache id '%s'"), tmp); + goto cleanup; + } + + VIR_FREE(tmp); + if (!(tmp = virXMLPropString(nodes[i], "host_id"))) { + virReportError(VIR_ERR_XML_ERROR, "%s", _("missing host id in cache tune")); + goto cleanup; + } + if (virStrToLong_uip(tmp, NULL, 10, &(bank[i].host_id)) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("invalid setting for cache host id '%s'"), tmp); + goto cleanup; + } + VIR_FREE(tmp); + + if (!(tmp = virXMLPropString(nodes[i], "size"))) { + virReportError(VIR_ERR_XML_ERROR, "%s", _("missing size in cache tune")); + goto cleanup; + } + if (virStrToLong_ull(tmp, NULL, 10, &(bank[i].size)) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("invalid setting for cache size '%s'"), tmp); + goto cleanup; + } + VIR_FREE(tmp); + + if (!(tmp = virXMLPropString(nodes[i], "type"))) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("missing cache type")); + goto cleanup; + } + + if ((type = virResCtrlTypeFromString(tmp)) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("'unsupported cache type '%s'"), tmp); + goto cleanup; + } + + resctrl = virResCtrlGet(type); + + if(resctrl == NULL || !resctrl->enabled) { + virReportError(VIR_ERR_XML_ERROR, + _("'host doesn't enabled cache type '%s'"), tmp); + goto cleanup; + } + + bool found_host_id = false; + /* Loop for banks to search host_id */ + for(j = 0; j < resctrl->num_banks; j++) { + if (resctrl->cache_banks[j].host_id == bank[i].host_id) { + found_host_id = true; + break; + } + } + + if (! found_host_id) { + virReportError(VIR_ERR_XML_ERROR, + _("'cache bank's host id %u not found on the host"), + bank[i].host_id); + goto cleanup; + } + + if (bank[i].size == 0 || + bank[i].size % resctrl->cache_banks[j].cache_min != 0) { + virReportError(VIR_ERR_XML_ERROR, + _("'the size shoud be multiplies of '%llu'"), + resctrl->cache_banks[j].cache_min); + goto cleanup; + } + + if (VIR_STRDUP(bank[i].type, tmp) < 0) + goto cleanup; + def->cachetune.cache_banks = bank; + } + + def->cachetune.n_banks = n; + return 0; + +cleanup: + VIR_FREE(bank); + VIR_FREE(tmp); + return -1; +} /* Parse the XML definition for a iothreadpin * and an iothreadspin has the form @@ -16882,6 +16993,13 @@ virDomainDefParseXML(xmlDocPtr xml, } VIR_FREE(nodes); + if ((n = virXPathNodeSet("./cputune/cachetune", ctxt, &nodes)) < 0) + goto error; + + if (virDomainCacheTuneDefParseXML(def, n ,nodes) < 0) + goto error; + VIR_FREE(nodes); + if ((n = virXPathNodeSet("./cputune/emulatorpin", ctxt, &nodes)) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("cannot extract emulatorpin nodes")); @@ -23398,6 +23516,20 @@ virDomainSchedulerFormat(virBufferPtr buf, } +static void +virDomainCacheTuneDefFormat(virBufferPtr buf, + virDomainCachetunePtr cache) +{ + size_t i; + for(i = 0; i < cache->n_banks; i ++) { + virBufferAsprintf(buf, "<cachetune id='%u' host_id='%u' " + "type='%s' size='%llu' unit='KiB'/>\n", + cache->cache_banks[i].id, + cache->cache_banks[i].host_id, + cache->cache_banks[i].type, + cache->cache_banks[i].size); + } +} static int virDomainCputuneDefFormat(virBufferPtr buf, @@ -23461,6 +23593,8 @@ virDomainCputuneDefFormat(virBufferPtr buf, VIR_FREE(cpumask); } + virDomainCacheTuneDefFormat(&childrenBuf, &def->cachetune); + if (def->cputune.emulatorpin) { char *cpumask; virBufferAddLit(&childrenBuf, "<emulatorpin "); diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 507ace8..49e01ac 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -2132,6 +2132,22 @@ struct _virDomainMemtune { unsigned long long swap_hard_limit; /* in kibibytes, limit at off_t bytes */ }; +typedef struct _virDomainCacheBank virDomainCacheBank; +typedef virDomainCacheBank *virDomainCacheBankPtr; +struct _virDomainCacheBank { + unsigned int id; + unsigned int host_id; + unsigned long long size; + char* type; +}; + +typedef struct _virDomainCachetune virDomainCachetune; +typedef virDomainCachetune *virDomainCachetunePtr; +struct _virDomainCachetune { + size_t n_banks; + virDomainCacheBankPtr cache_banks; +}; + typedef struct _virDomainPowerManagement virDomainPowerManagement; typedef virDomainPowerManagement *virDomainPowerManagementPtr; @@ -2196,6 +2212,8 @@ struct _virDomainDef { virDomainCputune cputune; + virDomainCachetune cachetune; + virDomainNumaPtr numa; virDomainResourceDefPtr resource; virDomainIdMapDef idmap; diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index d93b775..08a0bc4 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2317,6 +2317,9 @@ virRandomInt; virResCtrlAvailable; virResCtrlInit; virResCtrlGet; +virResCtrlTypeFromString; +virResCtrlTypeToString; + # util/virrotatingfile.h virRotatingFileReaderConsume; diff --git a/src/util/virresctrl.c b/src/util/virresctrl.c index 63bc808..76d4ab8 100644 --- a/src/util/virresctrl.c +++ b/src/util/virresctrl.c @@ -41,6 +41,31 @@ VIR_LOG_INIT("util.resctrl"); #define VIR_FROM_THIS VIR_FROM_RESCTRL +VIR_ENUM_IMPL(virResCtrl, RDT_NUM_RESOURCES, + "l3", "l3data", "l3code", "l2"); + +#define CONSTRUCT_RESCTRL_PATH(domain_name, item_name) \ +do { \ + if (NULL == domain_name) { \ + if (asprintf(&path, "%s/%s", RESCTRL_DIR, item_name) < 0)\ + return -1; \ + } \ + else { \ + if (asprintf(&path, "%s/%s/%s", RESCTRL_DIR, \ + domain_name, \ + item_name) < 0) \ + return -1; \ + } \ +} while(0) + +#define VIR_RESCTRL_ENABLED(type) \ + ResCtrlAll[type].enabled + +#define VIR_RESCTRL_GET_SCHEMATA(count) ((1 << count) - 1) + +#define VIR_RESCTRL_SET_SCHEMATA(p, type, pos, val) \ + p->schematas[type]->schemata_items[pos] = val + static unsigned int host_id = 0; static virResCtrl ResCtrlAll[] = { diff --git a/src/util/virresctrl.h b/src/util/virresctrl.h index f713e66..3cc41da 100644 --- a/src/util/virresctrl.h +++ b/src/util/virresctrl.h @@ -42,9 +42,10 @@ enum { RDT_RESOURCE_L3CODE, RDT_RESOURCE_L2, /* Must be the last */ - RDT_NUM_RESOURCES, + RDT_NUM_RESOURCES }; +VIR_ENUM_DECL(virResCtrl); typedef struct _virResCacheBank virResCacheBank; typedef virResCacheBank *virResCacheBankPtr; -- 1.9.1

virResCtrlSetCacheBanks: Set cache banks of a libvirt domain. It will create new resource domain under `/sys/fs/resctrl` and fill the schemata according the cache banks configration. virResCtrlUpdate: Update the schemata after libvirt domain destroy. Signed-off-by: Eli Qiao <liyong.qiao@intel.com> --- src/libvirt_private.syms | 2 + src/util/virresctrl.c | 644 ++++++++++++++++++++++++++++++++++++++++++++++- src/util/virresctrl.h | 47 +++- 3 files changed, 691 insertions(+), 2 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 08a0bc4..2b3278a 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -2319,6 +2319,8 @@ virResCtrlInit; virResCtrlGet; virResCtrlTypeFromString; virResCtrlTypeToString; +virResCtrlSetCacheBanks; +virResCtrlUpdate; # util/virrotatingfile.h diff --git a/src/util/virresctrl.c b/src/util/virresctrl.c index 76d4ab8..002b539 100644 --- a/src/util/virresctrl.c +++ b/src/util/virresctrl.c @@ -68,6 +68,11 @@ do { \ static unsigned int host_id = 0; +/* Global static struct to be maintained which is a interface */ +static virResCtrlDomain DomainAll; + +/* Global static struct array to be maintained which indicate + * resource status on a host */ static virResCtrl ResCtrlAll[] = { { .name = "L3", @@ -87,6 +92,64 @@ static virResCtrl ResCtrlAll[] = { }, }; +/* + * How many bits is set in schemata + * eg: + * virResCtrlBitsNum(1011) = 2 */ +static int virResCtrlBitsContinuesNum(unsigned schemata) { + int ret = 0; + for (int i = 0; i < MAX_CBM_BIT_LEN; i ++) { + if ((schemata & 0x1) == 0x1) + ret++; + else + break; + schemata = schemata >> 1; + if (schemata == 0) break; + } + return ret; +} + +static int virResCtrlGetStr(const char *domain_name, const char *item_name, char **ret) +{ + char *path; + int rc = 0; + + CONSTRUCT_RESCTRL_PATH(domain_name, item_name); + + if (virFileReadAll(path, MAX_FILE_LEN, ret) < 0) { + rc = -1; + goto cleanup; + } + +cleanup: + VIR_FREE(path); + return rc; +} + +static int virResCtrlGetTasks(const char *domain_name, char **pids) +{ + return virResCtrlGetStr(domain_name, "tasks", pids); +} + +static int virResCtrlGetSchemata(const int type, const char *name, char **schemata) +{ + int rc; + char *tmp, *end; + char *buf; + + if ((rc = virResCtrlGetStr(name, "schemata", &buf)) < 0) + return rc; + + tmp = strstr(buf, ResCtrlAll[type].name); + end = strchr(tmp, '\n'); + *end = '\0'; + if(VIR_STRDUP(*schemata, tmp) < 0) + rc = -1; + + VIR_FREE(buf); + return rc; +} + static int virResCtrlGetInfoStr(const int type, const char *item, char **str) { int ret = 0; @@ -109,6 +172,70 @@ cleanup: return ret; } +/* Return pointer of and ncount of schemata*/ +static virResSchemataPtr virParseSchemata(const char* schemata_str, int* ncount) +{ + const char *p, *q; + int pos; + int ischemata; + virResSchemataPtr schemata; + virResSchemataItemPtr schemataitems, tmpitem; + unsigned int socket_no = 0; + char *tmp; + + if(VIR_ALLOC(schemata) < 0) + goto cleanup; + + p = q = schemata_str; + pos = strchr(schemata_str, ':') - p; + + /* calculate cpu socket count */ + *ncount = 1; + while((q = strchr(p, ';')) != 0) { + p = q + 1; + (*ncount)++; + } + + /* allocat an arrry to store schemata for each socket*/ + if(VIR_ALLOC_N_QUIET(tmpitem, *ncount) < 0) + goto cleanup; + + schemataitems = tmpitem; + + p = q = schemata_str + pos + 1; + + while(*p != '\0'){ + if (*p == '='){ + q = p + 1; + + tmpitem->socket_no = socket_no++; + + while(*p != ';' && *p != '\0') p++; + + if (VIR_STRNDUP(tmp, q, p-q) < 0) + goto cleanup; + + if (virStrToLong_i(tmp, NULL, 16, &ischemata) < 0) + goto cleanup; + + VIR_FREE(tmp); + tmp = NULL; + tmpitem->schemata = ischemata; + tmpitem ++; + schemata->n_schemata_items +=1; + } + p++; + } + + schemata->schemata_items = schemataitems; + return schemata; + +cleanup: + VIR_FREE(schemata); + VIR_FREE(tmpitem); + return NULL; +} + static int virResCtrlGetCPUValue(const char* path, char** value) { @@ -252,6 +379,7 @@ static virResCacheBankPtr virResCtrlGetCacheBanks(int type, int* n_sockets) } bank[s_id].cache_size = cache_size; bank[s_id].cache_min = cache_size / ResCtrlAll[type].cbm_len; + bank[s_id].cache_left = cache_size - (bank[s_id].cache_min * ResCtrlAll[type].min_cbm_bits); } } return bank; @@ -268,7 +396,7 @@ static int virResCtrlGetConfig(int type) int i; char *str; - /* Read min_cbm_bits from resctrl. + /* Read num_closids from resctrl. eg: /sys/fs/resctrl/info/L3/num_closids */ if ((ret = virResCtrlGetInfoStr(type, "num_closids", &str)) < 0) { @@ -325,6 +453,515 @@ static int virResCtrlGetConfig(int type) return ret; } +/* Remove the Domain from sysfs, this should only success no pids in tasks + * of a partition. + */ +static +int virRscctrlRemoveDomain(const char *name) +{ + char *path = NULL; + int rc = 0; + + if ((rc = asprintf(&path, "%s/%s", RESCTRL_DIR, name)) < 0) + return rc; + rc = rmdir(path); + VIR_FREE(path); + return rc; +} + +/* assemble schemata string*/ +static +char* virResCtrlAssembleSchemata(virResSchemataPtr schemata, int type) +{ + virBuffer buf = VIR_BUFFER_INITIALIZER; + + virBufferAsprintf(&buf, "%s:%u=%x", ResCtrlAll[type].name, + schemata->schemata_items[0].socket_no, + schemata->schemata_items[0].schemata); + + for(int i = 1; i < schemata->n_schemata_items; i++) { + virBufferAsprintf(&buf, ";%u=%x", + schemata->schemata_items[i].socket_no, + schemata->schemata_items[i].schemata); + } + + return virBufferContentAndReset(&buf); +} + +/* Refresh default domains' schemata + */ +static +int virResCtrlRefreshSchemata(void) +{ + int i; + int j; + int k; + unsigned int tmp_schemata; + unsigned int default_schemata; + + virResDomainPtr header, p; + + header = DomainAll.domains; + + for (i = 0; i < RDT_NUM_RESOURCES; i++) { + if (VIR_RESCTRL_ENABLED(i)) { + for(j = 0; j < header->schematas[i]->n_schemata_items; j ++) { + p = header->next; + default_schemata = VIR_RESCTRL_GET_SCHEMATA(ResCtrlAll[i].cbm_len); + tmp_schemata = 0; + /* NOTEs: if only header domain, the schemata will be set to default one*/ + for (k = 1; k < DomainAll.num_domains; k++) { + tmp_schemata |= p->schematas[i]->schemata_items[j].schemata; + p = p->next; + } + /* sys fs doens't let us use 0 */ + int min_bits = VIR_RESCTRL_GET_SCHEMATA(ResCtrlAll[i].min_cbm_bits); + if((tmp_schemata & min_bits) == min_bits) + tmp_schemata -= min_bits; + + default_schemata ^= tmp_schemata; + + int bitsnum = virResCtrlBitsContinuesNum(default_schemata); + // calcuate header's schemata + // NOTES: resctrl sysfs only allow us to set a continues schemata + header->schematas[i]->schemata_items[j].schemata = VIR_RESCTRL_GET_SCHEMATA(bitsnum); + ResCtrlAll[i].cache_banks[j].cache_left = + (bitsnum - ResCtrlAll[i].min_cbm_bits) * ResCtrlAll[i].cache_banks[j].cache_min; + } + } + } + + return 0; + +} + +/* Refresh all domains', remove the domains which has no task ids. + * This will be used after VM pause, restart, destroy etc. + */ +static int +virResCtrlRefresh(void) +{ + int i; + char* tasks; + unsigned int origin_count = DomainAll.num_domains; + virResDomainPtr p, pre, del=NULL; + pre = DomainAll.domains; + p = pre->next; + + for (i = 1; i < origin_count; i++) { + if(virResCtrlGetTasks(p->name, &tasks) < 0) { + VIR_WARN("Failed to get tasks from %s", p->name); + pre = p; + p = p->next; + } + if(virStringIsEmpty(tasks)) { + pre->next = p->next; + if(p->next != NULL) + p->next->pre = pre; + + del = p; + p = p->next; + if(virRscctrlRemoveDomain(del->name) < 0) + VIR_WARN("Failed to remove partition %s", p->name); + + VIR_DEBUG("Remove partition %s", del->name); + + VIR_FREE(del->name); + VIR_FREE(del->tasks); + VIR_FREE(del); + del = NULL; + + DomainAll.num_domains -=1; + } else { + pre = p; + p = p->next; + } + VIR_FREE(tasks); + + } + + return virResCtrlRefreshSchemata(); +} + +/* Get a domain ptr by domain's name*/ +static +virResDomainPtr virResCtrlGetDomain(const char* name) { + int i; + virResDomainPtr p = DomainAll.domains; + for(i = 0; i < DomainAll.num_domains; i++) + { + if((p->name) && (strcmp(name, p->name) == 0)) { + return p; + } + p = p->next; + } + return NULL; +} + +static int +virResCtrlAddTask(virResDomainPtr dom, pid_t pid) +{ + virBuffer buf = VIR_BUFFER_INITIALIZER; + if(dom->tasks == NULL) { + virBufferAsprintf(&buf, "%lld\n", (long long)pid); + } else { + virBufferAsprintf(&buf, "%s%lld\n", dom->tasks, (long long)pid); + VIR_FREE(dom->tasks); + } + dom->tasks = virBufferContentAndReset(&buf); + return 0; +} + +static int +virResCtrlWrite(const char* name, const char* item, const char* content) +{ + char* path; + int writefd; + int rc = -1; + + CONSTRUCT_RESCTRL_PATH(name, item); + + if (!virFileExists(path)) + goto cleanup; + + if ((writefd = open(path, O_WRONLY | O_APPEND, S_IRUSR | S_IWUSR)) < 0) + goto cleanup; + + if (safewrite(writefd, content, strlen(content)) < 0) + goto cleanup; + + rc = 0; +cleanup: + VIR_FREE(path); + VIR_FORCE_CLOSE(writefd); + return rc; +} + +/* if name == NULL we load default schemata */ +static +virResDomainPtr virResCtrlLoadDomain(const char* name) +{ + char* schematas; + char* tasks = NULL; + virResDomainPtr p; + + if(VIR_ALLOC(p) < 0) + goto cleanup; + + if(name != NULL && virResCtrlGetTasks(name, &tasks) < 0) + goto cleanup; + + for(int i = 0; i < RDT_NUM_RESOURCES; i++) { + if(VIR_RESCTRL_ENABLED(i)) { + if (virResCtrlGetSchemata(i, name, &schematas) < 0) + goto cleanup; + p->schematas[i] = virParseSchemata(schematas, &(p->n_sockets)); + VIR_FREE(schematas); + } + } + + p->tasks = tasks; + + if((name != NULL) && (VIR_STRDUP(p->name, name)) < 0) + goto cleanup; + + return p; + +cleanup: + VIR_FREE(p); + return NULL; +} + +static +virResDomainPtr virResCtrlCreateDomain(const char* name) +{ + char *path; + mode_t mode = 0755; + virResDomainPtr p; + if (asprintf(&path, "%s/%s", RESCTRL_DIR, name) < 0) + return NULL; + if (virDirCreate(path, mode, 0, 0, 0) < 0) + goto cleanup; + + if((p = virResCtrlLoadDomain(name)) == NULL) + return p; + + /* sys fs doens't let us use 0. + * reset schemata to min_bits*/ + for(int i = 0; i < RDT_NUM_RESOURCES; i++) { + if(VIR_RESCTRL_ENABLED(i)) { + int min_bits = VIR_RESCTRL_GET_SCHEMATA(ResCtrlAll[i].min_cbm_bits); + for(int j = 0; j < p->n_sockets; j++) + p->schematas[i]->schemata_items[j].schemata = min_bits; + } + } + + VIR_FREE(path); + return p; + +cleanup: + VIR_FREE(path); + return NULL; +} + +static +int virResCtrlDestroyDomain(virResDomainPtr p) +{ + char *path; + if (asprintf(&path, "%s/%s", RESCTRL_DIR, p->name) < 0) + return -1; + rmdir(path); + + VIR_FREE(p->name); + p->name = NULL; + VIR_FREE(p->tasks); + VIR_FREE(p); + p = NULL; + return 0; +} + +/* flush domains's information to sysfs*/ +static int +virResCtrlFlushDomainToSysfs(virResDomainPtr dom) +{ + int i; + char* schemata; + char* tmp; + int rc = -1; + virBuffer buf = VIR_BUFFER_INITIALIZER; + + for(i = 0; i < RDT_NUM_RESOURCES; i++) { + if(VIR_RESCTRL_ENABLED(i)) { + tmp = virResCtrlAssembleSchemata(dom->schematas[i], i); + virBufferAsprintf(&buf, "%s\n", tmp); + VIR_FREE(tmp); + } + } + + schemata = virBufferContentAndReset(&buf); + + if(virResCtrlWrite(dom->name, "schemata", schemata) < 0) + goto cleanup; + if(!virStringIsEmpty(dom->tasks) + && virResCtrlWrite(dom->name, "tasks", dom->tasks) < 0) + goto cleanup; + rc = 0; + +cleanup: + VIR_FREE(schemata); + return rc; +} + +static virResDomainPtr virResCtrlGetAllDomains(unsigned int *len) +{ + struct dirent *ent; + DIR *dp = NULL; + int direrr; + + *len = 0; + virResDomainPtr header, tmp, tmp_pre; + header = tmp = tmp_pre = NULL; + if (virDirOpenQuiet(&dp, RESCTRL_DIR) < 0) { + if (errno == ENOENT) + return NULL; + VIR_ERROR(_("Unable to open %s (%d)"), RESCTRL_DIR, errno); + goto cleanup; + } + + header = virResCtrlLoadDomain(NULL); + if (header == NULL) + goto cleanup; + + header->next = NULL; + + *len = 1; + + while ((direrr = virDirRead(dp, &ent, NULL)) > 0) { + if ((ent->d_type != DT_DIR) || STREQ(ent->d_name, "info")) + continue; + + tmp = virResCtrlLoadDomain(ent->d_name); + if (tmp == NULL) + goto cleanup; + + tmp->next = NULL; + + if(header->next == NULL) + header->next = tmp; + + if(tmp_pre == NULL) + tmp->pre = header; + else { + tmp->pre = tmp_pre; + tmp_pre->next = tmp; + } + + tmp_pre = tmp; + (*len) ++; + } + return header; + +cleanup: + VIR_DIR_CLOSE(dp); + tmp_pre = tmp = header; + while(tmp) { + tmp_pre = tmp; + tmp = tmp->next; + VIR_FREE(tmp_pre); + } + return NULL; +} + +static int +virResCtrlAppendDomain(virResDomainPtr dom) +{ + virResDomainPtr p = DomainAll.domains; + while(p->next != NULL) p=p->next; + p->next = dom; + dom->pre = p; + DomainAll.num_domains +=1; + return 0; +} + +static int +virResCtrlGetSocketIdByHostID(int type, unsigned int hostid) +{ + int i; + for( i = 0; i < ResCtrlAll[type].num_banks; i++) { + if(ResCtrlAll[type].cache_banks[i].host_id == hostid) + return i; + } + return -1; +} + +static int +virResCtrlCalculateSchemata(int type, + int sid, + unsigned hostid, + unsigned long long size) +{ + int i; + int count; + virResDomainPtr p; + unsigned int tmp_schemata; + unsigned int schemata_sum = 0; + + if(ResCtrlAll[type].cache_banks[sid].cache_left < size) { + VIR_ERROR("Note enough cache left on bank %u", hostid); + return -1; + } + if ((count = size / ResCtrlAll[type].cache_banks[sid].cache_min) <= 0) { + VIR_ERROR("Error cache size %llu", size); + return -1; + } + + tmp_schemata = VIR_RESCTRL_GET_SCHEMATA(count); + + p = DomainAll.domains; + p = p->next; + for (i = 1; i < DomainAll.num_domains; i ++) { + schemata_sum |= p->schematas[type]->schemata_items[sid].schemata; + p = p->next; + } + + tmp_schemata = tmp_schemata << (ResCtrlAll[type].cbm_len - count); + + while ((tmp_schemata & schemata_sum) != 0) + tmp_schemata = tmp_schemata >> 1; + return tmp_schemata; +} + +int virResCtrlSetCacheBanks(virDomainCachetunePtr cachetune, + unsigned char* uuid, pid_t pid) +{ + size_t i; + char name[VIR_UUID_STRING_BUFLEN]; + virResDomainPtr p; + int type; + int sid; + int schemata; + + virUUIDFormat(uuid, name); + + for(i = 0 ; i < cachetune->n_banks; i++) { + VIR_DEBUG("cache_banks %u, %u, %llu, %s", + cachetune->cache_banks[i].id, + cachetune->cache_banks[i].host_id, + cachetune->cache_banks[i].size, + cachetune->cache_banks[i].type); + } + + if (cachetune->n_banks < 1) + return 0; + + p = virResCtrlGetDomain(name); + if (p == NULL) { + VIR_DEBUG("no domain name %s found, create new one!", name); + p = virResCtrlCreateDomain(name); + } + + if (p != NULL) { + for(i = 0 ; i < cachetune->n_banks; i++) { + if ((type = virResCtrlTypeFromString( + cachetune->cache_banks[i].type)) < 0 ) { + VIR_WARN("Ignore unknown cache type %s.", + cachetune->cache_banks[i].type); + continue; + } + + if((sid = virResCtrlGetSocketIdByHostID( + type, cachetune->cache_banks[i].host_id)) < 0) { + VIR_WARN("Can not find cache bank host id %u.", + cachetune->cache_banks[i].host_id); + continue; + } + + if((schemata = virResCtrlCalculateSchemata( + type, sid, cachetune->cache_banks[i].host_id, + cachetune->cache_banks[i].size)) < 0) { + VIR_WARN("Failed to set schemata for cache bank id %u", + cachetune->cache_banks[i].id); + continue; + } + + p->schematas[type]->schemata_items[sid].schemata = schemata; + } + + virResCtrlAddTask(p, pid); + + if(virResCtrlFlushDomainToSysfs(p) < 0) { + VIR_WARN("failed to flush domain %s to sysfs", name); + virResCtrlDestroyDomain(p); + return -1; + } + virResCtrlAppendDomain(p); + } else { + VIR_ERROR("Failed to create a domain in sysfs"); + return -1; + } + + virResCtrlRefresh(); + /* after refresh, flush header's schemata changes to sys fs */ + if(virResCtrlFlushDomainToSysfs(DomainAll.domains) < 0) + VIR_WARN("failed to flush domain to sysfs"); + + return 0; +} + +/* Should be called after pid disappeared, we recalculate + * schemata of default and flush it to sys fs. + */ +int virResCtrlUpdate(void) { + int rc; + if ((rc = virResCtrlRefresh()) < 0) + VIR_WARN("failed to refresh rescontrol"); + + if ((rc = virResCtrlFlushDomainToSysfs(DomainAll.domains)) < 0) + VIR_WARN("failed to flush domain to sysfs"); + + return rc; +} + int virResCtrlInit(void) { int i = 0; char *tmp; @@ -341,6 +978,11 @@ int virResCtrlInit(void) { VIR_FREE(tmp); } + + DomainAll.domains = virResCtrlGetAllDomains(&(DomainAll.num_domains)); + + if((rc = virResCtrlRefresh()) < 0) + VIR_WARN("failed to refresh resource control"); return rc; } diff --git a/src/util/virresctrl.h b/src/util/virresctrl.h index 3cc41da..11f43d8 100644 --- a/src/util/virresctrl.h +++ b/src/util/virresctrl.h @@ -26,6 +26,7 @@ # include "virutil.h" # include "virbitmap.h" +# include "domain_conf.h" #define RESCTRL_DIR "/sys/fs/resctrl" #define RESCTRL_INFO_DIR "/sys/fs/resctrl/info" @@ -82,9 +83,53 @@ struct _virResCtrl { virResCacheBankPtr cache_banks; }; +/** + * a virResSchemata represents a schemata object under a resource control + * domain. + */ +typedef struct _virResSchemataItem virResSchemataItem; +typedef virResSchemataItem *virResSchemataItemPtr; +struct _virResSchemataItem { + unsigned int socket_no; + unsigned schemata; +}; + +typedef struct _virResSchemata virResSchemata; +typedef virResSchemata *virResSchemataPtr; +struct _virResSchemata { + unsigned int n_schemata_items; + virResSchemataItemPtr schemata_items; +}; + +/** + * a virResDomain represents a resource control domain. It's a double linked + * list. + */ + +typedef struct _virResDomain virResDomain; +typedef virResDomain *virResDomainPtr; + +struct _virResDomain { + char* name; + virResSchemataPtr schematas[RDT_NUM_RESOURCES]; + char* tasks; + int n_sockets; + virResDomainPtr pre; + virResDomainPtr next; +}; + +/* All resource control domains on this host*/ + +typedef struct _virResCtrlDomain virResCtrlDomain; +typedef virResCtrlDomain *virResCtrlDomainPtr; +struct _virResCtrlDomain { + unsigned int num_domains; + virResDomainPtr domains; +}; bool virResCtrlAvailable(void); int virResCtrlInit(void); virResCtrlPtr virResCtrlGet(int); - +int virResCtrlSetCacheBanks(virDomainCachetunePtr, unsigned char*, pid_t); +int virResCtrlUpdate(void); #endif -- 1.9.1

On Mon, Feb 06, 2017 at 10:23:39AM +0800, Eli Qiao wrote:
virResCtrlSetCacheBanks: Set cache banks of a libvirt domain. It will create new resource domain under `/sys/fs/resctrl` and fill the schemata according the cache banks configration.
virResCtrlUpdate: Update the schemata after libvirt domain destroy.
Signed-off-by: Eli Qiao <liyong.qiao@intel.com> --- src/libvirt_private.syms | 2 + src/util/virresctrl.c | 644 ++++++++++++++++++++++++++++++++++++++++++++++- src/util/virresctrl.h | 47 +++- 3 files changed, 691 insertions(+), 2 deletions(-)
diff --git a/src/util/virresctrl.h b/src/util/virresctrl.h index 3cc41da..11f43d8 100644 --- a/src/util/virresctrl.h +++ b/src/util/virresctrl.h @@ -26,6 +26,7 @@
# include "virutil.h" # include "virbitmap.h" +# include "domain_conf.h"
#define RESCTRL_DIR "/sys/fs/resctrl" #define RESCTRL_INFO_DIR "/sys/fs/resctrl/info" @@ -82,9 +83,53 @@ struct _virResCtrl { virResCacheBankPtr cache_banks; };
+/** + * a virResSchemata represents a schemata object under a resource control + * domain. + */ +typedef struct _virResSchemataItem virResSchemataItem; +typedef virResSchemataItem *virResSchemataItemPtr; +struct _virResSchemataItem { + unsigned int socket_no; + unsigned schemata; +}; + +typedef struct _virResSchemata virResSchemata; +typedef virResSchemata *virResSchemataPtr; +struct _virResSchemata { + unsigned int n_schemata_items; + virResSchemataItemPtr schemata_items; +}; + +/** + * a virResDomain represents a resource control domain. It's a double linked + * list. + */ + +typedef struct _virResDomain virResDomain; +typedef virResDomain *virResDomainPtr; + +struct _virResDomain { + char* name; + virResSchemataPtr schematas[RDT_NUM_RESOURCES]; + char* tasks; + int n_sockets; + virResDomainPtr pre; + virResDomainPtr next; +}; + +/* All resource control domains on this host*/ + +typedef struct _virResCtrlDomain virResCtrlDomain; +typedef virResCtrlDomain *virResCtrlDomainPtr; +struct _virResCtrlDomain { + unsigned int num_domains; + virResDomainPtr domains; +};
I don't think any of these need to be in the header file - they're all private structs used only by the .c file. The biggest issue I see here is a complete lack of any kind of mutex locking. Libvirt is multi-threaded, so you must expect to have virResCtrlSetCacheBanks() and virResCtrlUpdate called concurrently and thus use mutexes to ensure safety. With the data structure design of using linked list of virResDomain though, doing good locking is going to be hard. It'll force you to have a single global mutex across the whole set of data structures which will really harm concurrency for VM startup. Really each virResDomain needs to be completely standalone, with its own dedicate mutex. A global mutex for virResCtrlDomain can be acquired whle you lookup the virResDomain object to use. Thereafter the global mutex should be released and only the mutex for the specific domain used.
bool virResCtrlAvailable(void); int virResCtrlInit(void); virResCtrlPtr virResCtrlGet(int); - +int virResCtrlSetCacheBanks(virDomainCachetunePtr, unsigned char*, pid_t); +int virResCtrlUpdate(void);
This API design doesn't really make locking very easy - since you are not passing any info into the virResCtrlUpdate() method you've created the need to do global rescans when updating. IMHO virResCtrlSetCacheBanks needs to return an object that represents the config for that particular VM. This can be passed back in, when the guest is shutdown to release resources once again, avoiding any global scans. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://entangle-photo.org -o- http://search.cpan.org/~danberr/ :|

-- Eli Qiao Sent with Sparrow (http://www.sparrowmailapp.com/?sig) On Tuesday, 7 February 2017 at 12:17 AM, Daniel P. Berrange wrote:
On Mon, Feb 06, 2017 at 10:23:39AM +0800, Eli Qiao wrote:
virResCtrlSetCacheBanks: Set cache banks of a libvirt domain. It will create new resource domain under `/sys/fs/resctrl` and fill the schemata according the cache banks configration.
virResCtrlUpdate: Update the schemata after libvirt domain destroy.
Signed-off-by: Eli Qiao <liyong.qiao@intel.com (mailto:liyong.qiao@intel.com)> --- src/libvirt_private.syms | 2 + src/util/virresctrl.c | 644 ++++++++++++++++++++++++++++++++++++++++++++++- src/util/virresctrl.h | 47 +++- 3 files changed, 691 insertions(+), 2 deletions(-)
diff --git a/src/util/virresctrl.h b/src/util/virresctrl.h index 3cc41da..11f43d8 100644 --- a/src/util/virresctrl.h +++ b/src/util/virresctrl.h @@ -26,6 +26,7 @@
# include "virutil.h" # include "virbitmap.h" +# include "domain_conf.h"
#define RESCTRL_DIR "/sys/fs/resctrl" #define RESCTRL_INFO_DIR "/sys/fs/resctrl/info" @@ -82,9 +83,53 @@ struct _virResCtrl { virResCacheBankPtr cache_banks; };
+/** + * a virResSchemata represents a schemata object under a resource control + * domain. + */ +typedef struct _virResSchemataItem virResSchemataItem; +typedef virResSchemataItem *virResSchemataItemPtr; +struct _virResSchemataItem { + unsigned int socket_no; + unsigned schemata; +}; + +typedef struct _virResSchemata virResSchemata; +typedef virResSchemata *virResSchemataPtr; +struct _virResSchemata { + unsigned int n_schemata_items; + virResSchemataItemPtr schemata_items; +}; + +/** + * a virResDomain represents a resource control domain. It's a double linked + * list. + */ + +typedef struct _virResDomain virResDomain; +typedef virResDomain *virResDomainPtr; + +struct _virResDomain { + char* name; + virResSchemataPtr schematas[RDT_NUM_RESOURCES]; + char* tasks; + int n_sockets; + virResDomainPtr pre; + virResDomainPtr next; +}; + +/* All resource control domains on this host*/ + +typedef struct _virResCtrlDomain virResCtrlDomain; +typedef virResCtrlDomain *virResCtrlDomainPtr; +struct _virResCtrlDomain { + unsigned int num_domains; + virResDomainPtr domains; +};
I don't think any of these need to be in the header file - they're all private structs used only by the .c file.
Yep.
The biggest issue I see here is a complete lack of any kind of mutex locking. Libvirt is multi-threaded, so you must expect to have virResCtrlSetCacheBanks() and virResCtrlUpdate called concurrently and thus use mutexes to ensure safety.
okay.
With the data structure design of using linked list of virResDomain though, doing good locking is going to be hard. It'll force you to have a single global mutex across the whole set of data structures which will really harm concurrency for VM startup.
Really each virResDomain needs to be completely standalone, with its own dedicate mutex. A global mutex for virResCtrlDomain can be acquired whle you lookup the virResDomain object to use. Thereafter the global mutex should be released and only the mutex for the specific domain used.
oh, yes, but lock is really a hard thing, I need to be careful to avoid dead lock.
bool virResCtrlAvailable(void); int virResCtrlInit(void); virResCtrlPtr virResCtrlGet(int); - +int virResCtrlSetCacheBanks(virDomainCachetunePtr, unsigned char*, pid_t); +int virResCtrlUpdate(void);
This API design doesn't really make locking very easy - since you are not passing any info into the virResCtrlUpdate() method you've created the need to do global rescans when updating.
yes, what if I use a global mutex variable in virresctrl.c?
IMHO virResCtrlSetCacheBanks needs to return an object that represents the config for that particular VM. This can be passed back in, when the guest is shutdown to release resources once again, avoiding any global scans.
Hmm.. what object should virResCtrlSetCacheBanks return? schemata setting? I expect that when the VM reboot, we recalculate from cachetune(in xml setting) again, that because we are not sure if the host can offer the VM for enough cache when it restart again.
Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://entangle-photo.org -o- http://search.cpan.org/~danberr/ :|

On Tue, Feb 07, 2017 at 02:43:04PM +0800, Eli Qiao wrote:
On Tuesday, 7 February 2017 at 12:17 AM, Daniel P. Berrange wrote:
On Mon, Feb 06, 2017 at 10:23:39AM +0800, Eli Qiao wrote:
virResCtrlSetCacheBanks: Set cache banks of a libvirt domain. It will create new resource domain under `/sys/fs/resctrl` and fill the schemata according the cache banks configration.
virResCtrlUpdate: Update the schemata after libvirt domain destroy.
Signed-off-by: Eli Qiao <liyong.qiao@intel.com (mailto:liyong.qiao@intel.com)> --- src/libvirt_private.syms | 2 + src/util/virresctrl.c | 644 ++++++++++++++++++++++++++++++++++++++++++++++- src/util/virresctrl.h | 47 +++- 3 files changed, 691 insertions(+), 2 deletions(-)
diff --git a/src/util/virresctrl.h b/src/util/virresctrl.h index 3cc41da..11f43d8 100644 --- a/src/util/virresctrl.h +++ b/src/util/virresctrl.h @@ -26,6 +26,7 @@
# include "virutil.h" # include "virbitmap.h" +# include "domain_conf.h"
#define RESCTRL_DIR "/sys/fs/resctrl" #define RESCTRL_INFO_DIR "/sys/fs/resctrl/info" @@ -82,9 +83,53 @@ struct _virResCtrl { virResCacheBankPtr cache_banks; };
+/** + * a virResSchemata represents a schemata object under a resource control + * domain. + */ +typedef struct _virResSchemataItem virResSchemataItem; +typedef virResSchemataItem *virResSchemataItemPtr; +struct _virResSchemataItem { + unsigned int socket_no; + unsigned schemata; +}; + +typedef struct _virResSchemata virResSchemata; +typedef virResSchemata *virResSchemataPtr; +struct _virResSchemata { + unsigned int n_schemata_items; + virResSchemataItemPtr schemata_items; +}; + +/** + * a virResDomain represents a resource control domain. It's a double linked + * list. + */ + +typedef struct _virResDomain virResDomain; +typedef virResDomain *virResDomainPtr; + +struct _virResDomain { + char* name; + virResSchemataPtr schematas[RDT_NUM_RESOURCES]; + char* tasks; + int n_sockets; + virResDomainPtr pre; + virResDomainPtr next; +}; + +/* All resource control domains on this host*/ + +typedef struct _virResCtrlDomain virResCtrlDomain; +typedef virResCtrlDomain *virResCtrlDomainPtr; +struct _virResCtrlDomain { + unsigned int num_domains; + virResDomainPtr domains; +};
I don't think any of these need to be in the header file - they're all private structs used only by the .c file.
Yep.
The biggest issue I see here is a complete lack of any kind of mutex locking. Libvirt is multi-threaded, so you must expect to have virResCtrlSetCacheBanks() and virResCtrlUpdate called concurrently and thus use mutexes to ensure safety.
okay.
With the data structure design of using linked list of virResDomain though, doing good locking is going to be hard. It'll force you to have a single global mutex across the whole set of data structures which will really harm concurrency for VM startup.
Really each virResDomain needs to be completely standalone, with its own dedicate mutex. A global mutex for virResCtrlDomain can be acquired whle you lookup the virResDomain object to use. Thereafter the global mutex should be released and only the mutex for the specific domain used.
oh, yes, but lock is really a hard thing, I need to be careful to avoid dead lock.
bool virResCtrlAvailable(void); int virResCtrlInit(void); virResCtrlPtr virResCtrlGet(int); - +int virResCtrlSetCacheBanks(virDomainCachetunePtr, unsigned char*, pid_t); +int virResCtrlUpdate(void);
This API design doesn't really make locking very easy - since you are not passing any info into the virResCtrlUpdate() method you've created the need to do global rescans when updating.
yes, what if I use a global mutex variable in virresctrl.c?
I'd like to see finer grained locking if possible. We try really hard to make guest startup be highly parallizeable, so any global locks that will be required by all VMs starting hurts that.
IMHO virResCtrlSetCacheBanks needs to return an object that represents the config for that particular VM. This can be passed back in, when the guest is shutdown to release resources once again, avoiding any global scans.
Hmm.. what object should virResCtrlSetCacheBanks return? schemata setting?
Well it might not need to return an object neccessarily. Perhaps you can just pass the VM PID into the method when your shutdown instead, and have a hash table keyed off PID to lookup the data structure needed to cleanup.
I expect that when the VM reboot, we recalculate from cachetune(in xml setting) again, that because we are not sure if the host can offer the VM for enough cache when it restart again.
You shouldn't need to care about reboot as a concept in these APIs. From the QEMU driver POV, a cold reboot will just be done as a stop followed by start. So these low level cache APIs just need to cope with start+stop. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://entangle-photo.org -o- http://search.cpan.org/~danberr/ :|

-- Eli Qiao Sent with Sparrow (http://www.sparrowmailapp.com/?sig) On Tuesday, 7 February 2017 at 6:15 PM, Daniel P. Berrange wrote:
On Tue, Feb 07, 2017 at 02:43:04PM +0800, Eli Qiao wrote:
On Tuesday, 7 February 2017 at 12:17 AM, Daniel P. Berrange wrote:
On Mon, Feb 06, 2017 at 10:23:39AM +0800, Eli Qiao wrote:
virResCtrlSetCacheBanks: Set cache banks of a libvirt domain. It will create new resource domain under `/sys/fs/resctrl` and fill the schemata according the cache banks configration.
virResCtrlUpdate: Update the schemata after libvirt domain destroy.
Signed-off-by: Eli Qiao <liyong.qiao@intel.com (mailto:liyong.qiao@intel.com)> --- src/libvirt_private.syms | 2 + src/util/virresctrl.c | 644 ++++++++++++++++++++++++++++++++++++++++++++++- src/util/virresctrl.h | 47 +++- 3 files changed, 691 insertions(+), 2 deletions(-)
diff --git a/src/util/virresctrl.h b/src/util/virresctrl.h index 3cc41da..11f43d8 100644 --- a/src/util/virresctrl.h +++ b/src/util/virresctrl.h @@ -26,6 +26,7 @@
# include "virutil.h" # include "virbitmap.h" +# include "domain_conf.h"
#define RESCTRL_DIR "/sys/fs/resctrl" #define RESCTRL_INFO_DIR "/sys/fs/resctrl/info" @@ -82,9 +83,53 @@ struct _virResCtrl { virResCacheBankPtr cache_banks; };
+/** + * a virResSchemata represents a schemata object under a resource control + * domain. + */ +typedef struct _virResSchemataItem virResSchemataItem; +typedef virResSchemataItem *virResSchemataItemPtr; +struct _virResSchemataItem { + unsigned int socket_no; + unsigned schemata; +}; + +typedef struct _virResSchemata virResSchemata; +typedef virResSchemata *virResSchemataPtr; +struct _virResSchemata { + unsigned int n_schemata_items; + virResSchemataItemPtr schemata_items; +}; + +/** + * a virResDomain represents a resource control domain. It's a double linked + * list. + */ + +typedef struct _virResDomain virResDomain; +typedef virResDomain *virResDomainPtr; + +struct _virResDomain { + char* name; + virResSchemataPtr schematas[RDT_NUM_RESOURCES]; + char* tasks; + int n_sockets; + virResDomainPtr pre; + virResDomainPtr next; +}; + +/* All resource control domains on this host*/ + +typedef struct _virResCtrlDomain virResCtrlDomain; +typedef virResCtrlDomain *virResCtrlDomainPtr; +struct _virResCtrlDomain { + unsigned int num_domains; + virResDomainPtr domains; +};
I don't think any of these need to be in the header file - they're all private structs used only by the .c file.
Yep.
The biggest issue I see here is a complete lack of any kind of mutex locking. Libvirt is multi-threaded, so you must expect to have virResCtrlSetCacheBanks() and virResCtrlUpdate called concurrently and thus use mutexes to ensure safety.
okay.
With the data structure design of using linked list of virResDomain though, doing good locking is going to be hard. It'll force you to have a single global mutex across the whole set of data structures which will really harm concurrency for VM startup.
Really each virResDomain needs to be completely standalone, with its own dedicate mutex. A global mutex for virResCtrlDomain can be acquired whle you lookup the virResDomain object to use. Thereafter the global mutex should be released and only the mutex for the specific domain used.
oh, yes, but lock is really a hard thing, I need to be careful to avoid dead lock.
bool virResCtrlAvailable(void); int virResCtrlInit(void); virResCtrlPtr virResCtrlGet(int); - +int virResCtrlSetCacheBanks(virDomainCachetunePtr, unsigned char*, pid_t); +int virResCtrlUpdate(void);
This API design doesn't really make locking very easy - since you are not passing any info into the virResCtrlUpdate() method you've created the need to do global rescans when updating.
yes, what if I use a global mutex variable in virresctrl.c?
I'd like to see finer grained locking if possible. We try really hard to make guest startup be highly parallizeable, so any global locks that will be required by all VMs starting hurts that.
hi Daniel, thanks for your reply, hmm.. I know what’s your mean, but just have no idea how to add a finer mutex/locking when you boot a VM and require cache allocation, we need to get the cache information on host, which is a global value, every VM should share it and modify it after the allocation, then flush changes to /sys/fs/resctrl. which is to say there should be one operation on cache allocation at one time. the process may be like: 1) start a VM1 2) grain locking/mutex, get cache left information on host ——— 1) start VM2 3) calculate the schemata of this VM and flush it to /sys/fs/resctrl ——— 2) wait for locking 4) update global cache left information 3) wait for locking 5) release lock/mutex, 4) grain locking/mutex, get cache left information on host 6) VM1 started 5) calculate the schemata of this VM and flush it to /sys/fs/resctrl 6) update .. 7) release locking.. VM2 should wait after VM1 flush schemata to /sys/fs/resctrl, or it will cause racing...
IMHO virResCtrlSetCacheBanks needs to return an object that represents the config for that particular VM. This can be passed back in, when the guest is shutdown to release resources once again, avoiding any global scans.
Hmm.. what object should virResCtrlSetCacheBanks return? schemata setting?
Well it might not need to return an object neccessarily. Perhaps you can just pass the VM PID into the method when your shutdown instead, and have a hash table keyed off PID to lookup the data structure needed to cleanup.
not only cleanup the struct, but still need to calculate the default schemata of host (release resources to host)
I expect that when the VM reboot, we recalculate from cachetune(in xml setting) again, that because we are not sure if the host can offer the VM for enough cache when it restart again.
You shouldn't need to care about reboot as a concept in these APIs. From the QEMU driver POV, a cold reboot will just be done as a stop followed by start. So these low level cache APIs just need to cope with start+stop.
Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://entangle-photo.org -o- http://search.cpan.org/~danberr/ :|

Set cache banks while booting a new domain. Signed-off-by: Eli Qiao <liyong.qiao@intel.com> --- src/qemu/qemu_process.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 184440d..f29630f 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -76,6 +76,7 @@ #include "configmake.h" #include "nwfilter_conf.h" #include "netdev_bandwidth_conf.h" +#include "virresctrl.h" #define VIR_FROM_THIS VIR_FROM_QEMU @@ -5714,6 +5715,12 @@ qemuProcessLaunch(virConnectPtr conn, qemuProcessAutoDestroyAdd(driver, vm, conn) < 0) goto cleanup; + VIR_DEBUG("Cache allocation"); + + if (virResCtrlAvailable() && + virResCtrlSetCacheBanks(&(vm->def->cachetune), vm->def->uuid, vm->pid) < 0) + goto cleanup; + ret = 0; cleanup: @@ -6216,6 +6223,10 @@ void qemuProcessStop(virQEMUDriverPtr driver, virPerfFree(priv->perf); priv->perf = NULL; + if(virResCtrlAvailable() && virResCtrlUpdate() < 0) + VIR_WARN("Failed to update resource control for %s", + vm->def->name); + qemuProcessRemoveDomainStatus(driver, vm); /* Remove VNC and Spice ports from port reservation bitmap, but only if -- 1.9.1

Enable l3code/l3data while doing cache tune. l3code/l3data should use a continus cbm in their seperated schemata and the cache size are shared between them, so we need to deal them differently with l3 cache. This should enable cdp feature while mounting /sys/fs/resctrl, eg: mount -t resctrl resctrl -o cdp /sys/fs/resctrl Signed-off-by: Eli Qiao <liyong.qiao@intel.com> --- docs/schemas/domaincommon.rng | 2 +- src/util/virresctrl.c | 24 +++++++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng index d0f9e54..70ce20c 100644 --- a/docs/schemas/domaincommon.rng +++ b/docs/schemas/domaincommon.rng @@ -5484,7 +5484,7 @@ </define> <define name="cachetype"> <data type="string"> - <param name="pattern">(l3)</param> + <param name="pattern">(l3|l3code|l3data)</param> </data> </define> <define name="cacheunit"> diff --git a/src/util/virresctrl.c b/src/util/virresctrl.c index 002b539..812bbc7 100644 --- a/src/util/virresctrl.c +++ b/src/util/virresctrl.c @@ -498,6 +498,7 @@ int virResCtrlRefreshSchemata(void) int k; unsigned int tmp_schemata; unsigned int default_schemata; + int pair_type = 0; virResDomainPtr header, p; @@ -505,6 +506,12 @@ int virResCtrlRefreshSchemata(void) for (i = 0; i < RDT_NUM_RESOURCES; i++) { if (VIR_RESCTRL_ENABLED(i)) { + + if (i == RDT_RESOURCE_L3DATA) + pair_type = RDT_RESOURCE_L3CODE; + if (i == RDT_RESOURCE_L3CODE) + pair_type = RDT_RESOURCE_L3DATA; + for(j = 0; j < header->schematas[i]->n_schemata_items; j ++) { p = header->next; default_schemata = VIR_RESCTRL_GET_SCHEMATA(ResCtrlAll[i].cbm_len); @@ -512,6 +519,8 @@ int virResCtrlRefreshSchemata(void) /* NOTEs: if only header domain, the schemata will be set to default one*/ for (k = 1; k < DomainAll.num_domains; k++) { tmp_schemata |= p->schematas[i]->schemata_items[j].schemata; + if(pair_type > 0) + tmp_schemata |= p->schematas[pair_type]->schemata_items[j].schemata; p = p->next; } /* sys fs doens't let us use 0 */ @@ -845,6 +854,7 @@ virResCtrlCalculateSchemata(int type, virResDomainPtr p; unsigned int tmp_schemata; unsigned int schemata_sum = 0; + int pair_type = 0; if(ResCtrlAll[type].cache_banks[sid].cache_left < size) { VIR_ERROR("Note enough cache left on bank %u", hostid); @@ -859,8 +869,19 @@ virResCtrlCalculateSchemata(int type, p = DomainAll.domains; p = p->next; + + /* for type is l3code and l3data, we need to deal them specially*/ + + if (type == RDT_RESOURCE_L3DATA) + pair_type = RDT_RESOURCE_L3CODE; + + if (type == RDT_RESOURCE_L3CODE) + pair_type = RDT_RESOURCE_L3DATA; + for (i = 1; i < DomainAll.num_domains; i ++) { schemata_sum |= p->schematas[type]->schemata_items[sid].schemata; + if(pair_type > 0) + schemata_sum |= p->schematas[pair_type]->schemata_items[sid].schemata; p = p->next; } @@ -901,6 +922,8 @@ int virResCtrlSetCacheBanks(virDomainCachetunePtr cachetune, } if (p != NULL) { + virResCtrlAppendDomain(p); + for(i = 0 ; i < cachetune->n_banks; i++) { if ((type = virResCtrlTypeFromString( cachetune->cache_banks[i].type)) < 0 ) { @@ -934,7 +957,6 @@ int virResCtrlSetCacheBanks(virDomainCachetunePtr cachetune, virResCtrlDestroyDomain(p); return -1; } - virResCtrlAppendDomain(p); } else { VIR_ERROR("Failed to create a domain in sysfs"); return -1; -- 1.9.1

l3data and l3code type of cache banks should be configured pairs. Signed-off-by: Eli Qiao <liyong.qiao@intel.com> --- src/conf/domain_conf.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 0304b36..e3434e7 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -15620,9 +15620,13 @@ virDomainCacheTuneDefParseXML(virDomainDefPtr def, int type = -1 ; virDomainCacheBankPtr bank = NULL; virResCtrlPtr resctrl; + /* A Array to make sure l3code and l3data are pairs*/ + int* sem = NULL; if (VIR_ALLOC_N(bank, n) < 0) goto cleanup; + if (VIR_ALLOC_N(sem, MAX_CPU_SOCKET_NUM) < 0) + goto cleanup; for (i = 0; i < n; i++) { if (!(tmp = virXMLPropString(nodes[i], "id"))) { @@ -15670,6 +15674,12 @@ virDomainCacheTuneDefParseXML(virDomainDefPtr def, goto cleanup; } + /* RDT_RESOURCE_L3DATA and RDT_RESOURCE_L3CODE should be pair */ + if (type == RDT_RESOURCE_L3DATA) + sem[bank[i].host_id] ++; + else if (type == RDT_RESOURCE_L3CODE) + sem[bank[i].host_id] --; + resctrl = virResCtrlGet(type); if(resctrl == NULL || !resctrl->enabled) { @@ -15707,12 +15717,22 @@ virDomainCacheTuneDefParseXML(virDomainDefPtr def, def->cachetune.cache_banks = bank; } + for(i = 0 ; i < MAX_CPU_SOCKET_NUM; i ++) { + if(sem[i] != 0) { + virReportError(VIR_ERR_XML_ERROR, + _("'l3code and l3data shoud be show up pairs on bank %d'"), + i); + goto cleanup; + } + } + def->cachetune.n_banks = n; return 0; cleanup: VIR_FREE(bank); VIR_FREE(tmp); + VIR_FREE(sem); return -1; } -- 1.9.1

On Mon, Feb 06, 2017 at 10:23:35AM +0800, Eli Qiao wrote:
This series patches are for supportting CAT featues, which also called cache tune in libvirt.
First to expose cache information which could be tuned in capabilites XML. Then add new domain xml element support to add cacahe bank which will apply on this libvirt domain.
This series patches add a util file `resctrl.c/h`, an interface to talk with linux kernel's sys fs.
There are still some TODOs such as expose new public interface to get free cache information.
Some discussion about this feature support can be found from: https://www.redhat.com/archives/libvir-list/2017-January/msg00644.html
Two comments: 1) Please perform appropriate filesystem locking when accessing resctrlfs, as described at: http://www.spinics.net/lists/linux-tip-commits/msg36754.html 2) <cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB'/> [b4c270b5-e0f9-4106-a446-69032872ed7d]# cat tasks 8654 [b4c270b5-e0f9-4106-a446-69032872ed7d]# pstree -p | grep qemu |-qemu-kvm(8654)-+-{qemu-kvm}(8688) | |-{qemu-kvm}(8692) | `-{qemu-kvm}(8693) Should add individual vcpus to the "tasks" file, not the main QEMU process. The NFV usecase requires exclusive CAT allocation for the vcpu which runs the sensitive workload. Perhaps: <cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB'/> Adds all vcpus that are pinned to the socket which cachebank with host_id=1. <cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB' vcpus=2,3/> Adds PID of vcpus 2 and 3 to resctrl directory created for this allocation.

On Mon, Feb 06, 2017 at 01:33:09PM -0200, Marcelo Tosatti wrote:
On Mon, Feb 06, 2017 at 10:23:35AM +0800, Eli Qiao wrote:
This series patches are for supportting CAT featues, which also called cache tune in libvirt.
First to expose cache information which could be tuned in capabilites XML. Then add new domain xml element support to add cacahe bank which will apply on this libvirt domain.
This series patches add a util file `resctrl.c/h`, an interface to talk with linux kernel's sys fs.
There are still some TODOs such as expose new public interface to get free cache information.
Some discussion about this feature support can be found from: https://www.redhat.com/archives/libvir-list/2017-January/msg00644.html
Two comments:
1) Please perform appropriate filesystem locking when accessing resctrlfs, as described at:
http://www.spinics.net/lists/linux-tip-commits/msg36754.html
2)
<cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB'/>
[b4c270b5-e0f9-4106-a446-69032872ed7d]# cat tasks 8654 [b4c270b5-e0f9-4106-a446-69032872ed7d]# pstree -p | grep qemu |-qemu-kvm(8654)-+-{qemu-kvm}(8688) | |-{qemu-kvm}(8692) | `-{qemu-kvm}(8693)
Should add individual vcpus to the "tasks" file, not the main QEMU process.
The NFV usecase requires exclusive CAT allocation for the vcpu which runs the sensitive workload.
Perhaps:
<cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB'/>
Adds all vcpus that are pinned to the socket which cachebank with host_id=1.
<cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB' vcpus=2,3/>
Adds PID of vcpus 2 and 3 to resctrl directory created for this allocation.
3) CDP / non-CDP convertion. In case the size determination has been performed with non-CDP, to emulate such allocation on a CDP host, it would be good to allow both code and data allocations to share the CBM space: <cachetune id='10' host_id='1' type='l3data' size='3072' unit='KiB'/> <cachetune id='10' host_id='1' type='l3code' size='3072' unit='KiB'/> Perhaps if using the same ID? Other than that, testing looks good.

-- Eli Qiao Sent with Sparrow (http://www.sparrowmailapp.com/?sig) On Tuesday, 7 February 2017 at 3:03 AM, Marcelo Tosatti wrote:
On Mon, Feb 06, 2017 at 01:33:09PM -0200, Marcelo Tosatti wrote:
On Mon, Feb 06, 2017 at 10:23:35AM +0800, Eli Qiao wrote:
This series patches are for supportting CAT featues, which also called cache tune in libvirt.
First to expose cache information which could be tuned in capabilites XML. Then add new domain xml element support to add cacahe bank which will apply on this libvirt domain.
This series patches add a util file `resctrl.c/h`, an interface to talk with linux kernel's sys fs.
There are still some TODOs such as expose new public interface to get free cache information.
Some discussion about this feature support can be found from: https://www.redhat.com/archives/libvir-list/2017-January/msg00644.html
Two comments:
1) Please perform appropriate filesystem locking when accessing resctrlfs, as described at:
http://www.spinics.net/lists/linux-tip-commits/msg36754.html
Sure.
2)
<cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB'/>
[b4c270b5-e0f9-4106-a446-69032872ed7d]# cat tasks 8654 [b4c270b5-e0f9-4106-a446-69032872ed7d]# pstree -p | grep qemu |-qemu-kvm(8654)-+-{qemu-kvm}(8688) | |-{qemu-kvm}(8692) | `-{qemu-kvm}(8693)
Should add individual vcpus to the "tasks" file, not the main QEMU process.
The NFV usecase requires exclusive CAT allocation for the vcpu which runs the sensitive workload.
Perhaps:
<cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB'/>
Adds all vcpus that are pinned to the socket which cachebank with host_id=1.
<cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB' vcpus=2,3/>
Adds PID of vcpus 2 and 3 to resctrl directory created for this allocation.
Hmm.. in this case, we need to figure out what’s the pid of vcpus=2 and vcpu=3 and added them to the resctrl directory. currently, I create a resctrl directory(called resctrl domain) for a VM so just put all task ids to it. this is my though: let say the vm has vcpus=0 1 2 3, and you want to let 0, 1 benefit cache on host_id=0, and 2, 3 on host_id=1 you will do: 1) pin vcpus 0, 1 on the cpus of socket 0 pin vcpus 2, 3 on the cpus of socket 1 this can be done in vcputune 2) define cache tune like this: <cachetune id='0' host_id=‘0' type='l3' size='3072' unit='KiB'/> <cachetune id='1' host_id='1' type='l3' size='3072' unit='KiB'/> in libvirt: we create a resctrl directory naming with the VM’s uuid and set schemata for each socket 0, and socket 1, put all qemu tasks ids into tasks file, this will work fine. please be note that in a resctrl directory, we can define schemata for each socket id separately.
3) CDP / non-CDP convertion.
In case the size determination has been performed with non-CDP, to emulate such allocation on a CDP host, it would be good to allow both code and data allocations to share the CBM space:
IOM, I don’t think it’s good to have this. in libvirt capabilities xml, the application will get to know if the host support cdp or not.
<cachetune id='10' host_id='1' type='l3data' size='3072' unit='KiB'/> <cachetune id='10' host_id='1' type='l3code' size='3072' unit='KiB'/>
Perhaps if using the same ID?
I am open to hear about what other’s say,
Other than that, testing looks good.
Thanks for the testing.

On Tue, Feb 07, 2017 at 02:43:13PM +0800, Eli Qiao wrote:
3) CDP / non-CDP convertion.
In case the size determination has been performed with non-CDP, to emulate such allocation on a CDP host, it would be good to allow both code and data allocations to share the CBM space:
IOM, I don’t think it’s good to have this. in libvirt capabilities xml, the application will get to know if the host support cdp or not.
Yep, as long as the capabilities XML provide info about the different cache banks, IMHO it is better to have the application be explicit about what they want for CDP & non-CDP scenarios. Let the higher level mgmt apps above libvirt apply specific policies if they desire Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://entangle-photo.org -o- http://search.cpan.org/~danberr/ :|

On Tue, Feb 07, 2017 at 10:17:37AM +0000, Daniel P. Berrange wrote:
On Tue, Feb 07, 2017 at 02:43:13PM +0800, Eli Qiao wrote:
3) CDP / non-CDP convertion.
In case the size determination has been performed with non-CDP, to emulate such allocation on a CDP host, it would be good to allow both code and data allocations to share the CBM space:
IOM, I don’t think it’s good to have this. in libvirt capabilities xml, the application will get to know if the host support cdp or not.
Yep, as long as the capabilities XML provide info about the different cache banks, IMHO it is better to have the application be explicit about what they want for CDP & non-CDP scenarios. Let the higher level mgmt apps above libvirt apply specific policies if they desire
There is no policy being applied here. What i mean is the following: 1) User determines that the type of the CAT allocation necessary for his application is one which shares cache and data, that is non-CDP (either because he didnt have a CDP machine at the time, or because he had a CDP machine but sharing data and code cache turns out to be efficient for the application). Say that this measured size is M. 2) A host with CDP enabled resctrl is used for this VM. How to create a CAT allocation with shared code and data? Have to write to the schemata file of the VM the following: L3data:1=0x00ff;... L3code:1=0x00ff;... (that is, the data and code CBM masks use the same bits). 3) How is the user going to achieve that with this patchset today? AFAIK, he can't. What he can do is the following: <cachetune id='1' host_id='0' type='l3code' size='M' unit='KiB'/> <cachetune id='2' host_id='0' type='l3data' size='M' unit='KiB'/> But this will allocate the following schemata file: L3data:1=0xff00;... L3code:1=0x00ff;... Which is not what is wanted. Therefore the suggestion to _share the cbm bit space_ in case a similar "cachetune id" is used. (maybe have a different syntax, i don't care, as long as the user can create code / data CAT allocations that share the CBM space). Does that make sense now?

On Tue, Feb 07, 2017 at 02:43:13PM +0800, Eli Qiao wrote:
-- Eli Qiao Sent with Sparrow (http://www.sparrowmailapp.com/?sig)
On Tuesday, 7 February 2017 at 3:03 AM, Marcelo Tosatti wrote:
On Mon, Feb 06, 2017 at 01:33:09PM -0200, Marcelo Tosatti wrote:
On Mon, Feb 06, 2017 at 10:23:35AM +0800, Eli Qiao wrote:
This series patches are for supportting CAT featues, which also called cache tune in libvirt.
First to expose cache information which could be tuned in capabilites XML. Then add new domain xml element support to add cacahe bank which will apply on this libvirt domain.
This series patches add a util file `resctrl.c/h`, an interface to talk with linux kernel's sys fs.
There are still some TODOs such as expose new public interface to get free cache information.
Some discussion about this feature support can be found from: https://www.redhat.com/archives/libvir-list/2017-January/msg00644.html
Two comments:
1) Please perform appropriate filesystem locking when accessing resctrlfs, as described at:
http://www.spinics.net/lists/linux-tip-commits/msg36754.html
Sure.
2)
<cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB'/>
[b4c270b5-e0f9-4106-a446-69032872ed7d]# cat tasks 8654 [b4c270b5-e0f9-4106-a446-69032872ed7d]# pstree -p | grep qemu |-qemu-kvm(8654)-+-{qemu-kvm}(8688) | |-{qemu-kvm}(8692) | `-{qemu-kvm}(8693)
Should add individual vcpus to the "tasks" file, not the main QEMU process.
The NFV usecase requires exclusive CAT allocation for the vcpu which runs the sensitive workload.
Perhaps:
<cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB'/>
Adds all vcpus that are pinned to the socket which cachebank with host_id=1.
<cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB' vcpus=2,3/>
Adds PID of vcpus 2 and 3 to resctrl directory created for this allocation.
Hmm.. in this case, we need to figure out what’s the pid of vcpus=2 and vcpu=3 and added them to the resctrl directory.
Yes and only the pids of vcpus=2 and vcpus=3, not of any other vcpus.
currently, I create a resctrl directory(called resctrl domain) for a VM so just put all task ids to it.
this is my though:
let say the vm has vcpus=0 1 2 3, and you want to let 0, 1 benefit cache on host_id=0, and 2, 3 on host_id=1
you will do:
1) pin vcpus 0, 1 on the cpus of socket 0 pin vcpus 2, 3 on the cpus of socket 1 this can be done in vcputune
2) define cache tune like this: <cachetune id='0' host_id=‘0' type='l3' size='3072' unit='KiB'/> <cachetune id='1' host_id='1' type='l3' size='3072' unit='KiB'/>
in libvirt: we create a resctrl directory naming with the VM’s uuid and set schemata for each socket 0, and socket 1, put all qemu tasks ids into tasks file, this will work fine.
No, please don't do this.
please be note that in a resctrl directory, we can define schemata for each socket id separately.
Please do not put vcpus automatically into the reservations. Its necessary to have certain vcpus in a reservation and some not. For example: 2 vcpu guest, vcpu0 pinned to socket 0 cpu0, vcpu1 pinned to socket 0 cpu1. <cachetune id='0' host_id=‘0' type='l3' size='3072' unit='KiB'/> We want _only_ vcpu1 to be part of this reservation, and not vcpu0 (want vcpu0 to use the default group, ie. schemata file at /sys/fs/resctrl/schemata So please have the ability to add vcpus to the XML syntax: <cachetune id='0' host_id=‘0' type='l3' size='3072' unit='KiB' vcpus='1'/> or <cachetune id='0' host_id=‘0' type='l3' size='3072' unit='KiB' vcpus='2,3'/> This also allows different sizes to be specified.
3) CDP / non-CDP convertion.
In case the size determination has been performed with non-CDP, to emulate such allocation on a CDP host, it would be good to allow both code and data allocations to share the CBM space:
IOM, I don’t think it’s good to have this. in libvirt capabilities xml, the application will get to know if the host support cdp or not.
<cachetune id='10' host_id='1' type='l3data' size='3072' unit='KiB'/> <cachetune id='10' host_id='1' type='l3code' size='3072' unit='KiB'/>
Perhaps if using the same ID?
I am open to hear about what other’s say,
Other than that, testing looks good.
Thanks for the testing.

-- Eli Qiao Sent with Sparrow (http://www.sparrowmailapp.com/?sig) On Tuesday, 7 February 2017 at 7:56 PM, Marcelo Tosatti wrote:
On Tue, Feb 07, 2017 at 02:43:13PM +0800, Eli Qiao wrote:
-- Eli Qiao Sent with Sparrow (http://www.sparrowmailapp.com/?sig)
On Tuesday, 7 February 2017 at 3:03 AM, Marcelo Tosatti wrote:
On Mon, Feb 06, 2017 at 01:33:09PM -0200, Marcelo Tosatti wrote:
On Mon, Feb 06, 2017 at 10:23:35AM +0800, Eli Qiao wrote:
This series patches are for supportting CAT featues, which also called cache tune in libvirt.
First to expose cache information which could be tuned in capabilites XML. Then add new domain xml element support to add cacahe bank which will apply on this libvirt domain.
This series patches add a util file `resctrl.c/h`, an interface to talk with linux kernel's sys fs.
There are still some TODOs such as expose new public interface to get free cache information.
Some discussion about this feature support can be found from: https://www.redhat.com/archives/libvir-list/2017-January/msg00644.html
Two comments:
1) Please perform appropriate filesystem locking when accessing resctrlfs, as described at:
http://www.spinics.net/lists/linux-tip-commits/msg36754.html
Sure.
2)
<cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB'/>
[b4c270b5-e0f9-4106-a446-69032872ed7d]# cat tasks 8654 [b4c270b5-e0f9-4106-a446-69032872ed7d]# pstree -p | grep qemu |-qemu-kvm(8654)-+-{qemu-kvm}(8688) | |-{qemu-kvm}(8692) | `-{qemu-kvm}(8693)
Should add individual vcpus to the "tasks" file, not the main QEMU process.
The NFV usecase requires exclusive CAT allocation for the vcpu which runs the sensitive workload.
Perhaps:
<cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB'/>
Adds all vcpus that are pinned to the socket which cachebank with host_id=1.
<cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB' vcpus=2,3/>
Adds PID of vcpus 2 and 3 to resctrl directory created for this allocation.
Hmm.. in this case, we need to figure out what’s the pid of vcpus=2 and vcpu=3 and added them to the resctrl directory.
Yes and only the pids of vcpus=2 and vcpus=3, not of any other vcpus.
currently, I create a resctrl directory(called resctrl domain) for a VM so just put all task ids to it.
this is my though:
let say the vm has vcpus=0 1 2 3, and you want to let 0, 1 benefit cache on host_id=0, and 2, 3 on host_id=1
you will do:
1) pin vcpus 0, 1 on the cpus of socket 0 pin vcpus 2, 3 on the cpus of socket 1 this can be done in vcputune
2) define cache tune like this: <cachetune id='0' host_id=‘0' type='l3' size='3072' unit='KiB'/> <cachetune id='1' host_id='1' type='l3' size='3072' unit='KiB'/>
in libvirt: we create a resctrl directory naming with the VM’s uuid and set schemata for each socket 0, and socket 1, put all qemu tasks ids into tasks file, this will work fine.
No, please don't do this.
please be note that in a resctrl directory, we can define schemata for each socket id separately.
Please do not put vcpus automatically into the reservations. Its necessary to have certain vcpus in a reservation and some not.
For example: 2 vcpu guest, vcpu0 pinned to socket 0 cpu0, vcpu1 pinned to socket 0 cpu1.
<cachetune id='0' host_id=‘0' type='l3' size='3072' unit='KiB'/>
We want _only_ vcpu1 to be part of this reservation, and not vcpu0 (want vcpu0 to use the default group, ie. schemata file at
/sys/fs/resctrl/schemata
So please have the ability to add vcpus to the XML syntax:
<cachetune id='0' host_id=‘0' type='l3' size='3072' unit='KiB' vcpus='1'/>
or
<cachetune id='0' host_id=‘0' type='l3' size='3072' unit='KiB' vcpus='2,3'/>
Yep, get your mean, make sense. if there’s exist method can get vcpus’ pid?
This also allows different sizes to be specified.
3) CDP / non-CDP convertion.
In case the size determination has been performed with non-CDP, to emulate such allocation on a CDP host, it would be good to allow both code and data allocations to share the CBM space:
IOM, I don’t think it’s good to have this. in libvirt capabilities xml, the application will get to know if the host support cdp or not.
<cachetune id='10' host_id='1' type='l3data' size='3072' unit='KiB'/> <cachetune id='10' host_id='1' type='l3code' size='3072' unit='KiB'/>
Perhaps if using the same ID?
I am open to hear about what other’s say,
Other than that, testing looks good.
Thanks for the testing.

On Wed, Feb 08, 2017 at 10:19:07AM +0800, Eli Qiao wrote:
-- Eli Qiao Sent with Sparrow (http://www.sparrowmailapp.com/?sig)
On Tuesday, 7 February 2017 at 7:56 PM, Marcelo Tosatti wrote:
On Tue, Feb 07, 2017 at 02:43:13PM +0800, Eli Qiao wrote:
-- Eli Qiao Sent with Sparrow (http://www.sparrowmailapp.com/?sig)
On Tuesday, 7 February 2017 at 3:03 AM, Marcelo Tosatti wrote:
On Mon, Feb 06, 2017 at 01:33:09PM -0200, Marcelo Tosatti wrote:
On Mon, Feb 06, 2017 at 10:23:35AM +0800, Eli Qiao wrote:
This series patches are for supportting CAT featues, which also called cache tune in libvirt.
First to expose cache information which could be tuned in capabilites XML. Then add new domain xml element support to add cacahe bank which will apply on this libvirt domain.
This series patches add a util file `resctrl.c/h`, an interface to talk with linux kernel's sys fs.
There are still some TODOs such as expose new public interface to get free cache information.
Some discussion about this feature support can be found from: https://www.redhat.com/archives/libvir-list/2017-January/msg00644.html
Two comments:
1) Please perform appropriate filesystem locking when accessing resctrlfs, as described at:
http://www.spinics.net/lists/linux-tip-commits/msg36754.html
Sure.
2)
<cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB'/>
[b4c270b5-e0f9-4106-a446-69032872ed7d]# cat tasks 8654 [b4c270b5-e0f9-4106-a446-69032872ed7d]# pstree -p | grep qemu |-qemu-kvm(8654)-+-{qemu-kvm}(8688) | |-{qemu-kvm}(8692) | `-{qemu-kvm}(8693)
Should add individual vcpus to the "tasks" file, not the main QEMU process.
The NFV usecase requires exclusive CAT allocation for the vcpu which runs the sensitive workload.
Perhaps:
<cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB'/>
Adds all vcpus that are pinned to the socket which cachebank with host_id=1.
<cachetune id='10' host_id='1' type='l3' size='3072' unit='KiB' vcpus=2,3/>
Adds PID of vcpus 2 and 3 to resctrl directory created for this allocation.
Hmm.. in this case, we need to figure out what’s the pid of vcpus=2 and vcpu=3 and added them to the resctrl directory.
Yes and only the pids of vcpus=2 and vcpus=3, not of any other vcpus.
currently, I create a resctrl directory(called resctrl domain) for a VM so just put all task ids to it.
this is my though:
let say the vm has vcpus=0 1 2 3, and you want to let 0, 1 benefit cache on host_id=0, and 2, 3 on host_id=1
you will do:
1) pin vcpus 0, 1 on the cpus of socket 0 pin vcpus 2, 3 on the cpus of socket 1 this can be done in vcputune
2) define cache tune like this: <cachetune id='0' host_id=‘0' type='l3' size='3072' unit='KiB'/> <cachetune id='1' host_id='1' type='l3' size='3072' unit='KiB'/>
in libvirt: we create a resctrl directory naming with the VM’s uuid and set schemata for each socket 0, and socket 1, put all qemu tasks ids into tasks file, this will work fine.
No, please don't do this.
please be note that in a resctrl directory, we can define schemata for each socket id separately.
Please do not put vcpus automatically into the reservations. Its necessary to have certain vcpus in a reservation and some not.
For example: 2 vcpu guest, vcpu0 pinned to socket 0 cpu0, vcpu1 pinned to socket 0 cpu1.
<cachetune id='0' host_id=‘0' type='l3' size='3072' unit='KiB'/>
We want _only_ vcpu1 to be part of this reservation, and not vcpu0 (want vcpu0 to use the default group, ie. schemata file at
/sys/fs/resctrl/schemata
So please have the ability to add vcpus to the XML syntax:
<cachetune id='0' host_id=‘0' type='l3' size='3072' unit='KiB' vcpus='1'/>
or
<cachetune id='0' host_id=‘0' type='l3' size='3072' unit='KiB' vcpus='2,3'/>
Yep, get your mean, make sense.
if there’s exist method can get vcpus’ pid?
qemu_process.c: pid_t vcpupid = qemuDomainGetVcpuPid(vm, vcpuid);
This also allows different sizes to be specified.
3) CDP / non-CDP convertion.
In case the size determination has been performed with non-CDP, to emulate such allocation on a CDP host, it would be good to allow both code and data allocations to share the CBM space:
IOM, I don’t think it’s good to have this. in libvirt capabilities xml, the application will get to know if the host support cdp or not.
<cachetune id='10' host_id='1' type='l3data' size='3072' unit='KiB'/> <cachetune id='10' host_id='1' type='l3code' size='3072' unit='KiB'/>
Perhaps if using the same ID?
I am open to hear about what other’s say,
Other than that, testing looks good.
Thanks for the testing.

On Mon, Feb 06, 2017 at 10:23:35AM +0800, Eli Qiao wrote:
This series patches are for supportting CAT featues, which also called cache tune in libvirt.
First to expose cache information which could be tuned in capabilites XML. Then add new domain xml element support to add cacahe bank which will apply on this libvirt domain.
This series patches add a util file `resctrl.c/h`, an interface to talk with linux kernel's sys fs.
There are still some TODOs such as expose new public interface to get free cache information.
Some discussion about this feature support can be found from: https://www.redhat.com/archives/libvir-list/2017-January/msg00644.html
Eli Qiao (7): Resctrl: Add some utils functions Resctrl: expose cache information to capabilities Resctrl: Add new xml element to support cache tune Resctrl: Add private interface to set cachebanks Qemu: Set cache banks Resctrl: enable l3code/l3data Resctrl: Make sure l3data/l3code are pairs
docs/schemas/domaincommon.rng | 41 ++ include/libvirt/virterror.h | 1 + src/Makefile.am | 1 + src/conf/capabilities.c | 56 +++ src/conf/capabilities.h | 23 + src/conf/domain_conf.c | 154 +++++++ src/conf/domain_conf.h | 18 + src/libvirt_private.syms | 10 + src/qemu/qemu_capabilities.c | 68 +++ src/qemu/qemu_driver.c | 4 + src/qemu/qemu_process.c | 11 + src/util/virerror.c | 1 + src/util/virresctrl.c | 1019 +++++++++++++++++++++++++++++++++++++++++ src/util/virresctrl.h | 135 ++++++ 14 files changed, 1542 insertions(+) create mode 100644 src/util/virresctrl.c create mode 100644 src/util/virresctrl.h
I see that Daniel didn't mention this, so just on a side note, we tend to add documentation in the same patch as the parsing/formatting code, so that it's consistent. Also there could be at least xml2xml test case added.
-- 1.9.1
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
participants (5)
-
Daniel P. Berrange
-
Eli Qiao
-
Eli Qiao
-
Marcelo Tosatti
-
Martin Kletzander