[libvirt] [PATCH 1/3] cpu_map.xml: add cmt feature to x86

Some Intel processor families (e.g. the Intel Xeon processor E5 v3 family) introduced CMT (Cache Monitoring Technology) to measure the usage of cache by applications running on the platform. This patch add it into x86 part of cpu_map.xml. Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> --- .gnulib | 2 +- src/cpu/cpu_map.xml | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.gnulib b/.gnulib index f39477d..106a386 160000 --- a/.gnulib +++ b/.gnulib @@ -1 +1 @@ -Subproject commit f39477dba778e99392948dd3dd19ec0d46aee932 +Subproject commit 106a3866d01f9dd57ab4f10dbeb0d5a8db73a9f7 diff --git a/src/cpu/cpu_map.xml b/src/cpu/cpu_map.xml index b9e95cf..14ccbd8 100644 --- a/src/cpu/cpu_map.xml +++ b/src/cpu/cpu_map.xml @@ -317,6 +317,9 @@ <feature name='rtm'> <cpuid function='0x00000007' ebx='0x00000800'/> </feature> + <feature name='cmt'> + <cpuid function='0x00000007' ebx='0x00001000'/> + </feature> <feature name='rdseed'> <cpuid function='0x00000007' ebx='0x00040000'/> </feature> -- 1.9.1

One RFC in https://www.redhat.com/archives/libvir-list/2015-June/msg01509.html CMT (Cache Monitoring Technology) can be used to measure the usage of cache by VM running on the host. This patch will extend the bulk stats API (virDomainListGetStats) to add this field. Applications based on libvirt can use this API to achieve cache usage of VM. Because CMT implementation in Linux kernel is based on perf mechanism, this patch will enable perf event for CMT when VM is created and disable it when VM is destroyed. Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> --- include/libvirt/libvirt-domain.h | 1 + src/qemu/qemu_domain.h | 3 ++ src/qemu/qemu_driver.c | 48 ++++++++++++++++++++++ src/qemu/qemu_process.c | 86 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 138 insertions(+) diff --git a/include/libvirt/libvirt-domain.h b/include/libvirt/libvirt-domain.h index e8202cf..fb5e1f4 100644 --- a/include/libvirt/libvirt-domain.h +++ b/include/libvirt/libvirt-domain.h @@ -1764,6 +1764,7 @@ typedef enum { VIR_DOMAIN_STATS_VCPU = (1 << 3), /* return domain virtual CPU info */ VIR_DOMAIN_STATS_INTERFACE = (1 << 4), /* return domain interfaces info */ VIR_DOMAIN_STATS_BLOCK = (1 << 5), /* return domain block info */ + VIR_DOMAIN_STATS_CACHE = (1 << 6), /* return domain block info */ } virDomainStatsTypes; typedef enum { diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 54e1e7b..31bce33 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -196,6 +196,9 @@ struct _qemuDomainObjPrivate { bool hookRun; /* true if there was a hook run over this domain */ + int cmt_fd; /* perf handler for CMT */ + + /* Bitmaps below hold data from the auto NUMA feature */ virBitmapPtr autoNodeset; virBitmapPtr autoCpuset; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 4cfae03..8c678c9 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -19320,6 +19320,53 @@ qemuDomainGetStatsBlock(virQEMUDriverPtr driver, #undef QEMU_ADD_COUNT_PARAM +static int +qemuDomainGetStatsCache(virQEMUDriverPtr driver ATTRIBUTE_UNUSED, + virDomainObjPtr dom, + virDomainStatsRecordPtr record, + int *maxparams, + unsigned int privflags ATTRIBUTE_UNUSED) +{ + qemuDomainObjPrivatePtr priv = dom->privateData; + FILE *fd; + unsigned long long cache = 0; + int scaling_factor = 0; + + if (priv->cmt_fd <= 0) + return -1; + + if (read(priv->cmt_fd, &cache, sizeof(uint64_t)) < 0) { + virReportSystemError(errno, "%s", + _("Unable to read cache data")); + return -1; + } + + fd = fopen("/sys/devices/intel_cqm/events/llc_occupancy.scale", "r"); + if (!fd) { + virReportSystemError(errno, "%s", + _("Unable to open CMT scale file")); + return -1; + } + if (fscanf(fd, "%d", &scaling_factor) != 1) { + virReportSystemError(errno, "%s", + _("Unable to read CMT scale file")); + VIR_FORCE_FCLOSE(fd); + return -1; + } + VIR_FORCE_FCLOSE(fd); + + cache *= scaling_factor; + + if (virTypedParamsAddULLong(&record->params, + &record->nparams, + maxparams, + "cache.current", + cache) < 0) + return -1; + + return 0; +} + typedef int (*qemuDomainGetStatsFunc)(virQEMUDriverPtr driver, virDomainObjPtr dom, @@ -19340,6 +19387,7 @@ static struct qemuDomainGetStatsWorker qemuDomainGetStatsWorkers[] = { { qemuDomainGetStatsVcpu, VIR_DOMAIN_STATS_VCPU, false }, { qemuDomainGetStatsInterface, VIR_DOMAIN_STATS_INTERFACE, false }, { qemuDomainGetStatsBlock, VIR_DOMAIN_STATS_BLOCK, true }, + { qemuDomainGetStatsCache, VIR_DOMAIN_STATS_CACHE, false }, { NULL, 0, false } }; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index ba84182..00b889d 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -25,8 +25,11 @@ #include <unistd.h> #include <signal.h> #include <sys/stat.h> +#include <sys/ioctl.h> +#include <sys/syscall.h> #if defined(__linux__) # include <linux/capability.h> +# include <linux/perf_event.h> #elif defined(__FreeBSD__) # include <sys/param.h> # include <sys/cpuset.h> @@ -4274,6 +4277,75 @@ qemuLogOperation(virDomainObjPtr vm, goto cleanup; } +/* + * Enable CMT(Cache Monitoring Technology) to measure the usage of + * cache by VM running on the node. + * + * Because the hypervisor implement CMT support basedon perf mechanism, + * we should enable perf event for CMT. The function 'sys_erf_event_open' + * is perf syscall wrapper. + */ +#ifdef __linux__ +static long sys_perf_event_open(struct perf_event_attr *hw_event, + pid_t pid, int cpu, int group_fd, + unsigned long flags) +{ + return syscall(__NR_perf_event_open, hw_event, pid, cpu, + group_fd, flags); +} +static int qemuCmtEnable(virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + struct perf_event_attr cmt_attr; + int event_type; + FILE *fp; + + fp = fopen("/sys/devices/intel_cqm/type", "r"); + if (!fp) { + virReportSystemError(errno, "%s", + _("CMT is not available on this host")); + return -1; + } + if (fscanf(fp, "%d", &event_type) != 1) { + virReportSystemError(errno, "%s", + _("Unable to read event type file.")); + VIR_FORCE_FCLOSE(fp); + return -1; + } + VIR_FORCE_FCLOSE(fp); + + memset(&cmt_attr, 0, sizeof(struct perf_event_attr)); + cmt_attr.size = sizeof(struct perf_event_attr); + cmt_attr.type = event_type; + cmt_attr.config = 1; + cmt_attr.inherit = 1; + cmt_attr.disabled = 1; + cmt_attr.enable_on_exec = 0; + + priv->cmt_fd = sys_perf_event_open(&cmt_attr, vm->pid, -1, -1, 0); + if (priv->cmt_fd < 0) { + virReportSystemError(errno, + _("Unable to open perf type=%d for pid=%d"), + event_type, vm->pid); + return -1; + } + + if (ioctl(priv->cmt_fd, PERF_EVENT_IOC_ENABLE) < 0) { + virReportSystemError(errno, "%s", + _("Unable to enable perf event for CMT")); + return -1; + } + + return 0; +} +#else +static int qemuCmtEnable(virDomainObjPtr vm) +{ + virReportUnsupportedError(); + return -1; +} +#endif + int qemuProcessStart(virConnectPtr conn, virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -4954,6 +5026,11 @@ int qemuProcessStart(virConnectPtr conn, if (virDomainSaveStatus(driver->xmlopt, cfg->stateDir, vm) < 0) goto cleanup; + VIR_DEBUG("Setting CMT perf counter"); + if (qemuCmtEnable(vm) < 0) + virReportSystemError(errno, "%s", + _("CMT is not available on this host")); + /* finally we can call the 'started' hook script if any */ if (virHookPresent(VIR_HOOK_DRIVER_QEMU)) { char *xml = qemuDomainDefFormatXML(driver, vm->def, 0); @@ -5122,6 +5199,15 @@ void qemuProcessStop(virQEMUDriverPtr driver, virPortAllocatorRelease(driver->migrationPorts, priv->nbdPort); priv->nbdPort = 0; + /* Disable CMT */ + if (priv->cmt_fd > 0) { + if (ioctl(priv->cmt_fd, PERF_EVENT_IOC_DISABLE) < 0) { + virReportSystemError(errno, "%s", + _("Unable to disable perf event for CMT")); + } + VIR_FORCE_CLOSE(priv->cmt_fd); + } + if (priv->agent) { qemuAgentClose(priv->agent); priv->agent = NULL; -- 1.9.1

On Sun, Jul 5, 2015 at 5:13 PM, Qiaowei Ren <qiaowei.ren@intel.com> wrote:
One RFC in https://www.redhat.com/archives/libvir-list/2015-June/msg01509.html
CMT (Cache Monitoring Technology) can be used to measure the usage of cache by VM running on the host. This patch will extend the bulk stats API (virDomainListGetStats) to add this field. Applications based on libvirt can use this API to achieve cache usage of VM. Because CMT implementation in Linux kernel is based on perf mechanism, this patch will enable perf event for CMT when VM is created and disable it when VM is destroyed.
Hi Ren, One query wrt this implementation. I see you make a perf ioctl to gather CMT stats each time the stats API is invoked. If the CMT stats are exposed by a hardware counter, then this implies logging on a per-cpu (or per-socket ???) basis. This also implies that the value read will vary as the CPU (or socket) on which it is being called changes. Now, with this background, if we need real-world stats on a VM, we need this perf ioctl executed on all CPUs/ sockets on which the VM ran. Also, once done, we will need to aggregate results from each of these sources. In this implementation, I am missing this -- there seems no control over which physical CPU the libvirt worker thread will run and collect the perf data from. Data collected from this implementation might not accurately model the system state. I _think_ libvirt currently has no way of directing a worker thread to collect stats from a given CPU -- if we do, I would be happy to learn about it :) Regards, Prerna

On Jul 6, 2015 14:49, Prerna wrote:
On Sun, Jul 5, 2015 at 5:13 PM, Qiaowei Ren <qiaowei.ren@intel.com <mailto:qiaowei.ren@intel.com> > wrote:
One RFC in https://www.redhat.com/archives/libvir-list/2015-June/msg01509.html
CMT (Cache Monitoring Technology) can be used to measure the usage of cache by VM running on the host. This patch will extend the bulk stats API (virDomainListGetStats) to add this field. Applications based on libvirt can use this API to achieve cache usage of VM. Because CMT implementation in Linux kernel is based on perf mechanism, this patch will enable perf event for CMT when VM is created and disable it when VM is destroyed.
Hi Ren,
One query wrt this implementation. I see you make a perf ioctl to gather CMT stats each time the stats API is invoked.
If the CMT stats are exposed by a hardware counter, then this implies logging on a per-cpu (or per-socket ???) basis.
This also implies that the value read will vary as the CPU (or socket) on which it is being called changes.
Now, with this background, if we need real-world stats on a VM, we need this perf ioctl executed on all CPUs/ sockets on which the VM ran. Also, once done, we will need to aggregate results from each of these sources.
In this implementation, I am missing this -- there seems no control over which physical CPU the libvirt worker thread will run and collect the perf data from. Data collected from this implementation might not accurately model the system state.
I _think_ libvirt currently has no way of directing a worker thread to collect stats from a given CPU -- if we do, I would be happy to learn about it :)
Prerna, thanks for your reply. I checked the CMT implementation in kernel, and noticed that the series implement new ->count() of pmu driver which can aggregate the results from each cpu if perf type is PERF_TYPE_INTEL_CQM . The following is the link for the patch: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=b... So I guess that this patch just need to set right perf type and "cpu=-1". Do you think this is ok? Thanks, Qiaowei

On Jul 7, 2015 15:51, Ren, Qiaowei wrote:
On Jul 6, 2015 14:49, Prerna wrote:
On Sun, Jul 5, 2015 at 5:13 PM, Qiaowei Ren <qiaowei.ren@intel.com <mailto:qiaowei.ren@intel.com> > wrote:
One RFC in https://www.redhat.com/archives/libvir-list/2015-June/msg01509.html
CMT (Cache Monitoring Technology) can be used to measure the usage of cache by VM running on the host. This patch will extend the bulk stats API (virDomainListGetStats) to add this field. Applications based on libvirt can use this API to achieve cache usage of VM. Because CMT implementation in Linux kernel is based on perf mechanism, this patch will enable perf event for CMT when VM is created and disable it when VM is destroyed.
Hi Ren,
One query wrt this implementation. I see you make a perf ioctl to gather CMT stats each time the stats API is invoked.
If the CMT stats are exposed by a hardware counter, then this implies logging on a per-cpu (or per-socket ???) basis.
This also implies that the value read will vary as the CPU (or socket) on which it is being called changes.
Now, with this background, if we need real-world stats on a VM, we need this perf ioctl executed on all CPUs/ sockets on which the VM ran. Also, once done, we will need to aggregate results from each of these sources.
In this implementation, I am missing this -- there seems no control over which physical CPU the libvirt worker thread will run and collect the perf data from. Data collected from this implementation might not accurately model the system state.
I _think_ libvirt currently has no way of directing a worker thread to collect stats from a given CPU -- if we do, I would be happy to learn about it :)
Prerna, thanks for your reply. I checked the CMT implementation in kernel, and noticed that the series implement new ->count() of pmu driver which can aggregate the results from each cpu if perf type is PERF_TYPE_INTEL_CQM . The following is the link for the patch: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?i d=bfe1fc d2688f557a6b6a88f59ea7619228728bd7
So I guess that this patch just need to set right perf type and "cpu=-1". Do you think this is ok?
Hi Prerna, Do you have more comments on this patch series? I would be glad to update my implementation. ^-^ Qiaowei

Hi Ren, Thank you for clarifying. I really do not have any more comments on the patch. Regards, Prerna On Thu, Jul 9, 2015 at 12:27 PM, Ren, Qiaowei <qiaowei.ren@intel.com> wrote:
On Jul 7, 2015 15:51, Ren, Qiaowei wrote:
On Jul 6, 2015 14:49, Prerna wrote:
On Sun, Jul 5, 2015 at 5:13 PM, Qiaowei Ren <qiaowei.ren@intel.com <mailto:qiaowei.ren@intel.com> > wrote:
One RFC in
https://www.redhat.com/archives/libvir-list/2015-June/msg01509.html
CMT (Cache Monitoring Technology) can be used to measure the usage of cache by VM running on the host. This patch will extend the bulk stats API (virDomainListGetStats) to add this field. Applications based on libvirt can use this API to achieve cache usage of VM. Because CMT implementation in Linux kernel is based on perf mechanism, this patch will enable perf event for CMT when VM is created and disable it when VM is destroyed.
Hi Ren,
One query wrt this implementation. I see you make a perf ioctl to gather CMT stats each time the stats API is invoked.
If the CMT stats are exposed by a hardware counter, then this implies logging on a per-cpu (or per-socket ???) basis.
This also implies that the value read will vary as the CPU (or socket) on which it is being called changes.
Now, with this background, if we need real-world stats on a VM, we need this perf ioctl executed on all CPUs/ sockets on which the VM ran. Also, once done, we will need to aggregate results from each of these sources.
In this implementation, I am missing this -- there seems no control over which physical CPU the libvirt worker thread will run and collect the perf data from. Data collected from this implementation might not accurately model the system state.
I _think_ libvirt currently has no way of directing a worker thread to collect stats from a given CPU -- if we do, I would be happy to learn about it :)
Prerna, thanks for your reply. I checked the CMT implementation in kernel, and noticed that the series implement new ->count() of pmu driver which can aggregate the results from each cpu if perf type is PERF_TYPE_INTEL_CQM . The following is the link for the patch:
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?i
d=bfe1fc d2688f557a6b6a88f59ea7619228728bd7
So I guess that this patch just need to set right perf type and "cpu=-1". Do you think this is ok?
Hi Prerna,
Do you have more comments on this patch series? I would be glad to update my implementation. ^-^
Qiaowei

On Jul 13, 2015 09:58, Prerna wrote:
Hi Ren,
Thank you for clarifying. I really do not have any more comments on the patch.
Ok. Thanks for your reply. So do you know how to get more feedback from libvirt maintainer, or how to contact with them? Now I could not open libvirt.org and don't know anything but this maillist. ^-^ Thanks, Qiaowei

On Jul 7, 2015 15:51, Ren, Qiaowei wrote:
On Jul 6, 2015 14:49, Prerna wrote:
On Sun, Jul 5, 2015 at 5:13 PM, Qiaowei Ren <qiaowei.ren@intel.com <mailto:qiaowei.ren@intel.com> > wrote:
One RFC in https://www.redhat.com/archives/libvir-list/2015-June/msg01509.html
CMT (Cache Monitoring Technology) can be used to measure the usage of cache by VM running on the host. This patch will extend the bulk stats API (virDomainListGetStats) to add this field. Applications based on libvirt can use this API to achieve cache usage of VM. Because CMT implementation in Linux kernel is based on perf mechanism, this patch will enable perf event for CMT when VM is created and disable it when VM is destroyed.
Hi Ren,
One query wrt this implementation. I see you make a perf ioctl to gather CMT stats each time the stats API is invoked.
If the CMT stats are exposed by a hardware counter, then this implies logging on a per-cpu (or per-socket ???) basis.
This also implies that the value read will vary as the CPU (or socket) on which it is being called changes.
Now, with this background, if we need real-world stats on a VM, we need this perf ioctl executed on all CPUs/ sockets on which the VM ran. Also, once done, we will need to aggregate results from each of these sources.
In this implementation, I am missing this -- there seems no control over which physical CPU the libvirt worker thread will run and collect the perf data from. Data collected from this implementation might not accurately model the system state.
I _think_ libvirt currently has no way of directing a worker thread to collect stats from a given CPU -- if we do, I would be happy to learn about it :)
Prerna, thanks for your reply. I checked the CMT implementation in kernel, and noticed that the series implement new ->count() of pmu driver which can aggregate the results from each cpu if perf type is PERF_TYPE_INTEL_CQM . The following is the link for the patch: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?i d=bfe1fc d2688f557a6b6a88f59ea7619228728bd7
So I guess that this patch just need to set right perf type and "cpu=-1". Do you think this is ok?
Thanks, Qiaowei
Could anyone help review this patch series? I would be glad to get more comments. ^-^ Thanks, Qiaowei

On Jul 7, 2015 15:51, Ren, Qiaowei wrote:
On Jul 6, 2015 14:49, Prerna wrote:
On Sun, Jul 5, 2015 at 5:13 PM, Qiaowei Ren <qiaowei.ren@intel.com <mailto:qiaowei.ren@intel.com> > wrote:
One RFC in https://www.redhat.com/archives/libvir-list/2015-June/msg01509.html
CMT (Cache Monitoring Technology) can be used to measure the usage of cache by VM running on the host. This patch will extend the bulk stats API (virDomainListGetStats) to add this field. Applications based on libvirt can use this API to achieve cache usage of VM. Because CMT implementation in Linux kernel is based on perf mechanism, this patch will enable perf event for CMT when VM is created and disable it when VM is destroyed.
Hi Ren,
One query wrt this implementation. I see you make a perf ioctl to gather CMT stats each time the stats API is invoked.
If the CMT stats are exposed by a hardware counter, then this implies logging on a per-cpu (or per-socket ???) basis.
This also implies that the value read will vary as the CPU (or socket) on which it is being called changes.
Now, with this background, if we need real-world stats on a VM, we need this perf ioctl executed on all CPUs/ sockets on which the VM ran. Also, once done, we will need to aggregate results from each of these sources.
In this implementation, I am missing this -- there seems no control over which physical CPU the libvirt worker thread will run and collect the perf data from. Data collected from this implementation might not accurately model the system state.
I _think_ libvirt currently has no way of directing a worker thread to collect stats from a given CPU -- if we do, I would be happy to learn about it :)
Prerna, thanks for your reply. I checked the CMT implementation in kernel, and noticed that the series implement new ->count() of pmu driver which can aggregate the results from each cpu if perf type is PERF_TYPE_INTEL_CQM . The following is the link for the patch: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?i d=bfe1fc d2688f557a6b6a88f59ea7619228728bd7
So I guess that this patch just need to set right perf type and "cpu=-1". Do you think this is ok?
Peter, according to your feedback about my RFC, I updated our implementation and submitted this patch series. Could you help review them? Thanks, Qiaowei

On Sun, Jul 05, 2015 at 07:43:43PM +0800, Qiaowei Ren wrote:
One RFC in https://www.redhat.com/archives/libvir-list/2015-June/msg01509.html
CMT (Cache Monitoring Technology) can be used to measure the usage of cache by VM running on the host. This patch will extend the bulk stats API (virDomainListGetStats) to add this field. Applications based on libvirt can use this API to achieve cache usage of VM. Because CMT implementation in Linux kernel is based on perf mechanism, this patch will enable perf event for CMT when VM is created and disable it when VM is destroyed.
Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 4cfae03..8c678c9 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -19320,6 +19320,53 @@ qemuDomainGetStatsBlock(virQEMUDriverPtr driver,
#undef QEMU_ADD_COUNT_PARAM
+static int +qemuDomainGetStatsCache(virQEMUDriverPtr driver ATTRIBUTE_UNUSED, + virDomainObjPtr dom, + virDomainStatsRecordPtr record, + int *maxparams, + unsigned int privflags ATTRIBUTE_UNUSED)
So this is a method that is used to collect per-domain information
+{ + qemuDomainObjPrivatePtr priv = dom->privateData; + FILE *fd; + unsigned long long cache = 0; + int scaling_factor = 0; + + if (priv->cmt_fd <= 0) + return -1; + + if (read(priv->cmt_fd, &cache, sizeof(uint64_t)) < 0) { + virReportSystemError(errno, "%s", + _("Unable to read cache data")); + return -1; + } + + fd = fopen("/sys/devices/intel_cqm/events/llc_occupancy.scale", "r"); + if (!fd) { + virReportSystemError(errno, "%s", + _("Unable to open CMT scale file")); + return -1; + } + if (fscanf(fd, "%d", &scaling_factor) != 1) { + virReportSystemError(errno, "%s", + _("Unable to read CMT scale file")); + VIR_FORCE_FCLOSE(fd); + return -1; + } + VIR_FORCE_FCLOSE(fd);
But this data you are reading is global to the entire host.
+ + cache *= scaling_factor; + + if (virTypedParamsAddULLong(&record->params, + &record->nparams, + maxparams, + "cache.current", + cache) < 0) + return -1; + + return 0; +} +
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index ba84182..00b889d 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c
+/* + * Enable CMT(Cache Monitoring Technology) to measure the usage of + * cache by VM running on the node. + * + * Because the hypervisor implement CMT support basedon perf mechanism, + * we should enable perf event for CMT. The function 'sys_erf_event_open' + * is perf syscall wrapper. + */ +#ifdef __linux__ +static long sys_perf_event_open(struct perf_event_attr *hw_event, + pid_t pid, int cpu, int group_fd, + unsigned long flags) +{ + return syscall(__NR_perf_event_open, hw_event, pid, cpu, + group_fd, flags); +} +static int qemuCmtEnable(virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + struct perf_event_attr cmt_attr; + int event_type; + FILE *fp; + + fp = fopen("/sys/devices/intel_cqm/type", "r"); + if (!fp) { + virReportSystemError(errno, "%s", + _("CMT is not available on this host")); + return -1; + } + if (fscanf(fp, "%d", &event_type) != 1) { + virReportSystemError(errno, "%s", + _("Unable to read event type file.")); + VIR_FORCE_FCLOSE(fp); + return -1; + } + VIR_FORCE_FCLOSE(fp); + + memset(&cmt_attr, 0, sizeof(struct perf_event_attr)); + cmt_attr.size = sizeof(struct perf_event_attr); + cmt_attr.type = event_type; + cmt_attr.config = 1; + cmt_attr.inherit = 1; + cmt_attr.disabled = 1; + cmt_attr.enable_on_exec = 0; + + priv->cmt_fd = sys_perf_event_open(&cmt_attr, vm->pid, -1, -1, 0); + if (priv->cmt_fd < 0) { + virReportSystemError(errno, + _("Unable to open perf type=%d for pid=%d"), + event_type, vm->pid); + return -1; + } + + if (ioctl(priv->cmt_fd, PERF_EVENT_IOC_ENABLE) < 0) { + virReportSystemError(errno, "%s", + _("Unable to enable perf event for CMT")); + return -1; + } + + return 0; +} +#else +static int qemuCmtEnable(virDomainObjPtr vm) +{ + virReportUnsupportedError(); + return -1; +} +#endif + int qemuProcessStart(virConnectPtr conn, virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -4954,6 +5026,11 @@ int qemuProcessStart(virConnectPtr conn, if (virDomainSaveStatus(driver->xmlopt, cfg->stateDir, vm) < 0) goto cleanup;
+ VIR_DEBUG("Setting CMT perf counter"); + if (qemuCmtEnable(vm) < 0) + virReportSystemError(errno, "%s", + _("CMT is not available on this host")); + /* finally we can call the 'started' hook script if any */ if (virHookPresent(VIR_HOOK_DRIVER_QEMU)) { char *xml = qemuDomainDefFormatXML(driver, vm->def, 0); @@ -5122,6 +5199,15 @@ void qemuProcessStop(virQEMUDriverPtr driver, virPortAllocatorRelease(driver->migrationPorts, priv->nbdPort); priv->nbdPort = 0;
+ /* Disable CMT */ + if (priv->cmt_fd > 0) {
You can't rely on keeping an open file descriptor for the guest because libvirtd may be restarted.
+ if (ioctl(priv->cmt_fd, PERF_EVENT_IOC_DISABLE) < 0) { + virReportSystemError(errno, "%s", + _("Unable to disable perf event for CMT")); + } + VIR_FORCE_CLOSE(priv->cmt_fd); + } + if (priv->agent) { qemuAgentClose(priv->agent); priv->agent = NULL;
Conceptually I think this approach to implementation is flawed. While you are turning on/off the perf events for each QEMU process, the data collection does not distinguish data from each QEMU process - the data reported is host wide. So this really doesn't make much sense IMHO. I'm also wondering whether this is really going to be sufficiently useful on its own. CPUs have countless other performance counters that I would imagine apps/admins will want to read in order to analyse QEMU performance, beyond this new CMT feature. The domain stats API won't really scale up to dealing with arbitrary perf event counter reporting so I'm not much of a fan of just special casing CMT in this way. IOW, if we want to support host performance analysis in libvirt, then we probably want to design an set of APIs specifically for this purpose, but I could well see us saying that this is out of scope for libvirt and apps shoud just use the linux perf interfaces directly. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

-----Original Message----- From: Daniel P. Berrange [mailto:berrange@redhat.com] Sent: Monday, July 20, 2015 5:32 PM To: Ren, Qiaowei Cc: libvir-list@redhat.com Subject: Re: [libvirt] [PATCH 2/3] Qemu: add CMT support
On Sun, Jul 05, 2015 at 07:43:43PM +0800, Qiaowei Ren wrote:
One RFC in https://www.redhat.com/archives/libvir-list/2015-June/msg01509.html
CMT (Cache Monitoring Technology) can be used to measure the usage of cache by VM running on the host. This patch will extend the bulk stats API (virDomainListGetStats) to add this field. Applications based on libvirt can use this API to achieve cache usage of VM. Because CMT implementation in Linux kernel is based on perf mechanism, this patch will enable perf event for CMT when VM is created and disable it when VM is destroyed.
Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 4cfae03..8c678c9 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -19320,6 +19320,53 @@ qemuDomainGetStatsBlock(virQEMUDriverPtr driver,
#undef QEMU_ADD_COUNT_PARAM
+static int +qemuDomainGetStatsCache(virQEMUDriverPtr driver ATTRIBUTE_UNUSED, + virDomainObjPtr dom, + virDomainStatsRecordPtr record, + int *maxparams, + unsigned int privflags ATTRIBUTE_UNUSED)
So this is a method that is used to collect per-domain information
+{ + qemuDomainObjPrivatePtr priv = dom->privateData; + FILE *fd; + unsigned long long cache = 0; + int scaling_factor = 0; + + if (priv->cmt_fd <= 0) + return -1; + + if (read(priv->cmt_fd, &cache, sizeof(uint64_t)) < 0) { + virReportSystemError(errno, "%s", + _("Unable to read cache data")); + return -1; + } + + fd = fopen("/sys/devices/intel_cqm/events/llc_occupancy.scale", "r"); + if (!fd) { + virReportSystemError(errno, "%s", + _("Unable to open CMT scale file")); + return -1; + } + if (fscanf(fd, "%d", &scaling_factor) != 1) { + virReportSystemError(errno, "%s", + _("Unable to read CMT scale file")); + VIR_FORCE_FCLOSE(fd); + return -1; + } + VIR_FORCE_FCLOSE(fd);
But this data you are reading is global to the entire host.
In fact this data is only for per-domain. When the perf syscall is called to enable perf event for domain, the pid of that domain is passed.
+ + cache *= scaling_factor; + + if (virTypedParamsAddULLong(&record->params, + &record->nparams, + maxparams, + "cache.current", + cache) < 0) + return -1; + + return 0; +} +
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index ba84182..00b889d 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c
+/* + * Enable CMT(Cache Monitoring Technology) to measure the usage of + * cache by VM running on the node. + * + * Because the hypervisor implement CMT support basedon perf +mechanism, + * we should enable perf event for CMT. The function 'sys_erf_event_open' + * is perf syscall wrapper. + */ +#ifdef __linux__ +static long sys_perf_event_open(struct perf_event_attr *hw_event, + pid_t pid, int cpu, int group_fd, + unsigned long flags) { + return syscall(__NR_perf_event_open, hw_event, pid, cpu, + group_fd, flags); } static int +qemuCmtEnable(virDomainObjPtr vm) { + qemuDomainObjPrivatePtr priv = vm->privateData; + struct perf_event_attr cmt_attr; + int event_type; + FILE *fp; + + fp = fopen("/sys/devices/intel_cqm/type", "r"); + if (!fp) { + virReportSystemError(errno, "%s", + _("CMT is not available on this host")); + return -1; + } + if (fscanf(fp, "%d", &event_type) != 1) { + virReportSystemError(errno, "%s", + _("Unable to read event type file.")); + VIR_FORCE_FCLOSE(fp); + return -1; + } + VIR_FORCE_FCLOSE(fp); + + memset(&cmt_attr, 0, sizeof(struct perf_event_attr)); + cmt_attr.size = sizeof(struct perf_event_attr); + cmt_attr.type = event_type; + cmt_attr.config = 1; + cmt_attr.inherit = 1; + cmt_attr.disabled = 1; + cmt_attr.enable_on_exec = 0; + + priv->cmt_fd = sys_perf_event_open(&cmt_attr, vm->pid, -1, -1, 0); + if (priv->cmt_fd < 0) { + virReportSystemError(errno, + _("Unable to open perf type=%d for pid=%d"), + event_type, vm->pid); + return -1; + } + + if (ioctl(priv->cmt_fd, PERF_EVENT_IOC_ENABLE) < 0) { + virReportSystemError(errno, "%s", + _("Unable to enable perf event for CMT")); + return -1; + } + + return 0; +} +#else +static int qemuCmtEnable(virDomainObjPtr vm) { + virReportUnsupportedError(); + return -1; +} +#endif + int qemuProcessStart(virConnectPtr conn, virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -4954,6 +5026,11 @@ int qemuProcessStart(virConnectPtr conn, if (virDomainSaveStatus(driver->xmlopt, cfg->stateDir, vm) < 0) goto cleanup;
+ VIR_DEBUG("Setting CMT perf counter"); + if (qemuCmtEnable(vm) < 0) + virReportSystemError(errno, "%s", + _("CMT is not available on this host")); + /* finally we can call the 'started' hook script if any */ if (virHookPresent(VIR_HOOK_DRIVER_QEMU)) { char *xml = qemuDomainDefFormatXML(driver, vm->def, 0); @@ -5122,6 +5199,15 @@ void qemuProcessStop(virQEMUDriverPtr driver, virPortAllocatorRelease(driver->migrationPorts, priv->nbdPort); priv->nbdPort = 0;
+ /* Disable CMT */ + if (priv->cmt_fd > 0) {
You can't rely on keeping an open file descriptor for the guest because libvirtd may be restarted.
Sorry, I don't really get the meaning of this. You mean that when libvirtd is restarted, those resource which the domain opened should be closed, right?
+ if (ioctl(priv->cmt_fd, PERF_EVENT_IOC_DISABLE) < 0) { + virReportSystemError(errno, "%s", + _("Unable to disable perf event for CMT")); + } + VIR_FORCE_CLOSE(priv->cmt_fd); + } + if (priv->agent) { qemuAgentClose(priv->agent); priv->agent = NULL;
Conceptually I think this approach to implementation is flawed. While you are turning on/off the perf events for each QEMU process, the data collection does not distinguish data from each QEMU process - the data reported is host wide. So this really doesn't make much sense IMHO.
As mentioned above, the data reported is only for domain.
I'm also wondering whether this is really going to be sufficiently useful on its own. CPUs have countless other performance counters that I would imagine apps/admins will want to read in order to analyse QEMU performance, beyond this new CMT feature. The domain stats API won't really scale up to dealing with arbitrary perf event counter reporting so I'm not much of a fan of just special casing CMT in this way.
IOW, if we want to support host performance analysis in libvirt, then we probably want to design an set of APIs specifically for this purpose, but I could well see us saying that this is out of scope for libvirt and apps shoud just use the linux perf interfaces directly.
Yes. I can get what you mean. Maybe libvirt doesn't have to be responsible for supporting host performance. But I guess cache usage should be important for each domain, if those apps based on libvirt can achieve this information they will be able to better check and confirm the domain works normally, like the stats of cpu/memory/block/... which have be supported in libvirt now. Do you think so? In fact, CMT support in kernel is not initially based on perf, and it used cgroup to report cache usage. But cgroup implementation is rejected by kernel community, and finally the implementation based on perf was merged. Thanks, Qiaowei

On Mon, Jul 20, 2015 at 01:50:54PM +0000, Ren, Qiaowei wrote:
-----Original Message----- From: Daniel P. Berrange [mailto:berrange@redhat.com] Sent: Monday, July 20, 2015 5:32 PM To: Ren, Qiaowei Cc: libvir-list@redhat.com Subject: Re: [libvirt] [PATCH 2/3] Qemu: add CMT support
On Sun, Jul 05, 2015 at 07:43:43PM +0800, Qiaowei Ren wrote:
One RFC in https://www.redhat.com/archives/libvir-list/2015-June/msg01509.html
CMT (Cache Monitoring Technology) can be used to measure the usage of cache by VM running on the host. This patch will extend the bulk stats API (virDomainListGetStats) to add this field. Applications based on libvirt can use this API to achieve cache usage of VM. Because CMT implementation in Linux kernel is based on perf mechanism, this patch will enable perf event for CMT when VM is created and disable it when VM is destroyed.
Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 4cfae03..8c678c9 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -19320,6 +19320,53 @@ qemuDomainGetStatsBlock(virQEMUDriverPtr driver,
#undef QEMU_ADD_COUNT_PARAM
+static int +qemuDomainGetStatsCache(virQEMUDriverPtr driver ATTRIBUTE_UNUSED, + virDomainObjPtr dom, + virDomainStatsRecordPtr record, + int *maxparams, + unsigned int privflags ATTRIBUTE_UNUSED)
So this is a method that is used to collect per-domain information
+{ + qemuDomainObjPrivatePtr priv = dom->privateData; + FILE *fd; + unsigned long long cache = 0; + int scaling_factor = 0; + + if (priv->cmt_fd <= 0) + return -1; + + if (read(priv->cmt_fd, &cache, sizeof(uint64_t)) < 0) { + virReportSystemError(errno, "%s", + _("Unable to read cache data")); + return -1; + } + + fd = fopen("/sys/devices/intel_cqm/events/llc_occupancy.scale", "r"); + if (!fd) { + virReportSystemError(errno, "%s", + _("Unable to open CMT scale file")); + return -1; + } + if (fscanf(fd, "%d", &scaling_factor) != 1) { + virReportSystemError(errno, "%s", + _("Unable to read CMT scale file")); + VIR_FORCE_FCLOSE(fd); + return -1; + } + VIR_FORCE_FCLOSE(fd);
But this data you are reading is global to the entire host.
In fact this data is only for per-domain. When the perf syscall is called to enable perf event for domain, the pid of that domain is passed.
Ah, I see - you rely on the open file descriptor to be associated with the VM pid.
-5122,6 +5199,15 @@ void qemuProcessStop(virQEMUDriverPtr driver, virPortAllocatorRelease(driver->migrationPorts, priv->nbdPort); priv->nbdPort = 0;
+ /* Disable CMT */ + if (priv->cmt_fd > 0) {
You can't rely on keeping an open file descriptor for the guest because libvirtd may be restarted.
Sorry, I don't really get the meaning of this. You mean that when libvirtd is restarted, those resource which the domain opened should be closed, right?
No, when libvirtd is restarted, the domains must all continuing running without loss of state. You open the FD when starting the guest, then libvirtd is restarted, now someone wants to query the perf data. The perf FD will not be open anymore because libvirtd was restarted. At least you'd need to re-open the file descriptor when libvirtd starts up again, for any running guest. I'm not really convinced we want to keep the perf file descriptors open for all the domains for the entire time they are running. Should really only open them when we actually want to read the collected data.
+ if (ioctl(priv->cmt_fd, PERF_EVENT_IOC_DISABLE) < 0) { + virReportSystemError(errno, "%s", + _("Unable to disable perf event for CMT")); + } + VIR_FORCE_CLOSE(priv->cmt_fd); + } + if (priv->agent) { qemuAgentClose(priv->agent); priv->agent = NULL;
Conceptually I think this approach to implementation is flawed. While you are turning on/off the perf events for each QEMU process, the data collection does not distinguish data from each QEMU process - the data reported is host wide. So this really doesn't make much sense IMHO.
As mentioned above, the data reported is only for domain.
I'm also wondering whether this is really going to be sufficiently useful on its own. CPUs have countless other performance counters that I would imagine apps/admins will want to read in order to analyse QEMU performance, beyond this new CMT feature. The domain stats API won't really scale up to dealing with arbitrary perf event counter reporting so I'm not much of a fan of just special casing CMT in this way.
IOW, if we want to support host performance analysis in libvirt, then we probably want to design an set of APIs specifically for this purpose, but I could well see us saying that this is out of scope for libvirt and apps shoud just use the linux perf interfaces directly.
Yes. I can get what you mean. Maybe libvirt doesn't have to be responsible for supporting host performance.
But I guess cache usage should be important for each domain, if those apps based on libvirt can achieve this information they will be able to better check and confirm the domain works normally, like the stats of cpu/memory/block/... which have be supported in libvirt now. Do you think so?
I'm not saying cache usage is unimportant. There are quite a lot of other hardware event counters in modern CPUs though, so I'm asking why we should add just this new special intel event, and not any of the other existing performance counters that are useful in diagnosing performance issues. Also, I'm thinking that QEMU has many different threads - VCPU threads, I/O threads, emulator threads and so on. I could see users want to have distinct profiling for these different functional areas of QEMU too instead of just whole-QEMU granularity. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Jul 20, 2015 22:34, Daniel P. Berrange wrote:
On Mon, Jul 20, 2015 at 01:50:54PM +0000, Ren, Qiaowei wrote:
Daniel P. Berrange wrote on Jul 20, 2015 17:32:
On Sun, Jul 05, 2015 at 07:43:43PM +0800, Qiaowei Ren wrote:
One RFC in https://www.redhat.com/archives/libvir-list/2015-June/msg01509.htm l
CMT (Cache Monitoring Technology) can be used to measure the usage of cache by VM running on the host. This patch will extend the bulk stats API (virDomainListGetStats) to add this field. Applications based on libvirt can use this API to achieve cache usage of VM. Because CMT implementation in Linux kernel is based on perf mechanism, this patch will enable perf event for CMT when VM is created and disable it when VM is destroyed.
Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 4cfae03..8c678c9 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -19320,6 +19320,53 @@ qemuDomainGetStatsBlock(virQEMUDriverPtr driver,
#undef QEMU_ADD_COUNT_PARAM +static int +qemuDomainGetStatsCache(virQEMUDriverPtr driver ATTRIBUTE_UNUSED, + virDomainObjPtr dom, + virDomainStatsRecordPtr record, + int *maxparams, + unsigned int privflags ATTRIBUTE_UNUSED)
So this is a method that is used to collect per-domain information
+{ + qemuDomainObjPrivatePtr priv = dom->privateData; + FILE *fd; + unsigned long long cache = 0; + int scaling_factor = 0; + + if (priv->cmt_fd <= 0) + return -1; + + if (read(priv->cmt_fd, &cache, sizeof(uint64_t)) < 0) { + virReportSystemError(errno, "%s", + _("Unable to read cache data")); + return -1; + } + + fd = fopen("/sys/devices/intel_cqm/events/llc_occupancy.scale", "r"); + if (!fd) { + virReportSystemError(errno, "%s", + _("Unable to open CMT scale file")); + return -1; + } + if (fscanf(fd, "%d", &scaling_factor) != 1) { + virReportSystemError(errno, "%s", + _("Unable to read CMT scale file")); + VIR_FORCE_FCLOSE(fd); + return -1; + } + VIR_FORCE_FCLOSE(fd);
But this data you are reading is global to the entire host.
In fact this data is only for per-domain. When the perf syscall is called to enable perf event for domain, the pid of that domain is passed.
Ah, I see - you rely on the open file descriptor to be associated with the VM pid.
-5122,6 +5199,15 @@ void qemuProcessStop(virQEMUDriverPtr driver, virPortAllocatorRelease(driver->migrationPorts, priv->nbdPort); priv->nbdPort = 0; + /* Disable CMT */ + if (priv->cmt_fd > 0) {
You can't rely on keeping an open file descriptor for the guest because libvirtd may be restarted.
Sorry, I don't really get the meaning of this. You mean that when libvirtd is restarted, those resource which the domain opened should be closed, right?
No, when libvirtd is restarted, the domains must all continuing running without loss of state. You open the FD when starting the guest, then libvirtd is restarted, now someone wants to query the perf data. The perf FD will not be open anymore because libvirtd was restarted. At least you'd need to re-open the file descriptor when libvirtd starts up again, for any running guest. I'm not really convinced we want to keep the perf file descriptors open for all the domains for the entire time they are running. Should really only open them when we actually want to read the collected data.
Got it! Should open/disable them when read the data.
+ if (ioctl(priv->cmt_fd, PERF_EVENT_IOC_DISABLE) < 0) { + virReportSystemError(errno, "%s", + _("Unable to disable perf event for CMT")); + } + VIR_FORCE_CLOSE(priv->cmt_fd); + } + if (priv->agent) { qemuAgentClose(priv->agent); priv->agent = NULL;
Conceptually I think this approach to implementation is flawed. While you are turning on/off the perf events for each QEMU process, the data collection does not distinguish data from each QEMU process - the data reported is host wide. So this really doesn't make much sense IMHO.
As mentioned above, the data reported is only for domain.
I'm also wondering whether this is really going to be sufficiently useful on its own. CPUs have countless other performance counters that I would imagine apps/admins will want to read in order to analyse QEMU performance, beyond this new CMT feature. The domain stats API won't really scale up to dealing with arbitrary perf event counter reporting so I'm not much of a fan of just special casing CMT in this way.
IOW, if we want to support host performance analysis in libvirt, then we probably want to design an set of APIs specifically for this purpose, but I could well see us saying that this is out of scope for libvirt and apps shoud just use the linux perf interfaces directly.
Yes. I can get what you mean. Maybe libvirt doesn't have to be responsible for supporting host performance.
But I guess cache usage should be important for each domain, if those apps based on libvirt can achieve this information they will be able to better check and confirm the domain works normally, like the stats of cpu/memory/block/... which have be supported in libvirt now. Do you think so?
I'm not saying cache usage is unimportant. There are quite a lot of other hardware event counters in modern CPUs though, so I'm asking why we should add just this new special intel event, and not any of the other existing performance counters that are useful in diagnosing performance issues.
Ah, I guess you mean that the best way is providing a set of utility method for perf operation, like cgroup support. Then we can use these interface to support a lot of necessary perf counters. If we have to do so, maybe I can try to firstly implement such methods and then you can help review them.
Also, I'm thinking that QEMU has many different threads - VCPU threads, I/O threads, emulator threads and so on. I could see users want to have distinct profiling for these different functional areas of QEMU too instead of just whole- QEMU granularity.
Yes. I believe such features should be useful for some users. I am currently working on adding some features like CMT into OpenStack, I only know profiling for VCPU/ IO / emulator threads should be not necessary for OpenStack. ^-^ Thanks, Qiaowei

-----Original Message----- From: Ren, Qiaowei Sent: Tuesday, July 21, 2015 4:00 PM To: Daniel P. Berrange Cc: libvir-list@redhat.com Subject: RE: [libvirt] [PATCH 2/3] Qemu: add CMT support
On Jul 20, 2015 22:34, Daniel P. Berrange wrote:
On Mon, Jul 20, 2015 at 01:50:54PM +0000, Ren, Qiaowei wrote:
Daniel P. Berrange wrote on Jul 20, 2015 17:32:
On Sun, Jul 05, 2015 at 07:43:43PM +0800, Qiaowei Ren wrote:
One RFC in https://www.redhat.com/archives/libvir-list/2015-June/msg01509.htm l
CMT (Cache Monitoring Technology) can be used to measure the usage of cache by VM running on the host. This patch will extend the bulk stats API (virDomainListGetStats) to add this field. Applications based on libvirt can use this API to achieve cache usage of VM. Because CMT implementation in Linux kernel is based on perf mechanism, this patch will enable perf event for CMT when VM is created and disable it when VM is destroyed.
Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 4cfae03..8c678c9 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -19320,6 +19320,53 @@ qemuDomainGetStatsBlock(virQEMUDriverPtr driver,
#undef QEMU_ADD_COUNT_PARAM +static int +qemuDomainGetStatsCache(virQEMUDriverPtr driver ATTRIBUTE_UNUSED, + virDomainObjPtr dom, + virDomainStatsRecordPtr record, + int *maxparams, + unsigned int privflags ATTRIBUTE_UNUSED)
So this is a method that is used to collect per-domain information
+{ + qemuDomainObjPrivatePtr priv = dom->privateData; + FILE *fd; + unsigned long long cache = 0; + int scaling_factor = 0; + + if (priv->cmt_fd <= 0) + return -1; + + if (read(priv->cmt_fd, &cache, sizeof(uint64_t)) < 0) { + virReportSystemError(errno, "%s", + _("Unable to read cache data")); + return -1; + } + + fd = fopen("/sys/devices/intel_cqm/events/llc_occupancy.scale", "r"); + if (!fd) { + virReportSystemError(errno, "%s", + _("Unable to open CMT scale file")); + return -1; + } + if (fscanf(fd, "%d", &scaling_factor) != 1) { + virReportSystemError(errno, "%s", + _("Unable to read CMT scale file")); + VIR_FORCE_FCLOSE(fd); + return -1; + } + VIR_FORCE_FCLOSE(fd);
But this data you are reading is global to the entire host.
In fact this data is only for per-domain. When the perf syscall is called to enable perf event for domain, the pid of that domain is passed.
Ah, I see - you rely on the open file descriptor to be associated with the VM pid.
-5122,6 +5199,15 @@ void qemuProcessStop(virQEMUDriverPtr driver, virPortAllocatorRelease(driver->migrationPorts, priv->nbdPort); priv->nbdPort = 0; + /* Disable CMT */ + if (priv->cmt_fd > 0) {
You can't rely on keeping an open file descriptor for the guest because libvirtd may be restarted.
Sorry, I don't really get the meaning of this. You mean that when libvirtd is restarted, those resource which the domain opened should be closed, right?
No, when libvirtd is restarted, the domains must all continuing running without loss of state. You open the FD when starting the guest, then libvirtd is restarted, now someone wants to query the perf data. The perf FD will not be open anymore because libvirtd was restarted. At least you'd need to re-open the file descriptor when libvirtd starts up again, for any running guest. I'm not really convinced we want to keep the perf file descriptors open for all the domains for the entire time they are running. Should really only open them when we actually want to read the collected data.
Got it! Should open/disable them when read the data.
+ if (ioctl(priv->cmt_fd, PERF_EVENT_IOC_DISABLE) < 0) { + virReportSystemError(errno, "%s", + _("Unable to disable perf event for CMT")); + } + VIR_FORCE_CLOSE(priv->cmt_fd); + } + if (priv->agent) { qemuAgentClose(priv->agent); priv->agent = NULL;
Conceptually I think this approach to implementation is flawed. While you are turning on/off the perf events for each QEMU process, the data collection does not distinguish data from each QEMU process - the data reported is host wide. So this really doesn't make much sense
IMHO.
As mentioned above, the data reported is only for domain.
I'm also wondering whether this is really going to be sufficiently useful on its own. CPUs have countless other performance counters that I would imagine apps/admins will want to read in order to analyse QEMU performance, beyond this new CMT feature. The domain stats API won't really scale up to dealing with arbitrary perf event counter reporting so I'm not much of a fan of just special casing CMT in this way.
IOW, if we want to support host performance analysis in libvirt, then we probably want to design an set of APIs specifically for this purpose, but I could well see us saying that this is out of scope for libvirt and apps shoud just use the linux perf interfaces directly.
Yes. I can get what you mean. Maybe libvirt doesn't have to be responsible for supporting host performance.
But I guess cache usage should be important for each domain, if those apps based on libvirt can achieve this information they will be able to better check and confirm the domain works normally, like the stats of cpu/memory/block/... which have be supported in libvirt now. Do you think so?
I'm not saying cache usage is unimportant. There are quite a lot of other hardware event counters in modern CPUs though, so I'm asking why we should add just this new special intel event, and not any of the other existing performance counters that are useful in diagnosing performance issues.
Ah, I guess you mean that the best way is providing a set of utility method for perf operation, like cgroup support. Then we can use these interface to support a lot of necessary perf counters.
If we have to do so, maybe I can try to firstly implement such methods and then you can help review them.
Also, I'm thinking that QEMU has many different threads - VCPU threads, I/O threads, emulator threads and so on. I could see users want to have distinct profiling for these different functional areas of QEMU too instead of just whole- QEMU granularity.
Yes. I believe such features should be useful for some users. I am currently working on adding some features like CMT into OpenStack, I only know profiling for VCPU/ IO / emulator threads should be not necessary for OpenStack. ^-^
Hi Daniel, what do you think about it now? If we have to add CMT support in Nova, do you think what is the best way? Can we directly use linux perf interface in OpenStack? Thanks, Qiaowei

-----Original Message----- From: libvir-list-bounces@redhat.com [mailto:libvir-list-bounces@redhat.com] On Behalf Of Ren, Qiaowei Sent: Monday, August 10, 2015 9:06 AM To: 'Daniel P. Berrange' Cc: 'libvir-list@redhat.com' Subject: Re: [libvirt] [PATCH 2/3] Qemu: add CMT support
-----Original Message----- From: Ren, Qiaowei Sent: Tuesday, July 21, 2015 4:00 PM To: Daniel P. Berrange Cc: libvir-list@redhat.com Subject: RE: [libvirt] [PATCH 2/3] Qemu: add CMT support
On Mon, Jul 20, 2015 at 01:50:54PM +0000, Ren, Qiaowei wrote:
Daniel P. Berrange wrote on Jul 20, 2015 17:32:
On Sun, Jul 05, 2015 at 07:43:43PM +0800, Qiaowei Ren wrote:
One RFC in https://www.redhat.com/archives/libvir-list/2015-June/msg01509.ht m l
CMT (Cache Monitoring Technology) can be used to measure the usage of cache by VM running on the host. This patch will extend the bulk stats API (virDomainListGetStats) to add this field. Applications based on libvirt can use this API to achieve cache usage of VM. Because CMT implementation in Linux kernel is based on perf mechanism, this patch will enable perf event for CMT when VM is created and disable it when VM is destroyed.
Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 4cfae03..8c678c9 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -19320,6 +19320,53 @@ qemuDomainGetStatsBlock(virQEMUDriverPtr driver,
#undef QEMU_ADD_COUNT_PARAM +static int +qemuDomainGetStatsCache(virQEMUDriverPtr driver ATTRIBUTE_UNUSED, + virDomainObjPtr dom, + virDomainStatsRecordPtr record, + int *maxparams, + unsigned int privflags ATTRIBUTE_UNUSED)
So this is a method that is used to collect per-domain information
+{ + qemuDomainObjPrivatePtr priv = dom->privateData; + FILE *fd; + unsigned long long cache = 0; + int scaling_factor = 0; + + if (priv->cmt_fd <= 0) + return -1; + + if (read(priv->cmt_fd, &cache, sizeof(uint64_t)) < 0) { + virReportSystemError(errno, "%s", + _("Unable to read cache data")); + return -1; + } + + fd = fopen("/sys/devices/intel_cqm/events/llc_occupancy.scale", "r"); + if (!fd) { + virReportSystemError(errno, "%s", + _("Unable to open CMT scale file")); + return -1; + } + if (fscanf(fd, "%d", &scaling_factor) != 1) { + virReportSystemError(errno, "%s", + _("Unable to read CMT scale file")); + VIR_FORCE_FCLOSE(fd); + return -1; + } + VIR_FORCE_FCLOSE(fd);
But this data you are reading is global to the entire host.
In fact this data is only for per-domain. When the perf syscall is called to enable perf event for domain, the pid of that domain is passed.
Ah, I see - you rely on the open file descriptor to be associated with the VM
On Jul 20, 2015 22:34, Daniel P. Berrange wrote: pid.
-5122,6 +5199,15 @@ void qemuProcessStop(virQEMUDriverPtr driver, virPortAllocatorRelease(driver->migrationPorts, priv->nbdPort); priv->nbdPort = 0; + /* Disable CMT */ + if (priv->cmt_fd > 0) {
You can't rely on keeping an open file descriptor for the guest because libvirtd may be restarted.
Sorry, I don't really get the meaning of this. You mean that when libvirtd is restarted, those resource which the domain opened should be closed, right?
No, when libvirtd is restarted, the domains must all continuing running without loss of state. You open the FD when starting the guest, then libvirtd is restarted, now someone wants to query the perf data. The perf FD will not be open anymore because libvirtd was restarted. At least you'd need to re-open the file descriptor when libvirtd starts up again, for any running guest. I'm not really convinced we want to keep the perf file descriptors open for all the domains for the entire time they are running. Should really only open them
when we actually want to read the collected data.
Got it! Should open/disable them when read the data.
+ if (ioctl(priv->cmt_fd, PERF_EVENT_IOC_DISABLE) < 0) { + virReportSystemError(errno, "%s", + _("Unable to disable perf event for CMT")); + } + VIR_FORCE_CLOSE(priv->cmt_fd); + } + if (priv->agent) { qemuAgentClose(priv->agent); priv->agent = NULL;
Conceptually I think this approach to implementation is flawed. While you are turning on/off the perf events for each QEMU process, the data collection does not distinguish data from each QEMU process - the data reported is host wide. So this really doesn't make much sense
IMHO.
As mentioned above, the data reported is only for domain.
I'm also wondering whether this is really going to be sufficiently useful on its own. CPUs have countless other performance counters that I would imagine apps/admins will want to read in order to analyse QEMU performance, beyond this new CMT feature. The domain stats API won't really scale up to dealing with arbitrary perf event counter reporting so I'm not much of a fan of just special casing CMT in this way.
IOW, if we want to support host performance analysis in libvirt, then we probably want to design an set of APIs specifically for this purpose, but I could well see us saying that this is out of scope for libvirt and apps shoud just use the linux perf interfaces directly.
Yes. I can get what you mean. Maybe libvirt doesn't have to be responsible for supporting host performance.
But I guess cache usage should be important for each domain, if those apps based on libvirt can achieve this information they will be able to better check and confirm the domain works normally, like the stats of cpu/memory/block/... which have be supported in libvirt now. Do you think so?
I'm not saying cache usage is unimportant. There are quite a lot of other hardware event counters in modern CPUs though, so I'm asking why we should add just this new special intel event, and not any of the other existing performance counters that are useful in diagnosing performance issues.
Ah, I guess you mean that the best way is providing a set of utility method for perf operation, like cgroup support. Then we can use these interface to support a lot of necessary perf counters.
If we have to do so, maybe I can try to firstly implement such methods and then you can help review them.
Also, I'm thinking that QEMU has many different threads - VCPU threads, I/O threads, emulator threads and so on. I could see users want to have distinct profiling for these different functional areas of QEMU too instead of just whole- QEMU granularity.
Yes. I believe such features should be useful for some users. I am currently working on adding some features like CMT into OpenStack, I only know profiling for VCPU/ IO / emulator threads should be not necessary for OpenStack. ^-^
Hi Daniel,
what do you think about it now? If we have to add CMT support in Nova, do you think what is the best way? Can we directly use linux perf interface in OpenStack?
Hi Daniel, If we firstly add utility methods for perf operation into libvirt like cgroup, and then implement CMT feature based on this, do you think it is OK? Thanks, Qiaowei

This patch update domstats command to support CMT feature based on extended bulk stats API virDomainListGetStats. Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> --- tools/virsh-domain-monitor.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/virsh-domain-monitor.c b/tools/virsh-domain-monitor.c index 1d4dc25..28f7bf8 100644 --- a/tools/virsh-domain-monitor.c +++ b/tools/virsh-domain-monitor.c @@ -2013,6 +2013,10 @@ static const vshCmdOptDef opts_domstats[] = { .type = VSH_OT_BOOL, .help = N_("report domain block device statistics"), }, + {.name = "cache", + .type = VSH_OT_BOOL, + .help = N_("report domain cache statistics"), + }, {.name = "list-active", .type = VSH_OT_BOOL, .help = N_("list only active domains"), @@ -2123,6 +2127,9 @@ cmdDomstats(vshControl *ctl, const vshCmd *cmd) if (vshCommandOptBool(cmd, "block")) stats |= VIR_DOMAIN_STATS_BLOCK; + if (vshCommandOptBool(cmd, "cache")) + stats |= VIR_DOMAIN_STATS_CACHE; + if (vshCommandOptBool(cmd, "list-active")) flags |= VIR_CONNECT_GET_ALL_DOMAINS_STATS_ACTIVE; -- 1.9.1
participants (4)
-
Daniel P. Berrange
-
Prerna
-
Qiaowei Ren
-
Ren, Qiaowei