-----Original Message-----
From: Jiri Denemark [mailto:jdenemar@redhat.com]
Sent: Thursday, October 29, 2015 5:52 PM
To: Ren, Qiaowei
Cc: libvir-list(a)redhat.com; Feng, Shaohe
Subject: Re: [libvirt] [PATCH 2/3] Qemu: add CMT support
On Thu, Oct 29, 2015 at 14:02:29 +0800, Qiaowei Ren wrote:
> One RFC in
>
https://www.redhat.com/archives/libvir-list/2015-June/msg01509.html
>
> CMT (Cache Monitoring Technology) can be used to measure the usage of
> cache by VM running on the host. This patch will extend the bulk stats
> API (virDomainListGetStats) to add this field. Applications based on
> libvirt can use this API to achieve cache usage of VM. Because CMT
> implementation in Linux kernel is based on perf mechanism, this patch
> will enable perf event for CMT when VM is created and disable it when
> VM is destroyed.
>
> Signed-off-by: Qiaowei Ren <qiaowei.ren(a)intel.com>
> ---
> include/libvirt/libvirt-domain.h | 1 +
> src/qemu/qemu_domain.h | 3 ++
> src/qemu/qemu_driver.c | 48 ++++++++++++++++++++++
> src/qemu/qemu_process.c | 86
++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 138 insertions(+)
>
> diff --git a/include/libvirt/libvirt-domain.h
> b/include/libvirt/libvirt-domain.h
> index e8202cf..fb5e1f4 100644
> --- a/include/libvirt/libvirt-domain.h
> +++ b/include/libvirt/libvirt-domain.h
> @@ -1764,6 +1764,7 @@ typedef enum {
> VIR_DOMAIN_STATS_VCPU = (1 << 3), /* return domain virtual CPU info */
> VIR_DOMAIN_STATS_INTERFACE = (1 << 4), /* return domain interfaces
info */
> VIR_DOMAIN_STATS_BLOCK = (1 << 5), /* return domain block info */
> + VIR_DOMAIN_STATS_CACHE = (1 << 6), /* return domain block info */
This flag is not documented anywhere and the comment is completely wrong.
> } virDomainStatsTypes;
>
> typedef enum {
> diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index
> 54e1e7b..31bce33 100644
> --- a/src/qemu/qemu_domain.h
> +++ b/src/qemu/qemu_domain.h
> @@ -196,6 +196,9 @@ struct _qemuDomainObjPrivate {
>
> bool hookRun; /* true if there was a hook run over this domain
> */
>
> + int cmt_fd; /* perf handler for CMT */
So you declare cmt_fd as int...
> +
> +
Why two empty lines?
> /* Bitmaps below hold data from the auto NUMA feature */
> virBitmapPtr autoNodeset;
> virBitmapPtr autoCpuset;
> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index
> 4cfae03..8c678c9 100644
> --- a/src/qemu/qemu_driver.c
> +++ b/src/qemu/qemu_driver.c
> @@ -19320,6 +19320,53 @@ qemuDomainGetStatsBlock(virQEMUDriverPtr
> driver,
>
> #undef QEMU_ADD_COUNT_PARAM
>
> +static int
> +qemuDomainGetStatsCache(virQEMUDriverPtr driver ATTRIBUTE_UNUSED,
> + virDomainObjPtr dom,
> + virDomainStatsRecordPtr record,
> + int *maxparams,
> + unsigned int privflags ATTRIBUTE_UNUSED) {
> + qemuDomainObjPrivatePtr priv = dom->privateData;
> + FILE *fd;
> + unsigned long long cache = 0;
> + int scaling_factor = 0;
> +
> + if (priv->cmt_fd <= 0)
Shouldn't this only check for cmt_fd < 0?
> + return -1;
> +
> + if (read(priv->cmt_fd, &cache, sizeof(uint64_t)) < 0) {
Shouldn't cache be declared as uint64_t then?
> + virReportSystemError(errno, "%s",
> + _("Unable to read cache data"));
> + return -1;
> + }
> +
> + fd = fopen("/sys/devices/intel_cqm/events/llc_occupancy.scale",
"r");
> + if (!fd) {
> + virReportSystemError(errno, "%s",
> + _("Unable to open CMT scale file"));
> + return -1;
> + }
> + if (fscanf(fd, "%d", &scaling_factor) != 1) {
> + virReportSystemError(errno, "%s",
> + _("Unable to read CMT scale file"));
> + VIR_FORCE_FCLOSE(fd);
> + return -1;
> + }
> + VIR_FORCE_FCLOSE(fd);
> +
> + cache *= scaling_factor;
> +
> + if (virTypedParamsAddULLong(&record->params,
> + &record->nparams,
> + maxparams,
> + "cache.current",
> + cache) < 0)
Any documentation of this new statistics is missing. The commit message
doesn't really help understanding it either.
> + return -1;
> +
> + return 0;
> +}
> +
> typedef int
> (*qemuDomainGetStatsFunc)(virQEMUDriverPtr driver,
> virDomainObjPtr dom, @@ -19340,6 +19387,7
> @@ static struct qemuDomainGetStatsWorker qemuDomainGetStatsWorkers[]
= {
> { qemuDomainGetStatsVcpu, VIR_DOMAIN_STATS_VCPU, false },
> { qemuDomainGetStatsInterface, VIR_DOMAIN_STATS_INTERFACE, false },
> { qemuDomainGetStatsBlock, VIR_DOMAIN_STATS_BLOCK, true },
> + { qemuDomainGetStatsCache, VIR_DOMAIN_STATS_CACHE, false },
> { NULL, 0, false }
> };
>
> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index
> ba84182..00b889d 100644
> --- a/src/qemu/qemu_process.c
> +++ b/src/qemu/qemu_process.c
> @@ -25,8 +25,11 @@
> #include <unistd.h>
> #include <signal.h>
> #include <sys/stat.h>
> +#include <sys/ioctl.h>
> +#include <sys/syscall.h>
> #if defined(__linux__)
> # include <linux/capability.h>
> +# include <linux/perf_event.h>
What if there's no perf_event.h header file?
> #elif defined(__FreeBSD__)
> # include <sys/param.h>
> # include <sys/cpuset.h>
> @@ -4274,6 +4277,75 @@ qemuLogOperation(virDomainObjPtr vm,
> goto cleanup;
> }
>
> +/*
> + * Enable CMT(Cache Monitoring Technology) to measure the usage of
> + * cache by VM running on the node.
> + *
> + * Because the hypervisor implement CMT support basedon perf
> +mechanism,
> + * we should enable perf event for CMT. The function 'sys_erf_event_open'
> + * is perf syscall wrapper.
> + */
> +#ifdef __linux__
> +static long sys_perf_event_open(struct perf_event_attr *hw_event,
> + pid_t pid, int cpu, int group_fd,
One parameter per line is easier to read.
> + unsigned long flags) {
> + return syscall(__NR_perf_event_open, hw_event, pid, cpu,
> + group_fd, flags);
Wrong indentation.
> +}
> +static int qemuCmtEnable(virDomainObjPtr vm) {
> + qemuDomainObjPrivatePtr priv = vm->privateData;
> + struct perf_event_attr cmt_attr;
> + int event_type;
> + FILE *fp;
> +
> + fp = fopen("/sys/devices/intel_cqm/type", "r");
> + if (!fp) {
> + virReportSystemError(errno, "%s",
> + _("CMT is not available on this host"));
> + return -1;
> + }
> + if (fscanf(fp, "%d", &event_type) != 1) {
> + virReportSystemError(errno, "%s",
> + _("Unable to read event type file."));
> + VIR_FORCE_FCLOSE(fp);
> + return -1;
> + }
> + VIR_FORCE_FCLOSE(fp);
> +
> + memset(&cmt_attr, 0, sizeof(struct perf_event_attr));
> + cmt_attr.size = sizeof(struct perf_event_attr);
> + cmt_attr.type = event_type;
> + cmt_attr.config = 1;
> + cmt_attr.inherit = 1;
> + cmt_attr.disabled = 1;
> + cmt_attr.enable_on_exec = 0;
> +
> + priv->cmt_fd = sys_perf_event_open(&cmt_attr, vm->pid, -1, -1,
> + 0);
...while here you assign long (from sys_perf_event_open) to cmt_fd.
> + if (priv->cmt_fd < 0) {
> + virReportSystemError(errno,
> + _("Unable to open perf type=%d for
pid=%d"),
> + event_type, vm->pid);
> + return -1;
> + }
> +
> + if (ioctl(priv->cmt_fd, PERF_EVENT_IOC_ENABLE) < 0) {
> + virReportSystemError(errno, "%s",
> + _("Unable to enable perf event for CMT"));
> + return -1;
> + }
> +
> + return 0;
> +}
> +#else
> +static int qemuCmtEnable(virDomainObjPtr vm) {
> + virReportUnsupportedError();
> + return -1;
> +}
> +#endif
> +
> int qemuProcessStart(virConnectPtr conn,
> virQEMUDriverPtr driver,
> virDomainObjPtr vm, @@ -4954,6 +5026,11 @@ int
> qemuProcessStart(virConnectPtr conn,
> if (virDomainSaveStatus(driver->xmlopt, cfg->stateDir, vm) < 0)
> goto cleanup;
>
> + VIR_DEBUG("Setting CMT perf counter");
> + if (qemuCmtEnable(vm) < 0)
> + virReportSystemError(errno, "%s",
> + _("CMT is not available on this host"));
> +
By setting en error here, you overwrite any useful error set inside
qemuCmtEnable. Anyway since you apparently want to make the error non fatal,
you should not even set it. The messages should be logged as warnings or even
lower.
If there is no performance or other impact of having cmt enabled, we could
enable it on hosts that support it, but we either need to log any errors with info
level or first check whether the host supports cmt and only try to enable it if it
does, in which case we could report the errors as warnings.
However, if a negative impact of enabling cmt is possible, we need an explicit
configuration option (turned off by default) that would ask libvirt to enable cmt.
Any errors could be logged as warnings in this case.
And what if libvirtd is restarted? Shouldn't we call sys_perf_event_open again?
Jirka, thanks for your nice feedback, and I will fix these problems in next patch series.
In addition, performance events won't slow down kernel and applications and so we
could enable it on hosts that support it. Certainly this patch only check if kernel
support cmt feature and does not check if host cpu support it through /proc/cpuinfo ...
If libvirtd is restarted, perf handler need to be got again. I will think it over next
version.
> /* finally we can call the 'started' hook script if
any */
> if (virHookPresent(VIR_HOOK_DRIVER_QEMU)) {
> char *xml = qemuDomainDefFormatXML(driver, vm->def, 0); @@
> -5122,6 +5199,15 @@ void qemuProcessStop(virQEMUDriverPtr driver,
> virPortAllocatorRelease(driver->migrationPorts, priv->nbdPort);
> priv->nbdPort = 0;
>
> + /* Disable CMT */
> + if (priv->cmt_fd > 0) {
cmt_fd >= 0 ?
> + if (ioctl(priv->cmt_fd, PERF_EVENT_IOC_DISABLE) < 0) {
> + virReportSystemError(errno, "%s",
> + _("Unable to disable perf event for
> + CMT"));
A warning should be good enough.
> + }
> + VIR_FORCE_CLOSE(priv->cmt_fd);
> + }
> +
This should go into a dedicated function.
> if (priv->agent) {
> qemuAgentClose(priv->agent);
> priv->agent = NULL;
Jirka