[libvirt] [PATCH RFC v3 0/6] support cpu bandwidth in libvirt

TODO: 1. We create sub directory for each vcpu in cpu subsystem. So we should recalculate cpu.shares for each vcpu. Changelog: v3: fix some small bugs implement the simple way v2: almost rewrite the patchset to support to control each vcpu's bandwidth. Limit quota to [-1, 2^64/1000] at the schemas level. We will check it at cgroup level. Wen Congyang (6): Introduce the function virCgroupForVcpu cgroup: Implement cpu.cfs_period_us and cpu.cfs_quota_us tuning API Update XML Schema for new entries qemu: Implement period and quota tunable XML configuration and parsing qemu: Implement cfs_period and cfs_quota's modification doc: Add documentation for new cputune elements period and quota docs/formatdomain.html.in | 19 ++ docs/schemas/domain.rng | 26 +++- src/conf/domain_conf.c | 20 ++- src/conf/domain_conf.h | 2 + src/libvirt_private.syms | 5 + src/qemu/qemu_cgroup.c | 127 +++++++++++ src/qemu/qemu_cgroup.h | 4 + src/qemu/qemu_driver.c | 259 ++++++++++++++++++++--- src/qemu/qemu_process.c | 4 + src/util/cgroup.c | 153 +++++++++++++- src/util/cgroup.h | 11 + tests/qemuxml2argvdata/qemuxml2argv-cputune.xml | 2 + 12 files changed, 596 insertions(+), 36 deletions(-)

Introduce the function virCgroupForVcpu() to create sub directory for each vcpu. --- src/libvirt_private.syms | 1 + src/util/cgroup.c | 72 ++++++++++++++++++++++++++++++++++++++++++--- src/util/cgroup.h | 5 +++ 3 files changed, 73 insertions(+), 5 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 3e3b1dd..30804eb 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -67,6 +67,7 @@ virCgroupDenyAllDevices; virCgroupDenyDevicePath; virCgroupForDomain; virCgroupForDriver; +virCgroupForVcpu; virCgroupFree; virCgroupGetBlkioWeight; virCgroupGetCpuShares; diff --git a/src/util/cgroup.c b/src/util/cgroup.c index 740cedf..8994aca 100644 --- a/src/util/cgroup.c +++ b/src/util/cgroup.c @@ -52,6 +52,16 @@ struct virCgroup { struct virCgroupController controllers[VIR_CGROUP_CONTROLLER_LAST]; }; +typedef enum { + VIR_CGROUP_NONE = 0, /* create subdir under each cgroup if possible. */ + VIR_CGROUP_MEM_HIERACHY = 1 << 0, /* call virCgroupSetMemoryUseHierarchy + * before creating subcgroups and + * attaching tasks + */ + VIR_CGROUP_VCPU = 1 << 1, /* create subdir only under the cgroup cpu, + * cpuacct and cpuset if possible. */ +} virCgroupFlags; + /** * virCgroupFree: * @@ -503,7 +513,7 @@ static int virCgroupSetMemoryUseHierarchy(virCgroupPtr group) } static int virCgroupMakeGroup(virCgroupPtr parent, virCgroupPtr group, - int create, bool memory_hierarchy) + int create, unsigned int flags) { int i; int rc = 0; @@ -516,6 +526,13 @@ static int virCgroupMakeGroup(virCgroupPtr parent, virCgroupPtr group, if (!group->controllers[i].mountPoint) continue; + /* We need to control cpu bandwidth for each vcpu now */ + if ((flags & VIR_CGROUP_VCPU) && i != VIR_CGROUP_CONTROLLER_CPU) { + /* treat it as unmounted and we can use virCgroupAddTask */ + VIR_FREE(group->controllers[i].mountPoint); + continue; + } + rc = virCgroupPathOfController(group, i, "", &path); if (rc < 0) return rc; @@ -555,7 +572,7 @@ static int virCgroupMakeGroup(virCgroupPtr parent, virCgroupPtr group, * Note that virCgroupSetMemoryUseHierarchy should always be * called prior to creating subcgroups and attaching tasks. */ - if (memory_hierarchy && + if ((flags & VIR_CGROUP_MEM_HIERACHY) && group->controllers[VIR_CGROUP_CONTROLLER_MEMORY].mountPoint != NULL && (i == VIR_CGROUP_CONTROLLER_MEMORY || STREQ(group->controllers[i].mountPoint, group->controllers[VIR_CGROUP_CONTROLLER_MEMORY].mountPoint))) { @@ -641,7 +658,7 @@ static int virCgroupAppRoot(int privileged, if (rc != 0) goto cleanup; - rc = virCgroupMakeGroup(rootgrp, *group, create, false); + rc = virCgroupMakeGroup(rootgrp, *group, create, VIR_CGROUP_NONE); cleanup: virCgroupFree(&rootgrp); @@ -801,7 +818,7 @@ int virCgroupForDriver(const char *name, VIR_FREE(path); if (rc == 0) { - rc = virCgroupMakeGroup(rootgrp, *group, create, false); + rc = virCgroupMakeGroup(rootgrp, *group, create, VIR_CGROUP_NONE); if (rc != 0) virCgroupFree(group); } @@ -861,7 +878,7 @@ int virCgroupForDomain(virCgroupPtr driver, * a group for driver, is to avoid overhead to track * cumulative usage that we don't need. */ - rc = virCgroupMakeGroup(driver, *group, create, true); + rc = virCgroupMakeGroup(driver, *group, create, VIR_CGROUP_MEM_HIERACHY); if (rc != 0) virCgroupFree(group); } @@ -879,6 +896,51 @@ int virCgroupForDomain(virCgroupPtr driver ATTRIBUTE_UNUSED, #endif /** + * virCgroupForVcpu: + * + * @driver: group for the domain + * @vcpuid: id of the vcpu + * @group: Pointer to returned virCgroupPtr + * + * Returns 0 on success + */ +#if defined HAVE_MNTENT_H && defined HAVE_GETMNTENT_R +int virCgroupForVcpu(virCgroupPtr driver, + int vcpuid, + virCgroupPtr *group, + int create) +{ + int rc; + char *path; + + if (driver == NULL) + return -EINVAL; + + if (virAsprintf(&path, "%s/vcpu%d", driver->path, vcpuid) < 0) + return -ENOMEM; + + rc = virCgroupNew(path, group); + VIR_FREE(path); + + if (rc == 0) { + rc = virCgroupMakeGroup(driver, *group, create, VIR_CGROUP_VCPU); + if (rc != 0) + virCgroupFree(group); + } + + return rc; +} +#else +int virCgroupForVcpu(virCgroupPtr driver ATTRIBUTE_UNUSED, + int vcpuid ATTRIBUTE_UNUSED, + virCgroupPtr *group ATTRIBUTE_UNUSED, + int create ATTRIBUTE_UNUSED) +{ + return -ENXIO; +} +#endif + +/** * virCgroupSetBlkioWeight: * * @group: The cgroup to change io weight for diff --git a/src/util/cgroup.h b/src/util/cgroup.h index 8ae756d..1d04418 100644 --- a/src/util/cgroup.h +++ b/src/util/cgroup.h @@ -40,6 +40,11 @@ int virCgroupForDomain(virCgroupPtr driver, virCgroupPtr *group, int create); +int virCgroupForVcpu(virCgroupPtr driver, + int vcpuid, + virCgroupPtr *group, + int create); + int virCgroupPathOfController(virCgroupPtr group, int controller, const char *key, -- 1.7.1

--- src/libvirt_private.syms | 4 ++ src/util/cgroup.c | 81 ++++++++++++++++++++++++++++++++++++++++++++-- src/util/cgroup.h | 6 +++ 3 files changed, 88 insertions(+), 3 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 30804eb..8b73ec3 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -71,6 +71,8 @@ virCgroupForVcpu; virCgroupFree; virCgroupGetBlkioWeight; virCgroupGetCpuShares; +virCgroupGetCpuCfsPeriod; +virCgroupGetCpuCfsQuota; virCgroupGetCpuacctUsage; virCgroupGetFreezerState; virCgroupGetMemoryHardLimit; @@ -85,6 +87,8 @@ virCgroupPathOfController; virCgroupRemove; virCgroupSetBlkioWeight; virCgroupSetCpuShares; +virCgroupSetCpuCfsPeriod; +virCgroupSetCpuCfsQuota; virCgroupSetFreezerState; virCgroupSetMemory; virCgroupSetMemoryHardLimit; diff --git a/src/util/cgroup.c b/src/util/cgroup.c index 8994aca..378e08a 100644 --- a/src/util/cgroup.c +++ b/src/util/cgroup.c @@ -398,8 +398,6 @@ static int virCgroupSetValueI64(virCgroupPtr group, return rc; } -#if 0 -/* This is included for completeness, but not yet used */ static int virCgroupGetValueI64(virCgroupPtr group, int controller, const char *key, @@ -419,7 +417,6 @@ out: return rc; } -#endif static int virCgroupGetValueU64(virCgroupPtr group, int controller, @@ -1384,6 +1381,84 @@ int virCgroupGetCpuShares(virCgroupPtr group, unsigned long long *shares) "cpu.shares", shares); } +/** + * virCgroupSetCpuCfsPeriod: + * + * @group: The cgroup to change cpu.cfs_period_us for + * @cfs_period: The bandwidth period in usecs + * + * Returns: 0 on success + */ +int virCgroupSetCpuCfsPeriod(virCgroupPtr group, unsigned long long cfs_period) +{ + /* The cfs_period shoule be greater or equal than 1ms, and less or equal + * than 1s. + */ + if (cfs_period < 1000 || cfs_period > 1000000) + return -EINVAL; + + return virCgroupSetValueU64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", cfs_period); +} + +/** + * virCgroupGetCpuCfsPeriod: + * + * @group: The cgroup to get cpu.cfs_period_us for + * @cfs_period: Pointer to the returned bandwidth period in usecs + * + * Returns: 0 on success + */ +int virCgroupGetCpuCfsPeriod(virCgroupPtr group, unsigned long long *cfs_period) +{ + return virCgroupGetValueU64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", cfs_period); +} + +/** + * virCgroupSetCpuCfsQuota: + * + * @group: The cgroup to change cpu.cfs_quota_us for + * @cfs_quota: the cpu bandwidth (in usecs) that this tg will be allowed to + * consume over period + * + * Returns: 0 on success + */ +int virCgroupSetCpuCfsQuota(virCgroupPtr group, long long cfs_quota) +{ + if (cfs_quota >= 0) { + /* The cfs_quota shoule be greater or equal than 1ms */ + if (cfs_quota < 1000) + return -EINVAL; + + /* check overflow */ + if (cfs_quota > (unsigned long long)~0ULL / 1000) + return -EINVAL; + } + + return virCgroupSetValueI64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_quota_us", cfs_quota); +} + +/** + * virCgroupGetCpuCfsQuota: + * + * @group: The cgroup to get cpu.cfs_quota_us for + * @cfs_quota: Pointer to the returned cpu bandwidth (in usecs) that this tg + * will be allowed to consume over period + * + * Returns: 0 on success + */ +int virCgroupGetCpuCfsQuota(virCgroupPtr group, long long *cfs_quota) +{ + return virCgroupGetValueI64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_quota_us", cfs_quota); +} + int virCgroupGetCpuacctUsage(virCgroupPtr group, unsigned long long *usage) { return virCgroupGetValueU64(group, diff --git a/src/util/cgroup.h b/src/util/cgroup.h index 1d04418..d190bb3 100644 --- a/src/util/cgroup.h +++ b/src/util/cgroup.h @@ -104,6 +104,12 @@ int virCgroupDenyDevicePath(virCgroupPtr group, int virCgroupSetCpuShares(virCgroupPtr group, unsigned long long shares); int virCgroupGetCpuShares(virCgroupPtr group, unsigned long long *shares); +int virCgroupSetCpuCfsPeriod(virCgroupPtr group, unsigned long long cfs_period); +int virCgroupGetCpuCfsPeriod(virCgroupPtr group, unsigned long long *cfs_period); + +int virCgroupSetCpuCfsQuota(virCgroupPtr group, long long cfs_quota); +int virCgroupGetCpuCfsQuota(virCgroupPtr group, long long *cfs_quota); + int virCgroupGetCpuacctUsage(virCgroupPtr group, unsigned long long *usage); int virCgroupSetFreezerState(virCgroupPtr group, const char *state); -- 1.7.1

--- docs/schemas/domain.rng | 26 +++++++++++++++++++++++++- 1 files changed, 25 insertions(+), 1 deletions(-) diff --git a/docs/schemas/domain.rng b/docs/schemas/domain.rng index 8a4e3fe..5f8151d 100644 --- a/docs/schemas/domain.rng +++ b/docs/schemas/domain.rng @@ -388,6 +388,16 @@ <ref name="cpushares"/> </element> </optional> + <optional> + <element name="period"> + <ref name="cpuperiod"/> + </element> + </optional> + <optional> + <element name="quota"> + <ref name="cpuquota"/> + </element> + </optional> <zeroOrMore> <element name="vcpupin"> <attribute name="vcpu"> @@ -2401,7 +2411,21 @@ <data type="unsignedInt"> <param name="pattern">[0-9]+</param> </data> - </define> + </define> + <define name="cpuperiod"> + <data type="unsignedLong"> + <param name="pattern">[0-9]+</param> + <param name="minInclusive">1000</param> + <param name="maxInclusive">1000000</param> + </data> + </define> + <define name="cpuquota"> + <data type="long"> + <param name="pattern">-?[0-9]+</param> + <param name="maxInclusive">18446744073709551</param> + <param name='minInclusive'>-1</param> + </data> + </define> <define name="PortNumber"> <data type="short"> <param name="minInclusive">-1</param> -- 1.7.1

On 07/18/2011 04:41 AM, Wen Congyang wrote:
--- docs/schemas/domain.rng | 26 +++++++++++++++++++++++++- 1 files changed, 25 insertions(+), 1 deletions(-)
diff --git a/docs/schemas/domain.rng b/docs/schemas/domain.rng index 8a4e3fe..5f8151d 100644 --- a/docs/schemas/domain.rng +++ b/docs/schemas/domain.rng @@ -388,6 +388,16 @@ <ref name="cpushares"/> </element> </optional> + <optional> + <element name="period"> + <ref name="cpuperiod"/> + </element> + </optional> + <optional> + <element name="quota"> + <ref name="cpuquota"/> + </element> + </optional> <zeroOrMore> <element name="vcpupin"> <attribute name="vcpu">
Yes, this is exactly the interface we are looking for.
@@ -2401,7 +2411,21 @@ <data type="unsignedInt"> <param name="pattern">[0-9]+</param> </data> - </define> + </define> + <define name="cpuperiod"> + <data type="unsignedLong"> + <param name="pattern">[0-9]+</param> + <param name="minInclusive">1000</param> + <param name="maxInclusive">1000000</param> + </data> + </define> + <define name="cpuquota"> + <data type="long"> + <param name="pattern">-?[0-9]+</param> + <param name="maxInclusive">18446744073709551</param> + <param name='minInclusive'>-1</param> + </data> + </define> <define name="PortNumber"> <data type="short"> <param name="minInclusive">-1</param>
-- Adam Litke IBM Linux Technology Center

--- src/conf/domain_conf.c | 20 +++- src/conf/domain_conf.h | 2 + src/qemu/qemu_cgroup.c | 127 +++++++++++++++++++++++ src/qemu/qemu_cgroup.h | 4 + src/qemu/qemu_process.c | 4 + tests/qemuxml2argvdata/qemuxml2argv-cputune.xml | 2 + 6 files changed, 157 insertions(+), 2 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 8c3e44e..f959a48 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -6021,6 +6021,14 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, &def->cputune.shares) < 0) def->cputune.shares = 0; + if (virXPathULongLong("string(./cputune/period[1])", ctxt, + &def->cputune.period) < 0) + def->cputune.period = 0; + + if (virXPathLongLong("string(./cputune/quota[1])", ctxt, + &def->cputune.quota) < 0) + def->cputune.quota = 0; + if ((n = virXPathNodeSet("./cputune/vcpupin", ctxt, &nodes)) < 0) { goto error; } @@ -9721,12 +9729,19 @@ virDomainDefFormatInternal(virDomainDefPtr def, virBufferAsprintf(&buf, " current='%u'", def->vcpus); virBufferAsprintf(&buf, ">%u</vcpu>\n", def->maxvcpus); - if (def->cputune.shares || def->cputune.vcpupin) + if (def->cputune.shares || def->cputune.vcpupin || + def->cputune.period || def->cputune.quota) virBufferAddLit(&buf, " <cputune>\n"); if (def->cputune.shares) virBufferAsprintf(&buf, " <shares>%lu</shares>\n", def->cputune.shares); + if (def->cputune.period) + virBufferAsprintf(&buf, " <period>%llu</period>\n", + def->cputune.period); + if (def->cputune.quota) + virBufferAsprintf(&buf, " <quota>%lld</quota>\n", + def->cputune.quota); if (def->cputune.vcpupin) { int i; for (i = 0; i < def->cputune.nvcpupin; i++) { @@ -9748,7 +9763,8 @@ virDomainDefFormatInternal(virDomainDefPtr def, } } - if (def->cputune.shares || def->cputune.vcpupin) + if (def->cputune.shares || def->cputune.vcpupin || + def->cputune.period || def->cputune.quota) virBufferAddLit(&buf, " </cputune>\n"); if (def->numatune.memory.nodemask) diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 172d3c2..02e5ae4 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -1195,6 +1195,8 @@ struct _virDomainDef { struct { unsigned long shares; + unsigned long long period; + long long quota; int nvcpupin; virDomainVcpuPinDefPtr *vcpupin; } cputune; diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index b357852..ab87fab 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -24,6 +24,7 @@ #include <config.h> #include "qemu_cgroup.h" +#include "qemu_domain.h" #include "cgroup.h" #include "logging.h" #include "memory.h" @@ -376,6 +377,132 @@ cleanup: return -1; } +int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, unsigned long long period, + long long quota) +{ + int rc; + unsigned long long old_period; + + if (period == 0 && quota == 0) + return 0; + + if (period) { + /* get old period, and we can rollback if set quota failed */ + rc = virCgroupGetCpuCfsPeriod(cgroup, &old_period); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to get cpu bandwidth period"); + return -1; + } + + rc = virCgroupSetCpuCfsPeriod(cgroup, period); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to set cpu bandwidth period"); + return -1; + } + } + + if (quota) { + rc = virCgroupSetCpuCfsQuota(cgroup, quota); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to set cpu bandwidth quota"); + goto cleanup; + } + } + + return 0; + +cleanup: + if (period) { + rc = virCgroupSetCpuCfsPeriod(cgroup, old_period); + if (rc < 0) + virReportSystemError(-rc, + _("%s"), + "Unable to rollback cpu bandwidth period"); + } + + return -1; +} + +int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm) +{ + virCgroupPtr cgroup = NULL; + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc; + unsigned int i; + unsigned long long period = vm->def->cputune.period; + long long quota = vm->def->cputune.quota; + + if (driver->cgroup == NULL) + return 0; /* Not supported, so claim success */ + + rc = virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to find cgroup for %s"), + vm->def->name); + goto cleanup; + } + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. + */ + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup, period, quota) < 0) + goto cleanup; + } + } + return 0; + } + + for (i = 0; i < priv->nvcpupids; i++) { + rc = virCgroupForVcpu(cgroup, i, &cgroup_vcpu, 1); + if (rc < 0) { + virReportSystemError(-rc, + _("Unable to create vcpu cgroup for %s(vcpu:" + " %d)"), + vm->def->name, i); + goto cleanup; + } + + /* move the thread for vcpu to sub dir */ + rc = virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]); + if (rc < 0) { + virReportSystemError(-rc, + _("unable to add vcpu %d task %d to cgroup"), + i, priv->vcpupids[i]); + goto cleanup; + } + + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0) + goto cleanup; + } + } + + virCgroupFree(&cgroup_vcpu); + } + + virCgroupFree(&cgroup_vcpu); + virCgroupFree(&cgroup); + return 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + if (cgroup) { + virCgroupRemove(cgroup); + virCgroupFree(&cgroup); + } + + return -1; +} + int qemuRemoveCgroup(struct qemud_driver *driver, virDomainObjPtr vm, diff --git a/src/qemu/qemu_cgroup.h b/src/qemu/qemu_cgroup.h index e8abfb4..17164d9 100644 --- a/src/qemu/qemu_cgroup.h +++ b/src/qemu/qemu_cgroup.h @@ -49,6 +49,10 @@ int qemuSetupHostUsbDeviceCgroup(usbDevice *dev, void *opaque); int qemuSetupCgroup(struct qemud_driver *driver, virDomainObjPtr vm); +int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, + unsigned long long period, + long long quota); +int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm); int qemuRemoveCgroup(struct qemud_driver *driver, virDomainObjPtr vm, int quiet); diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index d0085e0..4e6b7d0 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2789,6 +2789,10 @@ int qemuProcessStart(virConnectPtr conn, if (qemuProcessDetectVcpuPIDs(driver, vm) < 0) goto cleanup; + VIR_DEBUG("Setting cgroup for each VCPU(if required)"); + if (qemuSetupCgroupForVcpu(driver, vm) < 0) + goto cleanup; + VIR_DEBUG("Setting VCPU affinities"); if (qemuProcessSetVcpuAffinites(conn, vm) < 0) goto cleanup; diff --git a/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml b/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml index 0afbadb..091865a 100644 --- a/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml +++ b/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml @@ -6,6 +6,8 @@ <vcpu>2</vcpu> <cputune> <shares>2048</shares> + <period>1000000</period> + <quota>-1</quota> <vcpupin vcpu='0' cpuset='0'/> <vcpupin vcpu='1' cpuset='1'/> </cputune> -- 1.7.1

This is looking good to me. I am pleased to see that the global case is handled as expected when per-vcpu threads are not active. On 07/18/2011 04:42 AM, Wen Congyang wrote: <snip>
+int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm) +{ + virCgroupPtr cgroup = NULL; + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc; + unsigned int i; + unsigned long long period = vm->def->cputune.period; + long long quota = vm->def->cputune.quota; + + if (driver->cgroup == NULL) + return 0; /* Not supported, so claim success */
I just want to check: Is the above logic still valid? Failure to apply a quota setting (>0) could have fairly substantial implications for system performance. I am not convinced either way but I do think this point merits some discussion.
+ + rc = virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to find cgroup for %s"), + vm->def->name); + goto cleanup; + } + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. + */ + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup, period, quota) < 0) + goto cleanup; + } + } + return 0; + } + + for (i = 0; i < priv->nvcpupids; i++) { + rc = virCgroupForVcpu(cgroup, i, &cgroup_vcpu, 1); + if (rc < 0) { + virReportSystemError(-rc, + _("Unable to create vcpu cgroup for %s(vcpu:" + " %d)"), + vm->def->name, i); + goto cleanup; + } + + /* move the thread for vcpu to sub dir */ + rc = virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]); + if (rc < 0) { + virReportSystemError(-rc, + _("unable to add vcpu %d task %d to cgroup"), + i, priv->vcpupids[i]); + goto cleanup; + } + + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0) + goto cleanup; + } + } + + virCgroupFree(&cgroup_vcpu); + } + + virCgroupFree(&cgroup_vcpu); + virCgroupFree(&cgroup); + return 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + if (cgroup) { + virCgroupRemove(cgroup); + virCgroupFree(&cgroup); + } + + return -1; +} +
int qemuRemoveCgroup(struct qemud_driver *driver, virDomainObjPtr vm,
-- Adam Litke IBM Linux Technology Center

At 07/19/2011 04:35 AM, Adam Litke Write:
This is looking good to me. I am pleased to see that the global case is handled as expected when per-vcpu threads are not active.
On 07/18/2011 04:42 AM, Wen Congyang wrote: <snip>
+int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm) +{ + virCgroupPtr cgroup = NULL; + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc; + unsigned int i; + unsigned long long period = vm->def->cputune.period; + long long quota = vm->def->cputune.quota; + + if (driver->cgroup == NULL) + return 0; /* Not supported, so claim success */
I just want to check: Is the above logic still valid? Failure to apply
This logic is the same as the logic in the similar function qemuSetupCgroup().
a quota setting (>0) could have fairly substantial implications for system performance. I am not convinced either way but I do think this point merits some discussion.
+ + rc = virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to find cgroup for %s"), + vm->def->name); + goto cleanup; + } + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. + */ + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup, period, quota) < 0) + goto cleanup; + } + } + return 0; + } + + for (i = 0; i < priv->nvcpupids; i++) { + rc = virCgroupForVcpu(cgroup, i, &cgroup_vcpu, 1); + if (rc < 0) { + virReportSystemError(-rc, + _("Unable to create vcpu cgroup for %s(vcpu:" + " %d)"), + vm->def->name, i); + goto cleanup; + } + + /* move the thread for vcpu to sub dir */ + rc = virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]); + if (rc < 0) { + virReportSystemError(-rc, + _("unable to add vcpu %d task %d to cgroup"), + i, priv->vcpupids[i]); + goto cleanup; + } + + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0) + goto cleanup; + } + } + + virCgroupFree(&cgroup_vcpu); + } + + virCgroupFree(&cgroup_vcpu); + virCgroupFree(&cgroup); + return 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + if (cgroup) { + virCgroupRemove(cgroup); + virCgroupFree(&cgroup); + } + + return -1; +} +
int qemuRemoveCgroup(struct qemud_driver *driver, virDomainObjPtr vm,

On 07/18/2011 07:46 PM, Wen Congyang wrote:
At 07/19/2011 04:35 AM, Adam Litke Write:
This is looking good to me. I am pleased to see that the global case is handled as expected when per-vcpu threads are not active.
On 07/18/2011 04:42 AM, Wen Congyang wrote: <snip>
+int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm) +{ + virCgroupPtr cgroup = NULL; + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc; + unsigned int i; + unsigned long long period = vm->def->cputune.period; + long long quota = vm->def->cputune.quota; + + if (driver->cgroup == NULL) + return 0; /* Not supported, so claim success */
I just want to check: Is the above logic still valid? Failure to apply
This logic is the same as the logic in the similar function qemuSetupCgroup().
Yes, I am aware. However, the introduction of cpu quotas is a major feature that defines the amount of computing capacity a domain has available. In my opinion, if a domain XML file specifies a quota then 'virsh create' should fail of that quota cannot be set up for any reason.
a quota setting (>0) could have fairly substantial implications for system performance. I am not convinced either way but I do think this point merits some discussion.
+ + rc = virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to find cgroup for %s"), + vm->def->name); + goto cleanup; + } + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. + */ + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup, period, quota) < 0) + goto cleanup; + } + } + return 0; + } + + for (i = 0; i < priv->nvcpupids; i++) { + rc = virCgroupForVcpu(cgroup, i, &cgroup_vcpu, 1); + if (rc < 0) { + virReportSystemError(-rc, + _("Unable to create vcpu cgroup for %s(vcpu:" + " %d)"), + vm->def->name, i); + goto cleanup; + } + + /* move the thread for vcpu to sub dir */ + rc = virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]); + if (rc < 0) { + virReportSystemError(-rc, + _("unable to add vcpu %d task %d to cgroup"), + i, priv->vcpupids[i]); + goto cleanup; + } + + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0) + goto cleanup; + } + } + + virCgroupFree(&cgroup_vcpu); + } + + virCgroupFree(&cgroup_vcpu); + virCgroupFree(&cgroup); + return 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + if (cgroup) { + virCgroupRemove(cgroup); + virCgroupFree(&cgroup); + } + + return -1; +} +
int qemuRemoveCgroup(struct qemud_driver *driver, virDomainObjPtr vm,
-- Adam Litke IBM Linux Technology Center

On 07/18/2011 04:42 AM, Wen Congyang wrote:
+int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm) +{ + virCgroupPtr cgroup = NULL; + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc; + unsigned int i; + unsigned long long period = vm->def->cputune.period; + long long quota = vm->def->cputune.quota; + + if (driver->cgroup == NULL) + return 0; /* Not supported, so claim success */ + + rc = virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to find cgroup for %s"), + vm->def->name); + goto cleanup; + } + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. + */ + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup, period, quota) < 0) + goto cleanup; + } + } + return 0; + }
I found a problem above. In the case where we are controlling quota at the domain level cgroup we must multiply the user-specified quota by the number of vcpus in the domain in order to get the same performance as we would with per-vcpu cgroups. As written, the vm will be essentially capped at 1 vcpu worth of quota regardless of the number of vcpus. You will also have to apply this logic in reverse when reporting the scheduler statistics so that the quota number is a per-vcpu quantity. -- Adam Litke IBM Linux Technology Center

At 07/19/2011 04:59 AM, Adam Litke Write:
On 07/18/2011 04:42 AM, Wen Congyang wrote:
+int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm) +{ + virCgroupPtr cgroup = NULL; + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc; + unsigned int i; + unsigned long long period = vm->def->cputune.period; + long long quota = vm->def->cputune.quota; + + if (driver->cgroup == NULL) + return 0; /* Not supported, so claim success */ + + rc = virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to find cgroup for %s"), + vm->def->name); + goto cleanup; + } + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. + */ + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup, period, quota) < 0) + goto cleanup; + } + } + return 0; + }
I found a problem above. In the case where we are controlling quota at the domain level cgroup we must multiply the user-specified quota by the number of vcpus in the domain in order to get the same performance as we would with per-vcpu cgroups. As written, the vm will be essentially capped at 1 vcpu worth of quota regardless of the number of vcpus. You will also have to apply this logic in reverse when reporting the scheduler statistics so that the quota number is a per-vcpu quantity.
When quota is 1000, and per-vcpu thread is not active, we can start vm successfully. When the per-vcpu thread is active, and the num of vcpu is more than 1, we can not start vm if we multiply the user-specified quota. It will confuse the user: sometimes the vm can be started, but sometimes the vm can not be started with the same configuration.

On 07/18/2011 07:51 PM, Wen Congyang wrote:
At 07/19/2011 04:59 AM, Adam Litke Write:
On 07/18/2011 04:42 AM, Wen Congyang wrote:
+int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm) +{ + virCgroupPtr cgroup = NULL; + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc; + unsigned int i; + unsigned long long period = vm->def->cputune.period; + long long quota = vm->def->cputune.quota; + + if (driver->cgroup == NULL) + return 0; /* Not supported, so claim success */ + + rc = virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to find cgroup for %s"), + vm->def->name); + goto cleanup; + } + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. + */ + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup, period, quota) < 0) + goto cleanup; + } + } + return 0; + }
I found a problem above. In the case where we are controlling quota at the domain level cgroup we must multiply the user-specified quota by the number of vcpus in the domain in order to get the same performance as we would with per-vcpu cgroups. As written, the vm will be essentially capped at 1 vcpu worth of quota regardless of the number of vcpus. You will also have to apply this logic in reverse when reporting the scheduler statistics so that the quota number is a per-vcpu quantity.
When quota is 1000, and per-vcpu thread is not active, we can start vm successfully. When the per-vcpu thread is active, and the num of vcpu is more than 1, we can not start vm if we multiply the user-specified quota. It will confuse the user: sometimes the vm can be started, but sometimes the vm can not be started with the same configuration.
I am not sure I understand what you mean. When vcpu threads are active, the patches work correctly. It is only when you disable vcpu threads that you need to multiply the quota by the number of vcpus (since you are now applying it globally). A 4 vcpu guest that is started using an emulator with vcpu threads active will get 4 times the cpu bandwidth as compared to starting the identical configuration using an emulator without vcpu threads. This is because you currently apply the same quota setting to the full process as you were applying to a single vcpu. I know that what I am asking for is confusing at first. The quota value in a domain XML may not match up with the value actually written to the cgroup filesystem. The same applies for the schedinfo API vs. cgroupfs. However, my suggestion will result in quotas that match user expectation. For a 4 vcpu guest with 50% cpu quota, it is more logical to set period=500000,quota=250000 without having to know if my qemu supports vcpu threads. For example, to limit a guests to 50% CPU we would have these settings (when period == 500000): 1 VCPU 2 VCPU 4 VCPU 8 VCPU VCPU-threads ON 250000 250000 250000 250000 VCPU-threads OFF 250000 500000 1000000 2000000 With VCPU threads on, the value is applied to each VCPU whereas with VCPU threads off it is applied globally. This will yield roughly equivalent performance regardless of whether the underlying qemu process enables vcpu threads. -- Adam Litke IBM Linux Technology Center

At 07/19/2011 09:27 PM, Adam Litke Write:
On 07/18/2011 07:51 PM, Wen Congyang wrote:
At 07/19/2011 04:59 AM, Adam Litke Write:
On 07/18/2011 04:42 AM, Wen Congyang wrote:
+int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm) +{ + virCgroupPtr cgroup = NULL; + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc; + unsigned int i; + unsigned long long period = vm->def->cputune.period; + long long quota = vm->def->cputune.quota; + + if (driver->cgroup == NULL) + return 0; /* Not supported, so claim success */ + + rc = virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to find cgroup for %s"), + vm->def->name); + goto cleanup; + } + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. + */ + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup, period, quota) < 0) + goto cleanup; + } + } + return 0; + }
I found a problem above. In the case where we are controlling quota at the domain level cgroup we must multiply the user-specified quota by the number of vcpus in the domain in order to get the same performance as we would with per-vcpu cgroups. As written, the vm will be essentially capped at 1 vcpu worth of quota regardless of the number of vcpus. You will also have to apply this logic in reverse when reporting the scheduler statistics so that the quota number is a per-vcpu quantity.
When quota is 1000, and per-vcpu thread is not active, we can start vm successfully. When the per-vcpu thread is active, and the num of vcpu is more than 1, we can not start vm if we multiply the user-specified quota. It will confuse the user: sometimes the vm can be started, but sometimes the vm can not be started with the same configuration.
I am not sure I understand what you mean. When vcpu threads are active, the patches work correctly. It is only when you disable vcpu threads that you need to multiply the quota by the number of vcpus (since you are now applying it globally). A 4 vcpu guest that is started using an emulator with vcpu threads active will get 4 times the cpu bandwidth as compared to starting the identical configuration using an emulator without vcpu threads. This is because you currently apply the same quota setting to the full process as you were applying to a single vcpu.
I know that what I am asking for is confusing at first. The quota value in a domain XML may not match up with the value actually written to the cgroup filesystem. The same applies for the schedinfo API vs. cgroupfs. However, my suggestion will result in quotas that match user expectation. For a 4 vcpu guest with 50% cpu quota, it is more logical to set period=500000,quota=250000 without having to know if my qemu supports vcpu threads.
For example, to limit a guests to 50% CPU we would have these settings (when period == 500000):
1 VCPU 2 VCPU 4 VCPU 8 VCPU VCPU-threads ON 250000 250000 250000 250000 VCPU-threads OFF 250000 500000 1000000 2000000
Ah, sorry, I understood your comment wrong :( I will update this patch.
With VCPU threads on, the value is applied to each VCPU whereas with VCPU threads off it is applied globally. This will yield roughly equivalent performance regardless of whether the underlying qemu process enables vcpu threads.

--- src/conf/domain_conf.c | 20 +++- src/conf/domain_conf.h | 2 + src/qemu/qemu_cgroup.c | 137 +++++++++++++++++++++++ src/qemu/qemu_cgroup.h | 4 + src/qemu/qemu_process.c | 4 + tests/qemuxml2argvdata/qemuxml2argv-cputune.xml | 2 + 6 files changed, 167 insertions(+), 2 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 3c3ab39..dc579ef 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -6026,6 +6026,14 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, &def->cputune.shares) < 0) def->cputune.shares = 0; + if (virXPathULongLong("string(./cputune/period[1])", ctxt, + &def->cputune.period) < 0) + def->cputune.period = 0; + + if (virXPathLongLong("string(./cputune/quota[1])", ctxt, + &def->cputune.quota) < 0) + def->cputune.quota = 0; + if ((n = virXPathNodeSet("./cputune/vcpupin", ctxt, &nodes)) < 0) { goto error; } @@ -9727,12 +9735,19 @@ virDomainDefFormatInternal(virDomainDefPtr def, virBufferAsprintf(&buf, " current='%u'", def->vcpus); virBufferAsprintf(&buf, ">%u</vcpu>\n", def->maxvcpus); - if (def->cputune.shares || def->cputune.vcpupin) + if (def->cputune.shares || def->cputune.vcpupin || + def->cputune.period || def->cputune.quota) virBufferAddLit(&buf, " <cputune>\n"); if (def->cputune.shares) virBufferAsprintf(&buf, " <shares>%lu</shares>\n", def->cputune.shares); + if (def->cputune.period) + virBufferAsprintf(&buf, " <period>%llu</period>\n", + def->cputune.period); + if (def->cputune.quota) + virBufferAsprintf(&buf, " <quota>%lld</quota>\n", + def->cputune.quota); if (def->cputune.vcpupin) { int i; for (i = 0; i < def->cputune.nvcpupin; i++) { @@ -9754,7 +9769,8 @@ virDomainDefFormatInternal(virDomainDefPtr def, } } - if (def->cputune.shares || def->cputune.vcpupin) + if (def->cputune.shares || def->cputune.vcpupin || + def->cputune.period || def->cputune.quota) virBufferAddLit(&buf, " </cputune>\n"); if (def->numatune.memory.nodemask) diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 7a3d72b..fc7668d 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -1190,6 +1190,8 @@ struct _virDomainDef { struct { unsigned long shares; + unsigned long long period; + long long quota; int nvcpupin; virDomainVcpuPinDefPtr *vcpupin; } cputune; diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index b357852..bb1c1c8 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -24,6 +24,7 @@ #include <config.h> #include "qemu_cgroup.h" +#include "qemu_domain.h" #include "cgroup.h" #include "logging.h" #include "memory.h" @@ -376,6 +377,142 @@ cleanup: return -1; } +int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, unsigned long long period, + long long quota) +{ + int rc; + unsigned long long old_period; + + if (period == 0 && quota == 0) + return 0; + + if (period) { + /* get old period, and we can rollback if set quota failed */ + rc = virCgroupGetCpuCfsPeriod(cgroup, &old_period); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to get cpu bandwidth period"); + return -1; + } + + rc = virCgroupSetCpuCfsPeriod(cgroup, period); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to set cpu bandwidth period"); + return -1; + } + } + + if (quota) { + rc = virCgroupSetCpuCfsQuota(cgroup, quota); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to set cpu bandwidth quota"); + goto cleanup; + } + } + + return 0; + +cleanup: + if (period) { + rc = virCgroupSetCpuCfsPeriod(cgroup, old_period); + if (rc < 0) + virReportSystemError(-rc, + _("%s"), + "Unable to rollback cpu bandwidth period"); + } + + return -1; +} + +int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm) +{ + virCgroupPtr cgroup = NULL; + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc; + unsigned int i; + unsigned long long period = vm->def->cputune.period; + long long quota = vm->def->cputune.quota; + + if (driver->cgroup == NULL) + return 0; /* Not supported, so claim success */ + + rc = virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to find cgroup for %s"), + vm->def->name); + goto cleanup; + } + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. + */ + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + /* Ensure that we can multiply by vcpus without overflowing. */ + if (quota > LLONG_MAX / vm->def->vcpus) { + virReportSystemError(EINVAL, + _("%s"), + "Unable to set cpu bandwidth quota"); + goto cleanup; + } + + if (quota > 0) + quota *= vm->def->vcpus; + if (qemuSetupCgroupVcpuBW(cgroup, period, quota) < 0) + goto cleanup; + } + } + return 0; + } + + for (i = 0; i < priv->nvcpupids; i++) { + rc = virCgroupForVcpu(cgroup, i, &cgroup_vcpu, 1); + if (rc < 0) { + virReportSystemError(-rc, + _("Unable to create vcpu cgroup for %s(vcpu:" + " %d)"), + vm->def->name, i); + goto cleanup; + } + + /* move the thread for vcpu to sub dir */ + rc = virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]); + if (rc < 0) { + virReportSystemError(-rc, + _("unable to add vcpu %d task %d to cgroup"), + i, priv->vcpupids[i]); + goto cleanup; + } + + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0) + goto cleanup; + } + } + + virCgroupFree(&cgroup_vcpu); + } + + virCgroupFree(&cgroup_vcpu); + virCgroupFree(&cgroup); + return 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + if (cgroup) { + virCgroupRemove(cgroup); + virCgroupFree(&cgroup); + } + + return -1; +} + int qemuRemoveCgroup(struct qemud_driver *driver, virDomainObjPtr vm, diff --git a/src/qemu/qemu_cgroup.h b/src/qemu/qemu_cgroup.h index e8abfb4..17164d9 100644 --- a/src/qemu/qemu_cgroup.h +++ b/src/qemu/qemu_cgroup.h @@ -49,6 +49,10 @@ int qemuSetupHostUsbDeviceCgroup(usbDevice *dev, void *opaque); int qemuSetupCgroup(struct qemud_driver *driver, virDomainObjPtr vm); +int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, + unsigned long long period, + long long quota); +int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm); int qemuRemoveCgroup(struct qemud_driver *driver, virDomainObjPtr vm, int quiet); diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 448b06e..a1fbe06 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2789,6 +2789,10 @@ int qemuProcessStart(virConnectPtr conn, if (qemuProcessDetectVcpuPIDs(driver, vm) < 0) goto cleanup; + VIR_DEBUG("Setting cgroup for each VCPU(if required)"); + if (qemuSetupCgroupForVcpu(driver, vm) < 0) + goto cleanup; + VIR_DEBUG("Setting VCPU affinities"); if (qemuProcessSetVcpuAffinites(conn, vm) < 0) goto cleanup; diff --git a/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml b/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml index 0afbadb..091865a 100644 --- a/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml +++ b/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml @@ -6,6 +6,8 @@ <vcpu>2</vcpu> <cputune> <shares>2048</shares> + <period>1000000</period> + <quota>-1</quota> <vcpupin vcpu='0' cpuset='0'/> <vcpupin vcpu='1' cpuset='1'/> </cputune> -- 1.7.1

--- src/qemu/qemu_driver.c | 259 +++++++++++++++++++++++++++++++++++++++++++----- 1 files changed, 234 insertions(+), 25 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 8d54e58..c5d0e05 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5111,6 +5111,7 @@ static char *qemuGetSchedulerType(virDomainPtr dom, { struct qemud_driver *driver = dom->conn->privateData; char *ret = NULL; + char *cfs_period_path = NULL; qemuDriverLock(driver); if (!qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { @@ -5119,14 +5120,29 @@ static char *qemuGetSchedulerType(virDomainPtr dom, goto cleanup; } - if (nparams) - *nparams = 1; + /* check whether the host supports CFS bandwidth */ + if (virCgroupPathOfController(driver->cgroup, VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", &cfs_period_path) < 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("cannot get the path of cgroup CPU controller")); + goto cleanup; + } + + if (nparams) { + if (access(cfs_period_path, F_OK) < 0) { + *nparams = 1; + } else { + *nparams = 3; + } + } ret = strdup("posix"); if (!ret) virReportOOMError(); cleanup: + VIR_FREE(cfs_period_path); qemuDriverUnlock(driver); return ret; } @@ -5753,6 +5769,48 @@ cleanup: return ret; } +static int +qemuSetVcpusBWLive(virDomainObjPtr vm, virCgroupPtr cgroup, + unsigned long long period, long long quota) +{ + int i; + qemuDomainObjPrivatePtr priv = vm->privateData; + virCgroupPtr cgroup_vcpu = NULL; + int rc; + + if (period == 0 && quota == 0) + return 0; + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. + */ + return qemuSetupCgroupVcpuBW(cgroup, period, quota); + } + + for (i = 0; i < priv->nvcpupids; i++) { + rc = virCgroupForVcpu(cgroup, i, &cgroup_vcpu, 0); + if (rc < 0) { + virReportSystemError(-rc, + _("Unable to find vcpu cgroup for %s(vcpu:" + " %d)"), + vm->def->name, i); + goto cleanup; + } + + if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0) + goto cleanup; + + virCgroupFree(&cgroup_vcpu); + } + + return 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + return -1; +} + static int qemuSetSchedulerParametersFlags(virDomainPtr dom, virTypedParameterPtr params, int nparams, @@ -5762,9 +5820,10 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, int i; virCgroupPtr group = NULL; virDomainObjPtr vm = NULL; - virDomainDefPtr persistentDef = NULL; + virDomainDefPtr vmdef = NULL; int ret = -1; bool isActive; + int rc; virCheckFlags(VIR_DOMAIN_AFFECT_LIVE | VIR_DOMAIN_AFFECT_CONFIG, -1); @@ -5788,10 +5847,17 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, flags = VIR_DOMAIN_AFFECT_CONFIG; } - if ((flags & VIR_DOMAIN_AFFECT_CONFIG) && !vm->persistent) { - qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", - _("cannot change persistent config of a transient domain")); - goto cleanup; + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + if (!vm->persistent) { + qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("cannot change persistent config of a transient domain")); + goto cleanup; + } + + /* Make a copy for updated domain. */ + vmdef = virDomainObjCopyPersistentDef(driver->caps, vm); + if (!vmdef) + goto cleanup; } if (flags & VIR_DOMAIN_AFFECT_LIVE) { @@ -5818,7 +5884,6 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, virTypedParameterPtr param = ¶ms[i]; if (STREQ(param->field, "cpu_shares")) { - int rc; if (param->type != VIR_TYPED_PARAM_ULLONG) { qemuReportError(VIR_ERR_INVALID_ARG, "%s", _("invalid type for cpu_shares tunable, expected a 'ullong'")); @@ -5837,19 +5902,47 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, } if (flags & VIR_DOMAIN_AFFECT_CONFIG) { - persistentDef = virDomainObjGetPersistentDef(driver->caps, vm); - if (!persistentDef) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("can't get persistentDef")); + vmdef->cputune.shares = params[i].value.ul; + } + } else if (STREQ(param->field, "cfs_period")) { + if (param->type != VIR_TYPED_PARAM_ULLONG) { + qemuReportError(VIR_ERR_INVALID_ARG, "%s", + _("invalid type for cfs_period tunable," + " expected a 'ullong'")); + goto cleanup; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + rc = qemuSetVcpusBWLive(vm, group, params[i].value.ul, 0); + if (rc != 0) goto cleanup; - } - persistentDef->cputune.shares = params[i].value.ul; - rc = virDomainSaveConfig(driver->configDir, persistentDef); - if (rc) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("can't save config")); + + if (params[i].value.ul) + vm->def->cputune.period = params[i].value.ul; + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + vmdef->cputune.period = params[i].value.ul; + } + } else if (STREQ(param->field, "cfs_quota")) { + if (param->type != VIR_TYPED_PARAM_LLONG) { + qemuReportError(VIR_ERR_INVALID_ARG, "%s", + _("invalid type for cfs_quota tunable," + " expected a 'llong'")); + goto cleanup; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + rc = qemuSetVcpusBWLive(vm, group, 0, params[i].value.l); + if (rc != 0) goto cleanup; - } + + if (params[i].value.l) + vm->def->cputune.quota = params[i].value.l; + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + vmdef->cputune.quota = params[i].value.l; } } else { qemuReportError(VIR_ERR_INVALID_ARG, @@ -5858,9 +5951,23 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, } } + if (virDomainSaveStatus(driver->caps, driver->stateDir, vm) < 0) + goto cleanup; + + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + rc = virDomainSaveConfig(driver->configDir, vmdef); + if (rc < 0) + goto cleanup; + + virDomainObjAssignDef(vm, vmdef, false); + vmdef = NULL; + } + ret = 0; cleanup: + virDomainDefFree(vmdef); virCgroupFree(&group); if (vm) virDomainObjUnlock(vm); @@ -5879,6 +5986,69 @@ static int qemuSetSchedulerParameters(virDomainPtr dom, } static int +qemuGetVcpuBWLive(virCgroupPtr cgroup, unsigned long long *period, + long long *quota) +{ + int rc; + + rc = virCgroupGetCpuCfsPeriod(cgroup, period); + if (rc < 0) { + virReportSystemError(-rc, "%s", + _("unable to get cpu bandwidth period tunable")); + return -1; + } + + rc = virCgroupGetCpuCfsQuota(cgroup, quota); + if (rc < 0) { + virReportSystemError(-rc, "%s", + _("unable to get cpu bandwidth tunable")); + return -1; + } + + return 0; +} + +static int +qemuGetVcpusBWLive(virDomainObjPtr vm, virCgroupPtr cgroup, + unsigned long long *period, long long *quota) +{ + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = NULL; + int rc; + int ret = -1; + + priv = vm->privateData; + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* We do not create sub dir for each vcpu */ + rc = qemuGetVcpuBWLive(cgroup, period, quota); + if (rc < 0) + goto cleanup; + + goto out; + } + + /* get period and quota for vcpu0 */ + rc = virCgroupForVcpu(cgroup, 0, &cgroup_vcpu, 0); + if (!cgroup_vcpu) { + virReportSystemError(-rc, + _("Unable to find vcpu cgroup for %s(vcpu: 0)"), + vm->def->name); + goto cleanup; + } + + rc = qemuGetVcpuBWLive(cgroup_vcpu, period, quota); + if (rc < 0) + goto cleanup; + +out: + ret = 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + return ret; +} + +static int qemuGetSchedulerParametersFlags(virDomainPtr dom, virTypedParameterPtr params, int *nparams, @@ -5887,7 +6057,9 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, struct qemud_driver *driver = dom->conn->privateData; virCgroupPtr group = NULL; virDomainObjPtr vm = NULL; - unsigned long long val; + unsigned long long shares; + unsigned long long period; + long long quota; int ret = -1; int rc; bool isActive; @@ -5943,9 +6115,17 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, _("can't get persistentDef")); goto cleanup; } - val = persistentDef->cputune.shares; + shares = persistentDef->cputune.shares; + if (*nparams > 1) { + period = persistentDef->cputune.period; + quota = persistentDef->cputune.quota; + } } else { - val = vm->def->cputune.shares; + shares = vm->def->cputune.shares; + if (*nparams > 1) { + period = vm->def->cputune.period; + quota = vm->def->cputune.quota; + } } goto out; } @@ -5968,14 +6148,20 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, goto cleanup; } - rc = virCgroupGetCpuShares(group, &val); + rc = virCgroupGetCpuShares(group, &shares); if (rc != 0) { virReportSystemError(-rc, "%s", _("unable to get cpu shares tunable")); goto cleanup; } + + if (*nparams > 1) { + rc = qemuGetVcpusBWLive(vm, group, &period, "a); + if (rc != 0) + goto cleanup; + } out: - params[0].value.ul = val; + params[0].value.ul = shares; params[0].type = VIR_TYPED_PARAM_ULLONG; if (virStrcpyStatic(params[0].field, "cpu_shares") == NULL) { qemuReportError(VIR_ERR_INTERNAL_ERROR, @@ -5983,7 +6169,30 @@ out: goto cleanup; } - *nparams = 1; + if (*nparams > 1) { + params[1].value.ul = period; + params[1].type = VIR_TYPED_PARAM_ULLONG; + if (virStrcpyStatic(params[1].field, "cfs_period") == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("Field cfs_period too long for destination")); + goto cleanup; + } + + params[2].value.ul = quota; + params[2].type = VIR_TYPED_PARAM_LLONG; + if (virStrcpyStatic(params[2].field, "cfs_quota") == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("Field cfs_quota too long for destination")); + goto cleanup; + } + + *nparams = 3; + } else { + *nparams = 1; + } + ret = 0; cleanup: -- 1.7.1

On 07/18/2011 04:42 AM, Wen Congyang wrote:
@@ -5983,7 +6169,30 @@ out: goto cleanup; }
- *nparams = 1; + if (*nparams > 1) { + params[1].value.ul = period; + params[1].type = VIR_TYPED_PARAM_ULLONG; + if (virStrcpyStatic(params[1].field, "cfs_period") == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("Field cfs_period too long for destination")); + goto cleanup; + } + + params[2].value.ul = quota;
Possible buffer overflow if *nparams == 2 ...
+ params[2].type = VIR_TYPED_PARAM_LLONG; + if (virStrcpyStatic(params[2].field, "cfs_quota") == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("Field cfs_quota too long for destination")); + goto cleanup; + } + + *nparams = 3; + } else { + *nparams = 1; + } + ret = 0;
cleanup:
-- Adam Litke IBM Linux Technology Center

At 07/19/2011 04:44 AM, Adam Litke Write:
On 07/18/2011 04:42 AM, Wen Congyang wrote:
@@ -5983,7 +6169,30 @@ out: goto cleanup; }
- *nparams = 1; + if (*nparams > 1) { + params[1].value.ul = period; + params[1].type = VIR_TYPED_PARAM_ULLONG; + if (virStrcpyStatic(params[1].field, "cfs_period") == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("Field cfs_period too long for destination")); + goto cleanup; + } + + params[2].value.ul = quota;
Possible buffer overflow if *nparams == 2 ...
Yes, I forgot check the value :(
+ params[2].type = VIR_TYPED_PARAM_LLONG; + if (virStrcpyStatic(params[2].field, "cfs_quota") == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("Field cfs_quota too long for destination")); + goto cleanup; + } + + *nparams = 3; + } else { + *nparams = 1; + } + ret = 0;
cleanup:

--- src/qemu/qemu_driver.c | 312 ++++++++++++++++++++++++++++++++++++++++++++---- 1 files changed, 287 insertions(+), 25 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index cd65bce..fd80537 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5139,11 +5139,46 @@ cleanup: } +/* + * check whether the host supports CFS bandwidth + * + * Return 1 when CFS bandwidth is supported, 0 when CFS bandwidth is not + * supported, -1 on error. + */ +static int qemuGetCpuBWStatus(virCgroupPtr cgroup) +{ + char *cfs_period_path = NULL; + int ret = -1; + + if (!cgroup) + return 0; + + if (virCgroupPathOfController(cgroup, VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", &cfs_period_path) < 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("cannot get the path of cgroup CPU controller")); + goto cleanup; + } + + if (access(cfs_period_path, F_OK) < 0) { + ret = 0; + } else { + ret = 1; + } + +cleanup: + VIR_FREE(cfs_period_path); + return ret; +} + + static char *qemuGetSchedulerType(virDomainPtr dom, int *nparams) { struct qemud_driver *driver = dom->conn->privateData; char *ret = NULL; + int rc; qemuDriverLock(driver); if (!qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { @@ -5152,8 +5187,15 @@ static char *qemuGetSchedulerType(virDomainPtr dom, goto cleanup; } - if (nparams) - *nparams = 1; + if (nparams) { + rc = qemuGetCpuBWStatus(driver->cgroup); + if (rc < 0) + goto cleanup; + else if (rc == 0) + *nparams = 1; + else + *nparams = 3; + } ret = strdup("posix"); if (!ret) @@ -5786,6 +5828,58 @@ cleanup: return ret; } +static int +qemuSetVcpusBWLive(virDomainObjPtr vm, virCgroupPtr cgroup, + unsigned long long period, long long quota) +{ + int i; + qemuDomainObjPrivatePtr priv = vm->privateData; + virCgroupPtr cgroup_vcpu = NULL; + int rc; + + if (period == 0 && quota == 0) + return 0; + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. + */ + /* Ensure that we can multiply by vcpus without overflowing. */ + if (quota > LLONG_MAX / vm->def->vcpus) { + virReportSystemError(EINVAL, + _("%s"), + "Unable to set cpu bandwidth quota"); + goto cleanup; + } + + if (quota > 0) + quota *= vm->def->vcpus; + return qemuSetupCgroupVcpuBW(cgroup, period, quota); + } + + for (i = 0; i < priv->nvcpupids; i++) { + rc = virCgroupForVcpu(cgroup, i, &cgroup_vcpu, 0); + if (rc < 0) { + virReportSystemError(-rc, + _("Unable to find vcpu cgroup for %s(vcpu:" + " %d)"), + vm->def->name, i); + goto cleanup; + } + + if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0) + goto cleanup; + + virCgroupFree(&cgroup_vcpu); + } + + return 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + return -1; +} + static int qemuSetSchedulerParametersFlags(virDomainPtr dom, virTypedParameterPtr params, int nparams, @@ -5795,9 +5889,10 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, int i; virCgroupPtr group = NULL; virDomainObjPtr vm = NULL; - virDomainDefPtr persistentDef = NULL; + virDomainDefPtr vmdef = NULL; int ret = -1; bool isActive; + int rc; virCheckFlags(VIR_DOMAIN_AFFECT_LIVE | VIR_DOMAIN_AFFECT_CONFIG, -1); @@ -5821,10 +5916,17 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, flags = VIR_DOMAIN_AFFECT_CONFIG; } - if ((flags & VIR_DOMAIN_AFFECT_CONFIG) && !vm->persistent) { - qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", - _("cannot change persistent config of a transient domain")); - goto cleanup; + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + if (!vm->persistent) { + qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("cannot change persistent config of a transient domain")); + goto cleanup; + } + + /* Make a copy for updated domain. */ + vmdef = virDomainObjCopyPersistentDef(driver->caps, vm); + if (!vmdef) + goto cleanup; } if (flags & VIR_DOMAIN_AFFECT_LIVE) { @@ -5851,7 +5953,6 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, virTypedParameterPtr param = ¶ms[i]; if (STREQ(param->field, "cpu_shares")) { - int rc; if (param->type != VIR_TYPED_PARAM_ULLONG) { qemuReportError(VIR_ERR_INVALID_ARG, "%s", _("invalid type for cpu_shares tunable, expected a 'ullong'")); @@ -5870,19 +5971,47 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, } if (flags & VIR_DOMAIN_AFFECT_CONFIG) { - persistentDef = virDomainObjGetPersistentDef(driver->caps, vm); - if (!persistentDef) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("can't get persistentDef")); + vmdef->cputune.shares = params[i].value.ul; + } + } else if (STREQ(param->field, "cfs_period")) { + if (param->type != VIR_TYPED_PARAM_ULLONG) { + qemuReportError(VIR_ERR_INVALID_ARG, "%s", + _("invalid type for cfs_period tunable," + " expected a 'ullong'")); + goto cleanup; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + rc = qemuSetVcpusBWLive(vm, group, params[i].value.ul, 0); + if (rc != 0) goto cleanup; - } - persistentDef->cputune.shares = params[i].value.ul; - rc = virDomainSaveConfig(driver->configDir, persistentDef); - if (rc) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("can't save config")); + + if (params[i].value.ul) + vm->def->cputune.period = params[i].value.ul; + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + vmdef->cputune.period = params[i].value.ul; + } + } else if (STREQ(param->field, "cfs_quota")) { + if (param->type != VIR_TYPED_PARAM_LLONG) { + qemuReportError(VIR_ERR_INVALID_ARG, "%s", + _("invalid type for cfs_quota tunable," + " expected a 'llong'")); + goto cleanup; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + rc = qemuSetVcpusBWLive(vm, group, 0, params[i].value.l); + if (rc != 0) goto cleanup; - } + + if (params[i].value.l) + vm->def->cputune.quota = params[i].value.l; + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + vmdef->cputune.quota = params[i].value.l; } } else { qemuReportError(VIR_ERR_INVALID_ARG, @@ -5891,9 +6020,23 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, } } + if (virDomainSaveStatus(driver->caps, driver->stateDir, vm) < 0) + goto cleanup; + + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + rc = virDomainSaveConfig(driver->configDir, vmdef); + if (rc < 0) + goto cleanup; + + virDomainObjAssignDef(vm, vmdef, false); + vmdef = NULL; + } + ret = 0; cleanup: + virDomainDefFree(vmdef); virCgroupFree(&group); if (vm) virDomainObjUnlock(vm); @@ -5912,6 +6055,71 @@ static int qemuSetSchedulerParameters(virDomainPtr dom, } static int +qemuGetVcpuBWLive(virCgroupPtr cgroup, unsigned long long *period, + long long *quota) +{ + int rc; + + rc = virCgroupGetCpuCfsPeriod(cgroup, period); + if (rc < 0) { + virReportSystemError(-rc, "%s", + _("unable to get cpu bandwidth period tunable")); + return -1; + } + + rc = virCgroupGetCpuCfsQuota(cgroup, quota); + if (rc < 0) { + virReportSystemError(-rc, "%s", + _("unable to get cpu bandwidth tunable")); + return -1; + } + + return 0; +} + +static int +qemuGetVcpusBWLive(virDomainObjPtr vm, virCgroupPtr cgroup, + unsigned long long *period, long long *quota) +{ + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = NULL; + int rc; + int ret = -1; + + priv = vm->privateData; + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* We do not create sub dir for each vcpu */ + rc = qemuGetVcpuBWLive(cgroup, period, quota); + if (rc < 0) + goto cleanup; + + if (*quota > 0) + *quota /= vm->def->vcpus; + goto out; + } + + /* get period and quota for vcpu0 */ + rc = virCgroupForVcpu(cgroup, 0, &cgroup_vcpu, 0); + if (!cgroup_vcpu) { + virReportSystemError(-rc, + _("Unable to find vcpu cgroup for %s(vcpu: 0)"), + vm->def->name); + goto cleanup; + } + + rc = qemuGetVcpuBWLive(cgroup_vcpu, period, quota); + if (rc < 0) + goto cleanup; + +out: + ret = 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + return ret; +} + +static int qemuGetSchedulerParametersFlags(virDomainPtr dom, virTypedParameterPtr params, int *nparams, @@ -5920,10 +6128,14 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, struct qemud_driver *driver = dom->conn->privateData; virCgroupPtr group = NULL; virDomainObjPtr vm = NULL; - unsigned long long val; + unsigned long long shares; + unsigned long long period; + long long quota; int ret = -1; int rc; bool isActive; + bool cpu_bw_status; + int saved_nparams = 0; virCheckFlags(VIR_DOMAIN_AFFECT_LIVE | VIR_DOMAIN_AFFECT_CONFIG, -1); @@ -5943,6 +6155,13 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, goto cleanup; } + if (*nparams > 1) { + rc = qemuGetCpuBWStatus(driver->cgroup); + if (rc < 0) + goto cleanup; + cpu_bw_status = !!rc; + } + vm = virDomainFindByUUID(&driver->domains, dom->uuid); if (vm == NULL) { @@ -5976,9 +6195,17 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, _("can't get persistentDef")); goto cleanup; } - val = persistentDef->cputune.shares; + shares = persistentDef->cputune.shares; + if (*nparams > 1 && cpu_bw_status) { + period = persistentDef->cputune.period; + quota = persistentDef->cputune.quota; + } } else { - val = vm->def->cputune.shares; + shares = vm->def->cputune.shares; + if (*nparams > 1 && cpu_bw_status) { + period = vm->def->cputune.period; + quota = vm->def->cputune.quota; + } } goto out; } @@ -6001,14 +6228,20 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, goto cleanup; } - rc = virCgroupGetCpuShares(group, &val); + rc = virCgroupGetCpuShares(group, &shares); if (rc != 0) { virReportSystemError(-rc, "%s", _("unable to get cpu shares tunable")); goto cleanup; } + + if (*nparams > 1 && cpu_bw_status) { + rc = qemuGetVcpusBWLive(vm, group, &period, "a); + if (rc != 0) + goto cleanup; + } out: - params[0].value.ul = val; + params[0].value.ul = shares; params[0].type = VIR_TYPED_PARAM_ULLONG; if (virStrcpyStatic(params[0].field, "cpu_shares") == NULL) { qemuReportError(VIR_ERR_INTERNAL_ERROR, @@ -6016,7 +6249,36 @@ out: goto cleanup; } - *nparams = 1; + saved_nparams++; + + if (cpu_bw_status) { + if (*nparams > saved_nparams) { + params[1].value.ul = period; + params[1].type = VIR_TYPED_PARAM_ULLONG; + if (virStrcpyStatic(params[1].field, "cfs_period") == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("Field cfs_period too long for destination")); + goto cleanup; + } + saved_nparams++; + } + + if (*nparams > saved_nparams) { + params[2].value.ul = quota; + params[2].type = VIR_TYPED_PARAM_LLONG; + if (virStrcpyStatic(params[2].field, "cfs_quota") == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("Field cfs_quota too long for destination")); + goto cleanup; + } + saved_nparams++; + } + } + + *nparams = saved_nparams; + ret = 0; cleanup: -- 1.7.1

--- docs/formatdomain.html.in | 19 +++++++++++++++++++ 1 files changed, 19 insertions(+), 0 deletions(-) diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 269fc30..d388332 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -307,6 +307,8 @@ <vcpupin vcpu="2" cpuset="2,3"/> <vcpupin vcpu="3" cpuset="0,4"/> <shares>2048</shares> + <period>1000000</period> + <quota>-1</quota> </cputune> <numatune> <memory mode="strict" nodeset="1-4,^3"/> @@ -400,6 +402,23 @@ 2048 will get twice as much CPU time as a VM configured with value 1024. <span class="since">Since 0.9.0</span> </dd> + <dt><code>period</code></dt> + <dd> + The optional <code>period</code> element specifies the enforcement + interval(unit: microseconds). Within <code>period</code>, the domain + will not be allowed to consume more than <code>quota</code> worth of + runtime. The value should be in range [1000, 1000000]. + <span class="since">Since 0.9.4</span> + </dd> + <dt><code>quota</code></dt> + <dd> + The optional <code>quota</code> element specifies the maximum allowed + bandwidth(unit: microseconds). A domain with <code>quota</code> as any + negative value indicates that the domain has infinite bandwidth, which + means that it is not bandwidth controlled. The value should be in range + [1000, 18446744073709551] or less than 0. + <span class="since">Since 0.9.4</span> + </dd> <dt><code>numatune</code></dt> <dd> The optional <code>numatune</code> element provides details of -- 1.7.1

On 07/18/2011 04:43 AM, Wen Congyang wrote:
--- docs/formatdomain.html.in | 19 +++++++++++++++++++ 1 files changed, 19 insertions(+), 0 deletions(-)
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 269fc30..d388332 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -307,6 +307,8 @@ <vcpupin vcpu="2" cpuset="2,3"/> <vcpupin vcpu="3" cpuset="0,4"/> <shares>2048</shares> + <period>1000000</period> + <quota>-1</quota> </cputune> <numatune> <memory mode="strict" nodeset="1-4,^3"/> @@ -400,6 +402,23 @@ 2048 will get twice as much CPU time as a VM configured with value 1024. <span class="since">Since 0.9.0</span> </dd> + <dt><code>period</code></dt> + <dd> + The optional <code>period</code> element specifies the enforcement + interval(unit: microseconds). Within <code>period</code>, the domain + will not be allowed to consume more than <code>quota</code> worth of + runtime. The value should be in range [1000, 1000000]. + <span class="since">Since 0.9.4</span> + </dd>
This should be changed to read "each vcpu will bot be allowed to consume more than ...".
+ <dt><code>quota</code></dt> + <dd> + The optional <code>quota</code> element specifies the maximum allowed + bandwidth(unit: microseconds). A domain with <code>quota</code> as any + negative value indicates that the domain has infinite bandwidth, which + means that it is not bandwidth controlled. The value should be in range + [1000, 18446744073709551] or less than 0. + <span class="since">Since 0.9.4</span> + </dd> <dt><code>numatune</code></dt> <dd> The optional <code>numatune</code> element provides details of
-- Adam Litke IBM Linux Technology Center

--- docs/formatdomain.html.in | 19 +++++++++++++++++++ 1 files changed, 19 insertions(+), 0 deletions(-) diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index a54ee6a..47edd35 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -307,6 +307,8 @@ <vcpupin vcpu="2" cpuset="2,3"/> <vcpupin vcpu="3" cpuset="0,4"/> <shares>2048</shares> + <period>1000000</period> + <quota>-1</quota> </cputune> <numatune> <memory mode="strict" nodeset="1-4,^3"/> @@ -400,6 +402,23 @@ 2048 will get twice as much CPU time as a VM configured with value 1024. <span class="since">Since 0.9.0</span> </dd> + <dt><code>period</code></dt> + <dd> + The optional <code>period</code> element specifies the enforcement + interval(unit: microseconds). Within <code>period</code>, each vcpu of + the domain will not be allowed to consume more than <code>quota</code> + worth of runtime. The value should be in range [1000, 1000000]. + <span class="since">Since 0.9.4</span> + </dd> + <dt><code>quota</code></dt> + <dd> + The optional <code>quota</code> element specifies the maximum allowed + bandwidth(unit: microseconds). A domain with <code>quota</code> as any + negative value indicates that the domain has infinite bandwidth, which + means that it is not bandwidth controlled. The value should be in range + [1000, 18446744073709551] or less than 0. + <span class="since">Since 0.9.4</span> + </dd> <dt><code>numatune</code></dt> <dd> The optional <code>numatune</code> element provides details of -- 1.7.1

On Mon, 2011-07-18 at 17:34 +0800, Wen Congyang wrote:
TODO: 1. We create sub directory for each vcpu in cpu subsystem. So we should recalculate cpu.shares for each vcpu.
Is the per vcpu cgroup optional? I.e., is is possible to set the period and quota for the entire domain and let the host scheduler deal with it? Caveat: Domain level CFS shares seems to work well--"work well" here means "behaves as I expected" ;-). I have no experience with the period/quota facility and libvirt domains, so maybe it doesn't make sense to cap cpu utilization at the domain level. Regards, Lee
Changelog: v3: fix some small bugs implement the simple way v2: almost rewrite the patchset to support to control each vcpu's bandwidth. Limit quota to [-1, 2^64/1000] at the schemas level. We will check it at cgroup level.
Wen Congyang (6): Introduce the function virCgroupForVcpu cgroup: Implement cpu.cfs_period_us and cpu.cfs_quota_us tuning API Update XML Schema for new entries qemu: Implement period and quota tunable XML configuration and parsing qemu: Implement cfs_period and cfs_quota's modification doc: Add documentation for new cputune elements period and quota
docs/formatdomain.html.in | 19 ++ docs/schemas/domain.rng | 26 +++- src/conf/domain_conf.c | 20 ++- src/conf/domain_conf.h | 2 + src/libvirt_private.syms | 5 + src/qemu/qemu_cgroup.c | 127 +++++++++++ src/qemu/qemu_cgroup.h | 4 + src/qemu/qemu_driver.c | 259 ++++++++++++++++++++--- src/qemu/qemu_process.c | 4 + src/util/cgroup.c | 153 +++++++++++++- src/util/cgroup.h | 11 + tests/qemuxml2argvdata/qemuxml2argv-cputune.xml | 2 + 12 files changed, 596 insertions(+), 36 deletions(-)
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list

At 07/18/2011 09:36 PM, Lee Schermerhorn Write:
On Mon, 2011-07-18 at 17:34 +0800, Wen Congyang wrote:
TODO: 1. We create sub directory for each vcpu in cpu subsystem. So we should recalculate cpu.shares for each vcpu.
Is the per vcpu cgroup optional? I.e., is is possible to set the period and quota for the entire domain and let the host scheduler deal with it?
Caveat: Domain level CFS shares seems to work well--"work well" here means "behaves as I expected" ;-). I have no experience with the period/quota facility and libvirt domains, so maybe it doesn't make
The quota's value means that all tasks in this task group as a whole will not be allowedto consume more than quota(unit: us) worth of runtime within a period of period(unit: us). If per-vcpu thread is active, each vcpu has a thread. If one vcpu consume quota worth of runtime, the other vcpu will be hunger.
sense to cap cpu utilization at the domain level.
Regards, Lee
Changelog: v3: fix some small bugs implement the simple way v2: almost rewrite the patchset to support to control each vcpu's bandwidth. Limit quota to [-1, 2^64/1000] at the schemas level. We will check it at cgroup level.
Wen Congyang (6): Introduce the function virCgroupForVcpu cgroup: Implement cpu.cfs_period_us and cpu.cfs_quota_us tuning API Update XML Schema for new entries qemu: Implement period and quota tunable XML configuration and parsing qemu: Implement cfs_period and cfs_quota's modification doc: Add documentation for new cputune elements period and quota
docs/formatdomain.html.in | 19 ++ docs/schemas/domain.rng | 26 +++- src/conf/domain_conf.c | 20 ++- src/conf/domain_conf.h | 2 + src/libvirt_private.syms | 5 + src/qemu/qemu_cgroup.c | 127 +++++++++++ src/qemu/qemu_cgroup.h | 4 + src/qemu/qemu_driver.c | 259 ++++++++++++++++++++--- src/qemu/qemu_process.c | 4 + src/util/cgroup.c | 153 +++++++++++++- src/util/cgroup.h | 11 + tests/qemuxml2argvdata/qemuxml2argv-cputune.xml | 2 + 12 files changed, 596 insertions(+), 36 deletions(-)
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
participants (3)
-
Adam Litke
-
Lee Schermerhorn
-
Wen Congyang