[libvirt] [PATCH RFC 0/5] support cpu bandwidth in libvirt

I need this feature immediately after CFS bandwidth patchset is merged into upstream kernel. So I prepare this patchset, and post it here for reviewing before CFS bandwidth is merged into upstream kernel. quota is an optional attribute specified in microseconds not a percentage of period. TODO: 1. quota should be in the range [1000, 18446744073709551(2^64/1000)] or less than 0. But I only limit it less or equal than 2^64/1000. Does anyone have a better way to limit quota? Wen Congyang (5): cgroup: Implement cpu.cfs_period_us and cpu.cfs_quota_us tuning API Update XML Schema for new entries qemu: Implement period and quota tunable XML configuration and parsing. qemu: Implement cfs_period and cfs_quota's modification doc: Add documentation for new cputune elements period and quota docs/formatdomain.html.in | 19 +++ docs/schemas/domain.rng | 25 ++++- src/conf/domain_conf.c | 20 +++- src/conf/domain_conf.h | 2 + src/libvirt_private.syms | 4 + src/qemu/qemu_cgroup.c | 43 +++++- src/qemu/qemu_driver.c | 162 +++++++++++++++++++---- src/util/cgroup.c | 81 +++++++++++- src/util/cgroup.h | 6 + tests/qemuxml2argvdata/qemuxml2argv-cputune.xml | 2 + 10 files changed, 328 insertions(+), 36 deletions(-)

--- src/libvirt_private.syms | 4 ++ src/util/cgroup.c | 81 ++++++++++++++++++++++++++++++++++++++++++++-- src/util/cgroup.h | 6 +++ 3 files changed, 88 insertions(+), 3 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index ab110a3..11cd3e3 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -70,6 +70,8 @@ virCgroupForDriver; virCgroupFree; virCgroupGetBlkioWeight; virCgroupGetCpuShares; +virCgroupGetCpuCfsPeriod; +virCgroupGetCpuCfsQuota; virCgroupGetCpuacctUsage; virCgroupGetFreezerState; virCgroupGetMemoryHardLimit; @@ -84,6 +86,8 @@ virCgroupPathOfController; virCgroupRemove; virCgroupSetBlkioWeight; virCgroupSetCpuShares; +virCgroupSetCpuCfsPeriod; +virCgroupSetCpuCfsQuota; virCgroupSetFreezerState; virCgroupSetMemory; virCgroupSetMemoryHardLimit; diff --git a/src/util/cgroup.c b/src/util/cgroup.c index 2e5ef46..6f9eb1f 100644 --- a/src/util/cgroup.c +++ b/src/util/cgroup.c @@ -388,8 +388,6 @@ static int virCgroupSetValueI64(virCgroupPtr group, return rc; } -#if 0 -/* This is included for completeness, but not yet used */ static int virCgroupGetValueI64(virCgroupPtr group, int controller, const char *key, @@ -409,7 +407,6 @@ out: return rc; } -#endif static int virCgroupGetValueU64(virCgroupPtr group, int controller, @@ -1322,6 +1319,84 @@ int virCgroupGetCpuShares(virCgroupPtr group, unsigned long long *shares) "cpu.shares", shares); } +/** + * virCgroupSetCpuCfsPeriod: + * + * @group: The cgroup to change cpu.cfs_period_us for + * @cfs_period: The bandwidth period in usecs + * + * Returns: 0 on success + */ +int virCgroupSetCpuCfsPeriod(virCgroupPtr group, unsigned long long cfs_period) +{ + /* The cfs_period shoule be greater or equal than 1ms, and less or equal + * than 1s. + */ + if (cfs_period < 1000 || cfs_period > 1000000) + return -EINVAL; + + return virCgroupSetValueU64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", cfs_period); +} + +/** + * virCgroupGetCpuCfsPeriod: + * + * @group: The cgroup to get cpu.cfs_period_us for + * @cfs_period: Pointer to the returned bandwidth period in usecs + * + * Returns: 0 on success + */ +int virCgroupGetCpuCfsPeriod(virCgroupPtr group, unsigned long long *cfs_period) +{ + return virCgroupGetValueU64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", cfs_period); +} + +/** + * virCgroupSetCpuCfsQuota: + * + * @group: The cgroup to change cpu.cfs_quota_us for + * @cfs_quota: the cpu bandwidth (in usecs) that this tg will be allowed to + * consume over period + * + * Returns: 0 on success + */ +int virCgroupSetCpuCfsQuota(virCgroupPtr group, long long cfs_quota) +{ + if (cfs_quota >= 0) { + /* The cfs_quota shoule be greater or equal than 1ms */ + if (cfs_quota < 1000) + return -EINVAL; + + /* check overflow */ + if (cfs_quota > (unsigned long long)~0ULL / 1000) + return -EINVAL; + } + + return virCgroupSetValueI64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_quota_us", cfs_quota); +} + +/** + * virCgroupGetCpuCfsQuota: + * + * @group: The cgroup to get cpu.cfs_quota_us for + * @cfs_quota: Pointer to the returned cpu bandwidth (in usecs) that this tg + * will be allowed to consume over period + * + * Returns: 0 on success + */ +int virCgroupGetCpuCfsQuota(virCgroupPtr group, long long *cfs_quota) +{ + return virCgroupGetValueI64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_quota_us", cfs_quota); +} + int virCgroupGetCpuacctUsage(virCgroupPtr group, unsigned long long *usage) { return virCgroupGetValueU64(group, diff --git a/src/util/cgroup.h b/src/util/cgroup.h index 8ae756d..e794306 100644 --- a/src/util/cgroup.h +++ b/src/util/cgroup.h @@ -99,6 +99,12 @@ int virCgroupDenyDevicePath(virCgroupPtr group, int virCgroupSetCpuShares(virCgroupPtr group, unsigned long long shares); int virCgroupGetCpuShares(virCgroupPtr group, unsigned long long *shares); +int virCgroupSetCpuCfsPeriod(virCgroupPtr group, unsigned long long cfs_period); +int virCgroupGetCpuCfsPeriod(virCgroupPtr group, unsigned long long *cfs_period); + +int virCgroupSetCpuCfsQuota(virCgroupPtr group, long long cfs_quota); +int virCgroupGetCpuCfsQuota(virCgroupPtr group, long long *cfs_quota); + int virCgroupGetCpuacctUsage(virCgroupPtr group, unsigned long long *usage); int virCgroupSetFreezerState(virCgroupPtr group, const char *state); -- 1.7.1

--- docs/schemas/domain.rng | 25 ++++++++++++++++++++++++- 1 files changed, 24 insertions(+), 1 deletions(-) diff --git a/docs/schemas/domain.rng b/docs/schemas/domain.rng index b71778b..8db5752 100644 --- a/docs/schemas/domain.rng +++ b/docs/schemas/domain.rng @@ -375,6 +375,16 @@ <ref name="cpushares"/> </element> </optional> + <optional> + <element name="period"> + <ref name="cpuperiod"/> + </element> + </optional> + <optional> + <element name="quota"> + <ref name="cpuquota"/> + </element> + </optional> <zeroOrMore> <element name="vcpupin"> <attribute name="vcpu"> @@ -2337,7 +2347,20 @@ <data type="unsignedInt"> <param name="pattern">[0-9]+</param> </data> - </define> + </define> + <define name="cpuperiod"> + <data type="unsignedLong"> + <param name="pattern">[0-9]+</param> + <param name="minInclusive">1000</param> + <param name="maxInclusive">1000000</param> + </data> + </define> + <define name="cpuquota"> + <data type="long"> + <param name="pattern">-?[0-9]+</param> + <param name="maxInclusive">18446744073709551</param> + </data> + </define> <define name="hostName"> <data type="string"> <param name="pattern">[a-zA-Z0-9\.\-]+</param> -- 1.7.1

--- src/conf/domain_conf.c | 20 +++++++++- src/conf/domain_conf.h | 2 + src/qemu/qemu_cgroup.c | 43 +++++++++++++++++++---- tests/qemuxml2argvdata/qemuxml2argv-cputune.xml | 2 + 4 files changed, 58 insertions(+), 9 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 7393690..1572043 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -5841,6 +5841,14 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, &def->cputune.shares) < 0) def->cputune.shares = 0; + if (virXPathULongLong("string(./cputune/period[1])", ctxt, + &def->cputune.period) < 0) + def->cputune.period = 0; + + if (virXPathLongLong("string(./cputune/quota[1])", ctxt, + &def->cputune.quota) < 0) + def->cputune.quota = 0; + if ((n = virXPathNodeSet("./cputune/vcpupin", ctxt, &nodes)) < 0) { goto error; } @@ -9426,12 +9434,19 @@ char *virDomainDefFormat(virDomainDefPtr def, virBufferAsprintf(&buf, " current='%u'", def->vcpus); virBufferAsprintf(&buf, ">%u</vcpu>\n", def->maxvcpus); - if (def->cputune.shares || def->cputune.vcpupin) + if (def->cputune.shares || def->cputune.vcpupin || + def->cputune.period || def->cputune.quota) virBufferAddLit(&buf, " <cputune>\n"); if (def->cputune.shares) virBufferAsprintf(&buf, " <shares>%lu</shares>\n", def->cputune.shares); + if (def->cputune.period) + virBufferAsprintf(&buf, " <period>%llu</period>\n", + def->cputune.period); + if (def->cputune.quota) + virBufferAsprintf(&buf, " <quota>%lld</quota>\n", + def->cputune.quota); if (def->cputune.vcpupin) { int i; for (i = 0; i < def->cputune.nvcpupin; i++) { @@ -9453,7 +9468,8 @@ char *virDomainDefFormat(virDomainDefPtr def, } } - if (def->cputune.shares || def->cputune.vcpupin) + if (def->cputune.shares || def->cputune.vcpupin || + def->cputune.period || def->cputune.quota) virBufferAddLit(&buf, " </cputune>\n"); if (def->sysinfo) diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 3ef48d1..d2a5804 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -1151,6 +1151,8 @@ struct _virDomainDef { struct { unsigned long shares; + unsigned long long period; + long long quota; int nvcpupin; virDomainVcpupinDefPtr *vcpupin; } cputune; diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 1298924..b14b8b6 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -349,14 +349,43 @@ int qemuSetupCgroup(struct qemud_driver *driver, } } - if (vm->def->cputune.shares != 0) { + if (vm->def->cputune.shares != 0 || + vm->def->cputune.period != 0 || + vm->def->cputune.quota != 0) { if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { - rc = virCgroupSetCpuShares(cgroup, vm->def->cputune.shares); - if(rc != 0) { - virReportSystemError(-rc, - _("Unable to set io cpu shares for domain %s"), - vm->def->name); - goto cleanup; + if (vm->def->cputune.shares != 0) { + rc = virCgroupSetCpuShares(cgroup, vm->def->cputune.shares); + if(rc != 0) { + virReportSystemError(-rc, + _("Unable to set io cpu shares for" + " domain %s"), + vm->def->name); + goto cleanup; + } + } + + if (vm->def->cputune.period != 0) { + rc = virCgroupSetCpuCfsPeriod(cgroup, + vm->def->cputune.period); + if(rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpu bandwidth period" + " for domain %s"), + vm->def->name); + goto cleanup; + } + } + + if (vm->def->cputune.quota != 0) { + rc = virCgroupSetCpuCfsQuota(cgroup, + vm->def->cputune.quota); + if(rc != 0) { + virReportSystemError(-rc, + _("Unable to set cpu bandwidth for" + " domain %s"), + vm->def->name); + goto cleanup; + } } } else { qemuReportError(VIR_ERR_CONFIG_UNSUPPORTED, diff --git a/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml b/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml index 0afbadb..091865a 100644 --- a/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml +++ b/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml @@ -6,6 +6,8 @@ <vcpu>2</vcpu> <cputune> <shares>2048</shares> + <period>1000000</period> + <quota>-1</quota> <vcpupin vcpu='0' cpuset='0'/> <vcpupin vcpu='1' cpuset='1'/> </cputune> -- 1.7.1

On 06/15/2011 01:31 AM, Wen Congyang wrote:
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 3ef48d1..d2a5804 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -1151,6 +1151,8 @@ struct _virDomainDef {
struct { unsigned long shares; + unsigned long long period; + long long quota; int nvcpupin; virDomainVcpupinDefPtr *vcpupin; } cputune;
I think you are incorrectly mixing types here with quota and period. Shouldn't both of these be unsigned long long (or maybe _virCfsQuota as I suggested in an earlier email)? -- Adam Litke IBM Linux Technology Center

--- src/qemu/qemu_driver.c | 162 +++++++++++++++++++++++++++++++++++++++++------- 1 files changed, 139 insertions(+), 23 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 853c84c..0d83c04 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4764,6 +4764,7 @@ static char *qemuGetSchedulerType(virDomainPtr dom, { struct qemud_driver *driver = dom->conn->privateData; char *ret = NULL; + char *cfs_period_path = NULL; qemuDriverLock(driver); if (!qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { @@ -4772,14 +4773,29 @@ static char *qemuGetSchedulerType(virDomainPtr dom, goto cleanup; } - if (nparams) - *nparams = 1; + /* check whether the host supports CFS bandwidth */ + if (virCgroupPathOfController(driver->cgroup, VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", &cfs_period_path) < 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("cannot get the path of cgroup CPU controller")); + goto cleanup; + } + + if (nparams) { + if (access(cfs_period_path, F_OK) < 0) { + *nparams = 1; + } else { + *nparams = 3; + } + } ret = strdup("posix"); if (!ret) virReportOOMError(); cleanup: + VIR_FREE(cfs_period_path); qemuDriverUnlock(driver); return ret; } @@ -5313,9 +5329,10 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, int i; virCgroupPtr group = NULL; virDomainObjPtr vm = NULL; - virDomainDefPtr persistentDef = NULL; + virDomainDefPtr vmdef = NULL; int ret = -1; bool isActive; + int rc; virCheckFlags(VIR_DOMAIN_AFFECT_LIVE | VIR_DOMAIN_AFFECT_CONFIG, -1); @@ -5339,10 +5356,17 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, flags = VIR_DOMAIN_AFFECT_CONFIG; } - if ((flags & VIR_DOMAIN_AFFECT_CONFIG) && !vm->persistent) { - qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", - _("cannot change persistent config of a transient domain")); - goto cleanup; + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + if (vm->persistent) { + qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("cannot change persistent config of a transient domain")); + goto cleanup; + } + + /* Make a copy for updated domain. */ + vmdef = virDomainObjCopyPersistentDef(driver->caps, vm); + if (!vmdef) + goto cleanup; } if (flags & VIR_DOMAIN_AFFECT_LIVE) { @@ -5369,7 +5393,6 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, virTypedParameterPtr param = ¶ms[i]; if (STREQ(param->field, "cpu_shares")) { - int rc; if (param->type != VIR_TYPED_PARAM_ULLONG) { qemuReportError(VIR_ERR_INVALID_ARG, "%s", _("invalid type for cpu_shares tunable, expected a 'ullong'")); @@ -5388,19 +5411,53 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, } if (flags & VIR_DOMAIN_AFFECT_CONFIG) { - persistentDef = virDomainObjGetPersistentDef(driver->caps, vm); - if (!persistentDef) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("can't get persistentDef")); + vmdef->cputune.shares = params[i].value.ul; + } + } else if (STREQ(param->field, "cfs_period")) { + if (param->type != VIR_TYPED_PARAM_ULLONG) { + qemuReportError(VIR_ERR_INVALID_ARG, "%s", + _("invalid type for cfs_period tunable," + " expected a 'ullong'")); + goto cleanup; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + rc = virCgroupSetCpuCfsPeriod(group, params[i].value.ul); + if (rc != 0) { + virReportSystemError(-rc, "%s", + _("unable to set cpu bandwidth period" + " tunable")); goto cleanup; } - persistentDef->cputune.shares = params[i].value.ul; - rc = virDomainSaveConfig(driver->configDir, persistentDef); - if (rc) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("can't save config")); + + vm->def->cputune.period = params[i].value.ul; + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + vmdef->cputune.period = params[i].value.ul; + } + } else if (STREQ(param->field, "cfs_quota")) { + if (param->type != VIR_TYPED_PARAM_LLONG) { + qemuReportError(VIR_ERR_INVALID_ARG, "%s", + _("invalid type for cfs_quota tunable," + " expected a 'llong'")); + goto cleanup; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + rc = virCgroupSetCpuCfsQuota(group, params[i].value.l); + if (rc != 0) { + virReportSystemError(-rc, "%s", + _("unable to set cpu bandwidth" + " tunable")); goto cleanup; } + + vm->def->cputune.quota = params[i].value.l; + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + vmdef->cputune.quota = params[i].value.l; } } else { qemuReportError(VIR_ERR_INVALID_ARG, @@ -5409,6 +5466,15 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, } } + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + rc = virDomainSaveConfig(driver->configDir, vmdef); + if (rc) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("can't save config")); + goto cleanup; + } + } + ret = 0; cleanup: @@ -5438,7 +5504,9 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, struct qemud_driver *driver = dom->conn->privateData; virCgroupPtr group = NULL; virDomainObjPtr vm = NULL; - unsigned long long val; + unsigned long long shares; + unsigned long long period; + long long quota; int ret = -1; int rc; bool isActive; @@ -5493,9 +5561,17 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, _("can't get persistentDef")); goto cleanup; } - val = persistentDef->cputune.shares; + shares = persistentDef->cputune.shares; + if (*nparams > 1) { + period = persistentDef->cputune.period; + quota = persistentDef->cputune.quota; + } } else { - val = vm->def->cputune.shares; + shares = vm->def->cputune.shares; + if (*nparams > 1) { + period = vm->def->cputune.period; + quota = vm->def->cputune.quota; + } } goto out; } @@ -5518,14 +5594,31 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, goto cleanup; } - rc = virCgroupGetCpuShares(group, &val); + rc = virCgroupGetCpuShares(group, &shares); if (rc != 0) { virReportSystemError(-rc, "%s", _("unable to get cpu shares tunable")); goto cleanup; } + + if (*nparams > 1) { + rc = virCgroupGetCpuCfsPeriod(group, &period); + if (rc != 0) { + virReportSystemError(-rc, "%s", + _("unable to get cpu bandwidth period" + " tunable")); + goto cleanup; + } + + rc = virCgroupGetCpuCfsQuota(group, "a); + if (rc != 0) { + virReportSystemError(-rc, "%s", + _("unable to get cpu bandwidth tunable")); + goto cleanup; + } + } out: - params[0].value.ul = val; + params[0].value.ul = shares; params[0].type = VIR_TYPED_PARAM_ULLONG; if (virStrcpyStatic(params[0].field, "cpu_shares") == NULL) { qemuReportError(VIR_ERR_INTERNAL_ERROR, @@ -5533,7 +5626,30 @@ out: goto cleanup; } - *nparams = 1; + if (*nparams > 1) { + params[1].value.ul = period; + params[1].type = VIR_TYPED_PARAM_ULLONG; + if (virStrcpyStatic(params[1].field, "cfs_period") == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("Field cfs_period too long for destination")); + goto cleanup; + } + + params[2].value.ul = quota; + params[2].type = VIR_TYPED_PARAM_LLONG; + if (virStrcpyStatic(params[2].field, "cfs_quota") == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("Field cfs_quota too long for destination")); + goto cleanup; + } + + *nparams = 3; + } else { + *nparams = 1; + } + ret = 0; cleanup: -- 1.7.1

--- docs/formatdomain.html.in | 19 +++++++++++++++++++ 1 files changed, 19 insertions(+), 0 deletions(-) diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 267839b..05ebed7 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -294,6 +294,8 @@ <vcpupin vcpu="2" cpuset="2,3"/> <vcpupin vcpu="3" cpuset="0,4"/> <shares>2048</shares> + <period>1000000</period> + <quota>-1</quota> </cputune> ...</pre> @@ -382,6 +384,23 @@ 2048 will get twice as much CPU time as a VM configured with value 1024. <span class="since">Since 0.9.0</span> </dd> + <dt><code>period</code></dt> + <dd> + The optional <code>period</code> element specifies the enforcement + interval(unit: microseconds). Within <code>period</code>, the domain + will not be allowed to consume more than <code>quota</code> worth of + runtime. The value should be in range [1000, 1000000]. + <span class="since">Since 0.9.3</span> + </dd> + <dt><code>quota</code></dt> + <dd> + The optional <code>quota</code> element specifies the maximum allowed + bandwidth(unit: microseconds). A domain with <code>quota</code> as any + negative value indicates that the domain has infinite bandwidth, which + means that it is not bandwidth controlled. The value should be in range + [1000, 18446744073709551] or less than 0. + <span class="since">Since 0.9.3</span> + </dd> </dl> <h3><a name="elementsCPU">CPU model and topology</a></h3> -- 1.7.1

On 06/15/2011 01:24 AM, Wen Congyang wrote:
I need this feature immediately after CFS bandwidth patchset is merged into upstream kernel. So I prepare this patchset, and post it here for reviewing before CFS bandwidth is merged into upstream kernel.
quota is an optional attribute specified in microseconds not a percentage of period.
TODO: 1. quota should be in the range [1000, 18446744073709551(2^64/1000)] or less than 0. But I only limit it less or equal than 2^64/1000. Does anyone have a better way to limit quota?
What are the semantics of quota < 0, of quota == 0? How are you storing such a large signed integer in the kernel? I will guess that any value less than 0 means: no limit, and 0 means: no quota (ie. the cgroup cannot run on the cpu). I am not sure what the libvirt convention for dealing with a situation like this is. I think you have two options: 1) override an invalid value (ie. 1) to represent the case (n < 0) or, 2) use a struct to represent the quota: struct _virCfsQuota { unsigned long long val; int unlimited; }; When the quota is unlimited, .unlimited == 1 and val is undefined. Otherwise, the quota is in val.
Wen Congyang (5): cgroup: Implement cpu.cfs_period_us and cpu.cfs_quota_us tuning API Update XML Schema for new entries qemu: Implement period and quota tunable XML configuration and parsing. qemu: Implement cfs_period and cfs_quota's modification doc: Add documentation for new cputune elements period and quota
docs/formatdomain.html.in | 19 +++ docs/schemas/domain.rng | 25 ++++- src/conf/domain_conf.c | 20 +++- src/conf/domain_conf.h | 2 + src/libvirt_private.syms | 4 + src/qemu/qemu_cgroup.c | 43 +++++- src/qemu/qemu_driver.c | 162 +++++++++++++++++++---- src/util/cgroup.c | 81 +++++++++++- src/util/cgroup.h | 6 + tests/qemuxml2argvdata/qemuxml2argv-cputune.xml | 2 + 10 files changed, 328 insertions(+), 36 deletions(-)
-- Adam Litke IBM Linux Technology Center

At 06/16/2011 03:47 AM, Adam Litke Write:
On 06/15/2011 01:24 AM, Wen Congyang wrote:
I need this feature immediately after CFS bandwidth patchset is merged into upstream kernel. So I prepare this patchset, and post it here for reviewing before CFS bandwidth is merged into upstream kernel.
quota is an optional attribute specified in microseconds not a percentage of period.
TODO: 1. quota should be in the range [1000, 18446744073709551(2^64/1000)] or less than 0. But I only limit it less or equal than 2^64/1000. Does anyone have a better way to limit quota?
What are the semantics of quota < 0, of quota == 0? How are you storing
From Documentation/scheduler/sched-bwc.txt: ======================================= A group with cpu.cfs_quota_us as -1 indicates that the group has infinite bandwidth, which means that it is not bandwidth controlled.
Writing any negative value to cpu.cfs_quota_us will turn the group into an infinite bandwidth group. Reading cpu.cfs_quota_us for an infinite bandwidth group will always return -1. ======================================= quota == 0 is invalid.
such a large signed integer in the kernel? I will guess that any value
In the kernel, the value of quota is stored in uint64_t: ======================================= struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH raw_spinlock_t lock; ktime_t period; u64 quota; <========== here is u64, not s64 u64 runtime; u64 runtime_expires; s64 hierarchal_quota; int idle; struct hrtimer period_timer, slack_timer; struct list_head throttled_cfs_rq; /* statistics */ int nr_periods, nr_throttled; u64 throttled_time; #endif }; ======================================= The unit of quota in kernel is ns, but the value we write to cpu.cfs_quota_us is us. So the max value we can write is 2^64/1000. In the kernel, if quota is ~0ULL, it means unlimited.
less than 0 means: no limit, and 0 means: no quota (ie. the cgroup cannot run on the cpu).
If the kernel does not support CFS bandwidth, quota is 0 and we do not allow user to modify it. 0 is internal value, and the user does not know it.
I am not sure what the libvirt convention for dealing with a situation like this is. I think you have two options: 1) override an invalid value (ie. 1) to represent the case (n < 0) or, 2) use a struct to represent the quota:
I do not think so. The value that user can specify should be the similar the user can write it to cpu.cfs_quota_us. If so, the user can use it easily.
struct _virCfsQuota { unsigned long long val; int unlimited; };
When the quota is unlimited, .unlimited == 1 and val is undefined. Otherwise, the quota is in val.
Wen Congyang (5): cgroup: Implement cpu.cfs_period_us and cpu.cfs_quota_us tuning API Update XML Schema for new entries qemu: Implement period and quota tunable XML configuration and parsing. qemu: Implement cfs_period and cfs_quota's modification doc: Add documentation for new cputune elements period and quota
docs/formatdomain.html.in | 19 +++ docs/schemas/domain.rng | 25 ++++- src/conf/domain_conf.c | 20 +++- src/conf/domain_conf.h | 2 + src/libvirt_private.syms | 4 + src/qemu/qemu_cgroup.c | 43 +++++- src/qemu/qemu_driver.c | 162 +++++++++++++++++++---- src/util/cgroup.c | 81 +++++++++++- src/util/cgroup.h | 6 + tests/qemuxml2argvdata/qemuxml2argv-cputune.xml | 2 + 10 files changed, 328 insertions(+), 36 deletions(-)

On 06/16/2011 12:44 AM, Wen Congyang wrote:
In the kernel, the value of quota is stored in uint64_t: ======================================= struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH raw_spinlock_t lock; ktime_t period; u64 quota; <========== here is u64, not s64 u64 runtime; u64 runtime_expires; s64 hierarchal_quota;
int idle; struct hrtimer period_timer, slack_timer; struct list_head throttled_cfs_rq;
/* statistics */ int nr_periods, nr_throttled; u64 throttled_time;
#endif }; =======================================
The unit of quota in kernel is ns, but the value we write to cpu.cfs_quota_us is us. So the max value we can write is 2^64/1000.
In the kernel, if quota is ~0ULL, it means unlimited.
Ok, I would think you can use the same semantics in libvirt then. No need for a structure. -- Adam Litke IBM Linux Technology Center
participants (2)
-
Adam Litke
-
Wen Congyang