[libvirt] [PATCH RFC V2 00/10] support cpu bandwidth in libvirt

TODO: 1. We create sub directory for each vcpu in cpu subsystem. So we should recalculate cpu.shares for each vcpu. Changelog: v2: almost rewrite the patchset to support to control each vcpu's bandwidth. Limit quota to [-1, 2^64/1000] at the schemas level. We will check it at cgroup level. Wen Congyang (10): Introduce the function virCgroupForVcpu cgroup: Implement cpu.cfs_period_us and cpu.cfs_quota_us tuning API Update XML Schema for new entries qemu: Implement period and quota tunable XML configuration and parsing. support to pass VIR_DOMAIN_AFFECT_CURRENT to virDomainGetVcpusFlags() vcpubandwidth: introduce two new libvirt APIs vcpubandwidth: implement the code to support new API for the qemu driver vcpubandwidth: implement the remote protocol to address the new API vcpubandwidth: Implement virsh support doc: Add documentation for new cputune elements period and quota daemon/remote.c | 132 +++++++ docs/formatdomain.html.in | 19 + docs/schemas/domain.rng | 29 ++- include/libvirt/libvirt.h.in | 41 +++- python/generator.py | 2 + src/conf/domain_conf.c | 272 ++++++++++++++- src/conf/domain_conf.h | 17 + src/driver.h | 14 + src/libvirt.c | 129 +++++++- src/libvirt_private.syms | 9 + src/libvirt_public.syms | 6 + src/qemu/qemu_cgroup.c | 131 +++++++ src/qemu/qemu_cgroup.h | 2 + src/qemu/qemu_driver.c | 429 ++++++++++++++++++++++- src/qemu/qemu_process.c | 4 + src/remote/remote_driver.c | 64 ++++ src/remote/remote_protocol.x | 32 ++- src/rpc/gendispatch.pl | 30 ++ src/util/cgroup.c | 153 ++++++++- src/util/cgroup.h | 11 + tests/qemuxml2argvdata/qemuxml2argv-cputune.xml | 2 + tools/virsh.c | 142 ++++++++- tools/virsh.pod | 16 + 23 files changed, 1658 insertions(+), 28 deletions(-)

Introduce the function virCgroupForVcpu() to create sub directory for each vcpu. --- src/libvirt_private.syms | 1 + src/util/cgroup.c | 72 ++++++++++++++++++++++++++++++++++++++++++--- src/util/cgroup.h | 5 +++ 3 files changed, 73 insertions(+), 5 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 626ac6c..afde173 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -67,6 +67,7 @@ virCgroupDenyAllDevices; virCgroupDenyDevicePath; virCgroupForDomain; virCgroupForDriver; +virCgroupForVcpu; virCgroupFree; virCgroupGetBlkioWeight; virCgroupGetCpuShares; diff --git a/src/util/cgroup.c b/src/util/cgroup.c index 2e5ef46..ae7567d 100644 --- a/src/util/cgroup.c +++ b/src/util/cgroup.c @@ -52,6 +52,16 @@ struct virCgroup { struct virCgroupController controllers[VIR_CGROUP_CONTROLLER_LAST]; }; +typedef enum { + VIR_CGROUP_NONE = 0, /* create subdir under each cgroup if possible. */ + VIR_CGROUP_MEM_HIERACHY = 1 << 0, /* call virCgroupSetMemoryUseHierarchy + * before creating subcgroups and + * attaching tasks + */ + VIR_CGROUP_VCPU = 1 << 1, /* create subdir only under the cgroup cpu, + * cpuacct and cpuset if possible. */ +} virCgroupFlags; + /** * virCgroupFree: * @@ -503,7 +513,7 @@ static int virCgroupSetMemoryUseHierarchy(virCgroupPtr group) } static int virCgroupMakeGroup(virCgroupPtr parent, virCgroupPtr group, - int create, bool memory_hierarchy) + int create, unsigned int flags) { int i; int rc = 0; @@ -516,6 +526,13 @@ static int virCgroupMakeGroup(virCgroupPtr parent, virCgroupPtr group, if (!group->controllers[i].mountPoint) continue; + /* We need to control cpu bandwidth for each vcpu now */ + if ((flags & VIR_CGROUP_VCPU) && i != VIR_CGROUP_CONTROLLER_CPU) { + /* treat it as unmounted and we can use virCgroupAddTask */ + VIR_FREE(group->controllers[i].mountPoint); + continue; + } + rc = virCgroupPathOfController(group, i, "", &path); if (rc < 0) return rc; @@ -555,7 +572,7 @@ static int virCgroupMakeGroup(virCgroupPtr parent, virCgroupPtr group, * Note that virCgroupSetMemoryUseHierarchy should always be * called prior to creating subcgroups and attaching tasks. */ - if (memory_hierarchy && + if ((flags & VIR_CGROUP_MEM_HIERACHY) && group->controllers[VIR_CGROUP_CONTROLLER_MEMORY].mountPoint != NULL && (i == VIR_CGROUP_CONTROLLER_MEMORY || STREQ(group->controllers[i].mountPoint, group->controllers[VIR_CGROUP_CONTROLLER_MEMORY].mountPoint))) { @@ -641,7 +658,7 @@ static int virCgroupAppRoot(int privileged, if (rc != 0) goto cleanup; - rc = virCgroupMakeGroup(rootgrp, *group, create, false); + rc = virCgroupMakeGroup(rootgrp, *group, create, VIR_CGROUP_NONE); cleanup: virCgroupFree(&rootgrp); @@ -801,7 +818,7 @@ int virCgroupForDriver(const char *name, VIR_FREE(path); if (rc == 0) { - rc = virCgroupMakeGroup(rootgrp, *group, create, false); + rc = virCgroupMakeGroup(rootgrp, *group, create, VIR_CGROUP_NONE); if (rc != 0) virCgroupFree(group); } @@ -861,7 +878,7 @@ int virCgroupForDomain(virCgroupPtr driver, * a group for driver, is to avoid overhead to track * cumulative usage that we don't need. */ - rc = virCgroupMakeGroup(driver, *group, create, true); + rc = virCgroupMakeGroup(driver, *group, create, VIR_CGROUP_MEM_HIERACHY); if (rc != 0) virCgroupFree(group); } @@ -879,6 +896,51 @@ int virCgroupForDomain(virCgroupPtr driver ATTRIBUTE_UNUSED, #endif /** + * virCgroupForVcpu: + * + * @driver: group for the domain + * @vcpuid: id of the vcpu + * @group: Pointer to returned virCgroupPtr + * + * Returns 0 on success + */ +#if defined HAVE_MNTENT_H && defined HAVE_GETMNTENT_R +int virCgroupForVcpu(virCgroupPtr driver, + int vcpuid, + virCgroupPtr *group, + int create) +{ + int rc; + char *path; + + if (driver == NULL) + return -EINVAL; + + if (virAsprintf(&path, "%s/vcpu%d", driver->path, vcpuid) < 0) + return -ENOMEM; + + rc = virCgroupNew(path, group); + VIR_FREE(path); + + if (rc == 0) { + rc = virCgroupMakeGroup(driver, *group, create, VIR_CGROUP_VCPU); + if (rc != 0) + virCgroupFree(group); + } + + return rc; +} +#else +int virCgroupForVcpu(virCgroupPtr driver ATTRIBUTE_UNUSED, + int vcpuid ATTRIBUTE_UNUSED, + virCgroupPtr *group ATTRIBUTE_UNUSED, + int create ATTRIBUTE_UNUSED) +{ + return -ENXIO; +} +#endif + +/** * virCgroupSetBlkioWeight: * * @group: The cgroup to change io weight for diff --git a/src/util/cgroup.h b/src/util/cgroup.h index 8ae756d..1d04418 100644 --- a/src/util/cgroup.h +++ b/src/util/cgroup.h @@ -40,6 +40,11 @@ int virCgroupForDomain(virCgroupPtr driver, virCgroupPtr *group, int create); +int virCgroupForVcpu(virCgroupPtr driver, + int vcpuid, + virCgroupPtr *group, + int create); + int virCgroupPathOfController(virCgroupPtr group, int controller, const char *key, -- 1.7.1

--- src/libvirt_private.syms | 4 ++ src/util/cgroup.c | 81 ++++++++++++++++++++++++++++++++++++++++++++-- src/util/cgroup.h | 6 +++ 3 files changed, 88 insertions(+), 3 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index afde173..9b9b6ce 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -71,6 +71,8 @@ virCgroupForVcpu; virCgroupFree; virCgroupGetBlkioWeight; virCgroupGetCpuShares; +virCgroupGetCpuCfsPeriod; +virCgroupGetCpuCfsQuota; virCgroupGetCpuacctUsage; virCgroupGetFreezerState; virCgroupGetMemoryHardLimit; @@ -85,6 +87,8 @@ virCgroupPathOfController; virCgroupRemove; virCgroupSetBlkioWeight; virCgroupSetCpuShares; +virCgroupSetCpuCfsPeriod; +virCgroupSetCpuCfsQuota; virCgroupSetFreezerState; virCgroupSetMemory; virCgroupSetMemoryHardLimit; diff --git a/src/util/cgroup.c b/src/util/cgroup.c index ae7567d..2d1378b 100644 --- a/src/util/cgroup.c +++ b/src/util/cgroup.c @@ -398,8 +398,6 @@ static int virCgroupSetValueI64(virCgroupPtr group, return rc; } -#if 0 -/* This is included for completeness, but not yet used */ static int virCgroupGetValueI64(virCgroupPtr group, int controller, const char *key, @@ -419,7 +417,6 @@ out: return rc; } -#endif static int virCgroupGetValueU64(virCgroupPtr group, int controller, @@ -1384,6 +1381,84 @@ int virCgroupGetCpuShares(virCgroupPtr group, unsigned long long *shares) "cpu.shares", shares); } +/** + * virCgroupSetCpuCfsPeriod: + * + * @group: The cgroup to change cpu.cfs_period_us for + * @cfs_period: The bandwidth period in usecs + * + * Returns: 0 on success + */ +int virCgroupSetCpuCfsPeriod(virCgroupPtr group, unsigned long long cfs_period) +{ + /* The cfs_period shoule be greater or equal than 1ms, and less or equal + * than 1s. + */ + if (cfs_period < 1000 || cfs_period > 1000000) + return -EINVAL; + + return virCgroupSetValueU64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", cfs_period); +} + +/** + * virCgroupGetCpuCfsPeriod: + * + * @group: The cgroup to get cpu.cfs_period_us for + * @cfs_period: Pointer to the returned bandwidth period in usecs + * + * Returns: 0 on success + */ +int virCgroupGetCpuCfsPeriod(virCgroupPtr group, unsigned long long *cfs_period) +{ + return virCgroupGetValueU64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", cfs_period); +} + +/** + * virCgroupSetCpuCfsQuota: + * + * @group: The cgroup to change cpu.cfs_quota_us for + * @cfs_quota: the cpu bandwidth (in usecs) that this tg will be allowed to + * consume over period + * + * Returns: 0 on success + */ +int virCgroupSetCpuCfsQuota(virCgroupPtr group, long long cfs_quota) +{ + if (cfs_quota >= 0) { + /* The cfs_quota shoule be greater or equal than 1ms */ + if (cfs_quota < 1000) + return -EINVAL; + + /* check overflow */ + if (cfs_quota > (unsigned long long)~0ULL / 1000) + return -EINVAL; + } + + return virCgroupSetValueI64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_quota_us", cfs_quota); +} + +/** + * virCgroupGetCpuCfsQuota: + * + * @group: The cgroup to get cpu.cfs_quota_us for + * @cfs_quota: Pointer to the returned cpu bandwidth (in usecs) that this tg + * will be allowed to consume over period + * + * Returns: 0 on success + */ +int virCgroupGetCpuCfsQuota(virCgroupPtr group, long long *cfs_quota) +{ + return virCgroupGetValueI64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_quota_us", cfs_quota); +} + int virCgroupGetCpuacctUsage(virCgroupPtr group, unsigned long long *usage) { return virCgroupGetValueU64(group, diff --git a/src/util/cgroup.h b/src/util/cgroup.h index 1d04418..d190bb3 100644 --- a/src/util/cgroup.h +++ b/src/util/cgroup.h @@ -104,6 +104,12 @@ int virCgroupDenyDevicePath(virCgroupPtr group, int virCgroupSetCpuShares(virCgroupPtr group, unsigned long long shares); int virCgroupGetCpuShares(virCgroupPtr group, unsigned long long *shares); +int virCgroupSetCpuCfsPeriod(virCgroupPtr group, unsigned long long cfs_period); +int virCgroupGetCpuCfsPeriod(virCgroupPtr group, unsigned long long *cfs_period); + +int virCgroupSetCpuCfsQuota(virCgroupPtr group, long long cfs_quota); +int virCgroupGetCpuCfsQuota(virCgroupPtr group, long long *cfs_quota); + int virCgroupGetCpuacctUsage(virCgroupPtr group, unsigned long long *usage); int virCgroupSetFreezerState(virCgroupPtr group, const char *state); -- 1.7.1

--- docs/schemas/domain.rng | 29 ++++++++++++++++++++++++++++- 1 files changed, 28 insertions(+), 1 deletions(-) diff --git a/docs/schemas/domain.rng b/docs/schemas/domain.rng index 891662d..99c7fd6 100644 --- a/docs/schemas/domain.rng +++ b/docs/schemas/domain.rng @@ -376,6 +376,19 @@ </element> </optional> <zeroOrMore> + <element name="bandwidth"> + <attribute name="vcpu"> + <ref name="vcpuid"/> + </attribute> + <attribute name="period"> + <ref name="cpuperiod"/> + </attribute> + <attribute name="quota"> + <ref name="cpuquota"/> + </attribute> + </element> + </zeroOrMore> + <zeroOrMore> <element name="vcpupin"> <attribute name="vcpu"> <ref name="vcpuid"/> @@ -2371,7 +2384,21 @@ <data type="unsignedInt"> <param name="pattern">[0-9]+</param> </data> - </define> + </define> + <define name="cpuperiod"> + <data type="unsignedLong"> + <param name="pattern">[0-9]+</param> + <param name="minInclusive">1000</param> + <param name="maxInclusive">1000000</param> + </data> + </define> + <define name="cpuquota"> + <data type="long"> + <param name="pattern">-?[0-9]+</param> + <param name="maxInclusive">18446744073709511</param> + <param name="minInclusive">-1</param> + </data> + </define> <define name="hostName"> <data type="string"> <param name="pattern">[a-zA-Z0-9\.\-]+</param> -- 1.7.1

--- src/conf/domain_conf.c | 272 ++++++++++++++++++++++- src/conf/domain_conf.h | 25 ++ src/libvirt_private.syms | 4 + src/qemu/qemu_cgroup.c | 131 +++++++++++ src/qemu/qemu_cgroup.h | 2 + src/qemu/qemu_process.c | 4 + tests/qemuxml2argvdata/qemuxml2argv-cputune.xml | 2 + 7 files changed, 438 insertions(+), 2 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 60e0318..0a1f973 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -997,6 +997,21 @@ virDomainVcpuPinDefFree(virDomainVcpuPinDefPtr *def, VIR_FREE(def); } +static void +virDomainVcpuBWDefFree(virDomainVcpuBWDefPtr *def, + int nvcpubw) +{ + int i; + + if (!def || !nvcpubw) + return; + + for(i = 0; i < nvcpubw; i++) + VIR_FREE(def[i]); + + VIR_FREE(def); +} + void virDomainDefFree(virDomainDefPtr def) { unsigned int i; @@ -1089,6 +1104,9 @@ void virDomainDefFree(virDomainDefPtr def) virCPUDefFree(def->cpu); + virDomainVcpuBWDefFree(def->cputune.vcpubw, + def->cputune.nvcpubw); + virDomainVcpuPinDefFree(def->cputune.vcpupin, def->cputune.nvcpupin); VIR_FREE(def->numatune.memory.nodemask); @@ -5715,6 +5733,62 @@ error: goto cleanup; } +/* Parse the XML definition for a vcpubandwidth */ +static virDomainVcpuBWDefPtr +virDomainVcpuBWDefParseXML(const xmlNodePtr node, + xmlXPathContextPtr ctxt, + int maxvcpus) +{ + virDomainVcpuBWDefPtr def; + xmlNodePtr oldnode = ctxt->node; + unsigned int vcpuid; + unsigned long long period; + long long quota; + int ret; + + if (VIR_ALLOC(def) < 0) { + virReportOOMError(); + return NULL; + } + + ctxt->node = node; + + ret = virXPathUInt("string(./@vcpu)", ctxt, &vcpuid); + if (ret == -2) { + virDomainReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("vcpu id must be an unsigned integer")); + goto error; + } else if (ret == -1) { + virDomainReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("can't parse vcpupin node")); + goto error; + } + + if (vcpuid >= maxvcpus) { + virDomainReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("vcpu id must be less than maxvcpus")); + goto error; + } + + if (virXPathULongLong("string(./@period)", ctxt, &period) < 0) + period = 0; + + if (virXPathLongLong("string(./@quota)", ctxt, "a) < 0) + quota = 0; + + def->vcpuid = vcpuid; + def->period = period; + def->quota = quota; + +cleanup: + ctxt->node = oldnode; + return def; + +error: + VIR_FREE(def); + goto cleanup; +} + static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, xmlDocPtr xml, @@ -5881,6 +5955,49 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, &def->cputune.shares) < 0) def->cputune.shares = 0; + if ((n = virXPathNodeSet("./cputune/bandwidth", ctxt, &nodes)) < 0) + goto error; + + if (n > def->maxvcpus) { + virDomainReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("bandwith nodes must be less than" + " maxvcpus")); + goto error; + } + + if (n && VIR_ALLOC_N(def->cputune.vcpubw, n) < 0) + goto no_memory; + + for (i = 0; i < n; i++) { + virDomainVcpuBWDefPtr vcpubw = NULL; + vcpubw = virDomainVcpuBWDefParseXML(nodes[i], ctxt, def->maxvcpus); + + if (!vcpubw) + goto error; + + if (virDomainVcpuBWIsDuplicate(def->cputune.vcpubw, + def->cputune.nvcpubw, + vcpubw->vcpuid)) { + virDomainReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("duplicate vcpubandwidth for same" + " vcpu")); + VIR_FREE(vcpubw); + goto error; + } + + if (vcpubw->period || vcpubw->quota) + def->cputune.vcpubw[def->cputune.nvcpubw++] = vcpubw; + else + VIR_FREE(vcpubw); + } + if (def->cputune.nvcpubw) + ignore_value(VIR_REALLOC_N(def->cputune.vcpubw, + def->cputune.nvcpubw)); + else + VIR_FREE(def->cputune.vcpubw); + + VIR_FREE(nodes); + if ((n = virXPathNodeSet("./cputune/vcpupin", ctxt, &nodes)) < 0) { goto error; } @@ -8274,6 +8391,144 @@ virDomainVcpuPinDel(virDomainDefPtr def, int vcpu) return 0; } +/* Check if vcpupin with same vcpuid already exists. + * Return 1 if exists, 0 if not. */ +int +virDomainVcpuBWIsDuplicate(virDomainVcpuBWDefPtr *def, + int nvcpubw, + int vcpu) +{ + int i; + + if (!def || !nvcpubw) + return 0; + + for (i = 0; i < nvcpubw; i++) { + if (def[i]->vcpuid == vcpu) + return 1; + } + + return 0; +} + +virDomainVcpuBWDefPtr +virDomainVcpuBWFindByVcpu(virDomainVcpuBWDefPtr *def, + int nvcpubw, + int vcpu) +{ + int i; + + if (!def || !nvcpubw) + return NULL; + + for (i = 0; i < nvcpubw; i++) { + if (def[i]->vcpuid == vcpu) + return def[i]; + } + + return NULL; +} + +int +virDomainVcpuBWAdd(virDomainDefPtr def, + unsigned long long period, + long long quota, + int vcpu) +{ + virDomainVcpuBWDefPtr *vcpubw_list = NULL; + virDomainVcpuBWDefPtr vcpubw = NULL; + + /* No vcpubw exists yet. */ + if (!def->cputune.nvcpubw) { + if (period == 0 && quota == 0) + return 0; + + if (VIR_ALLOC(vcpubw) < 0) + goto no_memory; + + if (VIR_ALLOC(vcpubw_list) < 0) + goto no_memory; + + vcpubw->vcpuid = vcpu; + vcpubw->period = period; + vcpubw->quota = quota; + vcpubw_list[def->cputune.nvcpubw++] = vcpubw; + + def->cputune.vcpubw = vcpubw_list; + } else { + int nvcpubw = def->cputune.nvcpubw; + vcpubw_list = def->cputune.vcpubw; + if (virDomainVcpuBWIsDuplicate(vcpubw_list, nvcpubw, vcpu)) { + vcpubw = virDomainVcpuBWFindByVcpu(vcpubw_list, nvcpubw, vcpu); + if (period == 0 && quota == 0) { + return virDomainVcpuBWDel(def, vcpu); + } else { + vcpubw->vcpuid = vcpu; + vcpubw->period = period; + vcpubw->quota = quota; + } + } else { + if (period == 0 && quota == 0) + return 0; + + if (VIR_ALLOC(vcpubw) < 0) + goto no_memory; + + if (VIR_REALLOC_N(vcpubw_list, nvcpubw + 1) < 0) + goto no_memory; + + vcpubw->vcpuid = vcpu; + vcpubw->period = period; + vcpubw->quota = quota; + vcpubw_list[def->cputune.nvcpubw++] = vcpubw; + } + } + + return 0; + +no_memory: + virReportOOMError(); + VIR_FREE(vcpubw); + return -1; +} + +int +virDomainVcpuBWDel(virDomainDefPtr def, int vcpu) +{ + int n; + bool deleted = false; + virDomainVcpuBWDefPtr *vcpubw_list = def->cputune.vcpubw; + + /* No vcpubw exists yet */ + if (!def->cputune.nvcpubw) + return 0; + + for (n = 0; n < def->cputune.nvcpubw; n++) { + if (vcpubw_list[n]->vcpuid == vcpu) { + VIR_FREE(vcpubw_list[n]); + memmove(&vcpubw_list[n], &vcpubw_list[n+1], + (def->cputune.nvcpubw - n - 1) * + sizeof(virDomainVcpuBWDefPtr)); + deleted = true; + break; + } + } + + if (!deleted) + return 0; + + if (--def->cputune.nvcpubw == 0) { + VIR_FREE(def->cputune.vcpubw); + } else { + if (VIR_REALLOC_N(def->cputune.vcpubw, + def->cputune.nvcpubw) < 0) { + /* ignore, harmless */ + } + } + + return 0; +} + static int virDomainLifecycleDefFormat(virBufferPtr buf, int type, @@ -9553,12 +9808,24 @@ char *virDomainDefFormat(virDomainDefPtr def, virBufferAsprintf(&buf, " current='%u'", def->vcpus); virBufferAsprintf(&buf, ">%u</vcpu>\n", def->maxvcpus); - if (def->cputune.shares || def->cputune.vcpupin) + if (def->cputune.shares || def->cputune.vcpupin || + def->cputune.vcpubw) virBufferAddLit(&buf, " <cputune>\n"); if (def->cputune.shares) virBufferAsprintf(&buf, " <shares>%lu</shares>\n", def->cputune.shares); + if (def->cputune.vcpubw) { + int i; + for (i = 0; i < def->cputune.nvcpubw; i++) { + virBufferAsprintf(&buf, " <bandwidth vcpu='%u' ", + def->cputune.vcpubw[i]->vcpuid); + virBufferAsprintf(&buf, "period='%llu' ", + def->cputune.vcpubw[i]->period); + virBufferAsprintf(&buf, "quota='%lld'/>\n", + def->cputune.vcpubw[i]->quota); + } + } if (def->cputune.vcpupin) { int i; for (i = 0; i < def->cputune.nvcpupin; i++) { @@ -9580,7 +9847,8 @@ char *virDomainDefFormat(virDomainDefPtr def, } } - if (def->cputune.shares || def->cputune.vcpupin) + if (def->cputune.shares || def->cputune.vcpupin || + def->cputune.vcpubw) virBufferAddLit(&buf, " </cputune>\n"); if (def->numatune.memory.nodemask) diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index e81977c..a2929b5 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -1108,6 +1108,14 @@ struct _virDomainVcpuPinDef { char *cpumask; }; +typedef struct _virDomainVcpuBWDef virDomainVcpuBWDef; +typedef virDomainVcpuBWDef *virDomainVcpuBWDefPtr; +struct _virDomainVcpuBWDef { + int vcpuid; + unsigned long long period; + long long quota; +}; + int virDomainVcpuPinIsDuplicate(virDomainVcpuPinDefPtr *def, int nvcpupin, int vcpu); @@ -1116,6 +1124,14 @@ virDomainVcpuPinDefPtr virDomainVcpuPinFindByVcpu(virDomainVcpuPinDefPtr *def, int nvcpupin, int vcpu); +int virDomainVcpuBWIsDuplicate(virDomainVcpuBWDefPtr *def, + int nvcpubw, + int vcpu); + +virDomainVcpuBWDefPtr virDomainVcpuBWFindByVcpu(virDomainVcpuBWDefPtr *def, + int nvcpubw, + int vcpu); + enum virDomainNumatuneMemMode { VIR_DOMAIN_NUMATUNE_MEM_STRICT, VIR_DOMAIN_NUMATUNE_MEM_PREFERRED, @@ -1170,6 +1186,8 @@ struct _virDomainDef { struct { unsigned long shares; + int nvcpubw; + virDomainVcpuBWDefPtr *vcpubw; int nvcpupin; virDomainVcpuPinDefPtr *vcpupin; } cputune; @@ -1413,6 +1431,13 @@ int virDomainVcpuPinAdd(virDomainDefPtr def, int virDomainVcpuPinDel(virDomainDefPtr def, int vcpu); +int virDomainVcpuBWAdd(virDomainDefPtr def, + unsigned long long period, + long long quota, + int vcpu); + +int virDomainVcpuBWDel(virDomainDefPtr def, int vcpu); + int virDomainDiskIndexByName(virDomainDefPtr def, const char *name); int virDomainDiskInsert(virDomainDefPtr def, virDomainDiskDefPtr disk); diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 9b9b6ce..aad0c3a 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -371,6 +371,10 @@ virDomainTimerTickpolicyTypeFromString; virDomainTimerTickpolicyTypeToString; virDomainTimerTrackTypeFromString; virDomainTimerTrackTypeToString; +virDomainVcpuBWAdd; +virDomainVcpuBWDel; +virDomainVcpuBWFindByVcpu; +virDomainVcpuBWIsDuplicate; virDomainVcpuPinAdd; virDomainVcpuPinDel; virDomainVcpuPinFindByVcpu; diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 1298924..201c0b8 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -24,6 +24,7 @@ #include <config.h> #include "qemu_cgroup.h" +#include "qemu_domain.h" #include "cgroup.h" #include "logging.h" #include "memory.h" @@ -376,6 +377,136 @@ cleanup: return -1; } +int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, virDomainVcpuBWDefPtr vcpubw) +{ + int rc; + unsigned long long old_period; + + if (!vcpubw) + return 0; + + if (vcpubw->period == 0 && vcpubw->quota == 0) + return 0; + + if (vcpubw->period) { + /* get old period, and we can rollback if set quota failed */ + rc = virCgroupGetCpuCfsPeriod(cgroup, &old_period); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to get cpu bandwidth period"); + return -1; + } + + rc = virCgroupSetCpuCfsPeriod(cgroup, vcpubw->period); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to set cpu bandwidth period"); + return -1; + } + } + + if (vcpubw->quota) { + rc = virCgroupSetCpuCfsQuota(cgroup, vcpubw->quota); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to set cpu bandwidth quota"); + goto cleanup; + } + } + + return 0; + +cleanup: + if (vcpubw->period) { + rc = virCgroupSetCpuCfsPeriod(cgroup, old_period); + if (rc < 0) + virReportSystemError(-rc, + _("%s"), + "Unable to rollback cpu bandwidth period"); + } + + return -1; +} + +int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm) +{ + virCgroupPtr cgroup = NULL; + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc; + unsigned int i; + virDomainVcpuBWDefPtr *vcpubw_list = vm->def->cputune.vcpubw; + virDomainVcpuBWDefPtr vcpubw = NULL; + int nvcpubw = vm->def->cputune.nvcpubw; + + if (driver->cgroup == NULL) + return 0; /* Not supported, so claim success */ + + rc = virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to find cgroup for %s"), + vm->def->name); + goto cleanup; + } + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. So just use the last config. + */ + if (vcpubw_list) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup, vcpubw_list[nvcpubw - 1]) < 0) + goto cleanup; + } + } + return 0; + } + + for (i = 0; i < priv->nvcpupids; i++) { + rc = virCgroupForVcpu(cgroup, i, &cgroup_vcpu, 1); + if (rc < 0) { + virReportSystemError(-rc, + _("Unable to create vcpu cgroup for %s(vcpu:" + " %d)"), + vm->def->name, i); + goto cleanup; + } + + /* move the thread for vcpu to sub dir */ + rc = virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]); + if (rc < 0) { + virReportSystemError(-rc, + _("unable to add vcpu %d task %d to cgroup"), + i, priv->vcpupids[i]); + goto cleanup; + } + + if (vcpubw_list) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + vcpubw = virDomainVcpuBWFindByVcpu(vcpubw_list, nvcpubw, i); + if (qemuSetupCgroupVcpuBW(cgroup, vcpubw) < 0) + goto cleanup; + } + } + + virCgroupFree(&cgroup_vcpu); + } + + virCgroupFree(&cgroup_vcpu); + virCgroupFree(&cgroup); + return 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + if (cgroup) { + virCgroupRemove(cgroup); + virCgroupFree(&cgroup); + } + + return -1; +} + int qemuRemoveCgroup(struct qemud_driver *driver, virDomainObjPtr vm, diff --git a/src/qemu/qemu_cgroup.h b/src/qemu/qemu_cgroup.h index e8abfb4..f0a5cee 100644 --- a/src/qemu/qemu_cgroup.h +++ b/src/qemu/qemu_cgroup.h @@ -49,6 +49,8 @@ int qemuSetupHostUsbDeviceCgroup(usbDevice *dev, void *opaque); int qemuSetupCgroup(struct qemud_driver *driver, virDomainObjPtr vm); +int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, virDomainVcpuBWDefPtr vcpubw); +int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm); int qemuRemoveCgroup(struct qemud_driver *driver, virDomainObjPtr vm, int quiet); diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 88a31a3..ce3a4bb 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2677,6 +2677,10 @@ int qemuProcessStart(virConnectPtr conn, if (qemuProcessDetectVcpuPIDs(driver, vm) < 0) goto cleanup; + VIR_DEBUG("Setting cgroup for each VCPU(if required)"); + if (qemuSetupCgroupForVcpu(driver, vm) < 0) + goto cleanup; + VIR_DEBUG("Setting VCPU affinities"); if (qemuProcessSetVcpuAffinites(conn, vm) < 0) goto cleanup; diff --git a/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml b/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml index 0afbadb..0a67e40 100644 --- a/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml +++ b/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml @@ -6,6 +6,8 @@ <vcpu>2</vcpu> <cputune> <shares>2048</shares> + <bandwidth vcpu='0' period='1000000' quota='-1'/> + <bandwidth vcpu='1' period='1000' quota='1000'/> <vcpupin vcpu='0' cpuset='0'/> <vcpupin vcpu='1' cpuset='1'/> </cputune> -- 1.7.1

At 06/30/2011 11:09 AM, Wen Congyang Write:
--- src/conf/domain_conf.c | 272 ++++++++++++++++++++++- src/conf/domain_conf.h | 25 ++ src/libvirt_private.syms | 4 + src/qemu/qemu_cgroup.c | 131 +++++++++++ src/qemu/qemu_cgroup.h | 2 + src/qemu/qemu_process.c | 4 + tests/qemuxml2argvdata/qemuxml2argv-cputune.xml | 2 + 7 files changed, 438 insertions(+), 2 deletions(-)
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 60e0318..0a1f973 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -997,6 +997,21 @@ virDomainVcpuPinDefFree(virDomainVcpuPinDefPtr *def, VIR_FREE(def); }
+static void +virDomainVcpuBWDefFree(virDomainVcpuBWDefPtr *def, + int nvcpubw) +{ + int i; + + if (!def || !nvcpubw) + return; + + for(i = 0; i < nvcpubw; i++) + VIR_FREE(def[i]); + + VIR_FREE(def); +} + void virDomainDefFree(virDomainDefPtr def) { unsigned int i; @@ -1089,6 +1104,9 @@ void virDomainDefFree(virDomainDefPtr def)
virCPUDefFree(def->cpu);
+ virDomainVcpuBWDefFree(def->cputune.vcpubw, + def->cputune.nvcpubw); + virDomainVcpuPinDefFree(def->cputune.vcpupin, def->cputune.nvcpupin);
VIR_FREE(def->numatune.memory.nodemask); @@ -5715,6 +5733,62 @@ error: goto cleanup; }
+/* Parse the XML definition for a vcpubandwidth */ +static virDomainVcpuBWDefPtr +virDomainVcpuBWDefParseXML(const xmlNodePtr node, + xmlXPathContextPtr ctxt, + int maxvcpus) +{ + virDomainVcpuBWDefPtr def; + xmlNodePtr oldnode = ctxt->node; + unsigned int vcpuid; + unsigned long long period; + long long quota; + int ret; + + if (VIR_ALLOC(def) < 0) { + virReportOOMError(); + return NULL; + } + + ctxt->node = node; + + ret = virXPathUInt("string(./@vcpu)", ctxt, &vcpuid); + if (ret == -2) { + virDomainReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("vcpu id must be an unsigned integer")); + goto error; + } else if (ret == -1) { + virDomainReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("can't parse vcpupin node")); + goto error; + } + + if (vcpuid >= maxvcpus) { + virDomainReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("vcpu id must be less than maxvcpus")); + goto error; + } + + if (virXPathULongLong("string(./@period)", ctxt, &period) < 0) + period = 0; + + if (virXPathLongLong("string(./@quota)", ctxt, "a) < 0) + quota = 0; + + def->vcpuid = vcpuid; + def->period = period; + def->quota = quota; + +cleanup: + ctxt->node = oldnode; + return def; + +error: + VIR_FREE(def); + goto cleanup; +} +
static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, xmlDocPtr xml, @@ -5881,6 +5955,49 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, &def->cputune.shares) < 0) def->cputune.shares = 0;
+ if ((n = virXPathNodeSet("./cputune/bandwidth", ctxt, &nodes)) < 0) + goto error; + + if (n > def->maxvcpus) { + virDomainReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("bandwith nodes must be less than" + " maxvcpus")); + goto error; + } + + if (n && VIR_ALLOC_N(def->cputune.vcpubw, n) < 0) + goto no_memory; + + for (i = 0; i < n; i++) { + virDomainVcpuBWDefPtr vcpubw = NULL; + vcpubw = virDomainVcpuBWDefParseXML(nodes[i], ctxt, def->maxvcpus); + + if (!vcpubw) + goto error; + + if (virDomainVcpuBWIsDuplicate(def->cputune.vcpubw, + def->cputune.nvcpubw, + vcpubw->vcpuid)) { + virDomainReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("duplicate vcpubandwidth for same" + " vcpu")); + VIR_FREE(vcpubw); + goto error; + } + + if (vcpubw->period || vcpubw->quota) + def->cputune.vcpubw[def->cputune.nvcpubw++] = vcpubw; + else + VIR_FREE(vcpubw); + } + if (def->cputune.nvcpubw) + ignore_value(VIR_REALLOC_N(def->cputune.vcpubw, + def->cputune.nvcpubw)); + else + VIR_FREE(def->cputune.vcpubw); + + VIR_FREE(nodes); + if ((n = virXPathNodeSet("./cputune/vcpupin", ctxt, &nodes)) < 0) { goto error; } @@ -8274,6 +8391,144 @@ virDomainVcpuPinDel(virDomainDefPtr def, int vcpu) return 0; }
+/* Check if vcpupin with same vcpuid already exists. + * Return 1 if exists, 0 if not. */ +int +virDomainVcpuBWIsDuplicate(virDomainVcpuBWDefPtr *def, + int nvcpubw, + int vcpu) +{ + int i; + + if (!def || !nvcpubw) + return 0; + + for (i = 0; i < nvcpubw; i++) { + if (def[i]->vcpuid == vcpu) + return 1; + } + + return 0; +} + +virDomainVcpuBWDefPtr +virDomainVcpuBWFindByVcpu(virDomainVcpuBWDefPtr *def, + int nvcpubw, + int vcpu) +{ + int i; + + if (!def || !nvcpubw) + return NULL; + + for (i = 0; i < nvcpubw; i++) { + if (def[i]->vcpuid == vcpu) + return def[i]; + } + + return NULL; +} + +int +virDomainVcpuBWAdd(virDomainDefPtr def, + unsigned long long period, + long long quota, + int vcpu) +{ + virDomainVcpuBWDefPtr *vcpubw_list = NULL; + virDomainVcpuBWDefPtr vcpubw = NULL; + + /* No vcpubw exists yet. */ + if (!def->cputune.nvcpubw) { + if (period == 0 && quota == 0) + return 0; + + if (VIR_ALLOC(vcpubw) < 0) + goto no_memory; + + if (VIR_ALLOC(vcpubw_list) < 0) + goto no_memory; + + vcpubw->vcpuid = vcpu; + vcpubw->period = period; + vcpubw->quota = quota; + vcpubw_list[def->cputune.nvcpubw++] = vcpubw; + + def->cputune.vcpubw = vcpubw_list; + } else { + int nvcpubw = def->cputune.nvcpubw; + vcpubw_list = def->cputune.vcpubw; + if (virDomainVcpuBWIsDuplicate(vcpubw_list, nvcpubw, vcpu)) { + vcpubw = virDomainVcpuBWFindByVcpu(vcpubw_list, nvcpubw, vcpu); + if (period == 0 && quota == 0) { + return virDomainVcpuBWDel(def, vcpu); + } else { + vcpubw->vcpuid = vcpu; + vcpubw->period = period; + vcpubw->quota = quota; + } + } else { + if (period == 0 && quota == 0) + return 0; + + if (VIR_ALLOC(vcpubw) < 0) + goto no_memory; + + if (VIR_REALLOC_N(vcpubw_list, nvcpubw + 1) < 0) + goto no_memory; + + vcpubw->vcpuid = vcpu; + vcpubw->period = period; + vcpubw->quota = quota; + vcpubw_list[def->cputune.nvcpubw++] = vcpubw; + } + } + + return 0; + +no_memory: + virReportOOMError(); + VIR_FREE(vcpubw); + return -1; +} + +int +virDomainVcpuBWDel(virDomainDefPtr def, int vcpu) +{ + int n; + bool deleted = false; + virDomainVcpuBWDefPtr *vcpubw_list = def->cputune.vcpubw; + + /* No vcpubw exists yet */ + if (!def->cputune.nvcpubw) + return 0; + + for (n = 0; n < def->cputune.nvcpubw; n++) { + if (vcpubw_list[n]->vcpuid == vcpu) { + VIR_FREE(vcpubw_list[n]); + memmove(&vcpubw_list[n], &vcpubw_list[n+1], + (def->cputune.nvcpubw - n - 1) * + sizeof(virDomainVcpuBWDefPtr)); + deleted = true; + break; + } + } + + if (!deleted) + return 0; + + if (--def->cputune.nvcpubw == 0) { + VIR_FREE(def->cputune.vcpubw); + } else { + if (VIR_REALLOC_N(def->cputune.vcpubw, + def->cputune.nvcpubw) < 0) { + /* ignore, harmless */ + } + } + + return 0; +} + static int virDomainLifecycleDefFormat(virBufferPtr buf, int type, @@ -9553,12 +9808,24 @@ char *virDomainDefFormat(virDomainDefPtr def, virBufferAsprintf(&buf, " current='%u'", def->vcpus); virBufferAsprintf(&buf, ">%u</vcpu>\n", def->maxvcpus);
- if (def->cputune.shares || def->cputune.vcpupin) + if (def->cputune.shares || def->cputune.vcpupin || + def->cputune.vcpubw) virBufferAddLit(&buf, " <cputune>\n");
if (def->cputune.shares) virBufferAsprintf(&buf, " <shares>%lu</shares>\n", def->cputune.shares); + if (def->cputune.vcpubw) { + int i; + for (i = 0; i < def->cputune.nvcpubw; i++) { + virBufferAsprintf(&buf, " <bandwidth vcpu='%u' ", + def->cputune.vcpubw[i]->vcpuid); + virBufferAsprintf(&buf, "period='%llu' ", + def->cputune.vcpubw[i]->period); + virBufferAsprintf(&buf, "quota='%lld'/>\n", + def->cputune.vcpubw[i]->quota); + } + } if (def->cputune.vcpupin) { int i; for (i = 0; i < def->cputune.nvcpupin; i++) { @@ -9580,7 +9847,8 @@ char *virDomainDefFormat(virDomainDefPtr def, } }
- if (def->cputune.shares || def->cputune.vcpupin) + if (def->cputune.shares || def->cputune.vcpupin || + def->cputune.vcpubw) virBufferAddLit(&buf, " </cputune>\n");
if (def->numatune.memory.nodemask) diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index e81977c..a2929b5 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -1108,6 +1108,14 @@ struct _virDomainVcpuPinDef { char *cpumask; };
+typedef struct _virDomainVcpuBWDef virDomainVcpuBWDef; +typedef virDomainVcpuBWDef *virDomainVcpuBWDefPtr; +struct _virDomainVcpuBWDef { + int vcpuid; + unsigned long long period; + long long quota; +}; + int virDomainVcpuPinIsDuplicate(virDomainVcpuPinDefPtr *def, int nvcpupin, int vcpu); @@ -1116,6 +1124,14 @@ virDomainVcpuPinDefPtr virDomainVcpuPinFindByVcpu(virDomainVcpuPinDefPtr *def, int nvcpupin, int vcpu);
+int virDomainVcpuBWIsDuplicate(virDomainVcpuBWDefPtr *def, + int nvcpubw, + int vcpu); + +virDomainVcpuBWDefPtr virDomainVcpuBWFindByVcpu(virDomainVcpuBWDefPtr *def, + int nvcpubw, + int vcpu); + enum virDomainNumatuneMemMode { VIR_DOMAIN_NUMATUNE_MEM_STRICT, VIR_DOMAIN_NUMATUNE_MEM_PREFERRED, @@ -1170,6 +1186,8 @@ struct _virDomainDef {
struct { unsigned long shares; + int nvcpubw; + virDomainVcpuBWDefPtr *vcpubw; int nvcpupin; virDomainVcpuPinDefPtr *vcpupin; } cputune; @@ -1413,6 +1431,13 @@ int virDomainVcpuPinAdd(virDomainDefPtr def,
int virDomainVcpuPinDel(virDomainDefPtr def, int vcpu);
+int virDomainVcpuBWAdd(virDomainDefPtr def, + unsigned long long period, + long long quota, + int vcpu); + +int virDomainVcpuBWDel(virDomainDefPtr def, int vcpu); + int virDomainDiskIndexByName(virDomainDefPtr def, const char *name); int virDomainDiskInsert(virDomainDefPtr def, virDomainDiskDefPtr disk); diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 9b9b6ce..aad0c3a 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -371,6 +371,10 @@ virDomainTimerTickpolicyTypeFromString; virDomainTimerTickpolicyTypeToString; virDomainTimerTrackTypeFromString; virDomainTimerTrackTypeToString; +virDomainVcpuBWAdd; +virDomainVcpuBWDel; +virDomainVcpuBWFindByVcpu; +virDomainVcpuBWIsDuplicate; virDomainVcpuPinAdd; virDomainVcpuPinDel; virDomainVcpuPinFindByVcpu; diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 1298924..201c0b8 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -24,6 +24,7 @@ #include <config.h>
#include "qemu_cgroup.h" +#include "qemu_domain.h" #include "cgroup.h" #include "logging.h" #include "memory.h" @@ -376,6 +377,136 @@ cleanup: return -1; }
+int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, virDomainVcpuBWDefPtr vcpubw) +{ + int rc; + unsigned long long old_period; + + if (!vcpubw) + return 0; + + if (vcpubw->period == 0 && vcpubw->quota == 0) + return 0; + + if (vcpubw->period) { + /* get old period, and we can rollback if set quota failed */ + rc = virCgroupGetCpuCfsPeriod(cgroup, &old_period); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to get cpu bandwidth period"); + return -1; + } + + rc = virCgroupSetCpuCfsPeriod(cgroup, vcpubw->period); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to set cpu bandwidth period"); + return -1; + } + } + + if (vcpubw->quota) { + rc = virCgroupSetCpuCfsQuota(cgroup, vcpubw->quota); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to set cpu bandwidth quota"); + goto cleanup; + } + } + + return 0; + +cleanup: + if (vcpubw->period) { + rc = virCgroupSetCpuCfsPeriod(cgroup, old_period); + if (rc < 0) + virReportSystemError(-rc, + _("%s"), + "Unable to rollback cpu bandwidth period"); + } + + return -1; +} + +int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm) +{ + virCgroupPtr cgroup = NULL; + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc; + unsigned int i; + virDomainVcpuBWDefPtr *vcpubw_list = vm->def->cputune.vcpubw; + virDomainVcpuBWDefPtr vcpubw = NULL; + int nvcpubw = vm->def->cputune.nvcpubw; + + if (driver->cgroup == NULL) + return 0; /* Not supported, so claim success */ + + rc = virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to find cgroup for %s"), + vm->def->name); + goto cleanup; + } + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. So just use the last config. + */ + if (vcpubw_list) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup, vcpubw_list[nvcpubw - 1]) < 0) + goto cleanup; + } + } + return 0; + } + + for (i = 0; i < priv->nvcpupids; i++) { + rc = virCgroupForVcpu(cgroup, i, &cgroup_vcpu, 1); + if (rc < 0) { + virReportSystemError(-rc, + _("Unable to create vcpu cgroup for %s(vcpu:" + " %d)"), + vm->def->name, i); + goto cleanup; + } + + /* move the thread for vcpu to sub dir */ + rc = virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]); + if (rc < 0) { + virReportSystemError(-rc, + _("unable to add vcpu %d task %d to cgroup"), + i, priv->vcpupids[i]); + goto cleanup; + } + + if (vcpubw_list) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + vcpubw = virDomainVcpuBWFindByVcpu(vcpubw_list, nvcpubw, i); + if (qemuSetupCgroupVcpuBW(cgroup, vcpubw) < 0)
s/cgroup/cgroup_vcpu/
+ goto cleanup; + } + } + + virCgroupFree(&cgroup_vcpu); + } + + virCgroupFree(&cgroup_vcpu); + virCgroupFree(&cgroup); + return 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + if (cgroup) { + virCgroupRemove(cgroup); + virCgroupFree(&cgroup); + } + + return -1; +} +
int qemuRemoveCgroup(struct qemud_driver *driver, virDomainObjPtr vm, diff --git a/src/qemu/qemu_cgroup.h b/src/qemu/qemu_cgroup.h index e8abfb4..f0a5cee 100644 --- a/src/qemu/qemu_cgroup.h +++ b/src/qemu/qemu_cgroup.h @@ -49,6 +49,8 @@ int qemuSetupHostUsbDeviceCgroup(usbDevice *dev, void *opaque); int qemuSetupCgroup(struct qemud_driver *driver, virDomainObjPtr vm); +int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, virDomainVcpuBWDefPtr vcpubw); +int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm); int qemuRemoveCgroup(struct qemud_driver *driver, virDomainObjPtr vm, int quiet); diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 88a31a3..ce3a4bb 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2677,6 +2677,10 @@ int qemuProcessStart(virConnectPtr conn, if (qemuProcessDetectVcpuPIDs(driver, vm) < 0) goto cleanup;
+ VIR_DEBUG("Setting cgroup for each VCPU(if required)"); + if (qemuSetupCgroupForVcpu(driver, vm) < 0) + goto cleanup; + VIR_DEBUG("Setting VCPU affinities"); if (qemuProcessSetVcpuAffinites(conn, vm) < 0) goto cleanup; diff --git a/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml b/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml index 0afbadb..0a67e40 100644 --- a/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml +++ b/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml @@ -6,6 +6,8 @@ <vcpu>2</vcpu> <cputune> <shares>2048</shares> + <bandwidth vcpu='0' period='1000000' quota='-1'/> + <bandwidth vcpu='1' period='1000' quota='1000'/> <vcpupin vcpu='0' cpuset='0'/> <vcpupin vcpu='1' cpuset='1'/> </cputune>

We need this feature in the following patch. --- include/libvirt/libvirt.h.in | 3 ++- src/libvirt.c | 12 ++++++------ src/qemu/qemu_driver.c | 18 ++++++++++-------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/include/libvirt/libvirt.h.in b/include/libvirt/libvirt.h.in index 8e20f75..14d6a88 100644 --- a/include/libvirt/libvirt.h.in +++ b/include/libvirt/libvirt.h.in @@ -1234,8 +1234,9 @@ typedef virVcpuInfo *virVcpuInfoPtr; /* Flags for controlling virtual CPU hot-plugging. */ typedef enum { - /* Must choose at least one of these two bits; SetVcpus can choose both; + /* Must choose at least one of these three bits; SetVcpus can choose both; see virDomainModificationImpact for details. */ + VIR_DOMAIN_VCPU_CURRENT = VIR_DOMAIN_AFFECT_CURRENT, VIR_DOMAIN_VCPU_LIVE = VIR_DOMAIN_AFFECT_LIVE, VIR_DOMAIN_VCPU_CONFIG = VIR_DOMAIN_AFFECT_CONFIG, diff --git a/src/libvirt.c b/src/libvirt.c index e00c64f..9857e3b 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -6923,13 +6923,18 @@ error: * @flags must include either VIR_DOMAIN_AFFECT_LIVE to query a * running domain (which will fail if domain is not active), or * VIR_DOMAIN_AFFECT_CONFIG to query the XML description of the - * domain. It is an error to set both flags. + * domain, or VIR_DOMAIN_AFFECT_CURRENT to query a running domain + * (if domain is active) or query the XML description of the domain + * (if domain is not active). + * It is an error to set two of these three flags. * * If @flags includes VIR_DOMAIN_VCPU_MAXIMUM, then the maximum * virtual CPU limit is queried. Otherwise, this call queries the * current virtual CPU limit. * * Returns 0 in case of success, -1 in case of failure. + * + * NB. Not all hypervisor support VIR_DOMAIN_AFFECT_CURRENT. */ int @@ -6947,11 +6952,6 @@ virDomainGetVcpusFlags(virDomainPtr domain, unsigned int flags) return -1; } - /* Exactly one of these two flags should be set. */ - if (!(flags & VIR_DOMAIN_AFFECT_LIVE) == !(flags & VIR_DOMAIN_AFFECT_CONFIG)) { - virLibDomainError(VIR_ERR_INVALID_ARG, __FUNCTION__); - goto error; - } conn = domain->conn; if (conn->driver->domainGetVcpusFlags) { diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 8b65c26..9ddbc0f 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -3374,18 +3374,12 @@ qemudDomainGetVcpusFlags(virDomainPtr dom, unsigned int flags) virDomainObjPtr vm; virDomainDefPtr def; int ret = -1; + bool isActive; virCheckFlags(VIR_DOMAIN_AFFECT_LIVE | VIR_DOMAIN_AFFECT_CONFIG | VIR_DOMAIN_VCPU_MAXIMUM, -1); - /* Exactly one of LIVE or CONFIG must be set. */ - if (!(flags & VIR_DOMAIN_AFFECT_LIVE) == !(flags & VIR_DOMAIN_AFFECT_CONFIG)) { - qemuReportError(VIR_ERR_INVALID_ARG, - _("invalid flag combination: (0x%x)"), flags); - return -1; - } - qemuDriverLock(driver); vm = virDomainFindByUUID(&driver->domains, dom->uuid); qemuDriverUnlock(driver); @@ -3398,8 +3392,16 @@ qemudDomainGetVcpusFlags(virDomainPtr dom, unsigned int flags) goto cleanup; } + isActive = virDomainObjIsActive(vm); + if ((flags & ~VIR_DOMAIN_VCPU_MAXIMUM) == VIR_DOMAIN_AFFECT_CURRENT) { + if (isActive) + flags |= VIR_DOMAIN_AFFECT_LIVE; + else + flags |= VIR_DOMAIN_AFFECT_CONFIG; + } + if (flags & VIR_DOMAIN_AFFECT_LIVE) { - if (!virDomainObjIsActive(vm)) { + if (!isActive) { qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", _("domain not active")); goto cleanup; -- 1.7.1

We want to control bandwidth for each vcpu, so we can not use the API virDomainSetSchedulerParameters(). Introduce two new APIs to change and query bandwidth for each vcpu. --- include/libvirt/libvirt.h.in | 38 ++++++++++++++ python/generator.py | 2 + src/conf/domain_conf.h | 8 --- src/driver.h | 14 +++++ src/libvirt.c | 117 ++++++++++++++++++++++++++++++++++++++++++ src/libvirt_public.syms | 6 ++ 6 files changed, 177 insertions(+), 8 deletions(-) diff --git a/include/libvirt/libvirt.h.in b/include/libvirt/libvirt.h.in index 14d6a88..03c04b3 100644 --- a/include/libvirt/libvirt.h.in +++ b/include/libvirt/libvirt.h.in @@ -547,6 +547,44 @@ int virDomainSetSchedulerParametersFlags (virDomainPtr domain, unsigned int flags); /** + * virDomainVcpuBWDef: + * + * Vcpu's bandwidth. + */ +typedef struct _virDomainVcpuBWDef virDomainVcpuBWDef; +struct _virDomainVcpuBWDef { + int vcpuid; /* vcpu id */ + unsigned long long period; /* bandwidth period in usecs */ + long long quota; /* cpu bandwidth (in usecs) that this vcpu will be allowed + * to consume over period + */ +}; + +/** + * virDomainVcpuBWDefPtr: + * + * a pointer to a virDomainVcpuBWDef structure. + */ +typedef virDomainVcpuBWDef *virDomainVcpuBWDefPtr; + +/* Management of vcpu bandwidth */ + +/* + * Fetch vcpu bandwidth, caller allocates 'vcpubw' field of size 'nvcpu' + */ +int virDomainGetVcpuBW (virDomainPtr domain, + virDomainVcpuBWDefPtr vcpubw, + int *nvcpu, + unsigned int flags); +/* + * Change vcpu bandwidth. + */ +int virDomainSetVcpuBW (virDomainPtr domain, + virDomainVcpuBWDefPtr vcpubw, + int nvcpu, + unsigned int flags); + +/** * virDomainBlockStats: * * Block device stats for virDomainBlockStats. diff --git a/python/generator.py b/python/generator.py index c27ff73..3b08626 100755 --- a/python/generator.py +++ b/python/generator.py @@ -418,6 +418,8 @@ skip_function = ( "virNWFilterGetConnect", "virStoragePoolGetConnect", "virStorageVolGetConnect", + "virDomainGetVcpuBW", + "virDomainSetVcpuBW", ) # Generate C code, but skip python impl diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index a2929b5..26f2c76 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -1108,14 +1108,6 @@ struct _virDomainVcpuPinDef { char *cpumask; }; -typedef struct _virDomainVcpuBWDef virDomainVcpuBWDef; -typedef virDomainVcpuBWDef *virDomainVcpuBWDefPtr; -struct _virDomainVcpuBWDef { - int vcpuid; - unsigned long long period; - long long quota; -}; - int virDomainVcpuPinIsDuplicate(virDomainVcpuPinDefPtr *def, int nvcpupin, int vcpu); diff --git a/src/driver.h b/src/driver.h index 871a4ae..aa3bde2 100644 --- a/src/driver.h +++ b/src/driver.h @@ -317,6 +317,18 @@ typedef int unsigned int flags); typedef int + (*virDrvDomainGetVcpuBW) (virDomainPtr domain, + virDomainVcpuBWDefPtr vcpubw, + int *nvcpu, + unsigned int flags); + +typedef int + (*virDrvDomainSetVcpuBW) (virDomainPtr domain, + virDomainVcpuBWDefPtr vcpubw, + int nvcpu, + unsigned int flags); + +typedef int (*virDrvDomainBlockStats) (virDomainPtr domain, const char *path, @@ -739,6 +751,8 @@ struct _virDriver { virDrvDomainGetSchedulerParametersFlags domainGetSchedulerParametersFlags; virDrvDomainSetSchedulerParameters domainSetSchedulerParameters; virDrvDomainSetSchedulerParametersFlags domainSetSchedulerParametersFlags; + virDrvDomainGetVcpuBW domainGetVcpuBW; + virDrvDomainSetVcpuBW domainSetVcpuBW; virDrvDomainMigratePrepare domainMigratePrepare; virDrvDomainMigratePerform domainMigratePerform; virDrvDomainMigrateFinish domainMigrateFinish; diff --git a/src/libvirt.c b/src/libvirt.c index 9857e3b..baaa4ad 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -5852,6 +5852,123 @@ error: return -1; } +/** + * virDomainGetVcpuBW: + * @domain: pointer to domain object + * @vcpubw: pointer to vcpu bandwidth objects + * @nvcpu: pointer to the number of vcpu bandwidth objects + * @flags: bitwise-OR of virDomainModificationImpact + * + * Get the vcpu bandwidth, the @vcpubw array will be filled with the + * values. + * + * The value of @flags can be exactly VIR_DOMAIN_AFFECT_CURRENT, + * VIR_DOMAIN_AFFECT_LIVE, or VIR_DOMAIN_AFFECT_CONFIG. + * + * Returns -1 in case of error, 0 in case of success. + */ +int +virDomainGetVcpuBW(virDomainPtr domain, virDomainVcpuBWDefPtr vcpubw, + int *nvcpu, unsigned int flags) +{ + virConnectPtr conn; + + VIR_DOMAIN_DEBUG(domain, "vcpubw=%p, nvcpu=%d, flags=%u", + vcpubw, *nvcpu, flags); + + virResetLastError(); + + if (!VIR_IS_CONNECTED_DOMAIN(domain)) { + virLibDomainError(VIR_ERR_INVALID_DOMAIN, __FUNCTION__); + virDispatchError(NULL); + return -1; + } + + if (vcpubw == NULL || *nvcpu < 0) { + virLibDomainError(VIR_ERR_INVALID_ARG, __FUNCTION__); + goto error; + } + + if (domain->conn->flags & VIR_CONNECT_RO) { + virLibDomainError(VIR_ERR_OPERATION_DENIED, __FUNCTION__); + goto error; + } + conn = domain->conn; + + if (conn->driver->domainGetVcpuBW) { + int ret; + ret = conn->driver->domainGetVcpuBW(domain, vcpubw, nvcpu, flags); + if (ret < 0) + goto error; + return ret; + } + + virLibConnError(VIR_ERR_NO_SUPPORT, __FUNCTION__); + +error: + virDispatchError(domain->conn); + return -1; +} + + +/** + * virDomainSetVcpuBW: + * @domain: pointer to domain object + * @params: pointer to vcpu bandwidth objects + * @nparams: number of vcpu bandwidth objects + * (this value can be the same or less than domain's vcpu num) + * @flags: bitwise-OR of virDomainModificationImpact + * + * Change a subset or all vcpu bandwidth. The value of @flags + * should be either VIR_DOMAIN_AFFECT_CURRENT, or a bitwise-or of + * values from VIR_DOMAIN_AFFECT_LIVE and VIR_DOMAIN_AFFECT_CONFIG, + * although hypervisors vary in which flags are supported. + * + * Returns -1 in case of error, 0 in case of success. + */ +int +virDomainSetVcpuBW(virDomainPtr domain, virDomainVcpuBWDefPtr vcpubw, + int nvcpu, unsigned int flags) +{ + virConnectPtr conn; + + VIR_DOMAIN_DEBUG(domain, "vcpubw=%p, nvcpu=%d, flags=%u", + vcpubw, nvcpu, flags); + + virResetLastError(); + + if (!VIR_IS_CONNECTED_DOMAIN(domain)) { + virLibDomainError(VIR_ERR_INVALID_DOMAIN, __FUNCTION__); + virDispatchError(NULL); + return -1; + } + + if (vcpubw == NULL || nvcpu < 0) { + virLibDomainError(VIR_ERR_INVALID_ARG, __FUNCTION__); + goto error; + } + + if (domain->conn->flags & VIR_CONNECT_RO) { + virLibDomainError(VIR_ERR_OPERATION_DENIED, __FUNCTION__); + goto error; + } + conn = domain->conn; + + if (conn->driver->domainSetVcpuBW) { + int ret; + ret = conn->driver->domainSetVcpuBW(domain, vcpubw, nvcpu, flags); + if (ret < 0) + goto error; + return ret; + } + + virLibConnError(VIR_ERR_NO_SUPPORT, __FUNCTION__); + +error: + virDispatchError(domain->conn); + return -1; +} + /** * virDomainBlockStats: diff --git a/src/libvirt_public.syms b/src/libvirt_public.syms index 5f2541a..d4f80a6 100644 --- a/src/libvirt_public.syms +++ b/src/libvirt_public.syms @@ -466,4 +466,10 @@ LIBVIRT_0.9.3 { virNodeGetMemoryStats; } LIBVIRT_0.9.2; +LIBVIRT_0.9.4 { + global: + virDomainGetVcpuBW; + virDomainSetVcpuBW; +} LIBVIRT_0.9.3; + # .... define new API here using predicted next version number .... -- 1.7.1

At 06/30/2011 11:13 AM, Wen Congyang Write:
We want to control bandwidth for each vcpu, so we can not use the API virDomainSetSchedulerParameters(). Introduce two new APIs to change and query bandwidth for each vcpu.
+ +/** + * virDomainSetVcpuBW: + * @domain: pointer to domain object + * @params: pointer to vcpu bandwidth objects
another typo error: s/params/vcpubw/
+ * @nparams: number of vcpu bandwidth objects
s/nparams/nvcpubw/
+ * (this value can be the same or less than domain's vcpu num) + * @flags: bitwise-OR of virDomainModificationImpact + * + * Change a subset or all vcpu bandwidth. The value of @flags + * should be either VIR_DOMAIN_AFFECT_CURRENT, or a bitwise-or of + * values from VIR_DOMAIN_AFFECT_LIVE and VIR_DOMAIN_AFFECT_CONFIG, + * although hypervisors vary in which flags are supported. + * + * Returns -1 in case of error, 0 in case of success. + */ +int +virDomainSetVcpuBW(virDomainPtr domain, virDomainVcpuBWDefPtr vcpubw, + int nvcpu, unsigned int flags) +{ + virConnectPtr conn; + + VIR_DOMAIN_DEBUG(domain, "vcpubw=%p, nvcpu=%d, flags=%u", + vcpubw, nvcpu, flags); + + virResetLastError(); + + if (!VIR_IS_CONNECTED_DOMAIN(domain)) { + virLibDomainError(VIR_ERR_INVALID_DOMAIN, __FUNCTION__); + virDispatchError(NULL); + return -1; + } + + if (vcpubw == NULL || nvcpu < 0) { + virLibDomainError(VIR_ERR_INVALID_ARG, __FUNCTION__); + goto error; + } + + if (domain->conn->flags & VIR_CONNECT_RO) { + virLibDomainError(VIR_ERR_OPERATION_DENIED, __FUNCTION__); + goto error; + } + conn = domain->conn; + + if (conn->driver->domainSetVcpuBW) { + int ret; + ret = conn->driver->domainSetVcpuBW(domain, vcpubw, nvcpu, flags); + if (ret < 0) + goto error; + return ret; + } + + virLibConnError(VIR_ERR_NO_SUPPORT, __FUNCTION__); + +error: + virDispatchError(domain->conn); + return -1; +} +
/** * virDomainBlockStats: diff --git a/src/libvirt_public.syms b/src/libvirt_public.syms index 5f2541a..d4f80a6 100644 --- a/src/libvirt_public.syms +++ b/src/libvirt_public.syms @@ -466,4 +466,10 @@ LIBVIRT_0.9.3 { virNodeGetMemoryStats; } LIBVIRT_0.9.2;
+LIBVIRT_0.9.4 { + global: + virDomainGetVcpuBW; + virDomainSetVcpuBW; +} LIBVIRT_0.9.3; + # .... define new API here using predicted next version number ....

On Thu, 30 Jun 2011 11:13:18 +0800, Wen Congyang <wency@cn.fujitsu.com> wrote:
We want to control bandwidth for each vcpu, so we can not use the API virDomainSetSchedulerParameters(). Introduce two new APIs to change and query bandwidth for each vcpu.
Will we have different cpu bandwidth for different vcpus? Something like this: vcpu1: 1000000/250000 vcpu2: 1000000/500000 vcpu3: 1000000/300000 vcpu4: 1000000/400000 IMO, that is not required, we can have a top level bandwitdh for the VM and then redistribute it among vcpus equally, without user knowing about it. Something like this: VM1(4vcpu) has to be throttled at 1CPU bandwidth using SetSchedParamters. Internally libvirt splits it equally: vcpu1: 1000000/250000 vcpu2: 1000000/250000 vcpu3: 1000000/250000 vcpu4: 1000000/250000 So why introduce VCPU level apis? Regards, Nikunj

At 07/04/2011 07:19 PM, Nikunj A. Dadhania Write:
On Thu, 30 Jun 2011 11:13:18 +0800, Wen Congyang <wency@cn.fujitsu.com> wrote:
We want to control bandwidth for each vcpu, so we can not use the API virDomainSetSchedulerParameters(). Introduce two new APIs to change and query bandwidth for each vcpu.
Will we have different cpu bandwidth for different vcpus?
Something like this:
vcpu1: 1000000/250000 vcpu2: 1000000/500000 vcpu3: 1000000/300000 vcpu4: 1000000/400000
IMO, that is not required, we can have a top level bandwitdh for the VM and then redistribute it among vcpus equally, without user knowing about it.
Something like this:
VM1(4vcpu) has to be throttled at 1CPU bandwidth using SetSchedParamters.
Internally libvirt splits it equally: vcpu1: 1000000/250000 vcpu2: 1000000/250000 vcpu3: 1000000/250000 vcpu4: 1000000/250000
So why introduce VCPU level apis?
Adam Litke said IBM's performance team nead to control cpu bandwidth for each vcpu.
Regards, Nikunj

On Tue, 05 Jul 2011 15:06:06 +0800, Wen Congyang <wency@cn.fujitsu.com> wrote:
At 07/04/2011 07:19 PM, Nikunj A. Dadhania Write:
On Thu, 30 Jun 2011 11:13:18 +0800, Wen Congyang <wency@cn.fujitsu.com> wrote:
We want to control bandwidth for each vcpu, so we can not use the API virDomainSetSchedulerParameters(). Introduce two new APIs to change and query bandwidth for each vcpu.
Will we have different cpu bandwidth for different vcpus?
Something like this:
vcpu1: 1000000/250000 vcpu2: 1000000/500000 vcpu3: 1000000/300000 vcpu4: 1000000/400000
IMO, that is not required, we can have a top level bandwitdh for the VM and then redistribute it among vcpus equally, without user knowing about it.
Something like this:
VM1(4vcpu) has to be throttled at 1CPU bandwidth using SetSchedParamters.
Internally libvirt splits it equally: vcpu1: 1000000/250000 vcpu2: 1000000/250000 vcpu3: 1000000/250000 vcpu4: 1000000/250000
So why introduce VCPU level apis?
Adam Litke said IBM's performance team nead to control cpu bandwidth for each vcpu. Right, but we do not export that as a User API, that was my suggestion. We can internally control each vcpu's bandwidth, i.e. divide equally.
Regards Nikunj

On 2011-7-5 16:41, Nikunj A. Dadhania wrote:
On Tue, 05 Jul 2011 15:06:06 +0800, Wen Congyang <wency@cn.fujitsu.com> wrote:
At 07/04/2011 07:19 PM, Nikunj A. Dadhania Write:
On Thu, 30 Jun 2011 11:13:18 +0800, Wen Congyang <wency@cn.fujitsu.com> wrote:
We want to control bandwidth for each vcpu, so we can not use the API virDomainSetSchedulerParameters(). Introduce two new APIs to change and query bandwidth for each vcpu.
Will we have different cpu bandwidth for different vcpus?
Something like this:
vcpu1: 1000000/250000 vcpu2: 1000000/500000 vcpu3: 1000000/300000 vcpu4: 1000000/400000
IMO, that is not required, we can have a top level bandwitdh for the VM and then redistribute it among vcpus equally, without user knowing about it.
Something like this:
VM1(4vcpu) has to be throttled at 1CPU bandwidth using SetSchedParamters.
Internally libvirt splits it equally: vcpu1: 1000000/250000 vcpu2: 1000000/250000 vcpu3: 1000000/250000 vcpu4: 1000000/250000
So why introduce VCPU level apis?
Adam Litke said IBM's performance team nead to control cpu bandwidth for each vcpu. Right, but we do not export that as a User API, that was my suggestion. We can internally control each vcpu's bandwidth, i.e. divide equally.
Hmm, I heard that some server could run CPUs at different speed. May be this patch can simulate this behavior. Thanks, Gui
Regards Nikunj
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list

On Wed, 06 Jul 2011 14:56:27 +0800, Gui Jianfeng <guijianfeng@cn.fujitsu.com> wrote:
On 2011-7-5 16:41, Nikunj A. Dadhania wrote:
On Tue, 05 Jul 2011 15:06:06 +0800, Wen Congyang <wency@cn.fujitsu.com> wrote:
At 07/04/2011 07:19 PM, Nikunj A. Dadhania Write:
On Thu, 30 Jun 2011 11:13:18 +0800, Wen Congyang <wency@cn.fujitsu.com> wrote:
We want to control bandwidth for each vcpu, so we can not use the API virDomainSetSchedulerParameters(). Introduce two new APIs to change and query bandwidth for each vcpu.
Will we have different cpu bandwidth for different vcpus?
Something like this:
vcpu1: 1000000/250000 vcpu2: 1000000/500000 vcpu3: 1000000/300000 vcpu4: 1000000/400000
IMO, that is not required, we can have a top level bandwitdh for the VM and then redistribute it among vcpus equally, without user knowing about it.
Something like this:
VM1(4vcpu) has to be throttled at 1CPU bandwidth using SetSchedParamters.
Internally libvirt splits it equally: vcpu1: 1000000/250000 vcpu2: 1000000/250000 vcpu3: 1000000/250000 vcpu4: 1000000/250000
So why introduce VCPU level apis?
Adam Litke said IBM's performance team nead to control cpu bandwidth for each vcpu. Right, but we do not export that as a User API, that was my suggestion. We can internally control each vcpu's bandwidth, i.e. divide equally.
Hmm, I heard that some server could run CPUs at different speed. May be this patch can simulate this behavior. That happens on my laptop as well, depending on the machine load CPU frequency is changed but it is done transparently.
I am not sure if we are trying to simulate that here. Regards Nikunj

On 2011-7-6 16:56, Nikunj A. Dadhania wrote:
On Wed, 06 Jul 2011 14:56:27 +0800, Gui Jianfeng <guijianfeng@cn.fujitsu.com> wrote:
On 2011-7-5 16:41, Nikunj A. Dadhania wrote:
On Tue, 05 Jul 2011 15:06:06 +0800, Wen Congyang <wency@cn.fujitsu.com> wrote:
At 07/04/2011 07:19 PM, Nikunj A. Dadhania Write:
On Thu, 30 Jun 2011 11:13:18 +0800, Wen Congyang <wency@cn.fujitsu.com> wrote:
We want to control bandwidth for each vcpu, so we can not use the API virDomainSetSchedulerParameters(). Introduce two new APIs to change and query bandwidth for each vcpu.
Will we have different cpu bandwidth for different vcpus?
Something like this:
vcpu1: 1000000/250000 vcpu2: 1000000/500000 vcpu3: 1000000/300000 vcpu4: 1000000/400000
IMO, that is not required, we can have a top level bandwitdh for the VM and then redistribute it among vcpus equally, without user knowing about it.
Something like this:
VM1(4vcpu) has to be throttled at 1CPU bandwidth using SetSchedParamters.
Internally libvirt splits it equally: vcpu1: 1000000/250000 vcpu2: 1000000/250000 vcpu3: 1000000/250000 vcpu4: 1000000/250000
So why introduce VCPU level apis?
Adam Litke said IBM's performance team nead to control cpu bandwidth for each vcpu. Right, but we do not export that as a User API, that was my suggestion. We can internally control each vcpu's bandwidth, i.e. divide equally.
Hmm, I heard that some server could run CPUs at different speed. May be this patch can simulate this behavior. That happens on my laptop as well, depending on the machine load CPU frequency is changed but it is done transparently.
I means explicitly CPU speed configuring. ;)
I am not sure if we are trying to simulate that here.
So why not leave the flexible interface here, and let users make the decision? Thanks, Gui
Regards Nikunj

So why introduce VCPU level apis?
Adam Litke said IBM's performance team nead to control cpu bandwidth for each vcpu. Right, but we do not export that as a User API, that was my suggestion. We can internally control each vcpu's bandwidth, i.e. divide equally.
Hmm, I heard that some server could run CPUs at different speed. May be this patch can simulate this behavior. That happens on my laptop as well, depending on the machine load CPU frequency is changed but it is done transparently.
I means explicitly CPU speed configuring. ;)
I am not sure if we are trying to simulate that here.
So why not leave the flexible interface here, and let users make the decision?
In my mind, the flexibility is not always a good thing. It is nothing but troublesome for the person who doesn't like detailed setting. I don't know how many people want this flexibility. I'm for Nikunj's desgin. Do you have any concerns? -- Best regards, Taku Izumi <izumi.taku@jp.fujitsu.com>

On 2011-7-7 10:32, Taku Izumi wrote:
> So why introduce VCPU level apis?
Adam Litke said IBM's performance team nead to control cpu bandwidth for each vcpu. Right, but we do not export that as a User API, that was my suggestion. We can internally control each vcpu's bandwidth, i.e. divide equally.
Hmm, I heard that some server could run CPUs at different speed. May be this patch can simulate this behavior. That happens on my laptop as well, depending on the machine load CPU frequency is changed but it is done transparently.
I means explicitly CPU speed configuring. ;)
I am not sure if we are trying to simulate that here.
So why not leave the flexible interface here, and let users make the decision?
In my mind, the flexibility is not always a good thing. It is nothing but troublesome for the person who doesn't like detailed setting. I don't know how many people want this flexibility.
I'm for Nikunj's desgin. Do you have any concerns?
My concern is whether we need to simulate CPUs running at different speed or not. If All you guys think this isn't a problem. I'm also fine with Nikunj's idea. Thanks, Gui
-- Best regards, Taku Izumi <izumi.taku@jp.fujitsu.com>

At 07/07/2011 10:32 AM, Taku Izumi Write:
> So why introduce VCPU level apis?
Adam Litke said IBM's performance team nead to control cpu bandwidth for each vcpu. Right, but we do not export that as a User API, that was my suggestion. We can internally control each vcpu's bandwidth, i.e. divide equally.
Hmm, I heard that some server could run CPUs at different speed. May be this patch can simulate this behavior. That happens on my laptop as well, depending on the machine load CPU frequency is changed but it is done transparently.
I means explicitly CPU speed configuring. ;)
I am not sure if we are trying to simulate that here.
So why not leave the flexible interface here, and let users make the decision?
In my mind, the flexibility is not always a good thing. It is nothing but troublesome for the person who doesn't like detailed setting. I don't know how many people want this flexibility.
I think we should implement the flexibility. If we do not implement, and we want it later, we can not reuse these codes(add new element, and reimplement). I'd prefer the flexibility. IBM's performance team has tested this patch, and they said this patch produced the expected and desired performance characteristics.
I'm for Nikunj's desgin. Do you have any concerns?
Hi, izumi-san, Do you still object this implement?
-- Best regards, Taku Izumi <izumi.taku@jp.fujitsu.com>
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list

On Wed, 13 Jul 2011 14:26:23 +0800, Wen Congyang <wency@cn.fujitsu.com> wrote:
At 07/07/2011 10:32 AM, Taku Izumi Write:
>> So why introduce VCPU level apis? > > Adam Litke said IBM's performance team nead to control cpu bandwidth for each > vcpu. Right, but we do not export that as a User API, that was my suggestion. We can internally control each vcpu's bandwidth, i.e. divide equally.
Hmm, I heard that some server could run CPUs at different speed. May be this patch can simulate this behavior. That happens on my laptop as well, depending on the machine load CPU frequency is changed but it is done transparently.
I means explicitly CPU speed configuring. ;)
I am not sure if we are trying to simulate that here.
So why not leave the flexible interface here, and let users make the decision?
In my mind, the flexibility is not always a good thing. It is nothing but troublesome for the person who doesn't like detailed setting. I don't know how many people want this flexibility.
I think we should implement the flexibility. If we do not implement, and we want it later, we can not reuse these codes(add new element, and reimplement). IMHO, at present we can use the current SetSchedulerParameters API and whenever we need flexibility an API as suggested in this series could be added.
Thanks Nikunj

At 07/13/2011 04:50 PM, Nikunj A. Dadhania Write:
On Wed, 13 Jul 2011 14:26:23 +0800, Wen Congyang <wency@cn.fujitsu.com> wrote:
At 07/07/2011 10:32 AM, Taku Izumi Write:
>>> So why introduce VCPU level apis? >> >> Adam Litke said IBM's performance team nead to control cpu bandwidth for each >> vcpu. > Right, but we do not export that as a User API, that was my suggestion. > We can internally control each vcpu's bandwidth, i.e. divide equally.
Hmm, I heard that some server could run CPUs at different speed. May be this patch can simulate this behavior. That happens on my laptop as well, depending on the machine load CPU frequency is changed but it is done transparently.
I means explicitly CPU speed configuring. ;)
I am not sure if we are trying to simulate that here.
So why not leave the flexible interface here, and let users make the decision?
In my mind, the flexibility is not always a good thing. It is nothing but troublesome for the person who doesn't like detailed setting. I don't know how many people want this flexibility.
I think we should implement the flexibility. If we do not implement, and we want it later, we can not reuse these codes(add new element, and reimplement). IMHO, at present we can use the current SetSchedulerParameters API and whenever we need flexibility an API as suggested in this series could be
If we need flexibilty, not only an API shoule be added. We should add new element in the XML config file. It means that libvirt should support inflexibility and flexibilty. It is very bad. If we want to support flexibility later, it is better to support it now. Thanks. Wen Congyang
added.
Thanks Nikunj

On Wed, 13 Jul 2011 16:55:28 +0800 Wen Congyang <wency@cn.fujitsu.com> wrote:
At 07/13/2011 04:50 PM, Nikunj A. Dadhania Write:
On Wed, 13 Jul 2011 14:26:23 +0800, Wen Congyang <wency@cn.fujitsu.com> wrote:
At 07/07/2011 10:32 AM, Taku Izumi Write:
>>>> So why introduce VCPU level apis? >>> >>> Adam Litke said IBM's performance team nead to control cpu bandwidth for each >>> vcpu. >> Right, but we do not export that as a User API, that was my suggestion. >> We can internally control each vcpu's bandwidth, i.e. divide equally. > > Hmm, I heard that some server could run CPUs at different speed. > May be this patch can simulate this behavior. That happens on my laptop as well, depending on the machine load CPU frequency is changed but it is done transparently.
I means explicitly CPU speed configuring. ;)
I am not sure if we are trying to simulate that here.
So why not leave the flexible interface here, and let users make the decision?
In my mind, the flexibility is not always a good thing. It is nothing but troublesome for the person who doesn't like detailed setting. I don't know how many people want this flexibility.
I think we should implement the flexibility. If we do not implement, and we want it later, we can not reuse these codes(add new element, and reimplement). IMHO, at present we can use the current SetSchedulerParameters API and whenever we need flexibility an API as suggested in this series could be
If we need flexibilty, not only an API shoule be added. We should add new element in the XML config file. It means that libvirt should support inflexibility and flexibilty. It is very bad. If we want to support flexibility later, it is better to support it now.
I think nobody needs such a flexibility in the future and I like the simpler way. But, the worst thing is the decision is prolonged. If IBM people can accept the current implementation, I also do. Can you accept this, Nikunj ? If you can't, shall we decide by lot? ;) -- Best regards, Taku Izumi <izumi.taku@jp.fujitsu.com>

On Wed, 13 Jul 2011 16:55:28 +0800 Wen Congyang <wency@cn.fujitsu.com> wrote:
At 07/13/2011 04:50 PM, Nikunj A. Dadhania Write:
On Wed, 13 Jul 2011 14:26:23 +0800, Wen Congyang <wency@cn.fujitsu.com> wrote:
At 07/07/2011 10:32 AM, Taku Izumi Write:
>>>>> So why introduce VCPU level apis? >>>> >>>> Adam Litke said IBM's performance team nead to control cpu bandwidth for each >>>> vcpu. >>> Right, but we do not export that as a User API, that was my suggestion. >>> We can internally control each vcpu's bandwidth, i.e. divide equally. >> >> Hmm, I heard that some server could run CPUs at different speed. >> May be this patch can simulate this behavior. > That happens on my laptop as well, depending on the machine load CPU > frequency is changed but it is done transparently.
I means explicitly CPU speed configuring. ;)
> > I am not sure if we are trying to simulate that here.
So why not leave the flexible interface here, and let users make the decision?
In my mind, the flexibility is not always a good thing. It is nothing but troublesome for the person who doesn't like detailed setting. I don't know how many people want this flexibility.
I think we should implement the flexibility. If we do not implement, and we want it later, we can not reuse these codes(add new element, and reimplement). IMHO, at present we can use the current SetSchedulerParameters API and whenever we need flexibility an API as suggested in this series could be
If we need flexibilty, not only an API shoule be added. We should add new element in the XML config file. It means that libvirt should support inflexibility and flexibilty. It is very bad. If we want to support flexibility later, it is better to support it now.
I think nobody needs such a flexibility in the future and I like the simpler way. I had an offline discussion with folks(Adam and Ryan) here at IBM. Simulating a machine with different cpu speed does not seem like a compelling enough case to justify the added configuration complexity. An explicit qouta setting for each vcpu will require user to audit the cputune section to add/remove tunings just to change the vcpu count to
On Thu, 14 Jul 2011 16:14:01 +0900, Taku Izumi <izumi.taku@jp.fujitsu.com> wrote: the domain. We do not feel that this is the desired behaviour. In our experience, user don't want to make these kinds of decisions. They would rather prefer libvirt to just do the right thing by default. Adam pointed that in the future it is no problem to have the simple API and a more detailed API combined: <cputune> <period> <quota> <vcpu id=0> <quota>...</quota> </vcpu> </cputune>
But, the worst thing is the decision is prolonged. If IBM people can accept the current implementation, I also do.
Can you accept this, Nikunj ? Still not convinced.
If you can't, shall we decide by lot? ;) Not sure if this is the right approach. :-)
danpb/DV, what do you think? Regards Nikunj

At 07/15/2011 12:03 PM, Nikunj A. Dadhania Write:
On Thu, 14 Jul 2011 16:14:01 +0900, Taku Izumi <izumi.taku@jp.fujitsu.com> wrote:
On Wed, 13 Jul 2011 16:55:28 +0800 Wen Congyang <wency@cn.fujitsu.com> wrote:
At 07/13/2011 04:50 PM, Nikunj A. Dadhania Write:
On Wed, 13 Jul 2011 14:26:23 +0800, Wen Congyang <wency@cn.fujitsu.com> wrote:
At 07/07/2011 10:32 AM, Taku Izumi Write:
>>>>>> So why introduce VCPU level apis? >>>>> >>>>> Adam Litke said IBM's performance team nead to control cpu bandwidth for each >>>>> vcpu. >>>> Right, but we do not export that as a User API, that was my suggestion. >>>> We can internally control each vcpu's bandwidth, i.e. divide equally. >>> >>> Hmm, I heard that some server could run CPUs at different speed. >>> May be this patch can simulate this behavior. >> That happens on my laptop as well, depending on the machine load CPU >> frequency is changed but it is done transparently. > > I means explicitly CPU speed configuring. ;) > >> >> I am not sure if we are trying to simulate that here. > > So why not leave the flexible interface here, and let users make > the decision?
In my mind, the flexibility is not always a good thing. It is nothing but troublesome for the person who doesn't like detailed setting. I don't know how many people want this flexibility.
I think we should implement the flexibility. If we do not implement, and we want it later, we can not reuse these codes(add new element, and reimplement). IMHO, at present we can use the current SetSchedulerParameters API and whenever we need flexibility an API as suggested in this series could be
If we need flexibilty, not only an API shoule be added. We should add new element in the XML config file. It means that libvirt should support inflexibility and flexibilty. It is very bad. If we want to support flexibility later, it is better to support it now.
I think nobody needs such a flexibility in the future and I like the simpler way.
Okay, I will update the patchset and implement the simple way.
I had an offline discussion with folks(Adam and Ryan) here at IBM. Simulating a machine with different cpu speed does not seem like a compelling enough case to justify the added configuration complexity. An explicit qouta setting for each vcpu will require user to audit the cputune section to add/remove tunings just to change the vcpu count to the domain. We do not feel that this is the desired behaviour.
In our experience, user don't want to make these kinds of decisions. They would rather prefer libvirt to just do the right thing by default.
Adam pointed that in the future it is no problem to have the simple API and a more detailed API combined:
<cputune> <period> <quota> <vcpu id=0> <quota>...</quota> </vcpu> </cputune>
But, the worst thing is the decision is prolonged. If IBM people can accept the current implementation, I also do.
Can you accept this, Nikunj ? Still not convinced.
If you can't, shall we decide by lot? ;) Not sure if this is the right approach. :-)
danpb/DV, what do you think?
Regards Nikunj

On 07/05/2011 02:06 AM, Wen Congyang wrote:
At 07/04/2011 07:19 PM, Nikunj A. Dadhania Write:
On Thu, 30 Jun 2011 11:13:18 +0800, Wen Congyang <wency@cn.fujitsu.com> wrote:
We want to control bandwidth for each vcpu, so we can not use the API virDomainSetSchedulerParameters(). Introduce two new APIs to change and query bandwidth for each vcpu.
Will we have different cpu bandwidth for different vcpus?
Something like this:
vcpu1: 1000000/250000 vcpu2: 1000000/500000 vcpu3: 1000000/300000 vcpu4: 1000000/400000
IMO, that is not required, we can have a top level bandwitdh for the VM and then redistribute it among vcpus equally, without user knowing about it.
Something like this:
VM1(4vcpu) has to be throttled at 1CPU bandwidth using SetSchedParamters.
Internally libvirt splits it equally: vcpu1: 1000000/250000 vcpu2: 1000000/250000 vcpu3: 1000000/250000 vcpu4: 1000000/250000
So why introduce VCPU level apis?
Adam Litke said IBM's performance team nead to control cpu bandwidth for each vcpu.
Nikunj is correct here. We only need to ensure that the bandwidth is distributed equally between all of the cpus. This can be accomplished internally by creating a cgroup for each vcpu and ensuring that 'cpu.share' is the same for each vcpu cgroup. -- Adam Litke IBM Linux Technology Center

Nikunj is correct here. We only need to ensure that the bandwidth is distributed equally between all of the cpus. This can be accomplished internally by creating a cgroup for each vcpu and ensuring that 'cpu.share' is the same for each vcpu cgroup.
When taken together, virsh I/F and domain XML schema are the following: #virsh schedinfo VM --set cpuperiod=100000 --config #virsh schedinfo VM --set cpuquota=50000 --config .. <vcpu>4</vcpu> <cputune> <period>100000</period> <quota>50000</quota> </cputune> When specified above, the qemu driver creates cgroup for each vcpus and specified as follows: The quota value is divided by the number of vcpus. vcpu#: (quota/period) vcpu0: 12500/100000 vcpu1: 12500/100000 vcpu2: 12500/100000 vcpu3: 12500/100000 But, these values cannot be confirmed form the domain XML file. Is this what you intend? -- Taku Izumi <izumi.taku@jp.fujitsu.com>

On Wed, 6 Jul 2011 10:18:05 +0900, Taku Izumi <izumi.taku@jp.fujitsu.com> wrote:
Nikunj is correct here. We only need to ensure that the bandwidth is distributed equally between all of the cpus. This can be accomplished internally by creating a cgroup for each vcpu and ensuring that 'cpu.share' is the same for each vcpu cgroup.
When taken together, virsh I/F and domain XML schema are the following:
#virsh schedinfo VM --set cpuperiod=100000 --config #virsh schedinfo VM --set cpuquota=50000 --config
.. <vcpu>4</vcpu> <cputune> <period>100000</period> <quota>50000</quota> </cputune>
When specified above, the qemu driver creates cgroup for each vcpus and specified as follows:
The quota value is divided by the number of vcpus. vcpu#: (quota/period) vcpu0: 12500/100000 vcpu1: 12500/100000 vcpu2: 12500/100000 vcpu3: 12500/100000
I am with you till this point. This is what I was visualizing the interface as, thanks for putting it.
But, these values cannot be confirmed form the domain XML file. Do not understand what do you mean here.
Regards, Nikunj

On Wed, 06 Jul 2011 10:40:47 +0530 "Nikunj A. Dadhania" <nikunj@linux.vnet.ibm.com> wrote:
On Wed, 6 Jul 2011 10:18:05 +0900, Taku Izumi <izumi.taku@jp.fujitsu.com> wrote:
Nikunj is correct here. We only need to ensure that the bandwidth is distributed equally between all of the cpus. This can be accomplished internally by creating a cgroup for each vcpu and ensuring that 'cpu.share' is the same for each vcpu cgroup.
When taken together, virsh I/F and domain XML schema are the following:
#virsh schedinfo VM --set cpuperiod=100000 --config #virsh schedinfo VM --set cpuquota=50000 --config
.. <vcpu>4</vcpu> <cputune> <period>100000</period> <quota>50000</quota> </cputune>
When specified above, the qemu driver creates cgroup for each vcpus and specified as follows:
The quota value is divided by the number of vcpus. vcpu#: (quota/period) vcpu0: 12500/100000 vcpu1: 12500/100000 vcpu2: 12500/100000 vcpu3: 12500/100000
I am with you till this point. This is what I was visualizing the interface as, thanks for putting it.
But, these values cannot be confirmed form the domain XML file. Do not understand what do you mean here.
Sorry. What I wanted to say is: In case of the above example, the administrator cannot know that each vcpu's quota is specified to 12500, because domain's XML definition only shows that domain's quota is 50000. Additionally, this information cannot be gotton via virsh command because it is only displayed as the following: # virsh schedinfo VM Scheduler : posix cpuperiod : 100000 cpuquota : 50000 It was a little anxious that actual setting is encapsulated. Anyway, I understand what you intend. Thank you. -- Taku Izumi <izumi.taku@jp.fujitsu.com>

--- src/qemu/qemu_driver.c | 411 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 411 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 9ddbc0f..03b8b9b 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5942,6 +5942,415 @@ qemuGetSchedulerParameters(virDomainPtr dom, VIR_DOMAIN_AFFECT_CURRENT); } +static int +qemuGetVcpusBWConfig(struct qemud_driver *driver, virDomainObjPtr vm, + bool active, virDomainVcpuBWDefPtr vcpubw_list, int *nvcpu) +{ + int i; + virDomainVcpuBWDefPtr vcpubw = NULL; + virDomainDefPtr vmdef = NULL; + + if (active) { + vmdef = virDomainObjGetPersistentDef(driver->caps, vm); + if (!vmdef) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("can't get persistentDef")); + return -1; + } + } else { + vmdef = vm->def; + } + + for (i = 0; i < *nvcpu; i++) { + if (i >= vmdef->vcpus) + break; + + vcpubw = virDomainVcpuBWFindByVcpu(vmdef->cputune.vcpubw, + vmdef->cputune.nvcpubw, + i); + vcpubw_list[i].vcpuid = i; + if (vcpubw) { + vcpubw_list[i].quota = vcpubw->quota; + vcpubw_list[i].period = vcpubw->period; + } else { + vcpubw_list[i].quota = 0; + vcpubw_list[i].period = 0; + } + } + + *nvcpu = i; + return 0; +} + +static int +qemuGetVcpuBWLive(virCgroupPtr cgroup, virDomainVcpuBWDefPtr vcpubw) +{ + unsigned long long period; + long long quota; + int rc; + + rc = virCgroupGetCpuCfsPeriod(cgroup, &period); + if (rc < 0) { + virReportSystemError(-rc, "%s", + _("unable to get cpu bandwidth period tunable")); + return -1; + } + + rc = virCgroupGetCpuCfsQuota(cgroup, "a); + if (rc < 0) { + virReportSystemError(-rc, "%s", + _("unable to get cpu bandwidth tunable")); + return -1; + } + + vcpubw->period = period; + vcpubw->quota = quota; + + return 0; +} + +static int +qemuGetVcpusBWLive(struct qemud_driver *driver, virDomainObjPtr vm, + virDomainVcpuBWDefPtr vcpubw_list, int *nvcpu) +{ + virCgroupPtr cgroup = NULL; + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = NULL; + int rc; + int i; + int ret = -1; + + if (!qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("cgroup CPU controller is not mounted")); + return -1; + } + + if (virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0) != 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("cannot find cgroup for domain %s"), vm->def->name); + return -1; + } + + priv = vm->privateData; + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* We do not create sub dir for each vcpu */ + for (i = 0; i < *nvcpu; i++) { + if (i >= vm->def->vcpus) + break; + + vcpubw_list[i].vcpuid = i; + rc = qemuGetVcpuBWLive(cgroup, &vcpubw_list[i]); + if (rc < 0) + goto cleanup; + } + *nvcpu = i; + goto out; + } + + for (i = 0; i < *nvcpu; i++) { + if (i >= vm->def->vcpus) + break; + + rc = virCgroupForVcpu(cgroup, i, &cgroup_vcpu, 0); + if (!cgroup_vcpu) { + virReportSystemError(-rc, + _("Unable to find vcpu cgroup for %s(vcpu:" + " %d)"), + vm->def->name, i); + goto cleanup; + } + + vcpubw_list[i].vcpuid = i; + rc = qemuGetVcpuBWLive(cgroup_vcpu, &vcpubw_list[i]); + if (rc < 0) + goto cleanup; + virCgroupFree(&cgroup_vcpu); + } + *nvcpu = i; + +out: + ret = 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + virCgroupFree(&cgroup); + return ret; +} + +static int +qemuGetVcpuBW(virDomainPtr dom, virDomainVcpuBWDefPtr vcpubw, + int *nvcpu, unsigned int flags) +{ + struct qemud_driver *driver = dom->conn->privateData; + virDomainObjPtr vm = NULL; + int ret = -1; + bool isActive; + + virCheckFlags(VIR_DOMAIN_AFFECT_LIVE | + VIR_DOMAIN_AFFECT_CONFIG, -1); + + if ((flags & (VIR_DOMAIN_AFFECT_LIVE | VIR_DOMAIN_AFFECT_CONFIG)) == + (VIR_DOMAIN_AFFECT_LIVE | VIR_DOMAIN_AFFECT_CONFIG)) { + qemuReportError(VIR_ERR_INVALID_ARG, "%s", + _("cannot query live and config together")); + return -1; + } + + if (*nvcpu < 1) { + qemuReportError(VIR_ERR_INVALID_ARG, + "%s", _("Invalid vcpu count")); + return -1; + } + + qemuDriverLock(driver); + vm = virDomainFindByUUID(&driver->domains, dom->uuid); + + if (vm == NULL) { + char uuidstr[VIR_UUID_STRING_BUFLEN]; + virUUIDFormat(dom->uuid, uuidstr); + qemuReportError(VIR_ERR_NO_DOMAIN, + _("no domain with matching uuid '%s'"), uuidstr); + goto cleanup; + } + + isActive = virDomainObjIsActive(vm); + + if (flags == VIR_DOMAIN_AFFECT_CURRENT) { + if (isActive) + flags = VIR_DOMAIN_AFFECT_LIVE; + else + flags = VIR_DOMAIN_AFFECT_CONFIG; + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + if (!vm->persistent) { + qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("cannot query persistent config of a transient domain")); + goto cleanup; + } + + ret = qemuGetVcpusBWConfig(driver, vm, isActive, vcpubw, nvcpu); + if (ret < 0) + goto cleanup; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + if (!isActive) { + qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("domain is not running")); + goto cleanup; + } + + ret = qemuGetVcpusBWLive(driver, vm, vcpubw, nvcpu); + if (ret < 0) + goto cleanup; + } + + ret = 0; + +cleanup: + if (vm) + virDomainObjUnlock(vm); + qemuDriverUnlock(driver); + return ret; +} + +static int +qemuSetVcpuBWConfig(virDomainDefPtr vmdef, virDomainVcpuBWDefPtr vcpubw, + int nvcpu) +{ + int i; + + for (i = 0; i < nvcpu; i++) { + if (vcpubw[i].vcpuid < 0 || vcpubw[i].vcpuid >= vmdef->vcpus) { + qemuReportError(VIR_ERR_INVALID_ARG, "%s", _("Invalid vcpu id")); + return -1; + } + + if (vcpubw[i].period == 0 && vcpubw[i].quota == 0) { + if (virDomainVcpuBWDel(vmdef, vcpubw[i].vcpuid) < 0) + return -1; + } else { + if (virDomainVcpuBWAdd(vmdef, vcpubw[i].period, + vcpubw[i].quota, vcpubw[i].vcpuid) < 0) + return -1; + } + } + + return 0; +} + +static int +qemuSetVcpuBWLive(struct qemud_driver *driver, virDomainObjPtr vm, + virCgroupPtr cgroup, virDomainVcpuBWDefPtr vcpubw) +{ + if (qemuSetupCgroupVcpuBW(cgroup, vcpubw) < 0) + return -1; + + if (qemuSetVcpuBWConfig(vm->def, vcpubw, 1) < 0) + return -1; + + if (virDomainSaveStatus(driver->caps, driver->stateDir, vm) < 0) + return -1; + + return 0; +} + +static int +qemuSetVcpusBWLive(struct qemud_driver *driver, virDomainObjPtr vm, + virCgroupPtr cgroup, virDomainVcpuBWDefPtr vcpubw, + int nvcpubw) +{ + int i; + qemuDomainObjPrivatePtr priv = vm->privateData; + virCgroupPtr cgroup_vcpu = NULL; + int rc; + + /* check vcpu id first */ + for (i = 0; i < nvcpubw; i++) { + if (vcpubw[i].vcpuid < 0 || vcpubw[i].vcpuid >= vm->def->vcpus) { + qemuReportError(VIR_ERR_INVALID_ARG, "%s", _("Invalid vcpu id")); + return -1; + } + } + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. So just use the last config. + */ + if (vcpubw[nvcpubw -1].period || vcpubw[nvcpubw - 1].quota) { + return qemuSetVcpuBWLive(driver, vm, cgroup, &vcpubw[nvcpubw - 1]); + } + return 0; + } + + for (i = 0; i < nvcpubw; i++) { + if (vcpubw[i].period == 0 && vcpubw[i].quota == 0) + continue; + + rc = virCgroupForVcpu(cgroup, vcpubw[i].vcpuid, &cgroup_vcpu, 0); + if (rc < 0) { + virReportSystemError(-rc, + _("Unable to find vcpu cgroup for %s(vcpu:" + " %d)"), + vm->def->name, i); + goto cleanup; + } + + if (qemuSetVcpuBWLive(driver, vm, cgroup_vcpu, &vcpubw[i]) < 0) + goto cleanup; + + virCgroupFree(&cgroup_vcpu); + } + + return 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + return -1; +} + +static int +qemuSetVcpuBW(virDomainPtr dom, virDomainVcpuBWDefPtr vcpubw, + int nvcpu, unsigned int flags) +{ + struct qemud_driver *driver = dom->conn->privateData; + virCgroupPtr cgroup = NULL; + virDomainObjPtr vm = NULL; + int ret = -1; + bool isActive; + virDomainDefPtr vmdef = NULL; + + virCheckFlags(VIR_DOMAIN_AFFECT_LIVE | + VIR_DOMAIN_AFFECT_CONFIG, -1); + + if (nvcpu < 1) { + qemuReportError(VIR_ERR_INVALID_ARG, "%s", _("Invalid vcpu count")); + return -1; + } + + qemuDriverLock(driver); + vm = virDomainFindByUUID(&driver->domains, dom->uuid); + + if (vm == NULL) { + char uuidstr[VIR_UUID_STRING_BUFLEN]; + virUUIDFormat(dom->uuid, uuidstr); + qemuReportError(VIR_ERR_NO_DOMAIN, + _("no domain with matching uuid '%s'"), uuidstr); + goto cleanup; + } + + isActive = virDomainObjIsActive(vm); + + if (flags == VIR_DOMAIN_AFFECT_CURRENT) { + if (isActive) + flags = VIR_DOMAIN_AFFECT_LIVE; + else + flags = VIR_DOMAIN_AFFECT_CONFIG; + } + + if ((flags & VIR_DOMAIN_AFFECT_CONFIG) && !vm->persistent) { + qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("cannot change persistent config of a transient domain")); + goto cleanup; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + if (!isActive) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("domain is not running")); + goto cleanup; + } + + if (!qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("cgroup CPU controller is not mounted")); + goto cleanup; + } + if (virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0) != 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("cannot find cgroup for domain %s"), + vm->def->name); + goto cleanup; + } + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + /* Make a copy for updated domain. */ + vmdef = virDomainObjCopyPersistentDef(driver->caps, vm); + if (!vmdef) + goto cleanup; + + if (qemuSetVcpuBWConfig(vmdef, vcpubw, nvcpu) < 0) + goto cleanup; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + if (qemuSetVcpusBWLive(driver, vm, cgroup, vcpubw, nvcpu) < 0) + goto cleanup; + } + + /* Finally, if no error until here, we can save config. */ + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + ret = virDomainSaveConfig(driver->configDir, vmdef); + if (ret < 0) + goto cleanup; + virDomainObjAssignDef(vm, vmdef, false); + vmdef = NULL; + } + + ret = 0; + +cleanup: + virCgroupFree(&cgroup); + virDomainDefFree(vmdef); + if (vm) + virDomainObjUnlock(vm); + qemuDriverUnlock(driver); + return ret; +} + /* This uses the 'info blockstats' monitor command which was * integrated into both qemu & kvm in late 2007. If the command is * not supported we detect this and return the appropriate error. @@ -8477,6 +8886,8 @@ static virDriver qemuDriver = { .domainGetSchedulerParametersFlags = qemuGetSchedulerParametersFlags, /* 0.9.2 */ .domainSetSchedulerParameters = qemuSetSchedulerParameters, /* 0.7.0 */ .domainSetSchedulerParametersFlags = qemuSetSchedulerParametersFlags, /* 0.9.2 */ + .domainGetVcpuBW = qemuGetVcpuBW, /* 0.9.4 */ + .domainSetVcpuBW = qemuSetVcpuBW, /* 0.9.4 */ .domainMigratePerform = qemudDomainMigratePerform, /* 0.5.0 */ .domainBlockStats = qemudDomainBlockStats, /* 0.4.1 */ .domainInterfaceStats = qemudDomainInterfaceStats, /* 0.4.1 */ -- 1.7.1

--- daemon/remote.c | 132 ++++++++++++++++++++++++++++++++++++++++++ src/remote/remote_driver.c | 64 ++++++++++++++++++++ src/remote/remote_protocol.x | 32 ++++++++++- src/rpc/gendispatch.pl | 30 ++++++++++ 4 files changed, 257 insertions(+), 1 deletions(-) diff --git a/daemon/remote.c b/daemon/remote.c index 9e6cf77..c83a8fa 100644 --- a/daemon/remote.c +++ b/daemon/remote.c @@ -100,6 +100,17 @@ remoteDeserializeTypedParameters(remote_typed_param *args_params_val, u_int args_params_len, int limit, int *nparams); +static int +remoteSerializeVcpuBandwidth(virDomainVcpuBWDefPtr vcpubw, + int nvcpubw, + remote_vcpu_bandwidth **args_vcpubw_val, + u_int *args_vcpubw_len); +static virDomainVcpuBWDefPtr +remoteDeserializeVcpuBandwidth(remote_vcpu_bandwidth *ret_vcpubw_val, + u_int ret_vcpubw_len, + int limit, + int *nvcpubw); + #include "remote_dispatch.h" #include "qemu_dispatch.h" @@ -2857,6 +2868,127 @@ cleanup: } +/* Helper to serialize vcpu bandwidth. */ +static int remoteSerializeVcpuBandwidth(virDomainVcpuBWDefPtr vcpubw, + int nvcpubw, + remote_vcpu_bandwidth **ret_vcpubw_val, + u_int *ret_vcpubw_len) +{ + int i; + int rv = -1; + remote_vcpu_bandwidth *val; + + if (VIR_ALLOC_N(val, nvcpubw) < 0) { + virReportOOMError(); + goto cleanup; + } + + for (i = 0; i < nvcpubw; i++) { + val[i].vcpuid = vcpubw[i].vcpuid; + val[i].period = vcpubw[i].period; + val[i].quota = vcpubw[i].quota; + } + + *ret_vcpubw_val = val; + *ret_vcpubw_len = nvcpubw; + val = NULL; + rv = 0; + +cleanup: + VIR_FREE(val); + return rv; +} + + +/* Helper to deserialize vcpu bandwidth. */ +static virDomainVcpuBWDefPtr +remoteDeserializeVcpuBandwidth(remote_vcpu_bandwidth *args_vcpubw_val, + u_int args_vcpubw_len, + int limit, + int *nvcpubw) +{ + int i; + virDomainVcpuBWDefPtr vcpubw = NULL; + + /* Check the length of the returned list carefully. */ + if (args_vcpubw_len > limit) { + virNetError(VIR_ERR_INTERNAL_ERROR, "%s", + _("returned number of vcpubw exceeds limit")); + goto cleanup; + } + if (VIR_ALLOC_N(vcpubw, args_vcpubw_len) < 0) { + virReportOOMError(); + goto cleanup; + } + + /* Deserialise the result. */ + for (i = 0; i < args_vcpubw_len; i++) { + vcpubw[i].vcpuid = args_vcpubw_val[i].vcpuid; + vcpubw[i].period = args_vcpubw_val[i].period; + vcpubw[i].quota = args_vcpubw_val[i].quota; + } + + *nvcpubw = args_vcpubw_len; + +cleanup: + return vcpubw; +} + + +static int +remoteDispatchDomainGetVcpuBW(virNetServerPtr server ATTRIBUTE_UNUSED, + virNetServerClientPtr client ATTRIBUTE_UNUSED, + virNetMessageHeaderPtr hdr ATTRIBUTE_UNUSED, + virNetMessageErrorPtr rerr, + remote_domain_get_vcpu_b_w_args *args, + remote_domain_get_vcpu_b_w_ret *ret) +{ + virDomainPtr dom = NULL; + virDomainVcpuBWDefPtr vcpubw = NULL; + int nvcpubw = args->nvcpubw_list; + int rv = -1; + struct daemonClientPrivate *priv = + virNetServerClientGetPrivateData(client); + + if (!priv->conn) { + virNetError(VIR_ERR_INTERNAL_ERROR, "%s", _("connection not open")); + goto cleanup; + } + + if (nvcpubw > REMOTE_DOMAIN_VCPU_MAX) { + virNetError(VIR_ERR_INTERNAL_ERROR, "%s", _("nvcpubw too large")); + goto cleanup; + } + if (VIR_ALLOC_N(vcpubw, nvcpubw) < 0) + goto no_memory; + + if (!(dom = get_nonnull_domain(priv->conn, args->dom))) + goto cleanup; + + if (virDomainGetVcpuBW(dom, vcpubw, &nvcpubw, args->flags) < 0) + goto cleanup; + + if (remoteSerializeVcpuBandwidth(vcpubw, nvcpubw, + &ret->vcpubw_list.vcpubw_list_val, + &ret->vcpubw_list.vcpubw_list_len) < 0) + goto cleanup; + + rv = 0; + +cleanup: + if (rv < 0) + virNetMessageSaveError(rerr); + if (dom) + virDomainFree(dom); + VIR_FREE(vcpubw); + return rv; + +no_memory: + virReportOOMError(); + goto cleanup; +} + + /*----- Helpers. -----*/ /* get_nonnull_domain and get_nonnull_network turn an on-wire diff --git a/src/remote/remote_driver.c b/src/remote/remote_driver.c index 0e68bc5..fe3a4ab 100644 --- a/src/remote/remote_driver.c +++ b/src/remote/remote_driver.c @@ -1317,6 +1317,68 @@ cleanup: return rv; } +/* Helper to serialize vcpu bandwidth. */ +static int remoteSerializeVcpuBandwidth(virDomainVcpuBWDefPtr vcpubw, + int nvcpubw, + remote_vcpu_bandwidth **args_vcpubw_val, + u_int *args_vcpubw_len) +{ + int i; + int rv = -1; + remote_vcpu_bandwidth *val; + + if (VIR_ALLOC_N(val, nvcpubw) < 0) { + virReportOOMError(); + goto cleanup; + } + + for (i = 0; i < nvcpubw; i++) { + val[i].vcpuid = vcpubw[i].vcpuid; + val[i].period = vcpubw[i].period; + val[i].quota = vcpubw[i].quota; + } + + *args_vcpubw_val = val; + *args_vcpubw_len = nvcpubw; + val = NULL; + rv = 0; + +cleanup: + VIR_FREE(val); + return rv; +} + +/* Helper to deserialize vcpu bandwidth. */ +static int remoteDeserializeVcpuBandwidth(remote_vcpu_bandwidth *ret_vcpubw_val, + u_int ret_vcpubw_len, + int limit, + virDomainVcpuBWDefPtr vcpubw, + int *nvcpubw) +{ + int i; + int rv = -1; + + /* Check the length of the returned list carefully. */ + if (ret_vcpubw_len > limit || ret_vcpubw_len > *nvcpubw) { + remoteError(VIR_ERR_RPC, "%s", + _("returned number of vcpubw exceeds limit")); + goto cleanup; + } + + /* Deserialise the result. */ + for (i = 0; i < ret_vcpubw_len; i++) { + vcpubw[i].vcpuid = ret_vcpubw_val[i].vcpuid; + vcpubw[i].period = ret_vcpubw_val[i].period; + vcpubw[i].quota = ret_vcpubw_val[i].quota; + } + + rv = 0; + *nvcpubw = ret_vcpubw_len; + +cleanup: + return rv; +} + static int remoteDomainGetMemoryParameters (virDomainPtr domain, virTypedParameterPtr params, int *nparams, @@ -4234,6 +4296,8 @@ static virDriver remote_driver = { .domainMigrateFinish3 = remoteDomainMigrateFinish3, /* 0.9.2 */ .domainMigrateConfirm3 = remoteDomainMigrateConfirm3, /* 0.9.2 */ .domainSendKey = remoteDomainSendKey, /* 0.9.3 */ + .domainGetVcpuBW = remoteDomainGetVcpuBW, /* 0.9.4 */ + .domainSetVcpuBW = remoteDomainSetVcpuBW, /* 0.9.4 */ }; static virNetworkDriver network_driver = { diff --git a/src/remote/remote_protocol.x b/src/remote/remote_protocol.x index 7c3539c..3771ed1 100644 --- a/src/remote/remote_protocol.x +++ b/src/remote/remote_protocol.x @@ -193,6 +193,11 @@ const REMOTE_CPU_BASELINE_MAX = 256; */ const REMOTE_DOMAIN_SEND_KEY_MAX = 16; +/* + * Max number of vcpu + */ +const REMOTE_DOMAIN_VCPU_MAX = 4096; + /* UUID. VIR_UUID_BUFLEN definition comes from libvirt.h */ typedef opaque remote_uuid[VIR_UUID_BUFLEN]; @@ -331,6 +336,12 @@ struct remote_node_get_memory_stats { unsigned hyper value; }; +struct remote_vcpu_bandwidth { + int vcpuid; + uint64_t period; + int64_t quota; +}; + /*----- Calls. -----*/ /* For each call we may have a 'remote_CALL_args' and 'remote_CALL_ret' @@ -2115,6 +2126,22 @@ struct remote_domain_get_control_info_ret { /* insert@1 */ unsigned hyper stateTime; }; +struct remote_domain_get_vcpu_b_w_args { + remote_nonnull_domain dom; + int nvcpubw_list; /* call-by-reference */ + unsigned int flags; +}; + +struct remote_domain_get_vcpu_b_w_ret { + remote_vcpu_bandwidth vcpubw_list<REMOTE_DOMAIN_VCPU_MAX>; /* insert@1 */ +}; + +struct remote_domain_set_vcpu_b_w_args { + remote_nonnull_domain dom; + remote_vcpu_bandwidth vcpubw_list<REMOTE_DOMAIN_VCPU_MAX>; + unsigned int flags; +}; + /*----- Protocol. -----*/ /* Define the program number, protocol version and procedure numbers here. */ @@ -2383,7 +2410,10 @@ enum remote_procedure { REMOTE_PROC_NODE_GET_CPU_STATS = 227, /* skipgen skipgen */ REMOTE_PROC_NODE_GET_MEMORY_STATS = 228, /* skipgen skipgen */ REMOTE_PROC_DOMAIN_GET_CONTROL_INFO = 229, /* autogen autogen */ - REMOTE_PROC_DOMAIN_GET_VCPU_PIN_INFO = 230 /* skipgen skipgen */ + REMOTE_PROC_DOMAIN_GET_VCPU_PIN_INFO = 230, /* skipgen skipgen */ + + REMOTE_PROC_DOMAIN_GET_VCPU_B_W = 231, /* skipgen autogen */ + REMOTE_PROC_DOMAIN_SET_VCPU_B_W = 232 /* autogen autogen */ /* * Notice how the entries are grouped in sets of 10 ? diff --git a/src/rpc/gendispatch.pl b/src/rpc/gendispatch.pl index 027560c..7ea1c52 100755 --- a/src/rpc/gendispatch.pl +++ b/src/rpc/gendispatch.pl @@ -423,6 +423,17 @@ elsif ($opt_b) { " &n$1)) == NULL)\n" . " goto cleanup;\n"); push(@free_list, " VIR_FREE(params);"); + } elsif ($args_member =~ m/^remote_vcpu_bandwidth (\S+)<(\S+)>;/) { + push(@vars_list, "virDomainVcpuBWDefPtr $1 = NULL"); + push(@vars_list, "int n$1"); + push(@args_list, "$1"); + push(@args_list, "n$1"); + push(@getters_list, " if (($1 = remoteDeserializeVcpuBandwidth(args->$1.$1_val,\n" . + " args->$1.$1_len,\n" . + " $2,\n" . + " &n$1)) == NULL)\n" . + " goto cleanup;\n"); + push(@free_list, " VIR_FREE(vcpubw_list);"); } elsif ($args_member =~ m/<\S+>;/ or $args_member =~ m/\[\S+\];/) { # just make all other array types fail die "unhandled type for argument value: $args_member"; @@ -1099,6 +1110,13 @@ elsif ($opt_k) { " xdr_free((xdrproc_t)xdr_$call->{args}, (char *)&args);\n" . " goto done;\n" . " }"); + } elsif ($args_member =~ m/^remote_vcpu_bandwidth (\S+)<(\S+)>;/) { + push(@args_list, "virDomainVcpuBWDefPtr $1"); + push(@args_list, "int n$1"); + push(@setters_list2, "if (remoteSerializeVcpuBandwidth($1, n$1, &args.$1.$1_val, &args.$1.$1_len) < 0) {\n" . + " xdr_free((xdrproc_t)xdr_$call->{args}, (char *)&args);\n" . + " goto done;\n" . + " }"); } elsif ($args_member =~ m/^((?:unsigned )?int) (\S+);\s*\/\*\s*call-by-reference\s*\*\//) { my $type_name = "$1 *"; my $arg_name = $2; @@ -1264,6 +1282,18 @@ elsif ($opt_k) { } elsif ($ret_member =~ m/^remote_typed_param (\S+)<\S+>;/) { # error out on unannotated arrays die "remote_typed_param array without insert@<offset> annotation: $ret_member"; + } elsif ($ret_member =~ m/^remote_vcpu_bandwidth (\S+)<(\S+)>;\s*\/\*\s*insert@(\d+)\s*\*\//) { + splice(@args_list, int($3), 0, ("virDomainVcpuBWDefPtr $1")); + push(@ret_list2, "if (remoteDeserializeVcpuBandwidth(ret.$1.$1_val,\n" . + " ret.$1.$1_len,\n" . + " $2,\n" . + " $1,\n" . + " n$1) < 0)\n" . + " goto cleanup;\n"); + $single_ret_cleanup = 1; + } elsif ($ret_member =~ m/^remote_vcpu_bandwidth (\S+)<\S+>;/) { + # error out on unannotated arrays + die "remote_vcpu_bandwidth array without insert@<offset> annotation: $ret_member"; } elsif ($ret_member =~ m/^int (\S+);/) { my $arg_name = $1; -- 1.7.1

Introduce new command vcpu-bandwidth to change and query bandwidth for each vcpu. Usage: 1. query bandwidth for all vcpus: # virsh vcpu-bandwidth <domain> 2. query bandwidth for a vcpu: # virsh vcpu-bandwidth <domain> <vcpuid> 3. change bandwidth for a vcpu: # virsh vcpu-bandwidth <domain> <vcpuid> <period> <quota> You can omit period or quota. The option --live, --config, and --current is the same as the other commands. --- tools/virsh.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- tools/virsh.pod | 16 ++++++ 2 files changed, 157 insertions(+), 1 deletions(-) diff --git a/tools/virsh.c b/tools/virsh.c index d15d206..b5a49dd 100644 --- a/tools/virsh.c +++ b/tools/virsh.c @@ -3049,7 +3049,7 @@ cmdVcpuPin(vshControl *ctl, const vshCmd *cmd) flags |= VIR_DOMAIN_AFFECT_LIVE; /* neither option is specified */ if (!live && !config) - flags = -1; + flags = VIR_DOMAIN_AFFECT_CURRENT; } if (!vshConnectionUsability(ctl, ctl->conn)) @@ -3239,6 +3239,144 @@ parse_error: } /* + * "vcpu-bandwidth" command + */ +static const vshCmdInfo info_vcpu_bandwidth[] = { + {"help", N_("control or query domain vcpu bandwidth")}, + {"desc", N_("Change or query the bandwidth of virtual CPUs in the guest domain.")}, + {NULL, NULL} +}; + +static const vshCmdOptDef opts_vcpu_bandwidth[] = { + {"domain", VSH_OT_DATA, VSH_OFLAG_REQ, N_("domain name, id or uuid")}, + {"vcpu", VSH_OT_INT, 0, N_("vcpu number")}, + {"period", VSH_OT_INT, VSH_OFLAG_EMPTY_OK, N_("bandwidth period in usecs")}, + {"quota", VSH_OT_INT, VSH_OFLAG_EMPTY_OK, + N_("cpu bandwidth (in usecs) that this vcpu will be allowed to consume over period")}, + {"config", VSH_OT_BOOL, 0, N_("affect next boot")}, + {"live", VSH_OT_BOOL, 0, N_("affect running domain")}, + {"current", VSH_OT_BOOL, 0, N_("affect current domain")}, + {NULL, 0, 0, NULL} +}; + +static bool +cmdVcpuBandwidth(vshControl *ctl, const vshCmd *cmd) +{ + virDomainPtr dom; + int vcpu = -1; + int max_vcpus, nvcpus = 0; + unsigned long long period = 0; + long long quota = 0; + int period_found, quota_found; + virDomainVcpuBWDefPtr vcpubw_list = NULL; + virDomainVcpuBWDefPtr vcpubw = NULL; + bool ret = false; + bool config = vshCommandOptBool(cmd, "config"); + bool live = vshCommandOptBool(cmd, "live"); + bool current = vshCommandOptBool(cmd, "current"); + bool query = false; /* Query mode if no period and no quota */ + int flags = 0; + int rc; + int i; + + if (current) { + if (live || config) { + vshError(ctl, "%s", _("--current must be specified exclusively")); + return false; + } + flags = VIR_DOMAIN_AFFECT_CURRENT; + } else { + if (config) + flags |= VIR_DOMAIN_AFFECT_CONFIG; + if (live) + flags |= VIR_DOMAIN_AFFECT_LIVE; + /* neither option is specified */ + if (!live && !config) + flags = VIR_DOMAIN_AFFECT_CURRENT; + } + + if (!vshConnectionUsability(ctl, ctl->conn)) + return false; + + if (!(dom = vshCommandOptDomain(ctl, cmd, NULL))) + return false; + + if ((period_found = vshCommandOptULongLong(cmd, "period", &period)) < 0) { + vshError(ctl, "%s", _("Invalid value of period")); + goto cleanup; + } + + if ((quota_found = vshCommandOptLongLong(cmd, "quota", "a)) < 0) { + vshError(ctl, "%s", _("Invalid value of quota")); + goto cleanup; + } + + query = quota_found == 0 && period_found == 0; + + /* In query mode, "vcpu" is optional */ + if (vshCommandOptInt(cmd, "vcpu", &vcpu) < !query) { + vshError(ctl, "%s", _("Invalid or missing VCPU number.")); + goto cleanup; + } + + max_vcpus = virDomainGetVcpusFlags(dom, flags | VIR_DOMAIN_VCPU_MAXIMUM); + if (max_vcpus < 0) + goto cleanup; + + /* In query mode, we need to know current setting. In change mode, if user + * does not specify period or quota, we need to know its origin setting. + */ + if (quota_found == 0 || period_found == 0) { + nvcpus = max_vcpus; + vcpubw_list = vshMalloc(ctl, nvcpus * sizeof(*vcpubw_list)); + rc = virDomainGetVcpuBW(dom, vcpubw_list, &nvcpus, flags); + if (rc < 0) + goto cleanup; + } + + /* Query mode: show CPU bandwidth information then exit.*/ + if (query) { + vshPrint(ctl, "%s %s %s\n", _("vcpu:"), _("period"), _("quota")); + vshPrint(ctl, "----------------------------------\n"); + for (i = 0; i < nvcpus; i++) { + if (vcpu != -1 && i != vcpu) + continue; + vshPrint(ctl, "%4d: ", i); + vshPrint(ctl, "%6llu ", vcpubw_list[i].period); + vshPrint(ctl, "%5lld\n", vcpubw_list[i].quota); + } + goto cleanup; + } + + if (vcpu >= max_vcpus) { + vshError(ctl, _("Virtual CPU %d doesn't exist."), vcpu); + goto cleanup; + } + + vcpubw = vshMalloc(ctl, sizeof(*vcpubw)); + vcpubw[0].vcpuid = vcpu; + if (period_found) + vcpubw[0].period = period; + else + vcpubw[0].period = vcpubw_list[vcpu].period; + if (quota_found) + vcpubw[0].quota = quota; + else + vcpubw[0].quota = vcpubw_list[vcpu].quota; + rc = virDomainSetVcpuBW(dom, vcpubw, 1, flags); + if (rc < 0) + goto cleanup; + + ret = true; + +cleanup: + VIR_FREE(vcpubw_list); + VIR_FREE(vcpubw); + virDomainFree(dom); + return ret; +} + +/* * "setvcpus" command */ static const vshCmdInfo info_setvcpus[] = { @@ -11675,6 +11813,8 @@ static const vshCmdDef domManagementCmds[] = { {"vcpucount", cmdVcpucount, opts_vcpucount, info_vcpucount, 0}, {"vcpuinfo", cmdVcpuinfo, opts_vcpuinfo, info_vcpuinfo, 0}, {"vcpupin", cmdVcpuPin, opts_vcpupin, info_vcpupin, 0}, + {"vcpu-bandwidth", cmdVcpuBandwidth, opts_vcpu_bandwidth, + info_vcpu_bandwidth, 0}, {"version", cmdVersion, opts_version, info_version, 0}, {"vncdisplay", cmdVNCDisplay, opts_vncdisplay, info_vncdisplay, 0}, {NULL, NULL, NULL, NULL, 0} diff --git a/tools/virsh.pod b/tools/virsh.pod index 736b919..686d8ba 100644 --- a/tools/virsh.pod +++ b/tools/virsh.pod @@ -849,6 +849,22 @@ If no flag is specified, behavior is different depending on hypervisor. B<Note>: The expression is sequentially evaluated, so "0-15,^8" is identical to "9-14,0-7,15" but not identical to "^8,0-15". +=item B<vcpu-bandwidth> I<domain-id> optional I<vcpu> I<period> I<quota> +I<--live> I<--config> I<--current> + +Change or query the bandwidth of domain vcpus. To change a single I<vcpu>, +specify I<period> or I<quota>; otherwise, you can query one I<vcpu> or omit +I<vcpu> to list all at once. + +I<period> is bandwidth period in usecs, and I<quota> is cpu bandwidth (in usecs) +that this vcpu will be allowed to consume over period. +If I<--live> is specified, affect a running guest. +If I<--config> is specified, affect the next boot of a persistent guest. +If I<--current> is specified, affect the current guest state. +Both I<--live> and I<--config> flags may be given if I<period> or I<quota> is +present, but I<--current> is exclusive. +If no flag is specified, it is identical to I<--current>. + =item B<vncdisplay> I<domain-id> Output the IP address and port number for the VNC display. If the information -- 1.7.1

On Thu, 30 Jun 2011 11:19:14 +0800 Wen Congyang <wency@cn.fujitsu.com> wrote:
Introduce new command vcpu-bandwidth to change and query bandwidth for each vcpu.
Usage: 1. query bandwidth for all vcpus: # virsh vcpu-bandwidth <domain>
2. query bandwidth for a vcpu: # virsh vcpu-bandwidth <domain> <vcpuid>
3. change bandwidth for a vcpu: # virsh vcpu-bandwidth <domain> <vcpuid> <period> <quota> You can omit period or quota.
The option --live, --config, and --current is the same as the other commands.
I try the following: # virsh domstate VM shut off # virsh vcpu-bandwidth VM --config vcpu: period quota ---------------------------------- 0: 100000 50000 1: 100000 50000 2: 100000 50000 3: 100000 50000 # virsh start VM Domain VM started # virsh vcpu-bandwidth VM --live vcpu: period quota ---------------------------------- 0: 100000 -1 1: 100000 -1 2: 100000 -1 3: 100000 -1 This behavior doesn't make sense to me. Is this what you intend? Or, do I make a mistake?
--- tools/virsh.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- tools/virsh.pod | 16 ++++++ 2 files changed, 157 insertions(+), 1 deletions(-)
diff --git a/tools/virsh.c b/tools/virsh.c index d15d206..b5a49dd 100644 --- a/tools/virsh.c +++ b/tools/virsh.c @@ -3049,7 +3049,7 @@ cmdVcpuPin(vshControl *ctl, const vshCmd *cmd) flags |= VIR_DOMAIN_AFFECT_LIVE; /* neither option is specified */ if (!live && !config) - flags = -1; + flags = VIR_DOMAIN_AFFECT_CURRENT; }
if (!vshConnectionUsability(ctl, ctl->conn)) @@ -3239,6 +3239,144 @@ parse_error: }
/* + * "vcpu-bandwidth" command + */ +static const vshCmdInfo info_vcpu_bandwidth[] = { + {"help", N_("control or query domain vcpu bandwidth")}, + {"desc", N_("Change or query the bandwidth of virtual CPUs in the guest domain.")}, + {NULL, NULL} +}; + +static const vshCmdOptDef opts_vcpu_bandwidth[] = { + {"domain", VSH_OT_DATA, VSH_OFLAG_REQ, N_("domain name, id or uuid")}, + {"vcpu", VSH_OT_INT, 0, N_("vcpu number")}, + {"period", VSH_OT_INT, VSH_OFLAG_EMPTY_OK, N_("bandwidth period in usecs")}, + {"quota", VSH_OT_INT, VSH_OFLAG_EMPTY_OK, + N_("cpu bandwidth (in usecs) that this vcpu will be allowed to consume over period")}, + {"config", VSH_OT_BOOL, 0, N_("affect next boot")}, + {"live", VSH_OT_BOOL, 0, N_("affect running domain")}, + {"current", VSH_OT_BOOL, 0, N_("affect current domain")}, + {NULL, 0, 0, NULL} +}; + +static bool +cmdVcpuBandwidth(vshControl *ctl, const vshCmd *cmd) +{ + virDomainPtr dom; + int vcpu = -1; + int max_vcpus, nvcpus = 0; + unsigned long long period = 0; + long long quota = 0; + int period_found, quota_found; + virDomainVcpuBWDefPtr vcpubw_list = NULL; + virDomainVcpuBWDefPtr vcpubw = NULL; + bool ret = false; + bool config = vshCommandOptBool(cmd, "config"); + bool live = vshCommandOptBool(cmd, "live"); + bool current = vshCommandOptBool(cmd, "current"); + bool query = false; /* Query mode if no period and no quota */ + int flags = 0; + int rc; + int i; + + if (current) { + if (live || config) { + vshError(ctl, "%s", _("--current must be specified exclusively")); + return false; + } + flags = VIR_DOMAIN_AFFECT_CURRENT; + } else { + if (config) + flags |= VIR_DOMAIN_AFFECT_CONFIG; + if (live) + flags |= VIR_DOMAIN_AFFECT_LIVE; + /* neither option is specified */ + if (!live && !config) + flags = VIR_DOMAIN_AFFECT_CURRENT; + } + + if (!vshConnectionUsability(ctl, ctl->conn)) + return false; + + if (!(dom = vshCommandOptDomain(ctl, cmd, NULL))) + return false; + + if ((period_found = vshCommandOptULongLong(cmd, "period", &period)) < 0) { + vshError(ctl, "%s", _("Invalid value of period")); + goto cleanup; + } + + if ((quota_found = vshCommandOptLongLong(cmd, "quota", "a)) < 0) { + vshError(ctl, "%s", _("Invalid value of quota")); + goto cleanup; + } + + query = quota_found == 0 && period_found == 0; + + /* In query mode, "vcpu" is optional */ + if (vshCommandOptInt(cmd, "vcpu", &vcpu) < !query) { + vshError(ctl, "%s", _("Invalid or missing VCPU number.")); + goto cleanup; + } + + max_vcpus = virDomainGetVcpusFlags(dom, flags | VIR_DOMAIN_VCPU_MAXIMUM); + if (max_vcpus < 0) + goto cleanup;
Is the following case OK? VCPU#4-7 are not present. <vcpu current='4'>8</vcpu>
+ + /* In query mode, we need to know current setting. In change mode, if user + * does not specify period or quota, we need to know its origin setting. + */ + if (quota_found == 0 || period_found == 0) { + nvcpus = max_vcpus; + vcpubw_list = vshMalloc(ctl, nvcpus * sizeof(*vcpubw_list)); + rc = virDomainGetVcpuBW(dom, vcpubw_list, &nvcpus, flags); + if (rc < 0) + goto cleanup; + } + + /* Query mode: show CPU bandwidth information then exit.*/ + if (query) { + vshPrint(ctl, "%s %s %s\n", _("vcpu:"), _("period"), _("quota")); + vshPrint(ctl, "----------------------------------\n"); + for (i = 0; i < nvcpus; i++) { + if (vcpu != -1 && i != vcpu) + continue; + vshPrint(ctl, "%4d: ", i); + vshPrint(ctl, "%6llu ", vcpubw_list[i].period); + vshPrint(ctl, "%5lld\n", vcpubw_list[i].quota); + } + goto cleanup; + } + + if (vcpu >= max_vcpus) { + vshError(ctl, _("Virtual CPU %d doesn't exist."), vcpu); + goto cleanup; + }
This check should be done before querying?
+ + vcpubw = vshMalloc(ctl, sizeof(*vcpubw)); + vcpubw[0].vcpuid = vcpu; + if (period_found) + vcpubw[0].period = period; + else + vcpubw[0].period = vcpubw_list[vcpu].period; + if (quota_found) + vcpubw[0].quota = quota; + else + vcpubw[0].quota = vcpubw_list[vcpu].quota; + rc = virDomainSetVcpuBW(dom, vcpubw, 1, flags); + if (rc < 0) + goto cleanup; + + ret = true; + +cleanup: + VIR_FREE(vcpubw_list); + VIR_FREE(vcpubw); + virDomainFree(dom); + return ret; +} + +/* * "setvcpus" command */ static const vshCmdInfo info_setvcpus[] = { @@ -11675,6 +11813,8 @@ static const vshCmdDef domManagementCmds[] = { {"vcpucount", cmdVcpucount, opts_vcpucount, info_vcpucount, 0}, {"vcpuinfo", cmdVcpuinfo, opts_vcpuinfo, info_vcpuinfo, 0}, {"vcpupin", cmdVcpuPin, opts_vcpupin, info_vcpupin, 0}, + {"vcpu-bandwidth", cmdVcpuBandwidth, opts_vcpu_bandwidth, + info_vcpu_bandwidth, 0}, {"version", cmdVersion, opts_version, info_version, 0}, {"vncdisplay", cmdVNCDisplay, opts_vncdisplay, info_vncdisplay, 0}, {NULL, NULL, NULL, NULL, 0} diff --git a/tools/virsh.pod b/tools/virsh.pod index 736b919..686d8ba 100644 --- a/tools/virsh.pod +++ b/tools/virsh.pod @@ -849,6 +849,22 @@ If no flag is specified, behavior is different depending on hypervisor. B<Note>: The expression is sequentially evaluated, so "0-15,^8" is identical to "9-14,0-7,15" but not identical to "^8,0-15".
+=item B<vcpu-bandwidth> I<domain-id> optional I<vcpu> I<period> I<quota> +I<--live> I<--config> I<--current> + +Change or query the bandwidth of domain vcpus. To change a single I<vcpu>, +specify I<period> or I<quota>; otherwise, you can query one I<vcpu> or omit +I<vcpu> to list all at once. + +I<period> is bandwidth period in usecs, and I<quota> is cpu bandwidth (in usecs) +that this vcpu will be allowed to consume over period. +If I<--live> is specified, affect a running guest. +If I<--config> is specified, affect the next boot of a persistent guest. +If I<--current> is specified, affect the current guest state. +Both I<--live> and I<--config> flags may be given if I<period> or I<quota> is +present, but I<--current> is exclusive. +If no flag is specified, it is identical to I<--current>. + =item B<vncdisplay> I<domain-id>
Output the IP address and port number for the VNC display. If the information -- 1.7.1
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
-- Taku Izumi <izumi.taku@jp.fujitsu.com>

At 06/30/2011 03:27 PM, Taku Izumi Write:
On Thu, 30 Jun 2011 11:19:14 +0800 Wen Congyang <wency@cn.fujitsu.com> wrote:
Introduce new command vcpu-bandwidth to change and query bandwidth for each vcpu.
Usage: 1. query bandwidth for all vcpus: # virsh vcpu-bandwidth <domain>
2. query bandwidth for a vcpu: # virsh vcpu-bandwidth <domain> <vcpuid>
3. change bandwidth for a vcpu: # virsh vcpu-bandwidth <domain> <vcpuid> <period> <quota> You can omit period or quota.
The option --live, --config, and --current is the same as the other commands.
I try the following:
# virsh domstate VM shut off
# virsh vcpu-bandwidth VM --config vcpu: period quota ---------------------------------- 0: 100000 50000 1: 100000 50000 2: 100000 50000 3: 100000 50000
# virsh start VM Domain VM started
# virsh vcpu-bandwidth VM --live vcpu: period quota ---------------------------------- 0: 100000 -1 1: 100000 -1 2: 100000 -1 3: 100000 -1
This behavior doesn't make sense to me. Is this what you intend? Or, do I make a mistake?
Ah, It is a bug of my patchset(Patch 4/10). Please use this temp fix to test: diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 201c0b8..2bf7715 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -485,7 +485,7 @@ int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm) if (vcpubw_list) { if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { vcpubw = virDomainVcpuBWFindByVcpu(vcpubw_list, nvcpubw, i); - if (qemuSetupCgroupVcpuBW(cgroup, vcpubw) < 0) + if (qemuSetupCgroupVcpuBW(cgroup_vcpu, vcpubw) < 0) goto cleanup; } }

--- docs/formatdomain.html.in | 19 +++++++++++++++++++ 1 files changed, 19 insertions(+), 0 deletions(-) diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 3a64983..7e1e5f0 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -294,6 +294,8 @@ <vcpupin vcpu="2" cpuset="2,3"/> <vcpupin vcpu="3" cpuset="0,4"/> <shares>2048</shares> + <period>1000000000</period> + <quota>-1</quota> </cputune> <numatune> <memory mode="strict" nodeset="1-4,^3"/> @@ -385,6 +387,23 @@ 2048 will get twice as much CPU time as a VM configured with value 1024. <span class="since">Since 0.9.0</span> </dd> + <dt><code>period</code></dt> + <dd> + The optional <code>period</code> element specifies the enforcement + interval(unit: microseconds). Within <code>period</code>, the domain + will not be allowed to consume more than <code>quota</code> worth of + runtime. The value should be in range [1000, 1000000]. + <span class="since">Since 0.9.4</span> + </dd> + <dt><code>quota</code></dt> + <dd> + The optional <code>quota</code> element specifies the maximum allowed + bandwidth(unit: microseconds). A domain with <code>quota</code> as any + negative value indicates that the domain has infinite bandwidth, which + means that it is not bandwidth controlled. The value should be in range + [1000, 18446744073709551] or less than 0. + <span class="since">Since 0.9.4</span> + </dd> <dt><code>numatune</code></dt> <dd> The optional <code>numatune</code> element provides details of -- 1.7.1
participants (5)
-
Adam Litke
-
Gui Jianfeng
-
Nikunj A. Dadhania
-
Taku Izumi
-
Wen Congyang