[libvirt] [PATCH RESEND RFC v4 0/6] support cpu bandwidth in libvirt

Resend this patchset TODO: 1. We create sub directory for each vcpu in cpu subsystem. So we should recalculate cpu.shares for each vcpu. Changelog: v4: address Adam Litke's comment v3: fix some small bugs implement the simple way v2: almost rewrite the patchset to support to control each vcpu's bandwidth. Limit quota to [-1, 2^64/1000] at the schemas level. We will check it at cgroup level. Wen Congyang (6): Introduce the function virCgroupForVcpu cgroup: Implement cpu.cfs_period_us and cpu.cfs_quota_us tuning API Update XML Schema for new entries qemu: Implement period and quota tunable XML configuration and parsing qemu: Implement cfs_period and cfs_quota's modification doc: Add documentation for new cputune elements period and quota docs/formatdomain.html.in | 19 ++ docs/schemas/domain.rng | 26 ++- src/conf/domain_conf.c | 20 ++- src/conf/domain_conf.h | 2 + src/libvirt_private.syms | 5 + src/qemu/qemu_cgroup.c | 137 ++++++++++ src/qemu/qemu_cgroup.h | 4 + src/qemu/qemu_driver.c | 312 +++++++++++++++++++++-- src/qemu/qemu_process.c | 4 + src/util/cgroup.c | 153 +++++++++++- src/util/cgroup.h | 11 + tests/qemuxml2argvdata/qemuxml2argv-cputune.xml | 2 + 12 files changed, 659 insertions(+), 36 deletions(-)

Introduce the function virCgroupForVcpu() to create sub directory for each vcpu. --- src/libvirt_private.syms | 1 + src/util/cgroup.c | 72 ++++++++++++++++++++++++++++++++++++++++++--- src/util/cgroup.h | 5 +++ 3 files changed, 73 insertions(+), 5 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 3e3b1dd..30804eb 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -67,6 +67,7 @@ virCgroupDenyAllDevices; virCgroupDenyDevicePath; virCgroupForDomain; virCgroupForDriver; +virCgroupForVcpu; virCgroupFree; virCgroupGetBlkioWeight; virCgroupGetCpuShares; diff --git a/src/util/cgroup.c b/src/util/cgroup.c index 740cedf..8994aca 100644 --- a/src/util/cgroup.c +++ b/src/util/cgroup.c @@ -52,6 +52,16 @@ struct virCgroup { struct virCgroupController controllers[VIR_CGROUP_CONTROLLER_LAST]; }; +typedef enum { + VIR_CGROUP_NONE = 0, /* create subdir under each cgroup if possible. */ + VIR_CGROUP_MEM_HIERACHY = 1 << 0, /* call virCgroupSetMemoryUseHierarchy + * before creating subcgroups and + * attaching tasks + */ + VIR_CGROUP_VCPU = 1 << 1, /* create subdir only under the cgroup cpu, + * cpuacct and cpuset if possible. */ +} virCgroupFlags; + /** * virCgroupFree: * @@ -503,7 +513,7 @@ static int virCgroupSetMemoryUseHierarchy(virCgroupPtr group) } static int virCgroupMakeGroup(virCgroupPtr parent, virCgroupPtr group, - int create, bool memory_hierarchy) + int create, unsigned int flags) { int i; int rc = 0; @@ -516,6 +526,13 @@ static int virCgroupMakeGroup(virCgroupPtr parent, virCgroupPtr group, if (!group->controllers[i].mountPoint) continue; + /* We need to control cpu bandwidth for each vcpu now */ + if ((flags & VIR_CGROUP_VCPU) && i != VIR_CGROUP_CONTROLLER_CPU) { + /* treat it as unmounted and we can use virCgroupAddTask */ + VIR_FREE(group->controllers[i].mountPoint); + continue; + } + rc = virCgroupPathOfController(group, i, "", &path); if (rc < 0) return rc; @@ -555,7 +572,7 @@ static int virCgroupMakeGroup(virCgroupPtr parent, virCgroupPtr group, * Note that virCgroupSetMemoryUseHierarchy should always be * called prior to creating subcgroups and attaching tasks. */ - if (memory_hierarchy && + if ((flags & VIR_CGROUP_MEM_HIERACHY) && group->controllers[VIR_CGROUP_CONTROLLER_MEMORY].mountPoint != NULL && (i == VIR_CGROUP_CONTROLLER_MEMORY || STREQ(group->controllers[i].mountPoint, group->controllers[VIR_CGROUP_CONTROLLER_MEMORY].mountPoint))) { @@ -641,7 +658,7 @@ static int virCgroupAppRoot(int privileged, if (rc != 0) goto cleanup; - rc = virCgroupMakeGroup(rootgrp, *group, create, false); + rc = virCgroupMakeGroup(rootgrp, *group, create, VIR_CGROUP_NONE); cleanup: virCgroupFree(&rootgrp); @@ -801,7 +818,7 @@ int virCgroupForDriver(const char *name, VIR_FREE(path); if (rc == 0) { - rc = virCgroupMakeGroup(rootgrp, *group, create, false); + rc = virCgroupMakeGroup(rootgrp, *group, create, VIR_CGROUP_NONE); if (rc != 0) virCgroupFree(group); } @@ -861,7 +878,7 @@ int virCgroupForDomain(virCgroupPtr driver, * a group for driver, is to avoid overhead to track * cumulative usage that we don't need. */ - rc = virCgroupMakeGroup(driver, *group, create, true); + rc = virCgroupMakeGroup(driver, *group, create, VIR_CGROUP_MEM_HIERACHY); if (rc != 0) virCgroupFree(group); } @@ -879,6 +896,51 @@ int virCgroupForDomain(virCgroupPtr driver ATTRIBUTE_UNUSED, #endif /** + * virCgroupForVcpu: + * + * @driver: group for the domain + * @vcpuid: id of the vcpu + * @group: Pointer to returned virCgroupPtr + * + * Returns 0 on success + */ +#if defined HAVE_MNTENT_H && defined HAVE_GETMNTENT_R +int virCgroupForVcpu(virCgroupPtr driver, + int vcpuid, + virCgroupPtr *group, + int create) +{ + int rc; + char *path; + + if (driver == NULL) + return -EINVAL; + + if (virAsprintf(&path, "%s/vcpu%d", driver->path, vcpuid) < 0) + return -ENOMEM; + + rc = virCgroupNew(path, group); + VIR_FREE(path); + + if (rc == 0) { + rc = virCgroupMakeGroup(driver, *group, create, VIR_CGROUP_VCPU); + if (rc != 0) + virCgroupFree(group); + } + + return rc; +} +#else +int virCgroupForVcpu(virCgroupPtr driver ATTRIBUTE_UNUSED, + int vcpuid ATTRIBUTE_UNUSED, + virCgroupPtr *group ATTRIBUTE_UNUSED, + int create ATTRIBUTE_UNUSED) +{ + return -ENXIO; +} +#endif + +/** * virCgroupSetBlkioWeight: * * @group: The cgroup to change io weight for diff --git a/src/util/cgroup.h b/src/util/cgroup.h index 8ae756d..1d04418 100644 --- a/src/util/cgroup.h +++ b/src/util/cgroup.h @@ -40,6 +40,11 @@ int virCgroupForDomain(virCgroupPtr driver, virCgroupPtr *group, int create); +int virCgroupForVcpu(virCgroupPtr driver, + int vcpuid, + virCgroupPtr *group, + int create); + int virCgroupPathOfController(virCgroupPtr group, int controller, const char *key, -- 1.7.1

On Thu, Jul 21, 2011 at 10:08:03AM +0800, Wen Congyang wrote:
Introduce the function virCgroupForVcpu() to create sub directory for each vcpu. --- src/libvirt_private.syms | 1 + src/util/cgroup.c | 72 ++++++++++++++++++++++++++++++++++++++++++--- src/util/cgroup.h | 5 +++ 3 files changed, 73 insertions(+), 5 deletions(-)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 3e3b1dd..30804eb 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -67,6 +67,7 @@ virCgroupDenyAllDevices; virCgroupDenyDevicePath; virCgroupForDomain; virCgroupForDriver; +virCgroupForVcpu; virCgroupFree; virCgroupGetBlkioWeight; virCgroupGetCpuShares; diff --git a/src/util/cgroup.c b/src/util/cgroup.c index 740cedf..8994aca 100644 --- a/src/util/cgroup.c +++ b/src/util/cgroup.c @@ -52,6 +52,16 @@ struct virCgroup { struct virCgroupController controllers[VIR_CGROUP_CONTROLLER_LAST]; };
+typedef enum { + VIR_CGROUP_NONE = 0, /* create subdir under each cgroup if possible. */ + VIR_CGROUP_MEM_HIERACHY = 1 << 0, /* call virCgroupSetMemoryUseHierarchy + * before creating subcgroups and + * attaching tasks + */ + VIR_CGROUP_VCPU = 1 << 1, /* create subdir only under the cgroup cpu, + * cpuacct and cpuset if possible. */ +} virCgroupFlags; + /** * virCgroupFree: * @@ -503,7 +513,7 @@ static int virCgroupSetMemoryUseHierarchy(virCgroupPtr group) }
static int virCgroupMakeGroup(virCgroupPtr parent, virCgroupPtr group, - int create, bool memory_hierarchy) + int create, unsigned int flags) { int i; int rc = 0; @@ -516,6 +526,13 @@ static int virCgroupMakeGroup(virCgroupPtr parent, virCgroupPtr group, if (!group->controllers[i].mountPoint) continue;
+ /* We need to control cpu bandwidth for each vcpu now */ + if ((flags & VIR_CGROUP_VCPU) && i != VIR_CGROUP_CONTROLLER_CPU) {
small nit, I woudl put () on both side of the &&
+ /* treat it as unmounted and we can use virCgroupAddTask */ + VIR_FREE(group->controllers[i].mountPoint); + continue; + } + rc = virCgroupPathOfController(group, i, "", &path); if (rc < 0) return rc; @@ -555,7 +572,7 @@ static int virCgroupMakeGroup(virCgroupPtr parent, virCgroupPtr group, * Note that virCgroupSetMemoryUseHierarchy should always be * called prior to creating subcgroups and attaching tasks. */ - if (memory_hierarchy && + if ((flags & VIR_CGROUP_MEM_HIERACHY) && group->controllers[VIR_CGROUP_CONTROLLER_MEMORY].mountPoint != NULL &&
same here
(i == VIR_CGROUP_CONTROLLER_MEMORY || STREQ(group->controllers[i].mountPoint, group->controllers[VIR_CGROUP_CONTROLLER_MEMORY].mountPoint))) { @@ -641,7 +658,7 @@ static int virCgroupAppRoot(int privileged, if (rc != 0) goto cleanup;
- rc = virCgroupMakeGroup(rootgrp, *group, create, false); + rc = virCgroupMakeGroup(rootgrp, *group, create, VIR_CGROUP_NONE);
cleanup: virCgroupFree(&rootgrp); @@ -801,7 +818,7 @@ int virCgroupForDriver(const char *name, VIR_FREE(path);
if (rc == 0) { - rc = virCgroupMakeGroup(rootgrp, *group, create, false); + rc = virCgroupMakeGroup(rootgrp, *group, create, VIR_CGROUP_NONE); if (rc != 0) virCgroupFree(group); } @@ -861,7 +878,7 @@ int virCgroupForDomain(virCgroupPtr driver, * a group for driver, is to avoid overhead to track * cumulative usage that we don't need. */ - rc = virCgroupMakeGroup(driver, *group, create, true); + rc = virCgroupMakeGroup(driver, *group, create, VIR_CGROUP_MEM_HIERACHY); if (rc != 0) virCgroupFree(group); } @@ -879,6 +896,51 @@ int virCgroupForDomain(virCgroupPtr driver ATTRIBUTE_UNUSED, #endif
/** + * virCgroupForVcpu: + * + * @driver: group for the domain + * @vcpuid: id of the vcpu + * @group: Pointer to returned virCgroupPtr + * + * Returns 0 on success + */ +#if defined HAVE_MNTENT_H && defined HAVE_GETMNTENT_R +int virCgroupForVcpu(virCgroupPtr driver, + int vcpuid, + virCgroupPtr *group, + int create) +{ + int rc; + char *path; + + if (driver == NULL) + return -EINVAL; + + if (virAsprintf(&path, "%s/vcpu%d", driver->path, vcpuid) < 0) + return -ENOMEM; + + rc = virCgroupNew(path, group); + VIR_FREE(path); + + if (rc == 0) { + rc = virCgroupMakeGroup(driver, *group, create, VIR_CGROUP_VCPU); + if (rc != 0) + virCgroupFree(group); + } + + return rc; +} +#else +int virCgroupForVcpu(virCgroupPtr driver ATTRIBUTE_UNUSED, + int vcpuid ATTRIBUTE_UNUSED, + virCgroupPtr *group ATTRIBUTE_UNUSED, + int create ATTRIBUTE_UNUSED) +{ + return -ENXIO; +} +#endif + +/** * virCgroupSetBlkioWeight: * * @group: The cgroup to change io weight for diff --git a/src/util/cgroup.h b/src/util/cgroup.h index 8ae756d..1d04418 100644 --- a/src/util/cgroup.h +++ b/src/util/cgroup.h @@ -40,6 +40,11 @@ int virCgroupForDomain(virCgroupPtr driver, virCgroupPtr *group, int create);
+int virCgroupForVcpu(virCgroupPtr driver, + int vcpuid, + virCgroupPtr *group, + int create); + int virCgroupPathOfController(virCgroupPtr group, int controller, const char *key, -- 1.7.1
ACK, Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

On Thu, Jul 21, 2011 at 10:08:03AM +0800, Wen Congyang wrote:
Introduce the function virCgroupForVcpu() to create sub directory for each vcpu.
I'm far from convinced that this is a good idea. Setting policies on individual VCPUs is giving the host admin a misleading illusion of control over individual guest VCPUs. The problem is that QEMU has many non-VCPU threads which consume non-trivial CPU time. CPU time generated by a CPU in the guest, does not neccessarily map to a VCPU thread in the host, but could instead go to a I/O thread or a display thread, etc. IMHO we should only be doing controls at the whole VM level here. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Thu, Jul 21, 2011 at 11:09:26AM +0100, Daniel P. Berrange wrote:
On Thu, Jul 21, 2011 at 10:08:03AM +0800, Wen Congyang wrote:
Introduce the function virCgroupForVcpu() to create sub directory for each vcpu.
I'm far from convinced that this is a good idea. Setting policies on individual VCPUs is giving the host admin a misleading illusion of control over individual guest VCPUs. The problem is that QEMU has many non-VCPU threads which consume non-trivial CPU time. CPU time generated by a CPU in the guest, does not neccessarily map to a VCPU thread in the host, but could instead go to a I/O thread or a display thread, etc.
In fact the behaviour is worse than I describe, because the hard limit setting is not applied at the VM cgroup at all, so if the VM is doing heavy I/O, then no CPU limits are being applied to QEMU's I/O threads at all. The existing cpu_shares tunable is being set for the VM as a whole, so I think it is really confusing to have the period/quota only affecting VCPU threads and not the VM as a whole. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

Added Anthony to give him the opportunity to address the finer points of this one especially with respect to the qemu IO thread(s). This feature is really about capping the compute performance of a VM such that we get consistent top end performance. Yes, qemu has non-VCPU threads that this patch set doesn't govern, but that's the point. We are not attempting to throttle IO or device emulation with this feature. It's true that an IO-intensive guest may consume more host resources than a compute intensive guest, but they should still have equal top-end CPU performance when viewed from the guest's perspective. On 07/21/2011 05:09 AM, Daniel P. Berrange wrote:
On Thu, Jul 21, 2011 at 10:08:03AM +0800, Wen Congyang wrote:
Introduce the function virCgroupForVcpu() to create sub directory for each vcpu.
I'm far from convinced that this is a good idea. Setting policies on individual VCPUs is giving the host admin a misleading illusion of control over individual guest VCPUs. The problem is that QEMU has many non-VCPU threads which consume non-trivial CPU time. CPU time generated by a CPU in the guest, does not neccessarily map to a VCPU thread in the host, but could instead go to a I/O thread or a display thread, etc.
IMHO we should only be doing controls at the whole VM level here.
Daniel
-- Adam Litke IBM Linux Technology Center

On Thu, Jul 21, 2011 at 07:54:05AM -0500, Adam Litke wrote:
Added Anthony to give him the opportunity to address the finer points of this one especially with respect to the qemu IO thread(s).
This feature is really about capping the compute performance of a VM such that we get consistent top end performance. Yes, qemu has non-VCPU threads that this patch set doesn't govern, but that's the point. We are not attempting to throttle IO or device emulation with this feature. It's true that an IO-intensive guest may consume more host resources than a compute intensive guest, but they should still have equal top-end CPU performance when viewed from the guest's perspective.
I could be mis-understanding, what you're trying to achieve, here, so perhaps we should consider an example. - A machine has 4 physical CPUs - There are 4 guests on the machine - Each guest has 2 virtual CPUs So we've overcommit the host CPU resources x2 here. Lets say that we want to use this feature to ensure consistent top end performance of every guest, splitting the host pCPUs resources evenly across all guests, so each guest is ensured 1 pCPU worth of CPU time overall. This patch lets you do this by assigning caps per VCPU. So in this example, each VCPU cgroup would have to be configured to cap the VCPUs at 50% of a single pCPU. This leaves the other QEMU threads uncapped / unaccounted for. If any one guest causes non-trivial compute load in a non-VCPU thread, this can/will impact the top-end compute performance of all the other guests on the machine. If we did caps per VM, then you could set the VM cgroup such that the VM as a whole had 100% of a single pCPU. If a guest is 100% compute bound, it can use its full 100% of a pCPU allocation in vCPU threads. If any other guest is causing CPU time in a non-VCPU thread, it cannot impact the top end compute performance of VCPU threads in the other guests. A per-VM cap would, however, mean a guest with 2 vCPUs could have unequal scheduling, where one vCPU claimed 75% of the pCPU and the othe vCPU got left with only 25%. So AFAICT, per-VM cgroups is better for ensuring top end compute performance of a guest as a whole, but per-VCPU cgroups can ensure consistent top end performance across vCPUs within a guest. IMHO, per-VM cgroups is the more useful because it is the only way to stop guests impacting each other, but there could be additional benefits of *also* have per-VCPU cgroups if you want to ensure fairness of top-end performance across vCPUs inside a single VM. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 07/21/2011 08:34 AM, Daniel P. Berrange wrote:
On Thu, Jul 21, 2011 at 07:54:05AM -0500, Adam Litke wrote:
Added Anthony to give him the opportunity to address the finer points of this one especially with respect to the qemu IO thread(s).
This feature is really about capping the compute performance of a VM such that we get consistent top end performance. Yes, qemu has non-VCPU threads that this patch set doesn't govern, but that's the point. We are not attempting to throttle IO or device emulation with this feature. It's true that an IO-intensive guest may consume more host resources than a compute intensive guest, but they should still have equal top-end CPU performance when viewed from the guest's perspective.
I could be mis-understanding, what you're trying to achieve, here, so perhaps we should consider an example.
- A machine has 4 physical CPUs - There are 4 guests on the machine - Each guest has 2 virtual CPUs
So we've overcommit the host CPU resources x2 here.
Lets say that we want to use this feature to ensure consistent top end performance of every guest, splitting the host pCPUs resources evenly across all guests, so each guest is ensured 1 pCPU worth of CPU time overall.
This patch lets you do this by assigning caps per VCPU. So in this example, each VCPU cgroup would have to be configured to cap the VCPUs at 50% of a single pCPU.
This leaves the other QEMU threads uncapped / unaccounted for. If any one guest causes non-trivial compute load in a non-VCPU thread, this can/will impact the top-end compute performance of all the other guests on the machine.
But this is not undesirable behavior. You're mixing up consistency and top end performance. They are totally different things and I think most consumers of capping really only care about the later.
If we did caps per VM, then you could set the VM cgroup such that the VM as a whole had 100% of a single pCPU.
Consistent performance is very hard to achieve. The desire is to cap performance not just within a box, but also across multiple different boxes with potentially different versions of KVM. The I/O threads are basically hypervisor overhead. That's going to change over time.
If a guest is 100% compute bound, it can use its full 100% of a pCPU allocation in vCPU threads. If any other guest is causing CPU time in a non-VCPU thread, it cannot impact the top end compute performance of VCPU threads in the other guests.
A per-VM cap would, however, mean a guest with 2 vCPUs could have unequal scheduling, where one vCPU claimed 75% of the pCPU and the othe vCPU got left with only 25%.
So AFAICT, per-VM cgroups is better for ensuring top end compute performance of a guest as a whole, but per-VCPU cgroups can ensure consistent top end performance across vCPUs within a guest.
And the later is the primary use case as far as I can tell. If I'm a user, I'll be very confused if I have a 4 VCPU guest, run 1 instance of specint, and get X as the result. I then run 4 instances of specint and get X as the result. My expectation is to get 4X as the result because I'm running 4 instances. Getting X as the result is going to appear to be a bug. I cannot imagine that any normal user would expect this to be the behavior. Regards, Anthony Liguori
IMHO, per-VM cgroups is the more useful because it is the only way to stop guests impacting each other, but there could be additional benefits of *also* have per-VCPU cgroups if you want to ensure fairness of top-end performance across vCPUs inside a single VM.
Regards, Daniel

On Thu, Jul 21, 2011 at 08:44:35AM -0500, Anthony Liguori wrote:
On 07/21/2011 08:34 AM, Daniel P. Berrange wrote:
On Thu, Jul 21, 2011 at 07:54:05AM -0500, Adam Litke wrote:
Added Anthony to give him the opportunity to address the finer points of this one especially with respect to the qemu IO thread(s).
This feature is really about capping the compute performance of a VM such that we get consistent top end performance. Yes, qemu has non-VCPU threads that this patch set doesn't govern, but that's the point. We are not attempting to throttle IO or device emulation with this feature. It's true that an IO-intensive guest may consume more host resources than a compute intensive guest, but they should still have equal top-end CPU performance when viewed from the guest's perspective.
I could be mis-understanding, what you're trying to achieve, here, so perhaps we should consider an example.
- A machine has 4 physical CPUs - There are 4 guests on the machine - Each guest has 2 virtual CPUs
So we've overcommit the host CPU resources x2 here.
Lets say that we want to use this feature to ensure consistent top end performance of every guest, splitting the host pCPUs resources evenly across all guests, so each guest is ensured 1 pCPU worth of CPU time overall.
This patch lets you do this by assigning caps per VCPU. So in this example, each VCPU cgroup would have to be configured to cap the VCPUs at 50% of a single pCPU.
This leaves the other QEMU threads uncapped / unaccounted for. If any one guest causes non-trivial compute load in a non-VCPU thread, this can/will impact the top-end compute performance of all the other guests on the machine.
But this is not undesirable behavior. You're mixing up consistency and top end performance. They are totally different things and I think most consumers of capping really only care about the later.
If we did caps per VM, then you could set the VM cgroup such that the VM as a whole had 100% of a single pCPU.
Consistent performance is very hard to achieve. The desire is to cap performance not just within a box, but also across multiple different boxes with potentially different versions of KVM. The I/O threads are basically hypervisor overhead. That's going to change over time.
If a guest is 100% compute bound, it can use its full 100% of a pCPU allocation in vCPU threads. If any other guest is causing CPU time in a non-VCPU thread, it cannot impact the top end compute performance of VCPU threads in the other guests.
A per-VM cap would, however, mean a guest with 2 vCPUs could have unequal scheduling, where one vCPU claimed 75% of the pCPU and the othe vCPU got left with only 25%.
So AFAICT, per-VM cgroups is better for ensuring top end compute performance of a guest as a whole, but per-VCPU cgroups can ensure consistent top end performance across vCPUs within a guest.
And the later is the primary use case as far as I can tell.
If I'm a user, I'll be very confused if I have a 4 VCPU guest, run 1 instance of specint, and get X as the result. I then run 4 instances of specint and get X as the result. My expectation is to get 4X as the result because I'm running 4 instances.
I don't see why doing limits at per-VM vs per-VM has an impact on performance when adding extra guests. It would certainly change behaviour if adding extra vCPUs to a guest though. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 07/21/2011 09:21 AM, Daniel P. Berrange wrote:
On Thu, Jul 21, 2011 at 08:44:35AM -0500, Anthony Liguori wrote:
On 07/21/2011 08:34 AM, Daniel P. Berrange wrote:
On Thu, Jul 21, 2011 at 07:54:05AM -0500, Adam Litke wrote:
Added Anthony to give him the opportunity to address the finer points of this one especially with respect to the qemu IO thread(s).
This feature is really about capping the compute performance of a VM such that we get consistent top end performance. Yes, qemu has non-VCPU threads that this patch set doesn't govern, but that's the point. We are not attempting to throttle IO or device emulation with this feature. It's true that an IO-intensive guest may consume more host resources than a compute intensive guest, but they should still have equal top-end CPU performance when viewed from the guest's perspective.
I could be mis-understanding, what you're trying to achieve, here, so perhaps we should consider an example.
- A machine has 4 physical CPUs - There are 4 guests on the machine - Each guest has 2 virtual CPUs
So we've overcommit the host CPU resources x2 here.
Lets say that we want to use this feature to ensure consistent top end performance of every guest, splitting the host pCPUs resources evenly across all guests, so each guest is ensured 1 pCPU worth of CPU time overall.
This patch lets you do this by assigning caps per VCPU. So in this example, each VCPU cgroup would have to be configured to cap the VCPUs at 50% of a single pCPU.
This leaves the other QEMU threads uncapped / unaccounted for. If any one guest causes non-trivial compute load in a non-VCPU thread, this can/will impact the top-end compute performance of all the other guests on the machine.
But this is not undesirable behavior. You're mixing up consistency and top end performance. They are totally different things and I think most consumers of capping really only care about the later.
If we did caps per VM, then you could set the VM cgroup such that the VM as a whole had 100% of a single pCPU.
Consistent performance is very hard to achieve. The desire is to cap performance not just within a box, but also across multiple different boxes with potentially different versions of KVM. The I/O threads are basically hypervisor overhead. That's going to change over time.
If a guest is 100% compute bound, it can use its full 100% of a pCPU allocation in vCPU threads. If any other guest is causing CPU time in a non-VCPU thread, it cannot impact the top end compute performance of VCPU threads in the other guests.
A per-VM cap would, however, mean a guest with 2 vCPUs could have unequal scheduling, where one vCPU claimed 75% of the pCPU and the othe vCPU got left with only 25%.
So AFAICT, per-VM cgroups is better for ensuring top end compute performance of a guest as a whole, but per-VCPU cgroups can ensure consistent top end performance across vCPUs within a guest.
And the later is the primary use case as far as I can tell.
If I'm a user, I'll be very confused if I have a 4 VCPU guest, run 1 instance of specint, and get X as the result. I then run 4 instances of specint and get X as the result. My expectation is to get 4X as the result because I'm running 4 instances.
I don't see why doing limits at per-VM vs per-VM has an impact on performance when adding extra guests. It would certainly change behaviour if adding extra vCPUs to a guest though.
My use of "instance" was referring to a copy of specint. IOW, 4-VCPU guest: $ ./specint score - 490 $ ./specint --threads=4 score - 490 That would confuse a user. The expectation is: $ ./specint --threads=4 score - 1960 Regards, Anthony Liguori
Daniel

On 07/21/2011 08:34 AM, Daniel P. Berrange wrote:
On Thu, Jul 21, 2011 at 07:54:05AM -0500, Adam Litke wrote:
Added Anthony to give him the opportunity to address the finer points of this one especially with respect to the qemu IO thread(s).
This feature is really about capping the compute performance of a VM such that we get consistent top end performance. Yes, qemu has non-VCPU threads that this patch set doesn't govern, but that's the point. We are not attempting to throttle IO or device emulation with this feature. It's true that an IO-intensive guest may consume more host resources than a compute intensive guest, but they should still have equal top-end CPU performance when viewed from the guest's perspective.
I could be mis-understanding, what you're trying to achieve, here, so perhaps we should consider an example.
From your example, it's clear to me that you understand the use case well.
- A machine has 4 physical CPUs - There are 4 guests on the machine - Each guest has 2 virtual CPUs
So we've overcommit the host CPU resources x2 here.
Lets say that we want to use this feature to ensure consistent top end performance of every guest, splitting the host pCPUs resources evenly across all guests, so each guest is ensured 1 pCPU worth of CPU time overall.
This patch lets you do this by assigning caps per VCPU. So in this example, each VCPU cgroup would have to be configured to cap the VCPUs at 50% of a single pCPU.
This leaves the other QEMU threads uncapped / unaccounted for. If any one guest causes non-trivial compute load in a non-VCPU thread, this can/will impact the top-end compute performance of all the other guests on the machine.
If we did caps per VM, then you could set the VM cgroup such that the VM as a whole had 100% of a single pCPU.
If a guest is 100% compute bound, it can use its full 100% of a pCPU allocation in vCPU threads. If any other guest is causing CPU time in a non-VCPU thread, it cannot impact the top end compute performance of VCPU threads in the other guests.
A per-VM cap would, however, mean a guest with 2 vCPUs could have unequal scheduling, where one vCPU claimed 75% of the pCPU and the othe vCPU got left with only 25%.
So AFAICT, per-VM cgroups is better for ensuring top end compute performance of a guest as a whole, but per-VCPU cgroups can ensure consistent top end performance across vCPUs within a guest.
IMHO, per-VM cgroups is the more useful because it is the only way to stop guests impacting each other, but there could be additional benefits of *also* have per-VCPU cgroups if you want to ensure fairness of top-end performance across vCPUs inside a single VM.
What this says to me is that per-VM cgroups _in_addition_to_ per-vcpu cgroups is the _most_ useful situation. Since I can't think of any cases where someone would want per-vm and not per-vcpu, how about we always do both when supported. We can still use one pair of tunables (<period> and <quota>) and try to do the right thing. For example: <vcpus>2</vcpus> <cputune> <period>500000</period> <quota>250000</quota> </cputune> Would have the following behavior for qemu-kvm (vcpu threads) Global VM cgroup: cfs_period:500000 cfs_quota:500000 Each vcpu cgroup: cfs_period:500000 cfs_quota:250000 and this behavior for qemu with no vcpu threads Global VM cgroup: cfs_period:500000 cfs_quota:500000 It's true that IO could still throw off the scheduling balance somewhat among vcpus _within_ a VM, but this effect would be confined within the vm itself. Best of both worlds? -- Adam Litke IBM Linux Technology Center

On Thu, Jul 21, 2011 at 08:49:28AM -0500, Adam Litke wrote:
On 07/21/2011 08:34 AM, Daniel P. Berrange wrote:
On Thu, Jul 21, 2011 at 07:54:05AM -0500, Adam Litke wrote:
Added Anthony to give him the opportunity to address the finer points of this one especially with respect to the qemu IO thread(s).
This feature is really about capping the compute performance of a VM such that we get consistent top end performance. Yes, qemu has non-VCPU threads that this patch set doesn't govern, but that's the point. We are not attempting to throttle IO or device emulation with this feature. It's true that an IO-intensive guest may consume more host resources than a compute intensive guest, but they should still have equal top-end CPU performance when viewed from the guest's perspective.
I could be mis-understanding, what you're trying to achieve, here, so perhaps we should consider an example.
From your example, it's clear to me that you understand the use case well.
- A machine has 4 physical CPUs - There are 4 guests on the machine - Each guest has 2 virtual CPUs
So we've overcommit the host CPU resources x2 here.
Lets say that we want to use this feature to ensure consistent top end performance of every guest, splitting the host pCPUs resources evenly across all guests, so each guest is ensured 1 pCPU worth of CPU time overall.
This patch lets you do this by assigning caps per VCPU. So in this example, each VCPU cgroup would have to be configured to cap the VCPUs at 50% of a single pCPU.
This leaves the other QEMU threads uncapped / unaccounted for. If any one guest causes non-trivial compute load in a non-VCPU thread, this can/will impact the top-end compute performance of all the other guests on the machine.
If we did caps per VM, then you could set the VM cgroup such that the VM as a whole had 100% of a single pCPU.
If a guest is 100% compute bound, it can use its full 100% of a pCPU allocation in vCPU threads. If any other guest is causing CPU time in a non-VCPU thread, it cannot impact the top end compute performance of VCPU threads in the other guests.
A per-VM cap would, however, mean a guest with 2 vCPUs could have unequal scheduling, where one vCPU claimed 75% of the pCPU and the othe vCPU got left with only 25%.
So AFAICT, per-VM cgroups is better for ensuring top end compute performance of a guest as a whole, but per-VCPU cgroups can ensure consistent top end performance across vCPUs within a guest.
IMHO, per-VM cgroups is the more useful because it is the only way to stop guests impacting each other, but there could be additional benefits of *also* have per-VCPU cgroups if you want to ensure fairness of top-end performance across vCPUs inside a single VM.
What this says to me is that per-VM cgroups _in_addition_to_ per-vcpu cgroups is the _most_ useful situation. Since I can't think of any cases where someone would want per-vm and not per-vcpu, how about we always do both when supported. We can still use one pair of tunables (<period> and <quota>) and try to do the right thing. For example:
<vcpus>2</vcpus> <cputune> <period>500000</period> <quota>250000</quota> </cputune>
Would have the following behavior for qemu-kvm (vcpu threads)
Global VM cgroup: cfs_period:500000 cfs_quota:500000 Each vcpu cgroup: cfs_period:500000 cfs_quota:250000
and this behavior for qemu with no vcpu threads
So, whatever quota value is in the XML, you would multiply that by the number of vCPUS and use it to set the VM quota value ? I'm trying to think if there is ever a case where you don't want the VM to be a plain multiple of the VCPU value, but I can't think of one. So only the real discussion point here, is whether the quota value in the XML, is treated as a per-VM value, or a per-VCPU value. cpu_shares is treated as a per-VM value, period doesn't matter but cpu_quota would be a per-VCPU value, multiplied to get a per-VM value when needed. I still find this mis-match rather wierd to be fair. So current behaviour if vCPU threads set quota in vCPU group else set nVCPUs * quota in VM group Would change to set nVCPUs * quota in VM group if vCPU threads set quota in vCPU group ? We need to remember to update the VM cgroup if we change the number of vCPUs on a running guest of course Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 07/21/2011 09:29 AM, Daniel P. Berrange wrote:
On Thu, Jul 21, 2011 at 08:49:28AM -0500, Adam Litke wrote:
On 07/21/2011 08:34 AM, Daniel P. Berrange wrote:
On Thu, Jul 21, 2011 at 07:54:05AM -0500, Adam Litke wrote:
Added Anthony to give him the opportunity to address the finer points of this one especially with respect to the qemu IO thread(s).
This feature is really about capping the compute performance of a VM such that we get consistent top end performance. Yes, qemu has non-VCPU threads that this patch set doesn't govern, but that's the point. We are not attempting to throttle IO or device emulation with this feature. It's true that an IO-intensive guest may consume more host resources than a compute intensive guest, but they should still have equal top-end CPU performance when viewed from the guest's perspective.
I could be mis-understanding, what you're trying to achieve, here, so perhaps we should consider an example.
From your example, it's clear to me that you understand the use case well.
- A machine has 4 physical CPUs - There are 4 guests on the machine - Each guest has 2 virtual CPUs
So we've overcommit the host CPU resources x2 here.
Lets say that we want to use this feature to ensure consistent top end performance of every guest, splitting the host pCPUs resources evenly across all guests, so each guest is ensured 1 pCPU worth of CPU time overall.
This patch lets you do this by assigning caps per VCPU. So in this example, each VCPU cgroup would have to be configured to cap the VCPUs at 50% of a single pCPU.
This leaves the other QEMU threads uncapped / unaccounted for. If any one guest causes non-trivial compute load in a non-VCPU thread, this can/will impact the top-end compute performance of all the other guests on the machine.
If we did caps per VM, then you could set the VM cgroup such that the VM as a whole had 100% of a single pCPU.
If a guest is 100% compute bound, it can use its full 100% of a pCPU allocation in vCPU threads. If any other guest is causing CPU time in a non-VCPU thread, it cannot impact the top end compute performance of VCPU threads in the other guests.
A per-VM cap would, however, mean a guest with 2 vCPUs could have unequal scheduling, where one vCPU claimed 75% of the pCPU and the othe vCPU got left with only 25%.
So AFAICT, per-VM cgroups is better for ensuring top end compute performance of a guest as a whole, but per-VCPU cgroups can ensure consistent top end performance across vCPUs within a guest.
IMHO, per-VM cgroups is the more useful because it is the only way to stop guests impacting each other, but there could be additional benefits of *also* have per-VCPU cgroups if you want to ensure fairness of top-end performance across vCPUs inside a single VM.
What this says to me is that per-VM cgroups _in_addition_to_ per-vcpu cgroups is the _most_ useful situation. Since I can't think of any cases where someone would want per-vm and not per-vcpu, how about we always do both when supported. We can still use one pair of tunables (<period> and <quota>) and try to do the right thing. For example:
<vcpus>2</vcpus> <cputune> <period>500000</period> <quota>250000</quota> </cputune>
Would have the following behavior for qemu-kvm (vcpu threads)
Global VM cgroup: cfs_period:500000 cfs_quota:500000 Each vcpu cgroup: cfs_period:500000 cfs_quota:250000
and this behavior for qemu with no vcpu threads
So, whatever quota value is in the XML, you would multiply that by the number of vCPUS and use it to set the VM quota value ?
Yep.
I'm trying to think if there is ever a case where you don't want the VM to be a plain multiple of the VCPU value, but I can't think of one.
So only the real discussion point here, is whether the quota value in the XML, is treated as a per-VM value, or a per-VCPU value.
I think it has to be per-VCPU. Otherwise the user will have to remember to do the multiplication themselves. If they forget to do this they will get a nasty performance surprise.
cpu_shares is treated as a per-VM value, period doesn't matter but cpu_quota would be a per-VCPU value, multiplied to get a per-VM value when needed. I still find this mis-match rather wierd to be fair.
Yes, this is unfortunate. But cpu_shares is a comparative value whereas quota is quantitative. In the future we could apply 'shares' at the vcpu level too. In that case we'd just pick some arbitrary number and apply it to each vcpu cgroup.
So current behaviour
if vCPU threads set quota in vCPU group else set nVCPUs * quota in VM group
Would change to
set nVCPUs * quota in VM group if vCPU threads set quota in vCPU group
?
Yes.
We need to remember to update the VM cgroup if we change the number of vCPUs on a running guest of course
When can this happen? Does libvirt support cpu hotplug? -- Adam Litke IBM Linux Technology Center

On 07/21/2011 09:25 AM, Adam Litke wrote:
We need to remember to update the VM cgroup if we change the number of vCPUs on a running guest of course
When can this happen? Does libvirt support cpu hotplug?
Libvirt - yes, via virDomainSetVcpusFlags and similar API. Hypervisors - different story. Xen supports it, and at one point, qemu used to, but the qemu implementation was so buggy that current qemu does not support cpu hotplug. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

At 07/21/2011 11:25 PM, Adam Litke Write:
On 07/21/2011 09:29 AM, Daniel P. Berrange wrote:
On Thu, Jul 21, 2011 at 08:49:28AM -0500, Adam Litke wrote:
On 07/21/2011 08:34 AM, Daniel P. Berrange wrote:
On Thu, Jul 21, 2011 at 07:54:05AM -0500, Adam Litke wrote:
Added Anthony to give him the opportunity to address the finer points of this one especially with respect to the qemu IO thread(s).
This feature is really about capping the compute performance of a VM such that we get consistent top end performance. Yes, qemu has non-VCPU threads that this patch set doesn't govern, but that's the point. We are not attempting to throttle IO or device emulation with this feature. It's true that an IO-intensive guest may consume more host resources than a compute intensive guest, but they should still have equal top-end CPU performance when viewed from the guest's perspective.
I could be mis-understanding, what you're trying to achieve, here, so perhaps we should consider an example.
From your example, it's clear to me that you understand the use case well.
- A machine has 4 physical CPUs - There are 4 guests on the machine - Each guest has 2 virtual CPUs
So we've overcommit the host CPU resources x2 here.
Lets say that we want to use this feature to ensure consistent top end performance of every guest, splitting the host pCPUs resources evenly across all guests, so each guest is ensured 1 pCPU worth of CPU time overall.
This patch lets you do this by assigning caps per VCPU. So in this example, each VCPU cgroup would have to be configured to cap the VCPUs at 50% of a single pCPU.
This leaves the other QEMU threads uncapped / unaccounted for. If any one guest causes non-trivial compute load in a non-VCPU thread, this can/will impact the top-end compute performance of all the other guests on the machine.
If we did caps per VM, then you could set the VM cgroup such that the VM as a whole had 100% of a single pCPU.
If a guest is 100% compute bound, it can use its full 100% of a pCPU allocation in vCPU threads. If any other guest is causing CPU time in a non-VCPU thread, it cannot impact the top end compute performance of VCPU threads in the other guests.
A per-VM cap would, however, mean a guest with 2 vCPUs could have unequal scheduling, where one vCPU claimed 75% of the pCPU and the othe vCPU got left with only 25%.
So AFAICT, per-VM cgroups is better for ensuring top end compute performance of a guest as a whole, but per-VCPU cgroups can ensure consistent top end performance across vCPUs within a guest.
IMHO, per-VM cgroups is the more useful because it is the only way to stop guests impacting each other, but there could be additional benefits of *also* have per-VCPU cgroups if you want to ensure fairness of top-end performance across vCPUs inside a single VM.
What this says to me is that per-VM cgroups _in_addition_to_ per-vcpu cgroups is the _most_ useful situation. Since I can't think of any cases where someone would want per-vm and not per-vcpu, how about we always do both when supported. We can still use one pair of tunables (<period> and <quota>) and try to do the right thing. For example:
<vcpus>2</vcpus> <cputune> <period>500000</period> <quota>250000</quota> </cputune>
Would have the following behavior for qemu-kvm (vcpu threads)
Global VM cgroup: cfs_period:500000 cfs_quota:500000 Each vcpu cgroup: cfs_period:500000 cfs_quota:250000
and this behavior for qemu with no vcpu threads
So, whatever quota value is in the XML, you would multiply that by the number of vCPUS and use it to set the VM quota value ?
Yep.
I'm trying to think if there is ever a case where you don't want the VM to be a plain multiple of the VCPU value, but I can't think of one.
So only the real discussion point here, is whether the quota value in the XML, is treated as a per-VM value, or a per-VCPU value.
I think it has to be per-VCPU. Otherwise the user will have to remember to do the multiplication themselves. If they forget to do this they will get a nasty performance surprise.
cpu_shares is treated as a per-VM value, period doesn't matter but cpu_quota would be a per-VCPU value, multiplied to get a per-VM value when needed. I still find this mis-match rather wierd to be fair.
Yes, this is unfortunate. But cpu_shares is a comparative value whereas quota is quantitative. In the future we could apply 'shares' at the vcpu level too. In that case we'd just pick some arbitrary number and apply it to each vcpu cgroup.
So current behaviour
if vCPU threads set quota in vCPU group else set nVCPUs * quota in VM group
Would change to
set nVCPUs * quota in VM group if vCPU threads set quota in vCPU group
?
Yes.
I treat this answer as you agree with Daniel P. Berrange's idea. If so, I will implement it.
We need to remember to update the VM cgroup if we change the number of vCPUs on a running guest of course
When can this happen? Does libvirt support cpu hotplug?

On 07/22/2011 01:56 AM, Wen Congyang wrote:
So current behaviour
if vCPU threads set quota in vCPU group else set nVCPUs * quota in VM group
Would change to
set nVCPUs * quota in VM group if vCPU threads set quota in vCPU group
?
Yes.
I treat this answer as you agree with Daniel P. Berrange's idea. If so, I will implement it.
Yes, I am in agreement. -- Adam Litke IBM Linux Technology Center

--- src/libvirt_private.syms | 4 ++ src/util/cgroup.c | 81 ++++++++++++++++++++++++++++++++++++++++++++-- src/util/cgroup.h | 6 +++ 3 files changed, 88 insertions(+), 3 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 30804eb..8b73ec3 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -71,6 +71,8 @@ virCgroupForVcpu; virCgroupFree; virCgroupGetBlkioWeight; virCgroupGetCpuShares; +virCgroupGetCpuCfsPeriod; +virCgroupGetCpuCfsQuota; virCgroupGetCpuacctUsage; virCgroupGetFreezerState; virCgroupGetMemoryHardLimit; @@ -85,6 +87,8 @@ virCgroupPathOfController; virCgroupRemove; virCgroupSetBlkioWeight; virCgroupSetCpuShares; +virCgroupSetCpuCfsPeriod; +virCgroupSetCpuCfsQuota; virCgroupSetFreezerState; virCgroupSetMemory; virCgroupSetMemoryHardLimit; diff --git a/src/util/cgroup.c b/src/util/cgroup.c index 8994aca..378e08a 100644 --- a/src/util/cgroup.c +++ b/src/util/cgroup.c @@ -398,8 +398,6 @@ static int virCgroupSetValueI64(virCgroupPtr group, return rc; } -#if 0 -/* This is included for completeness, but not yet used */ static int virCgroupGetValueI64(virCgroupPtr group, int controller, const char *key, @@ -419,7 +417,6 @@ out: return rc; } -#endif static int virCgroupGetValueU64(virCgroupPtr group, int controller, @@ -1384,6 +1381,84 @@ int virCgroupGetCpuShares(virCgroupPtr group, unsigned long long *shares) "cpu.shares", shares); } +/** + * virCgroupSetCpuCfsPeriod: + * + * @group: The cgroup to change cpu.cfs_period_us for + * @cfs_period: The bandwidth period in usecs + * + * Returns: 0 on success + */ +int virCgroupSetCpuCfsPeriod(virCgroupPtr group, unsigned long long cfs_period) +{ + /* The cfs_period shoule be greater or equal than 1ms, and less or equal + * than 1s. + */ + if (cfs_period < 1000 || cfs_period > 1000000) + return -EINVAL; + + return virCgroupSetValueU64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", cfs_period); +} + +/** + * virCgroupGetCpuCfsPeriod: + * + * @group: The cgroup to get cpu.cfs_period_us for + * @cfs_period: Pointer to the returned bandwidth period in usecs + * + * Returns: 0 on success + */ +int virCgroupGetCpuCfsPeriod(virCgroupPtr group, unsigned long long *cfs_period) +{ + return virCgroupGetValueU64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", cfs_period); +} + +/** + * virCgroupSetCpuCfsQuota: + * + * @group: The cgroup to change cpu.cfs_quota_us for + * @cfs_quota: the cpu bandwidth (in usecs) that this tg will be allowed to + * consume over period + * + * Returns: 0 on success + */ +int virCgroupSetCpuCfsQuota(virCgroupPtr group, long long cfs_quota) +{ + if (cfs_quota >= 0) { + /* The cfs_quota shoule be greater or equal than 1ms */ + if (cfs_quota < 1000) + return -EINVAL; + + /* check overflow */ + if (cfs_quota > (unsigned long long)~0ULL / 1000) + return -EINVAL; + } + + return virCgroupSetValueI64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_quota_us", cfs_quota); +} + +/** + * virCgroupGetCpuCfsQuota: + * + * @group: The cgroup to get cpu.cfs_quota_us for + * @cfs_quota: Pointer to the returned cpu bandwidth (in usecs) that this tg + * will be allowed to consume over period + * + * Returns: 0 on success + */ +int virCgroupGetCpuCfsQuota(virCgroupPtr group, long long *cfs_quota) +{ + return virCgroupGetValueI64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_quota_us", cfs_quota); +} + int virCgroupGetCpuacctUsage(virCgroupPtr group, unsigned long long *usage) { return virCgroupGetValueU64(group, diff --git a/src/util/cgroup.h b/src/util/cgroup.h index 1d04418..d190bb3 100644 --- a/src/util/cgroup.h +++ b/src/util/cgroup.h @@ -104,6 +104,12 @@ int virCgroupDenyDevicePath(virCgroupPtr group, int virCgroupSetCpuShares(virCgroupPtr group, unsigned long long shares); int virCgroupGetCpuShares(virCgroupPtr group, unsigned long long *shares); +int virCgroupSetCpuCfsPeriod(virCgroupPtr group, unsigned long long cfs_period); +int virCgroupGetCpuCfsPeriod(virCgroupPtr group, unsigned long long *cfs_period); + +int virCgroupSetCpuCfsQuota(virCgroupPtr group, long long cfs_quota); +int virCgroupGetCpuCfsQuota(virCgroupPtr group, long long *cfs_quota); + int virCgroupGetCpuacctUsage(virCgroupPtr group, unsigned long long *usage); int virCgroupSetFreezerState(virCgroupPtr group, const char *state); -- 1.7.1

Plase add a small comment before commit On Thu, Jul 21, 2011 at 10:08:47AM +0800, Wen Congyang wrote:
--- src/libvirt_private.syms | 4 ++ src/util/cgroup.c | 81 ++++++++++++++++++++++++++++++++++++++++++++-- src/util/cgroup.h | 6 +++ 3 files changed, 88 insertions(+), 3 deletions(-)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 30804eb..8b73ec3 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -71,6 +71,8 @@ virCgroupForVcpu; virCgroupFree; virCgroupGetBlkioWeight; virCgroupGetCpuShares; +virCgroupGetCpuCfsPeriod; +virCgroupGetCpuCfsQuota; virCgroupGetCpuacctUsage; virCgroupGetFreezerState; virCgroupGetMemoryHardLimit; @@ -85,6 +87,8 @@ virCgroupPathOfController; virCgroupRemove; virCgroupSetBlkioWeight; virCgroupSetCpuShares; +virCgroupSetCpuCfsPeriod; +virCgroupSetCpuCfsQuota; virCgroupSetFreezerState; virCgroupSetMemory; virCgroupSetMemoryHardLimit; diff --git a/src/util/cgroup.c b/src/util/cgroup.c index 8994aca..378e08a 100644 --- a/src/util/cgroup.c +++ b/src/util/cgroup.c @@ -398,8 +398,6 @@ static int virCgroupSetValueI64(virCgroupPtr group, return rc; }
-#if 0 -/* This is included for completeness, but not yet used */ static int virCgroupGetValueI64(virCgroupPtr group, int controller, const char *key, @@ -419,7 +417,6 @@ out:
return rc; } -#endif
static int virCgroupGetValueU64(virCgroupPtr group, int controller, @@ -1384,6 +1381,84 @@ int virCgroupGetCpuShares(virCgroupPtr group, unsigned long long *shares) "cpu.shares", shares); }
+/** + * virCgroupSetCpuCfsPeriod: + * + * @group: The cgroup to change cpu.cfs_period_us for + * @cfs_period: The bandwidth period in usecs + * + * Returns: 0 on success + */ +int virCgroupSetCpuCfsPeriod(virCgroupPtr group, unsigned long long cfs_period) +{ + /* The cfs_period shoule be greater or equal than 1ms, and less or equal + * than 1s. + */ + if (cfs_period < 1000 || cfs_period > 1000000) + return -EINVAL; + + return virCgroupSetValueU64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", cfs_period); +} + +/** + * virCgroupGetCpuCfsPeriod: + * + * @group: The cgroup to get cpu.cfs_period_us for + * @cfs_period: Pointer to the returned bandwidth period in usecs + * + * Returns: 0 on success + */ +int virCgroupGetCpuCfsPeriod(virCgroupPtr group, unsigned long long *cfs_period) +{ + return virCgroupGetValueU64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", cfs_period); +} + +/** + * virCgroupSetCpuCfsQuota: + * + * @group: The cgroup to change cpu.cfs_quota_us for + * @cfs_quota: the cpu bandwidth (in usecs) that this tg will be allowed to + * consume over period + * + * Returns: 0 on success + */ +int virCgroupSetCpuCfsQuota(virCgroupPtr group, long long cfs_quota) +{ + if (cfs_quota >= 0) { + /* The cfs_quota shoule be greater or equal than 1ms */ + if (cfs_quota < 1000) + return -EINVAL; + + /* check overflow */ + if (cfs_quota > (unsigned long long)~0ULL / 1000) + return -EINVAL; + } + + return virCgroupSetValueI64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_quota_us", cfs_quota); +} + +/** + * virCgroupGetCpuCfsQuota: + * + * @group: The cgroup to get cpu.cfs_quota_us for + * @cfs_quota: Pointer to the returned cpu bandwidth (in usecs) that this tg + * will be allowed to consume over period + * + * Returns: 0 on success + */ +int virCgroupGetCpuCfsQuota(virCgroupPtr group, long long *cfs_quota) +{ + return virCgroupGetValueI64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_quota_us", cfs_quota); +} + int virCgroupGetCpuacctUsage(virCgroupPtr group, unsigned long long *usage) { return virCgroupGetValueU64(group, diff --git a/src/util/cgroup.h b/src/util/cgroup.h index 1d04418..d190bb3 100644 --- a/src/util/cgroup.h +++ b/src/util/cgroup.h @@ -104,6 +104,12 @@ int virCgroupDenyDevicePath(virCgroupPtr group, int virCgroupSetCpuShares(virCgroupPtr group, unsigned long long shares); int virCgroupGetCpuShares(virCgroupPtr group, unsigned long long *shares);
+int virCgroupSetCpuCfsPeriod(virCgroupPtr group, unsigned long long cfs_period); +int virCgroupGetCpuCfsPeriod(virCgroupPtr group, unsigned long long *cfs_period); + +int virCgroupSetCpuCfsQuota(virCgroupPtr group, long long cfs_quota); +int virCgroupGetCpuCfsQuota(virCgroupPtr group, long long *cfs_quota); + int virCgroupGetCpuacctUsage(virCgroupPtr group, unsigned long long *usage);
int virCgroupSetFreezerState(virCgroupPtr group, const char *state);
Okay, the check for overflow could probably be done with some kind of MAX_INT but for ULL , but that sounds right too, ACK Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

On Thu, Jul 21, 2011 at 10:08:47AM +0800, Wen Congyang wrote:
--- src/libvirt_private.syms | 4 ++ src/util/cgroup.c | 81 ++++++++++++++++++++++++++++++++++++++++++++-- src/util/cgroup.h | 6 +++ 3 files changed, 88 insertions(+), 3 deletions(-)
+int virCgroupSetCpuCfsPeriod(virCgroupPtr group, unsigned long long cfs_period) +{ + /* The cfs_period shoule be greater or equal than 1ms, and less or equal + * than 1s. + */ + if (cfs_period < 1000 || cfs_period > 1000000) + return -EINVAL; + + return virCgroupSetValueU64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", cfs_period); +}
+int virCgroupGetCpuCfsPeriod(virCgroupPtr group, unsigned long long *cfs_period) +{ + return virCgroupGetValueU64(group, + VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", cfs_period); +}
BTW, what kernel version is required for this support ? In my Fedora 15 kernels I only see rt_period_us, no cfs_period_us, and I can't find it in the latest 3.0 kernel tree either ? Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

--- docs/schemas/domain.rng | 26 +++++++++++++++++++++++++- 1 files changed, 25 insertions(+), 1 deletions(-) diff --git a/docs/schemas/domain.rng b/docs/schemas/domain.rng index 8a4e3fe..5f8151d 100644 --- a/docs/schemas/domain.rng +++ b/docs/schemas/domain.rng @@ -388,6 +388,16 @@ <ref name="cpushares"/> </element> </optional> + <optional> + <element name="period"> + <ref name="cpuperiod"/> + </element> + </optional> + <optional> + <element name="quota"> + <ref name="cpuquota"/> + </element> + </optional> <zeroOrMore> <element name="vcpupin"> <attribute name="vcpu"> @@ -2401,7 +2411,21 @@ <data type="unsignedInt"> <param name="pattern">[0-9]+</param> </data> - </define> + </define> + <define name="cpuperiod"> + <data type="unsignedLong"> + <param name="pattern">[0-9]+</param> + <param name="minInclusive">1000</param> + <param name="maxInclusive">1000000</param> + </data> + </define> + <define name="cpuquota"> + <data type="long"> + <param name="pattern">-?[0-9]+</param> + <param name="maxInclusive">18446744073709551</param> + <param name='minInclusive'>-1</param> + </data> + </define> <define name="PortNumber"> <data type="short"> <param name="minInclusive">-1</param> -- 1.7.1

On Thu, Jul 21, 2011 at 10:09:46AM +0800, Wen Congyang wrote:
--- docs/schemas/domain.rng | 26 +++++++++++++++++++++++++- 1 files changed, 25 insertions(+), 1 deletions(-)
diff --git a/docs/schemas/domain.rng b/docs/schemas/domain.rng index 8a4e3fe..5f8151d 100644 --- a/docs/schemas/domain.rng +++ b/docs/schemas/domain.rng @@ -388,6 +388,16 @@ <ref name="cpushares"/> </element> </optional> + <optional> + <element name="period"> + <ref name="cpuperiod"/> + </element> + </optional> + <optional> + <element name="quota"> + <ref name="cpuquota"/> + </element> + </optional> <zeroOrMore> <element name="vcpupin"> <attribute name="vcpu"> @@ -2401,7 +2411,21 @@ <data type="unsignedInt"> <param name="pattern">[0-9]+</param> </data> - </define> + </define> + <define name="cpuperiod"> + <data type="unsignedLong"> + <param name="pattern">[0-9]+</param> + <param name="minInclusive">1000</param> + <param name="maxInclusive">1000000</param> + </data> + </define> + <define name="cpuquota"> + <data type="long"> + <param name="pattern">-?[0-9]+</param> + <param name="maxInclusive">18446744073709551</param> + <param name='minInclusive'>-1</param> + </data> + </define> <define name="PortNumber"> <data type="short"> <param name="minInclusive">-1</param>
Perfect :-) ACK Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

--- src/conf/domain_conf.c | 20 +++- src/conf/domain_conf.h | 2 + src/qemu/qemu_cgroup.c | 137 +++++++++++++++++++++++ src/qemu/qemu_cgroup.h | 4 + src/qemu/qemu_process.c | 4 + tests/qemuxml2argvdata/qemuxml2argv-cputune.xml | 2 + 6 files changed, 167 insertions(+), 2 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 3c3ab39..dc579ef 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -6026,6 +6026,14 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, &def->cputune.shares) < 0) def->cputune.shares = 0; + if (virXPathULongLong("string(./cputune/period[1])", ctxt, + &def->cputune.period) < 0) + def->cputune.period = 0; + + if (virXPathLongLong("string(./cputune/quota[1])", ctxt, + &def->cputune.quota) < 0) + def->cputune.quota = 0; + if ((n = virXPathNodeSet("./cputune/vcpupin", ctxt, &nodes)) < 0) { goto error; } @@ -9727,12 +9735,19 @@ virDomainDefFormatInternal(virDomainDefPtr def, virBufferAsprintf(&buf, " current='%u'", def->vcpus); virBufferAsprintf(&buf, ">%u</vcpu>\n", def->maxvcpus); - if (def->cputune.shares || def->cputune.vcpupin) + if (def->cputune.shares || def->cputune.vcpupin || + def->cputune.period || def->cputune.quota) virBufferAddLit(&buf, " <cputune>\n"); if (def->cputune.shares) virBufferAsprintf(&buf, " <shares>%lu</shares>\n", def->cputune.shares); + if (def->cputune.period) + virBufferAsprintf(&buf, " <period>%llu</period>\n", + def->cputune.period); + if (def->cputune.quota) + virBufferAsprintf(&buf, " <quota>%lld</quota>\n", + def->cputune.quota); if (def->cputune.vcpupin) { int i; for (i = 0; i < def->cputune.nvcpupin; i++) { @@ -9754,7 +9769,8 @@ virDomainDefFormatInternal(virDomainDefPtr def, } } - if (def->cputune.shares || def->cputune.vcpupin) + if (def->cputune.shares || def->cputune.vcpupin || + def->cputune.period || def->cputune.quota) virBufferAddLit(&buf, " </cputune>\n"); if (def->numatune.memory.nodemask) diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 7a3d72b..fc7668d 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -1190,6 +1190,8 @@ struct _virDomainDef { struct { unsigned long shares; + unsigned long long period; + long long quota; int nvcpupin; virDomainVcpuPinDefPtr *vcpupin; } cputune; diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index b357852..bb1c1c8 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -24,6 +24,7 @@ #include <config.h> #include "qemu_cgroup.h" +#include "qemu_domain.h" #include "cgroup.h" #include "logging.h" #include "memory.h" @@ -376,6 +377,142 @@ cleanup: return -1; } +int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, unsigned long long period, + long long quota) +{ + int rc; + unsigned long long old_period; + + if (period == 0 && quota == 0) + return 0; + + if (period) { + /* get old period, and we can rollback if set quota failed */ + rc = virCgroupGetCpuCfsPeriod(cgroup, &old_period); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to get cpu bandwidth period"); + return -1; + } + + rc = virCgroupSetCpuCfsPeriod(cgroup, period); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to set cpu bandwidth period"); + return -1; + } + } + + if (quota) { + rc = virCgroupSetCpuCfsQuota(cgroup, quota); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to set cpu bandwidth quota"); + goto cleanup; + } + } + + return 0; + +cleanup: + if (period) { + rc = virCgroupSetCpuCfsPeriod(cgroup, old_period); + if (rc < 0) + virReportSystemError(-rc, + _("%s"), + "Unable to rollback cpu bandwidth period"); + } + + return -1; +} + +int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm) +{ + virCgroupPtr cgroup = NULL; + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc; + unsigned int i; + unsigned long long period = vm->def->cputune.period; + long long quota = vm->def->cputune.quota; + + if (driver->cgroup == NULL) + return 0; /* Not supported, so claim success */ + + rc = virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to find cgroup for %s"), + vm->def->name); + goto cleanup; + } + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. + */ + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + /* Ensure that we can multiply by vcpus without overflowing. */ + if (quota > LLONG_MAX / vm->def->vcpus) { + virReportSystemError(EINVAL, + _("%s"), + "Unable to set cpu bandwidth quota"); + goto cleanup; + } + + if (quota > 0) + quota *= vm->def->vcpus; + if (qemuSetupCgroupVcpuBW(cgroup, period, quota) < 0) + goto cleanup; + } + } + return 0; + } + + for (i = 0; i < priv->nvcpupids; i++) { + rc = virCgroupForVcpu(cgroup, i, &cgroup_vcpu, 1); + if (rc < 0) { + virReportSystemError(-rc, + _("Unable to create vcpu cgroup for %s(vcpu:" + " %d)"), + vm->def->name, i); + goto cleanup; + } + + /* move the thread for vcpu to sub dir */ + rc = virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]); + if (rc < 0) { + virReportSystemError(-rc, + _("unable to add vcpu %d task %d to cgroup"), + i, priv->vcpupids[i]); + goto cleanup; + } + + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0) + goto cleanup; + } + } + + virCgroupFree(&cgroup_vcpu); + } + + virCgroupFree(&cgroup_vcpu); + virCgroupFree(&cgroup); + return 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + if (cgroup) { + virCgroupRemove(cgroup); + virCgroupFree(&cgroup); + } + + return -1; +} + int qemuRemoveCgroup(struct qemud_driver *driver, virDomainObjPtr vm, diff --git a/src/qemu/qemu_cgroup.h b/src/qemu/qemu_cgroup.h index e8abfb4..17164d9 100644 --- a/src/qemu/qemu_cgroup.h +++ b/src/qemu/qemu_cgroup.h @@ -49,6 +49,10 @@ int qemuSetupHostUsbDeviceCgroup(usbDevice *dev, void *opaque); int qemuSetupCgroup(struct qemud_driver *driver, virDomainObjPtr vm); +int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, + unsigned long long period, + long long quota); +int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm); int qemuRemoveCgroup(struct qemud_driver *driver, virDomainObjPtr vm, int quiet); diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 448b06e..a1fbe06 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2789,6 +2789,10 @@ int qemuProcessStart(virConnectPtr conn, if (qemuProcessDetectVcpuPIDs(driver, vm) < 0) goto cleanup; + VIR_DEBUG("Setting cgroup for each VCPU(if required)"); + if (qemuSetupCgroupForVcpu(driver, vm) < 0) + goto cleanup; + VIR_DEBUG("Setting VCPU affinities"); if (qemuProcessSetVcpuAffinites(conn, vm) < 0) goto cleanup; diff --git a/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml b/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml index 0afbadb..091865a 100644 --- a/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml +++ b/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml @@ -6,6 +6,8 @@ <vcpu>2</vcpu> <cputune> <shares>2048</shares> + <period>1000000</period> + <quota>-1</quota> <vcpupin vcpu='0' cpuset='0'/> <vcpupin vcpu='1' cpuset='1'/> </cputune> -- 1.7.1

On Thu, Jul 21, 2011 at 10:10:31AM +0800, Wen Congyang wrote:
--- src/conf/domain_conf.c | 20 +++- src/conf/domain_conf.h | 2 + src/qemu/qemu_cgroup.c | 137 +++++++++++++++++++++++ src/qemu/qemu_cgroup.h | 4 + src/qemu/qemu_process.c | 4 + tests/qemuxml2argvdata/qemuxml2argv-cputune.xml | 2 + 6 files changed, 167 insertions(+), 2 deletions(-)
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 3c3ab39..dc579ef 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -6026,6 +6026,14 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, &def->cputune.shares) < 0) def->cputune.shares = 0;
+ if (virXPathULongLong("string(./cputune/period[1])", ctxt, + &def->cputune.period) < 0) + def->cputune.period = 0; + + if (virXPathLongLong("string(./cputune/quota[1])", ctxt, + &def->cputune.quota) < 0) + def->cputune.quota = 0; + if ((n = virXPathNodeSet("./cputune/vcpupin", ctxt, &nodes)) < 0) { goto error; } @@ -9727,12 +9735,19 @@ virDomainDefFormatInternal(virDomainDefPtr def, virBufferAsprintf(&buf, " current='%u'", def->vcpus); virBufferAsprintf(&buf, ">%u</vcpu>\n", def->maxvcpus);
- if (def->cputune.shares || def->cputune.vcpupin) + if (def->cputune.shares || def->cputune.vcpupin || + def->cputune.period || def->cputune.quota) virBufferAddLit(&buf, " <cputune>\n");
if (def->cputune.shares) virBufferAsprintf(&buf, " <shares>%lu</shares>\n", def->cputune.shares); + if (def->cputune.period) + virBufferAsprintf(&buf, " <period>%llu</period>\n", + def->cputune.period); + if (def->cputune.quota) + virBufferAsprintf(&buf, " <quota>%lld</quota>\n", + def->cputune.quota); if (def->cputune.vcpupin) { int i; for (i = 0; i < def->cputune.nvcpupin; i++) { @@ -9754,7 +9769,8 @@ virDomainDefFormatInternal(virDomainDefPtr def, } }
- if (def->cputune.shares || def->cputune.vcpupin) + if (def->cputune.shares || def->cputune.vcpupin || + def->cputune.period || def->cputune.quota) virBufferAddLit(&buf, " </cputune>\n");
if (def->numatune.memory.nodemask) diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 7a3d72b..fc7668d 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -1190,6 +1190,8 @@ struct _virDomainDef {
struct { unsigned long shares; + unsigned long long period; + long long quota; int nvcpupin; virDomainVcpuPinDefPtr *vcpupin; } cputune; diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index b357852..bb1c1c8 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -24,6 +24,7 @@ #include <config.h>
#include "qemu_cgroup.h" +#include "qemu_domain.h" #include "cgroup.h" #include "logging.h" #include "memory.h" @@ -376,6 +377,142 @@ cleanup: return -1; }
+int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, unsigned long long period, + long long quota) +{ + int rc; + unsigned long long old_period; + + if (period == 0 && quota == 0) + return 0; + + if (period) { + /* get old period, and we can rollback if set quota failed */ + rc = virCgroupGetCpuCfsPeriod(cgroup, &old_period); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to get cpu bandwidth period"); + return -1; + } + + rc = virCgroupSetCpuCfsPeriod(cgroup, period); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to set cpu bandwidth period"); + return -1; + } + } + + if (quota) { + rc = virCgroupSetCpuCfsQuota(cgroup, quota); + if (rc < 0) { + virReportSystemError(-rc, + _("%s"), "Unable to set cpu bandwidth quota"); + goto cleanup; + } + } + + return 0; + +cleanup: + if (period) { + rc = virCgroupSetCpuCfsPeriod(cgroup, old_period); + if (rc < 0) + virReportSystemError(-rc, + _("%s"), + "Unable to rollback cpu bandwidth period"); + } + + return -1; +} + +int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm) +{ + virCgroupPtr cgroup = NULL; + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = vm->privateData; + int rc; + unsigned int i; + unsigned long long period = vm->def->cputune.period; + long long quota = vm->def->cputune.quota; + + if (driver->cgroup == NULL) + return 0; /* Not supported, so claim success */ + + rc = virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to find cgroup for %s"), + vm->def->name); + goto cleanup; + } + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. + */ + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + /* Ensure that we can multiply by vcpus without overflowing. */ + if (quota > LLONG_MAX / vm->def->vcpus) { + virReportSystemError(EINVAL, + _("%s"), + "Unable to set cpu bandwidth quota"); + goto cleanup; + } + + if (quota > 0) + quota *= vm->def->vcpus; + if (qemuSetupCgroupVcpuBW(cgroup, period, quota) < 0) + goto cleanup; + } + } + return 0; + } + + for (i = 0; i < priv->nvcpupids; i++) { + rc = virCgroupForVcpu(cgroup, i, &cgroup_vcpu, 1); + if (rc < 0) { + virReportSystemError(-rc, + _("Unable to create vcpu cgroup for %s(vcpu:" + " %d)"), + vm->def->name, i); + goto cleanup; + } + + /* move the thread for vcpu to sub dir */ + rc = virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]); + if (rc < 0) { + virReportSystemError(-rc, + _("unable to add vcpu %d task %d to cgroup"), + i, priv->vcpupids[i]); + goto cleanup; + } + + if (period || quota) { + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { + if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0) + goto cleanup; + } + } + + virCgroupFree(&cgroup_vcpu); + } + + virCgroupFree(&cgroup_vcpu); + virCgroupFree(&cgroup); + return 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + if (cgroup) { + virCgroupRemove(cgroup); + virCgroupFree(&cgroup); + } + + return -1; +} +
int qemuRemoveCgroup(struct qemud_driver *driver, virDomainObjPtr vm, diff --git a/src/qemu/qemu_cgroup.h b/src/qemu/qemu_cgroup.h index e8abfb4..17164d9 100644 --- a/src/qemu/qemu_cgroup.h +++ b/src/qemu/qemu_cgroup.h @@ -49,6 +49,10 @@ int qemuSetupHostUsbDeviceCgroup(usbDevice *dev, void *opaque); int qemuSetupCgroup(struct qemud_driver *driver, virDomainObjPtr vm); +int qemuSetupCgroupVcpuBW(virCgroupPtr cgroup, + unsigned long long period, + long long quota); +int qemuSetupCgroupForVcpu(struct qemud_driver *driver, virDomainObjPtr vm); int qemuRemoveCgroup(struct qemud_driver *driver, virDomainObjPtr vm, int quiet); diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 448b06e..a1fbe06 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2789,6 +2789,10 @@ int qemuProcessStart(virConnectPtr conn, if (qemuProcessDetectVcpuPIDs(driver, vm) < 0) goto cleanup;
+ VIR_DEBUG("Setting cgroup for each VCPU(if required)"); + if (qemuSetupCgroupForVcpu(driver, vm) < 0) + goto cleanup; + VIR_DEBUG("Setting VCPU affinities"); if (qemuProcessSetVcpuAffinites(conn, vm) < 0) goto cleanup; diff --git a/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml b/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml index 0afbadb..091865a 100644 --- a/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml +++ b/tests/qemuxml2argvdata/qemuxml2argv-cputune.xml @@ -6,6 +6,8 @@ <vcpu>2</vcpu> <cputune> <shares>2048</shares> + <period>1000000</period> + <quota>-1</quota> <vcpupin vcpu='0' cpuset='0'/> <vcpupin vcpu='1' cpuset='1'/> </cputune>
Okay, again please add some comment about the code addtition in the commit message. We should make clear somehow that a quota or period of zero is simply ignored, but that looks fine, ACK, Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

--- src/qemu/qemu_driver.c | 312 ++++++++++++++++++++++++++++++++++++++++++++---- 1 files changed, 287 insertions(+), 25 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index cd65bce..fd80537 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5139,11 +5139,46 @@ cleanup: } +/* + * check whether the host supports CFS bandwidth + * + * Return 1 when CFS bandwidth is supported, 0 when CFS bandwidth is not + * supported, -1 on error. + */ +static int qemuGetCpuBWStatus(virCgroupPtr cgroup) +{ + char *cfs_period_path = NULL; + int ret = -1; + + if (!cgroup) + return 0; + + if (virCgroupPathOfController(cgroup, VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", &cfs_period_path) < 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("cannot get the path of cgroup CPU controller")); + goto cleanup; + } + + if (access(cfs_period_path, F_OK) < 0) { + ret = 0; + } else { + ret = 1; + } + +cleanup: + VIR_FREE(cfs_period_path); + return ret; +} + + static char *qemuGetSchedulerType(virDomainPtr dom, int *nparams) { struct qemud_driver *driver = dom->conn->privateData; char *ret = NULL; + int rc; qemuDriverLock(driver); if (!qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { @@ -5152,8 +5187,15 @@ static char *qemuGetSchedulerType(virDomainPtr dom, goto cleanup; } - if (nparams) - *nparams = 1; + if (nparams) { + rc = qemuGetCpuBWStatus(driver->cgroup); + if (rc < 0) + goto cleanup; + else if (rc == 0) + *nparams = 1; + else + *nparams = 3; + } ret = strdup("posix"); if (!ret) @@ -5786,6 +5828,58 @@ cleanup: return ret; } +static int +qemuSetVcpusBWLive(virDomainObjPtr vm, virCgroupPtr cgroup, + unsigned long long period, long long quota) +{ + int i; + qemuDomainObjPrivatePtr priv = vm->privateData; + virCgroupPtr cgroup_vcpu = NULL; + int rc; + + if (period == 0 && quota == 0) + return 0; + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. + */ + /* Ensure that we can multiply by vcpus without overflowing. */ + if (quota > LLONG_MAX / vm->def->vcpus) { + virReportSystemError(EINVAL, + _("%s"), + "Unable to set cpu bandwidth quota"); + goto cleanup; + } + + if (quota > 0) + quota *= vm->def->vcpus; + return qemuSetupCgroupVcpuBW(cgroup, period, quota); + } + + for (i = 0; i < priv->nvcpupids; i++) { + rc = virCgroupForVcpu(cgroup, i, &cgroup_vcpu, 0); + if (rc < 0) { + virReportSystemError(-rc, + _("Unable to find vcpu cgroup for %s(vcpu:" + " %d)"), + vm->def->name, i); + goto cleanup; + } + + if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0) + goto cleanup; + + virCgroupFree(&cgroup_vcpu); + } + + return 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + return -1; +} + static int qemuSetSchedulerParametersFlags(virDomainPtr dom, virTypedParameterPtr params, int nparams, @@ -5795,9 +5889,10 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, int i; virCgroupPtr group = NULL; virDomainObjPtr vm = NULL; - virDomainDefPtr persistentDef = NULL; + virDomainDefPtr vmdef = NULL; int ret = -1; bool isActive; + int rc; virCheckFlags(VIR_DOMAIN_AFFECT_LIVE | VIR_DOMAIN_AFFECT_CONFIG, -1); @@ -5821,10 +5916,17 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, flags = VIR_DOMAIN_AFFECT_CONFIG; } - if ((flags & VIR_DOMAIN_AFFECT_CONFIG) && !vm->persistent) { - qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", - _("cannot change persistent config of a transient domain")); - goto cleanup; + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + if (!vm->persistent) { + qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("cannot change persistent config of a transient domain")); + goto cleanup; + } + + /* Make a copy for updated domain. */ + vmdef = virDomainObjCopyPersistentDef(driver->caps, vm); + if (!vmdef) + goto cleanup; } if (flags & VIR_DOMAIN_AFFECT_LIVE) { @@ -5851,7 +5953,6 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, virTypedParameterPtr param = ¶ms[i]; if (STREQ(param->field, "cpu_shares")) { - int rc; if (param->type != VIR_TYPED_PARAM_ULLONG) { qemuReportError(VIR_ERR_INVALID_ARG, "%s", _("invalid type for cpu_shares tunable, expected a 'ullong'")); @@ -5870,19 +5971,47 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, } if (flags & VIR_DOMAIN_AFFECT_CONFIG) { - persistentDef = virDomainObjGetPersistentDef(driver->caps, vm); - if (!persistentDef) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("can't get persistentDef")); + vmdef->cputune.shares = params[i].value.ul; + } + } else if (STREQ(param->field, "cfs_period")) { + if (param->type != VIR_TYPED_PARAM_ULLONG) { + qemuReportError(VIR_ERR_INVALID_ARG, "%s", + _("invalid type for cfs_period tunable," + " expected a 'ullong'")); + goto cleanup; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + rc = qemuSetVcpusBWLive(vm, group, params[i].value.ul, 0); + if (rc != 0) goto cleanup; - } - persistentDef->cputune.shares = params[i].value.ul; - rc = virDomainSaveConfig(driver->configDir, persistentDef); - if (rc) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("can't save config")); + + if (params[i].value.ul) + vm->def->cputune.period = params[i].value.ul; + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + vmdef->cputune.period = params[i].value.ul; + } + } else if (STREQ(param->field, "cfs_quota")) { + if (param->type != VIR_TYPED_PARAM_LLONG) { + qemuReportError(VIR_ERR_INVALID_ARG, "%s", + _("invalid type for cfs_quota tunable," + " expected a 'llong'")); + goto cleanup; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + rc = qemuSetVcpusBWLive(vm, group, 0, params[i].value.l); + if (rc != 0) goto cleanup; - } + + if (params[i].value.l) + vm->def->cputune.quota = params[i].value.l; + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + vmdef->cputune.quota = params[i].value.l; } } else { qemuReportError(VIR_ERR_INVALID_ARG, @@ -5891,9 +6020,23 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, } } + if (virDomainSaveStatus(driver->caps, driver->stateDir, vm) < 0) + goto cleanup; + + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + rc = virDomainSaveConfig(driver->configDir, vmdef); + if (rc < 0) + goto cleanup; + + virDomainObjAssignDef(vm, vmdef, false); + vmdef = NULL; + } + ret = 0; cleanup: + virDomainDefFree(vmdef); virCgroupFree(&group); if (vm) virDomainObjUnlock(vm); @@ -5912,6 +6055,71 @@ static int qemuSetSchedulerParameters(virDomainPtr dom, } static int +qemuGetVcpuBWLive(virCgroupPtr cgroup, unsigned long long *period, + long long *quota) +{ + int rc; + + rc = virCgroupGetCpuCfsPeriod(cgroup, period); + if (rc < 0) { + virReportSystemError(-rc, "%s", + _("unable to get cpu bandwidth period tunable")); + return -1; + } + + rc = virCgroupGetCpuCfsQuota(cgroup, quota); + if (rc < 0) { + virReportSystemError(-rc, "%s", + _("unable to get cpu bandwidth tunable")); + return -1; + } + + return 0; +} + +static int +qemuGetVcpusBWLive(virDomainObjPtr vm, virCgroupPtr cgroup, + unsigned long long *period, long long *quota) +{ + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = NULL; + int rc; + int ret = -1; + + priv = vm->privateData; + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* We do not create sub dir for each vcpu */ + rc = qemuGetVcpuBWLive(cgroup, period, quota); + if (rc < 0) + goto cleanup; + + if (*quota > 0) + *quota /= vm->def->vcpus; + goto out; + } + + /* get period and quota for vcpu0 */ + rc = virCgroupForVcpu(cgroup, 0, &cgroup_vcpu, 0); + if (!cgroup_vcpu) { + virReportSystemError(-rc, + _("Unable to find vcpu cgroup for %s(vcpu: 0)"), + vm->def->name); + goto cleanup; + } + + rc = qemuGetVcpuBWLive(cgroup_vcpu, period, quota); + if (rc < 0) + goto cleanup; + +out: + ret = 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + return ret; +} + +static int qemuGetSchedulerParametersFlags(virDomainPtr dom, virTypedParameterPtr params, int *nparams, @@ -5920,10 +6128,14 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, struct qemud_driver *driver = dom->conn->privateData; virCgroupPtr group = NULL; virDomainObjPtr vm = NULL; - unsigned long long val; + unsigned long long shares; + unsigned long long period; + long long quota; int ret = -1; int rc; bool isActive; + bool cpu_bw_status; + int saved_nparams = 0; virCheckFlags(VIR_DOMAIN_AFFECT_LIVE | VIR_DOMAIN_AFFECT_CONFIG, -1); @@ -5943,6 +6155,13 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, goto cleanup; } + if (*nparams > 1) { + rc = qemuGetCpuBWStatus(driver->cgroup); + if (rc < 0) + goto cleanup; + cpu_bw_status = !!rc; + } + vm = virDomainFindByUUID(&driver->domains, dom->uuid); if (vm == NULL) { @@ -5976,9 +6195,17 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, _("can't get persistentDef")); goto cleanup; } - val = persistentDef->cputune.shares; + shares = persistentDef->cputune.shares; + if (*nparams > 1 && cpu_bw_status) { + period = persistentDef->cputune.period; + quota = persistentDef->cputune.quota; + } } else { - val = vm->def->cputune.shares; + shares = vm->def->cputune.shares; + if (*nparams > 1 && cpu_bw_status) { + period = vm->def->cputune.period; + quota = vm->def->cputune.quota; + } } goto out; } @@ -6001,14 +6228,20 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, goto cleanup; } - rc = virCgroupGetCpuShares(group, &val); + rc = virCgroupGetCpuShares(group, &shares); if (rc != 0) { virReportSystemError(-rc, "%s", _("unable to get cpu shares tunable")); goto cleanup; } + + if (*nparams > 1 && cpu_bw_status) { + rc = qemuGetVcpusBWLive(vm, group, &period, "a); + if (rc != 0) + goto cleanup; + } out: - params[0].value.ul = val; + params[0].value.ul = shares; params[0].type = VIR_TYPED_PARAM_ULLONG; if (virStrcpyStatic(params[0].field, "cpu_shares") == NULL) { qemuReportError(VIR_ERR_INTERNAL_ERROR, @@ -6016,7 +6249,36 @@ out: goto cleanup; } - *nparams = 1; + saved_nparams++; + + if (cpu_bw_status) { + if (*nparams > saved_nparams) { + params[1].value.ul = period; + params[1].type = VIR_TYPED_PARAM_ULLONG; + if (virStrcpyStatic(params[1].field, "cfs_period") == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("Field cfs_period too long for destination")); + goto cleanup; + } + saved_nparams++; + } + + if (*nparams > saved_nparams) { + params[2].value.ul = quota; + params[2].type = VIR_TYPED_PARAM_LLONG; + if (virStrcpyStatic(params[2].field, "cfs_quota") == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("Field cfs_quota too long for destination")); + goto cleanup; + } + saved_nparams++; + } + } + + *nparams = saved_nparams; + ret = 0; cleanup: -- 1.7.1

On Thu, Jul 21, 2011 at 10:11:32AM +0800, Wen Congyang wrote:
--- src/qemu/qemu_driver.c | 312 ++++++++++++++++++++++++++++++++++++++++++++---- 1 files changed, 287 insertions(+), 25 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index cd65bce..fd80537 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5139,11 +5139,46 @@ cleanup: }
+/* + * check whether the host supports CFS bandwidth + * + * Return 1 when CFS bandwidth is supported, 0 when CFS bandwidth is not + * supported, -1 on error. + */ +static int qemuGetCpuBWStatus(virCgroupPtr cgroup) +{ + char *cfs_period_path = NULL; + int ret = -1; + + if (!cgroup) + return 0; + + if (virCgroupPathOfController(cgroup, VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", &cfs_period_path) < 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("cannot get the path of cgroup CPU controller"));
Hum, I'm not sure we should really flag this as an error here It should be made an INFO I think. What should get an error is if we try to start using cpu control on a guest while the host doesn't support it. In practice we need to check proper handling in 3 cases: - at guest startup - on migration when checking the target host - when activated at runtime
+ goto cleanup; + } + + if (access(cfs_period_path, F_OK) < 0) { + ret = 0; + } else { + ret = 1; + } + +cleanup: + VIR_FREE(cfs_period_path); + return ret; +} + + static char *qemuGetSchedulerType(virDomainPtr dom, int *nparams) { struct qemud_driver *driver = dom->conn->privateData; char *ret = NULL; + int rc;
qemuDriverLock(driver); if (!qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { @@ -5152,8 +5187,15 @@ static char *qemuGetSchedulerType(virDomainPtr dom, goto cleanup; }
- if (nparams) - *nparams = 1; + if (nparams) { + rc = qemuGetCpuBWStatus(driver->cgroup); + if (rc < 0) + goto cleanup; + else if (rc == 0) + *nparams = 1; + else + *nparams = 3; + }
ret = strdup("posix"); if (!ret) @@ -5786,6 +5828,58 @@ cleanup: return ret; }
+static int +qemuSetVcpusBWLive(virDomainObjPtr vm, virCgroupPtr cgroup, + unsigned long long period, long long quota) +{ + int i; + qemuDomainObjPrivatePtr priv = vm->privateData; + virCgroupPtr cgroup_vcpu = NULL; + int rc; + + if (period == 0 && quota == 0) + return 0; + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. + */ + /* Ensure that we can multiply by vcpus without overflowing. */ + if (quota > LLONG_MAX / vm->def->vcpus) { + virReportSystemError(EINVAL, + _("%s"), + "Unable to set cpu bandwidth quota");
should probably give an hint of why in the error message
+ goto cleanup; + } + + if (quota > 0) + quota *= vm->def->vcpus; + return qemuSetupCgroupVcpuBW(cgroup, period, quota); + } + + for (i = 0; i < priv->nvcpupids; i++) { + rc = virCgroupForVcpu(cgroup, i, &cgroup_vcpu, 0); + if (rc < 0) { + virReportSystemError(-rc, + _("Unable to find vcpu cgroup for %s(vcpu:" + " %d)"), + vm->def->name, i); + goto cleanup; + } + + if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0) + goto cleanup; + + virCgroupFree(&cgroup_vcpu); + } + + return 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + return -1; +} + static int qemuSetSchedulerParametersFlags(virDomainPtr dom, virTypedParameterPtr params, int nparams, @@ -5795,9 +5889,10 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, int i; virCgroupPtr group = NULL; virDomainObjPtr vm = NULL; - virDomainDefPtr persistentDef = NULL; + virDomainDefPtr vmdef = NULL; int ret = -1; bool isActive; + int rc;
virCheckFlags(VIR_DOMAIN_AFFECT_LIVE | VIR_DOMAIN_AFFECT_CONFIG, -1); @@ -5821,10 +5916,17 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, flags = VIR_DOMAIN_AFFECT_CONFIG; }
- if ((flags & VIR_DOMAIN_AFFECT_CONFIG) && !vm->persistent) { - qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", - _("cannot change persistent config of a transient domain")); - goto cleanup; + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + if (!vm->persistent) { + qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("cannot change persistent config of a transient domain")); + goto cleanup; + } + + /* Make a copy for updated domain. */ + vmdef = virDomainObjCopyPersistentDef(driver->caps, vm); + if (!vmdef) + goto cleanup; }
if (flags & VIR_DOMAIN_AFFECT_LIVE) { @@ -5851,7 +5953,6 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, virTypedParameterPtr param = ¶ms[i];
if (STREQ(param->field, "cpu_shares")) { - int rc; if (param->type != VIR_TYPED_PARAM_ULLONG) { qemuReportError(VIR_ERR_INVALID_ARG, "%s", _("invalid type for cpu_shares tunable, expected a 'ullong'")); @@ -5870,19 +5971,47 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, }
if (flags & VIR_DOMAIN_AFFECT_CONFIG) { - persistentDef = virDomainObjGetPersistentDef(driver->caps, vm); - if (!persistentDef) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("can't get persistentDef")); + vmdef->cputune.shares = params[i].value.ul; + } + } else if (STREQ(param->field, "cfs_period")) { + if (param->type != VIR_TYPED_PARAM_ULLONG) { + qemuReportError(VIR_ERR_INVALID_ARG, "%s", + _("invalid type for cfs_period tunable," + " expected a 'ullong'")); + goto cleanup; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + rc = qemuSetVcpusBWLive(vm, group, params[i].value.ul, 0); + if (rc != 0) goto cleanup; - } - persistentDef->cputune.shares = params[i].value.ul; - rc = virDomainSaveConfig(driver->configDir, persistentDef); - if (rc) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("can't save config")); + + if (params[i].value.ul) + vm->def->cputune.period = params[i].value.ul; + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + vmdef->cputune.period = params[i].value.ul; + } + } else if (STREQ(param->field, "cfs_quota")) { + if (param->type != VIR_TYPED_PARAM_LLONG) { + qemuReportError(VIR_ERR_INVALID_ARG, "%s", + _("invalid type for cfs_quota tunable," + " expected a 'llong'")); + goto cleanup; + } + + if (flags & VIR_DOMAIN_AFFECT_LIVE) { + rc = qemuSetVcpusBWLive(vm, group, 0, params[i].value.l); + if (rc != 0) goto cleanup; - } + + if (params[i].value.l) + vm->def->cputune.quota = params[i].value.l; + } + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + vmdef->cputune.quota = params[i].value.l; } } else { qemuReportError(VIR_ERR_INVALID_ARG, @@ -5891,9 +6020,23 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, } }
+ if (virDomainSaveStatus(driver->caps, driver->stateDir, vm) < 0) + goto cleanup; + + + if (flags & VIR_DOMAIN_AFFECT_CONFIG) { + rc = virDomainSaveConfig(driver->configDir, vmdef); + if (rc < 0) + goto cleanup; + + virDomainObjAssignDef(vm, vmdef, false); + vmdef = NULL; + } + ret = 0;
cleanup: + virDomainDefFree(vmdef); virCgroupFree(&group); if (vm) virDomainObjUnlock(vm); @@ -5912,6 +6055,71 @@ static int qemuSetSchedulerParameters(virDomainPtr dom, }
static int +qemuGetVcpuBWLive(virCgroupPtr cgroup, unsigned long long *period, + long long *quota) +{ + int rc; + + rc = virCgroupGetCpuCfsPeriod(cgroup, period); + if (rc < 0) { + virReportSystemError(-rc, "%s", + _("unable to get cpu bandwidth period tunable")); + return -1; + } + + rc = virCgroupGetCpuCfsQuota(cgroup, quota); + if (rc < 0) { + virReportSystemError(-rc, "%s", + _("unable to get cpu bandwidth tunable")); + return -1; + } + + return 0; +} + +static int +qemuGetVcpusBWLive(virDomainObjPtr vm, virCgroupPtr cgroup, + unsigned long long *period, long long *quota) +{ + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = NULL; + int rc; + int ret = -1; + + priv = vm->privateData; + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* We do not create sub dir for each vcpu */ + rc = qemuGetVcpuBWLive(cgroup, period, quota); + if (rc < 0) + goto cleanup; + + if (*quota > 0) + *quota /= vm->def->vcpus; + goto out; + } + + /* get period and quota for vcpu0 */ + rc = virCgroupForVcpu(cgroup, 0, &cgroup_vcpu, 0); + if (!cgroup_vcpu) { + virReportSystemError(-rc, + _("Unable to find vcpu cgroup for %s(vcpu: 0)"), + vm->def->name); + goto cleanup; + } + + rc = qemuGetVcpuBWLive(cgroup_vcpu, period, quota); + if (rc < 0) + goto cleanup; + +out: + ret = 0; + +cleanup: + virCgroupFree(&cgroup_vcpu); + return ret; +} + +static int qemuGetSchedulerParametersFlags(virDomainPtr dom, virTypedParameterPtr params, int *nparams, @@ -5920,10 +6128,14 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, struct qemud_driver *driver = dom->conn->privateData; virCgroupPtr group = NULL; virDomainObjPtr vm = NULL; - unsigned long long val; + unsigned long long shares; + unsigned long long period; + long long quota; int ret = -1; int rc; bool isActive; + bool cpu_bw_status; + int saved_nparams = 0;
virCheckFlags(VIR_DOMAIN_AFFECT_LIVE | VIR_DOMAIN_AFFECT_CONFIG, -1); @@ -5943,6 +6155,13 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, goto cleanup; }
+ if (*nparams > 1) { + rc = qemuGetCpuBWStatus(driver->cgroup); + if (rc < 0) + goto cleanup; + cpu_bw_status = !!rc; + } + vm = virDomainFindByUUID(&driver->domains, dom->uuid);
if (vm == NULL) { @@ -5976,9 +6195,17 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, _("can't get persistentDef")); goto cleanup; } - val = persistentDef->cputune.shares; + shares = persistentDef->cputune.shares; + if (*nparams > 1 && cpu_bw_status) { + period = persistentDef->cputune.period; + quota = persistentDef->cputune.quota; + } } else { - val = vm->def->cputune.shares; + shares = vm->def->cputune.shares; + if (*nparams > 1 && cpu_bw_status) { + period = vm->def->cputune.period; + quota = vm->def->cputune.quota; + } } goto out; } @@ -6001,14 +6228,20 @@ qemuGetSchedulerParametersFlags(virDomainPtr dom, goto cleanup; }
- rc = virCgroupGetCpuShares(group, &val); + rc = virCgroupGetCpuShares(group, &shares); if (rc != 0) { virReportSystemError(-rc, "%s", _("unable to get cpu shares tunable")); goto cleanup; } + + if (*nparams > 1 && cpu_bw_status) { + rc = qemuGetVcpusBWLive(vm, group, &period, "a); + if (rc != 0) + goto cleanup; + } out: - params[0].value.ul = val; + params[0].value.ul = shares; params[0].type = VIR_TYPED_PARAM_ULLONG; if (virStrcpyStatic(params[0].field, "cpu_shares") == NULL) { qemuReportError(VIR_ERR_INTERNAL_ERROR, @@ -6016,7 +6249,36 @@ out: goto cleanup; }
- *nparams = 1; + saved_nparams++; + + if (cpu_bw_status) { + if (*nparams > saved_nparams) { + params[1].value.ul = period; + params[1].type = VIR_TYPED_PARAM_ULLONG; + if (virStrcpyStatic(params[1].field, "cfs_period") == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("Field cfs_period too long for destination")); + goto cleanup; + } + saved_nparams++; + } + + if (*nparams > saved_nparams) { + params[2].value.ul = quota; + params[2].type = VIR_TYPED_PARAM_LLONG; + if (virStrcpyStatic(params[2].field, "cfs_quota") == NULL) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("Field cfs_quota too long for destination")); + goto cleanup; + } + saved_nparams++; + } + } + + *nparams = saved_nparams; + ret = 0;
cleanup:
ACK but please add a commit message Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

At 07/21/2011 12:34 PM, Daniel Veillard Write:
On Thu, Jul 21, 2011 at 10:11:32AM +0800, Wen Congyang wrote:
--- src/qemu/qemu_driver.c | 312 ++++++++++++++++++++++++++++++++++++++++++++---- 1 files changed, 287 insertions(+), 25 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index cd65bce..fd80537 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5139,11 +5139,46 @@ cleanup: }
+/* + * check whether the host supports CFS bandwidth + * + * Return 1 when CFS bandwidth is supported, 0 when CFS bandwidth is not + * supported, -1 on error. + */ +static int qemuGetCpuBWStatus(virCgroupPtr cgroup) +{ + char *cfs_period_path = NULL; + int ret = -1; + + if (!cgroup) + return 0; + + if (virCgroupPathOfController(cgroup, VIR_CGROUP_CONTROLLER_CPU, + "cpu.cfs_period_us", &cfs_period_path) < 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("cannot get the path of cgroup CPU controller"));
Hum, I'm not sure we should really flag this as an error here It should be made an INFO I think. What should get an error is if we try to start using cpu control on a guest while the host doesn't support it. In practice we need to check proper handling in 3 cases: - at guest startup - on migration when checking the target host - when activated at runtime
Okay, I will change the message level.
+ goto cleanup; + } + + if (access(cfs_period_path, F_OK) < 0) { + ret = 0; + } else { + ret = 1; + } + +cleanup: + VIR_FREE(cfs_period_path); + return ret; +} + + static char *qemuGetSchedulerType(virDomainPtr dom, int *nparams) { struct qemud_driver *driver = dom->conn->privateData; char *ret = NULL; + int rc;
qemuDriverLock(driver); if (!qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_CPU)) { @@ -5152,8 +5187,15 @@ static char *qemuGetSchedulerType(virDomainPtr dom, goto cleanup; }
- if (nparams) - *nparams = 1; + if (nparams) { + rc = qemuGetCpuBWStatus(driver->cgroup); + if (rc < 0) + goto cleanup; + else if (rc == 0) + *nparams = 1; + else + *nparams = 3; + }
ret = strdup("posix"); if (!ret) @@ -5786,6 +5828,58 @@ cleanup: return ret; }
+static int +qemuSetVcpusBWLive(virDomainObjPtr vm, virCgroupPtr cgroup, + unsigned long long period, long long quota) +{ + int i; + qemuDomainObjPrivatePtr priv = vm->privateData; + virCgroupPtr cgroup_vcpu = NULL; + int rc; + + if (period == 0 && quota == 0) + return 0; + + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* If we does not know VCPU<->PID mapping or all vcpu runs in the same + * thread, we can not control each vcpu. + */ + /* Ensure that we can multiply by vcpus without overflowing. */ + if (quota > LLONG_MAX / vm->def->vcpus) { + virReportSystemError(EINVAL, + _("%s"), + "Unable to set cpu bandwidth quota");
should probably give an hint of why in the error message
EINVAL means the value is invalid. If you think it is not enough, I will change the error message. Thanks Wen Congyang
+ goto cleanup; + } +
snip
ret = 0;
cleanup:
ACK but please add a commit message
Daniel

On Thu, Jul 21, 2011 at 10:11:32AM +0800, Wen Congyang wrote:
@@ -5851,7 +5953,6 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, virTypedParameterPtr param = ¶ms[i];
if (STREQ(param->field, "cpu_shares")) { - int rc; if (param->type != VIR_TYPED_PARAM_ULLONG) { qemuReportError(VIR_ERR_INVALID_ARG, "%s", _("invalid type for cpu_shares tunable, expected a 'ullong'")); @@ -5870,19 +5971,47 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, }
if (flags & VIR_DOMAIN_AFFECT_CONFIG) { - persistentDef = virDomainObjGetPersistentDef(driver->caps, vm); - if (!persistentDef) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("can't get persistentDef")); + vmdef->cputune.shares = params[i].value.ul; + } + } else if (STREQ(param->field, "cfs_period")) {
[snip]
+ } else if (STREQ(param->field, "cfs_quota")) {
In the XML file we now have <cputune> <shares>1024</shares> <period>90000</period> <quota>0</quota> </cputune> But the schedinfo parameter are being named cpu_shares: 1024 cfs_period: 90000 cfs_quota: 0 These new tunables should be named 'cpu_period' and 'cpu_quota' too.
+static int +qemuGetVcpusBWLive(virDomainObjPtr vm, virCgroupPtr cgroup, + unsigned long long *period, long long *quota) +{ + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = NULL; + int rc; + int ret = -1; + + priv = vm->privateData; + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* We do not create sub dir for each vcpu */ + rc = qemuGetVcpuBWLive(cgroup, period, quota); + if (rc < 0) + goto cleanup; + + if (*quota > 0) + *quota /= vm->def->vcpus; + goto out; + } + + /* get period and quota for vcpu0 */ + rc = virCgroupForVcpu(cgroup, 0, &cgroup_vcpu, 0); + if (!cgroup_vcpu) { + virReportSystemError(-rc, + _("Unable to find vcpu cgroup for %s(vcpu: 0)"), + vm->def->name); + goto cleanup; + } + + rc = qemuGetVcpuBWLive(cgroup_vcpu, period, quota); + if (rc < 0) + goto cleanup;
This is also bad IMHO, giving different semantics for the operation of the API, depending on whether the QEMU impl has VCPUs threads or not. Again this just says to me that we need this to apply at the VM level not the VCPU level. If we really do need finer per-VCPU controls, in addition to a per-VM control, I think that should likely be done as a separate tunable, or separate CPU eg, we could have 2 sets of tunables, one that applies to the VM and the second set that adds further controls to the VCPUs. <cputune> <shares>1024</shares> <period>90000</period> <quota>0</quota> <vcpu_shares>1024</vcpu_shares> <vcpu_period>90000</vcpu_period> <vcpu_quota>0</vcpu_quota> </cputune> Daniel. -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 07/21/2011 06:01 AM, Daniel P. Berrange wrote:
On Thu, Jul 21, 2011 at 10:11:32AM +0800, Wen Congyang wrote:
@@ -5851,7 +5953,6 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, virTypedParameterPtr param = ¶ms[i];
if (STREQ(param->field, "cpu_shares")) { - int rc; if (param->type != VIR_TYPED_PARAM_ULLONG) { qemuReportError(VIR_ERR_INVALID_ARG, "%s", _("invalid type for cpu_shares tunable, expected a 'ullong'")); @@ -5870,19 +5971,47 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, }
if (flags & VIR_DOMAIN_AFFECT_CONFIG) { - persistentDef = virDomainObjGetPersistentDef(driver->caps, vm); - if (!persistentDef) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("can't get persistentDef")); + vmdef->cputune.shares = params[i].value.ul; + } + } else if (STREQ(param->field, "cfs_period")) {
[snip]
+ } else if (STREQ(param->field, "cfs_quota")) {
In the XML file we now have
<cputune> <shares>1024</shares> <period>90000</period> <quota>0</quota> </cputune>
But the schedinfo parameter are being named
cpu_shares: 1024 cfs_period: 90000 cfs_quota: 0
These new tunables should be named 'cpu_period' and 'cpu_quota' too.
You mean 'cfs_period' and 'cfs_quota', right? The only problem I see with that naming (specifically for the quota) is that it implies a direct relationship to the tunable in cgroupfs. While true for 'cpu_shares', it is not true for quota since the proper setting to apply depends on the threading behavior of qemu.
+static int +qemuGetVcpusBWLive(virDomainObjPtr vm, virCgroupPtr cgroup, + unsigned long long *period, long long *quota) +{ + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = NULL; + int rc; + int ret = -1; + + priv = vm->privateData; + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* We do not create sub dir for each vcpu */ + rc = qemuGetVcpuBWLive(cgroup, period, quota); + if (rc < 0) + goto cleanup; + + if (*quota > 0) + *quota /= vm->def->vcpus; + goto out; + } + + /* get period and quota for vcpu0 */ + rc = virCgroupForVcpu(cgroup, 0, &cgroup_vcpu, 0); + if (!cgroup_vcpu) { + virReportSystemError(-rc, + _("Unable to find vcpu cgroup for %s(vcpu: 0)"), + vm->def->name); + goto cleanup; + } + + rc = qemuGetVcpuBWLive(cgroup_vcpu, period, quota); + if (rc < 0) + goto cleanup;
This is also bad IMHO, giving different semantics for the operation of the API, depending on whether the QEMU impl has VCPUs threads or not. Again this just says to me that we need this to apply at the VM level not the VCPU level.
IMO, this firms up the semantics of the libvirt API. As these patches are written, the libvirt semantics are: "Apply cpu capping to this domain. Period is the measurement interval. Quota is the amount of time each vcpu may run within the period." We are able to insulate the users from details of the underlying qemu -- whether it supports threads or not. Another reason a per-vcpu quota setting is more desirable than a global setting is seen when changing the number of vcpus assigned to a domain. Each time you change the vcpu count, you would have to change the quota number too. Imagine all of the complaints from users when they increase the vcpu count and find their domain actually performs worse.
If we really do need finer per-VCPU controls, in addition to a per-VM control, I think that should likely be done as a separate tunable, or separate CPU
eg, we could have 2 sets of tunables, one that applies to the VM and the second set that adds further controls to the VCPUs.
How would this behave when qemu doesn't support vcpu threads? We'd either have to throw an error, or just ignore the vcpu_ settings. either way, you're exposing users to qemu internals. They would need to change their domain.xml files when moving from a multi-threaded qemu to a single-threaded qemu.
<cputune> <shares>1024</shares> <period>90000</period> <quota>0</quota>
<vcpu_shares>1024</vcpu_shares> <vcpu_period>90000</vcpu_period> <vcpu_quota>0</vcpu_quota> </cputune>
I actually do like the <vcpu_*> naming because it makes two things more clear: Units apply at the vcpu level, and these stats don't necessarily map directly to cgroupfs. However, I would still maintain that the current logic should be used for applying the settings (per-vcpu if supported, multiply by nvcpus and apply globally if vcpu threads are not supported). -- Adam Litke IBM Linux Technology Center

On Thu, Jul 21, 2011 at 08:20:25AM -0500, Adam Litke wrote:
On 07/21/2011 06:01 AM, Daniel P. Berrange wrote:
On Thu, Jul 21, 2011 at 10:11:32AM +0800, Wen Congyang wrote:
@@ -5851,7 +5953,6 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, virTypedParameterPtr param = ¶ms[i];
if (STREQ(param->field, "cpu_shares")) { - int rc; if (param->type != VIR_TYPED_PARAM_ULLONG) { qemuReportError(VIR_ERR_INVALID_ARG, "%s", _("invalid type for cpu_shares tunable, expected a 'ullong'")); @@ -5870,19 +5971,47 @@ static int qemuSetSchedulerParametersFlags(virDomainPtr dom, }
if (flags & VIR_DOMAIN_AFFECT_CONFIG) { - persistentDef = virDomainObjGetPersistentDef(driver->caps, vm); - if (!persistentDef) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("can't get persistentDef")); + vmdef->cputune.shares = params[i].value.ul; + } + } else if (STREQ(param->field, "cfs_period")) {
[snip]
+ } else if (STREQ(param->field, "cfs_quota")) {
In the XML file we now have
<cputune> <shares>1024</shares> <period>90000</period> <quota>0</quota> </cputune>
But the schedinfo parameter are being named
cpu_shares: 1024 cfs_period: 90000 cfs_quota: 0
These new tunables should be named 'cpu_period' and 'cpu_quota' too.
You mean 'cfs_period' and 'cfs_quota', right?
No, I we should *not* be using 'cfs_' in the public interface since this is just an internal impl, of the current Linux schedular. If Linux gains a different schedular in the future which can still provide period/quota caps, the naming 'cfs_period' and 'cfs_quota' will be bad. The XML already has this correct, by not using 'cfs_' in the XML naming. The schedular info parameters are wrong, since they have go the 'cfs_' prefix.
+ priv = vm->privateData; + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* We do not create sub dir for each vcpu */ + rc = qemuGetVcpuBWLive(cgroup, period, quota); + if (rc < 0) + goto cleanup; + + if (*quota > 0) + *quota /= vm->def->vcpus; + goto out; + } + + /* get period and quota for vcpu0 */ + rc = virCgroupForVcpu(cgroup, 0, &cgroup_vcpu, 0); + if (!cgroup_vcpu) { + virReportSystemError(-rc, + _("Unable to find vcpu cgroup for %s(vcpu: 0)"), + vm->def->name); + goto cleanup; + } + + rc = qemuGetVcpuBWLive(cgroup_vcpu, period, quota); + if (rc < 0) + goto cleanup;
This is also bad IMHO, giving different semantics for the operation of the API, depending on whether the QEMU impl has VCPUs threads or not. Again this just says to me that we need this to apply at the VM level not the VCPU level.
IMO, this firms up the semantics of the libvirt API. As these patches are written, the libvirt semantics are: "Apply cpu capping to this domain. Period is the measurement interval. Quota is the amount of time each vcpu may run within the period." We are able to insulate the users from details of the underlying qemu -- whether it supports threads or not.
Another reason a per-vcpu quota setting is more desirable than a global setting is seen when changing the number of vcpus assigned to a domain. Each time you change the vcpu count, you would have to change the quota number too. Imagine all of the complaints from users when they increase the vcpu count and find their domain actually performs worse.
I think it is possible to argue that last point either way really, and we in fact already have that behaviour with the existing cpu_shares parameter. Having different behaviour for these new parameters, vs cpu_shares is an even worse problem. Since there is no clear cut answer to your last point either way, this again suggests to me that we should have two sets of CPU tunables here, one applying to the whole VM and other to the per-VCPUs (if supported).
If we really do need finer per-VCPU controls, in addition to a per-VM control, I think that should likely be done as a separate tunable, or separate CPU
eg, we could have 2 sets of tunables, one that applies to the VM and the second set that adds further controls to the VCPUs.
How would this behave when qemu doesn't support vcpu threads? We'd either have to throw an error, or just ignore the vcpu_ settings. either way, you're exposing users to qemu internals. They would need to change their domain.xml files when moving from a multi-threaded qemu to a single-threaded qemu.
If the user requests a per-VCPU control and we can't support that we should return an error to the user, rather than silently implementing a different type of control. This doesn't imply that we've exposing users to QEMU internals. We can simply telling them that we have been unable to support the configuration that they requested. This is good, because knowing that the config could not be supported means they have the knowledge to be able to apply a different policy. With this current impl users have no idea we are secretly applying different semantics and limiting the whole VM. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 07/20/2011 09:11 PM, Wen Congyang wrote:
+static int +qemuGetVcpusBWLive(virDomainObjPtr vm, virCgroupPtr cgroup, + unsigned long long *period, long long *quota) +{ + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = NULL; + int rc; + int ret = -1; + + priv = vm->privateData; + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* We do not create sub dir for each vcpu */ + rc = qemuGetVcpuBWLive(cgroup, period, quota); + if (rc < 0) + goto cleanup; + + if (*quota > 0) + *quota /= vm->def->vcpus; + goto out; + }
Are you sure the above is correct? Based on my earlier suggestion, <quota> is always specified as the amount of runtime afforded to a single vcpu. Hence, if you are changing quota to cover for all of a vm's vcpus, wouldn't you want to: *quota *= vm->def->vcpus; ? -- Adam Litke IBM Linux Technology Center

On Thu, Jul 21, 2011 at 08:00:21AM -0500, Adam Litke wrote:
On 07/20/2011 09:11 PM, Wen Congyang wrote:
+static int +qemuGetVcpusBWLive(virDomainObjPtr vm, virCgroupPtr cgroup, + unsigned long long *period, long long *quota) +{ + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = NULL; + int rc; + int ret = -1; + + priv = vm->privateData; + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* We do not create sub dir for each vcpu */ + rc = qemuGetVcpuBWLive(cgroup, period, quota); + if (rc < 0) + goto cleanup; + + if (*quota > 0) + *quota /= vm->def->vcpus; + goto out; + }
Are you sure the above is correct? Based on my earlier suggestion, <quota> is always specified as the amount of runtime afforded to a single vcpu. Hence, if you are changing quota to cover for all of a vm's vcpus, wouldn't you want to:
*quota *= vm->def->vcpus;
This is a good example of why I think the current design is flawed and needs separate limits explicitly tracked, one for the VM as a whole, and another for individual VCPUs. The per-VCPU limit would then simply be unsupported for the case of a QEMU without vCPU threads, but the per-VM limit could still be used. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

At 07/21/2011 09:00 PM, Adam Litke Write:
On 07/20/2011 09:11 PM, Wen Congyang wrote:
+static int +qemuGetVcpusBWLive(virDomainObjPtr vm, virCgroupPtr cgroup, + unsigned long long *period, long long *quota) +{ + virCgroupPtr cgroup_vcpu = NULL; + qemuDomainObjPrivatePtr priv = NULL; + int rc; + int ret = -1; + + priv = vm->privateData; + if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { + /* We do not create sub dir for each vcpu */ + rc = qemuGetVcpuBWLive(cgroup, period, quota); + if (rc < 0) + goto cleanup; + + if (*quota > 0) + *quota /= vm->def->vcpus; + goto out; + }
Are you sure the above is correct? Based on my earlier suggestion, <quota> is always specified as the amount of runtime afforded to a single vcpu. Hence, if you are changing quota to cover for all of a vm's vcpus, wouldn't you want to:
*quota *= vm->def->vcpus;
When we set, we use '*quota *= vm->def->vcpus;', so when we read, we should use '*quota /= vm->def->vcpus' Thanks Wen Congyang
?

--- docs/formatdomain.html.in | 19 +++++++++++++++++++ 1 files changed, 19 insertions(+), 0 deletions(-) diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index a54ee6a..47edd35 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -307,6 +307,8 @@ <vcpupin vcpu="2" cpuset="2,3"/> <vcpupin vcpu="3" cpuset="0,4"/> <shares>2048</shares> + <period>1000000</period> + <quota>-1</quota> </cputune> <numatune> <memory mode="strict" nodeset="1-4,^3"/> @@ -400,6 +402,23 @@ 2048 will get twice as much CPU time as a VM configured with value 1024. <span class="since">Since 0.9.0</span> </dd> + <dt><code>period</code></dt> + <dd> + The optional <code>period</code> element specifies the enforcement + interval(unit: microseconds). Within <code>period</code>, each vcpu of + the domain will not be allowed to consume more than <code>quota</code> + worth of runtime. The value should be in range [1000, 1000000]. + <span class="since">Since 0.9.4</span> + </dd> + <dt><code>quota</code></dt> + <dd> + The optional <code>quota</code> element specifies the maximum allowed + bandwidth(unit: microseconds). A domain with <code>quota</code> as any + negative value indicates that the domain has infinite bandwidth, which + means that it is not bandwidth controlled. The value should be in range + [1000, 18446744073709551] or less than 0. + <span class="since">Since 0.9.4</span> + </dd> <dt><code>numatune</code></dt> <dd> The optional <code>numatune</code> element provides details of -- 1.7.1

On Thu, Jul 21, 2011 at 10:12:14AM +0800, Wen Congyang wrote:
--- docs/formatdomain.html.in | 19 +++++++++++++++++++ 1 files changed, 19 insertions(+), 0 deletions(-)
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index a54ee6a..47edd35 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -307,6 +307,8 @@ <vcpupin vcpu="2" cpuset="2,3"/> <vcpupin vcpu="3" cpuset="0,4"/> <shares>2048</shares> + <period>1000000</period> + <quota>-1</quota> </cputune> <numatune> <memory mode="strict" nodeset="1-4,^3"/> @@ -400,6 +402,23 @@ 2048 will get twice as much CPU time as a VM configured with value 1024. <span class="since">Since 0.9.0</span> </dd> + <dt><code>period</code></dt> + <dd> + The optional <code>period</code> element specifies the enforcement + interval(unit: microseconds). Within <code>period</code>, each vcpu of + the domain will not be allowed to consume more than <code>quota</code> + worth of runtime. The value should be in range [1000, 1000000]. + <span class="since">Since 0.9.4</span> + </dd> + <dt><code>quota</code></dt> + <dd> + The optional <code>quota</code> element specifies the maximum allowed + bandwidth(unit: microseconds). A domain with <code>quota</code> as any + negative value indicates that the domain has infinite bandwidth, which + means that it is not bandwidth controlled. The value should be in range + [1000, 18446744073709551] or less than 0. + <span class="since">Since 0.9.4</span> + </dd> <dt><code>numatune</code></dt> <dd> The optional <code>numatune</code> element provides details of
I think we need to expand this a bit: - first state that 0 means no value for both tunable - then express that the implementation is based on CFS for QEmu/KVM and what the use case really are. It seems to me that it's not really for fine grained ressource control but rather to keep the limits of CPU usage consistent. - also an small explanation of how well those tunable may or may not work in case of migration seems important ACK and don't forget the commit message :-) thanks Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

At 07/21/2011 12:39 PM, Daniel Veillard Write:
On Thu, Jul 21, 2011 at 10:12:14AM +0800, Wen Congyang wrote:
--- docs/formatdomain.html.in | 19 +++++++++++++++++++ 1 files changed, 19 insertions(+), 0 deletions(-)
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index a54ee6a..47edd35 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -307,6 +307,8 @@ <vcpupin vcpu="2" cpuset="2,3"/> <vcpupin vcpu="3" cpuset="0,4"/> <shares>2048</shares> + <period>1000000</period> + <quota>-1</quota> </cputune> <numatune> <memory mode="strict" nodeset="1-4,^3"/> @@ -400,6 +402,23 @@ 2048 will get twice as much CPU time as a VM configured with value 1024. <span class="since">Since 0.9.0</span> </dd> + <dt><code>period</code></dt> + <dd> + The optional <code>period</code> element specifies the enforcement + interval(unit: microseconds). Within <code>period</code>, each vcpu of + the domain will not be allowed to consume more than <code>quota</code> + worth of runtime. The value should be in range [1000, 1000000]. + <span class="since">Since 0.9.4</span> + </dd> + <dt><code>quota</code></dt> + <dd> + The optional <code>quota</code> element specifies the maximum allowed + bandwidth(unit: microseconds). A domain with <code>quota</code> as any + negative value indicates that the domain has infinite bandwidth, which + means that it is not bandwidth controlled. The value should be in range + [1000, 18446744073709551] or less than 0. + <span class="since">Since 0.9.4</span> + </dd> <dt><code>numatune</code></dt> <dd> The optional <code>numatune</code> element provides details of
I think we need to expand this a bit: - first state that 0 means no value for both tunable - then express that the implementation is based on CFS for QEmu/KVM and what the use case really are. It seems to me that it's not really for fine grained ressource control but rather to keep the limits of CPU usage consistent. - also an small explanation of how well those tunable may or may not work in case of migration seems important
I added the first two, and I do not know the relationship between cpu bandwidth and migration. If someone knows it, we can add it later.
ACK
and don't forget the commit message :-)
I addressed your comment, added commit message for each patch and pushed this patch set. Thanks Wen Congyang
thanks
Daniel

At 07/21/2011 05:22 PM, Wen Congyang Write:
At 07/21/2011 12:39 PM, Daniel Veillard Write:
On Thu, Jul 21, 2011 at 10:12:14AM +0800, Wen Congyang wrote:
--- docs/formatdomain.html.in | 19 +++++++++++++++++++ 1 files changed, 19 insertions(+), 0 deletions(-)
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index a54ee6a..47edd35 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -307,6 +307,8 @@ <vcpupin vcpu="2" cpuset="2,3"/> <vcpupin vcpu="3" cpuset="0,4"/> <shares>2048</shares> + <period>1000000</period> + <quota>-1</quota> </cputune> <numatune> <memory mode="strict" nodeset="1-4,^3"/> @@ -400,6 +402,23 @@ 2048 will get twice as much CPU time as a VM configured with value 1024. <span class="since">Since 0.9.0</span> </dd> + <dt><code>period</code></dt> + <dd> + The optional <code>period</code> element specifies the enforcement + interval(unit: microseconds). Within <code>period</code>, each vcpu of + the domain will not be allowed to consume more than <code>quota</code> + worth of runtime. The value should be in range [1000, 1000000]. + <span class="since">Since 0.9.4</span> + </dd> + <dt><code>quota</code></dt> + <dd> + The optional <code>quota</code> element specifies the maximum allowed + bandwidth(unit: microseconds). A domain with <code>quota</code> as any + negative value indicates that the domain has infinite bandwidth, which + means that it is not bandwidth controlled. The value should be in range + [1000, 18446744073709551] or less than 0. + <span class="since">Since 0.9.4</span> + </dd> <dt><code>numatune</code></dt> <dd> The optional <code>numatune</code> element provides details of
I think we need to expand this a bit: - first state that 0 means no value for both tunable - then express that the implementation is based on CFS for QEmu/KVM and what the use case really are. It seems to me that it's not really for fine grained ressource control but rather to keep the limits of CPU usage consistent. - also an small explanation of how well those tunable may or may not work in case of migration seems important
I added the first two, and I do not know the relationship between cpu bandwidth and migration. If someone knows it, we can add it later.
ACK
and don't forget the commit message :-)
I addressed your comment, added commit message for each patch and pushed this patch set. Thanks Wen Congyang
OOPs, I forgot to run make syntax-check :(, I will fix it soon.
thanks
Daniel
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
participants (6)
-
Adam Litke
-
Anthony Liguori
-
Daniel P. Berrange
-
Daniel Veillard
-
Eric Blake
-
Wen Congyang