On Fri, Jun 28, 2013 at 17:24:42 -0400, Laine Stump wrote:
On 06/28/2013 11:04 AM, Jiri Denemark wrote:
> ---
> src/qemu/qemu_cgroup.c | 21 ++-------------------
> src/qemu/qemu_domain.c | 29 +++++++++++++++++++++++++++++
> src/qemu/qemu_domain.h | 2 ++
> 3 files changed, 33 insertions(+), 19 deletions(-)
>
> diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c
> index 5f54ca6..22bf78e 100644
> --- a/src/qemu/qemu_cgroup.c
> +++ b/src/qemu/qemu_cgroup.c
> @@ -461,9 +461,7 @@ static int
> qemuSetupMemoryCgroup(virDomainObjPtr vm)
> {
> qemuDomainObjPrivatePtr priv = vm->privateData;
> - unsigned long long hard_limit;
> int rc;
> - int i;
>
> if (!virCgroupHasController(priv->cgroup,VIR_CGROUP_CONTROLLER_MEMORY)) {
> if (vm->def->mem.hard_limit != 0 ||
> @@ -477,23 +475,8 @@ qemuSetupMemoryCgroup(virDomainObjPtr vm)
> }
> }
>
> - hard_limit = vm->def->mem.hard_limit;
> - if (!hard_limit) {
> - /* If there is no hard_limit set, set a reasonable one to avoid
> - * system thrashing caused by exploited qemu. A 'reasonable
> - * limit' has been chosen:
> - * (1 + k) * (domain memory + total video memory) + (32MB for
> - * cache per each disk) + F
> - * where k = 0.5 and F = 200MB. The cache for disks is important as
> - * kernel cache on the host side counts into the RSS limit. */
> - hard_limit = vm->def->mem.max_balloon;
> - for (i = 0; i < vm->def->nvideos; i++)
> - hard_limit += vm->def->videos[i]->vram;
> - hard_limit = hard_limit * 1.5 + 204800;
> - hard_limit += vm->def->ndisks * 32768;
> - }
> -
> - rc = virCgroupSetMemoryHardLimit(priv->cgroup, hard_limit);
> + rc = virCgroupSetMemoryHardLimit(priv->cgroup,
> + qemuDomainMemoryLimit(vm->def));
> if (rc != 0) {
> virReportSystemError(-rc,
> _("Unable to set memory hard limit for domain
%s"),
> diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
> index 8d79066..77b94ec 100644
> --- a/src/qemu/qemu_domain.c
> +++ b/src/qemu/qemu_domain.c
> @@ -2181,3 +2181,32 @@ cleanup:
> virObjectUnref(cfg);
> return ret;
> }
> +
> +
> +unsigned long long
> +qemuDomainMemoryLimit(virDomainDefPtr def)
> +{
> + unsigned long long mem;
> + int i;
> +
> + if (def->mem.hard_limit) {
> + mem = def->mem.hard_limit;
> + } else {
> + /* If there is no hard_limit set, compute a reasonable one to avoid
> + * system thrashing caused by exploited qemu. A 'reasonable
> + * limit' has been chosen:
> + * (1 + k) * (domain memory + total video memory) + (32MB for
> + * cache per each disk) + F
> + * where k = 0.5 and F = 200MB. The cache for disks is important as
> + * kernel cache on the host side counts into the RSS limit.
> + */
> + mem = def->mem.max_balloon;
> + for (i = 0; i < def->nvideos; i++)
> + mem += def->videos[i]->vram;
> + mem *= 1.5;
> + mem += def->ndisks * 32768;
> + mem += 204800;
> + }
I know you're just doing a cut-paste here, but I'm curious how we
arrived at this formula. On systems using vfio, it ends up locking *way*
more memory than previously. For my test guest with 4GB of ram assigned,
the old computation would lock 5GB of ram for the qemu process. With the
new method in this patch it ends up locking just about 7GB.
That's because previously this computation was made only for hard_limit
and the limit for locking was just completely ignoring it and used
max_balloon + 1GB. However, this computation does not count with VFIO in
any way so I just combined the two computations by adding the extra 1GB
for VFIO into this shared code (in patch 2/3). I'm not sure if that's
the right think to do or not but it's certainly better than before when
memory locking limit completely ignore the need for VRAM and per-disk
cache. Unfortunately, the original formula was suggested by Avi, who
moved to new challenges. Perhaps others could jump in and share their
opinions (Paolo? :-P). Be careful, though, that we're actually
discussing qemuDomainMemoryLimit code after patch 2/3 is applied, that
is with the VFIO stuff counted in. The complete code of this function
is:
unsigned long long
qemuDomainMemoryLimit(virDomainDefPtr def)
{
unsigned long long mem;
int i;
if (def->mem.hard_limit) {
mem = def->mem.hard_limit;
} else {
/* If there is no hard_limit set, compute a reasonable one to avoid
* system thrashing caused by exploited qemu. A 'reasonable
* limit' has been chosen:
* (1 + k) * (domain memory + total video memory) + (32MB for
* cache per each disk) + F
* where k = 0.5 and F = 200MB. The cache for disks is important as
* kernel cache on the host side counts into the RSS limit.
*
* Moreover, VFIO requires some amount for IO space. Alex Williamson
* suggested adding 1GiB for IO space just to be safe (some finer
* tuning might be nice, though).
*/
mem = def->mem.max_balloon;
for (i = 0; i < def->nvideos; i++)
mem += def->videos[i]->vram;
mem *= 1.5;
mem += def->ndisks * 32768;
mem += 204800;
for (i = 0; i < def->nhostdevs; i++) {
virDomainHostdevDefPtr hostdev = def->hostdevs[i];
if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS &&
hostdev->source.subsys.type ==
VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI &&
hostdev->source.subsys.u.pci.backend ==
VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
mem += 1024 * 1024;
break;
}
}
}
return mem;
}
Jirka