
On Fri, Jun 28, 2013 at 17:24:42 -0400, Laine Stump wrote:
On 06/28/2013 11:04 AM, Jiri Denemark wrote:
--- src/qemu/qemu_cgroup.c | 21 ++------------------- src/qemu/qemu_domain.c | 29 +++++++++++++++++++++++++++++ src/qemu/qemu_domain.h | 2 ++ 3 files changed, 33 insertions(+), 19 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 5f54ca6..22bf78e 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -461,9 +461,7 @@ static int qemuSetupMemoryCgroup(virDomainObjPtr vm) { qemuDomainObjPrivatePtr priv = vm->privateData; - unsigned long long hard_limit; int rc; - int i;
if (!virCgroupHasController(priv->cgroup,VIR_CGROUP_CONTROLLER_MEMORY)) { if (vm->def->mem.hard_limit != 0 || @@ -477,23 +475,8 @@ qemuSetupMemoryCgroup(virDomainObjPtr vm) } }
- hard_limit = vm->def->mem.hard_limit; - if (!hard_limit) { - /* If there is no hard_limit set, set a reasonable one to avoid - * system thrashing caused by exploited qemu. A 'reasonable - * limit' has been chosen: - * (1 + k) * (domain memory + total video memory) + (32MB for - * cache per each disk) + F - * where k = 0.5 and F = 200MB. The cache for disks is important as - * kernel cache on the host side counts into the RSS limit. */ - hard_limit = vm->def->mem.max_balloon; - for (i = 0; i < vm->def->nvideos; i++) - hard_limit += vm->def->videos[i]->vram; - hard_limit = hard_limit * 1.5 + 204800; - hard_limit += vm->def->ndisks * 32768; - } - - rc = virCgroupSetMemoryHardLimit(priv->cgroup, hard_limit); + rc = virCgroupSetMemoryHardLimit(priv->cgroup, + qemuDomainMemoryLimit(vm->def)); if (rc != 0) { virReportSystemError(-rc, _("Unable to set memory hard limit for domain %s"), diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 8d79066..77b94ec 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -2181,3 +2181,32 @@ cleanup: virObjectUnref(cfg); return ret; } + + +unsigned long long +qemuDomainMemoryLimit(virDomainDefPtr def) +{ + unsigned long long mem; + int i; + + if (def->mem.hard_limit) { + mem = def->mem.hard_limit; + } else { + /* If there is no hard_limit set, compute a reasonable one to avoid + * system thrashing caused by exploited qemu. A 'reasonable + * limit' has been chosen: + * (1 + k) * (domain memory + total video memory) + (32MB for + * cache per each disk) + F + * where k = 0.5 and F = 200MB. The cache for disks is important as + * kernel cache on the host side counts into the RSS limit. + */ + mem = def->mem.max_balloon; + for (i = 0; i < def->nvideos; i++) + mem += def->videos[i]->vram; + mem *= 1.5; + mem += def->ndisks * 32768; + mem += 204800; + }
I know you're just doing a cut-paste here, but I'm curious how we arrived at this formula. On systems using vfio, it ends up locking *way* more memory than previously. For my test guest with 4GB of ram assigned, the old computation would lock 5GB of ram for the qemu process. With the new method in this patch it ends up locking just about 7GB.
That's because previously this computation was made only for hard_limit and the limit for locking was just completely ignoring it and used max_balloon + 1GB. However, this computation does not count with VFIO in any way so I just combined the two computations by adding the extra 1GB for VFIO into this shared code (in patch 2/3). I'm not sure if that's the right think to do or not but it's certainly better than before when memory locking limit completely ignore the need for VRAM and per-disk cache. Unfortunately, the original formula was suggested by Avi, who moved to new challenges. Perhaps others could jump in and share their opinions (Paolo? :-P). Be careful, though, that we're actually discussing qemuDomainMemoryLimit code after patch 2/3 is applied, that is with the VFIO stuff counted in. The complete code of this function is: unsigned long long qemuDomainMemoryLimit(virDomainDefPtr def) { unsigned long long mem; int i; if (def->mem.hard_limit) { mem = def->mem.hard_limit; } else { /* If there is no hard_limit set, compute a reasonable one to avoid * system thrashing caused by exploited qemu. A 'reasonable * limit' has been chosen: * (1 + k) * (domain memory + total video memory) + (32MB for * cache per each disk) + F * where k = 0.5 and F = 200MB. The cache for disks is important as * kernel cache on the host side counts into the RSS limit. * * Moreover, VFIO requires some amount for IO space. Alex Williamson * suggested adding 1GiB for IO space just to be safe (some finer * tuning might be nice, though). */ mem = def->mem.max_balloon; for (i = 0; i < def->nvideos; i++) mem += def->videos[i]->vram; mem *= 1.5; mem += def->ndisks * 32768; mem += 204800; for (i = 0; i < def->nhostdevs; i++) { virDomainHostdevDefPtr hostdev = def->hostdevs[i]; if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS && hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI && hostdev->source.subsys.u.pci.backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { mem += 1024 * 1024; break; } } } return mem; } Jirka