[libvirt] [PATCH v3] Pin guest to memory node on NUMA system

Version 3: Addressed comments on V2. Version 2: Fixed the string formatting errors in v1. The patch contains the fix for defect 1009880 reported at redhat bugzilla. The root cause is, ever since the subcpusets(vcpu,emulator) were introduced, the parent cpuset cannot be modified to remove the nodes that are in use by the subcpusets. The fix is to break the memory node modification into three steps as to assign new nodes into the parent first. Change the nodes in the child nodes. Then remove the old nodes on the parent node. Signed-off-by: Shivaprasad G Bhat <sbhat@linux.vnet.ibm.com> --- src/qemu/qemu_driver.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 110 insertions(+), 5 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 8a1eefd..4bc9d1d 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -8132,6 +8132,47 @@ cleanup: } static int +qemuSetVcpuCpusetMems(virDomainObjPtr vm, + char *nodeset_str) +{ + size_t j = 0; + qemuDomainObjPrivatePtr priv = vm->privateData; + virCgroupPtr cgroup_vcpu = NULL; + + for (j = 0; j < priv->nvcpupids; j++) { + if (virCgroupNewVcpu(priv->cgroup, j, false, &cgroup_vcpu) < 0) { + return -1; + } + if (virCgroupSetCpusetMems(cgroup_vcpu, nodeset_str) < 0) { + virCgroupFree(&cgroup_vcpu); + return -1; + } + virCgroupFree(&cgroup_vcpu); + } + + return 0; +} + +static int +qemuSetEmulatorCpusetMems(virDomainObjPtr vm, + char *nodeset_str) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + virCgroupPtr cgroup_emulator = NULL; + + if (virCgroupNewEmulator(priv->cgroup, false, &cgroup_emulator) < 0) { + return -1; + } + if (virCgroupSetCpusetMems(cgroup_emulator, nodeset_str) < 0) { + virCgroupFree(&cgroup_emulator); + return -1; + } + virCgroupFree(&cgroup_emulator); + + return 0; +} + +static int qemuDomainSetNumaParameters(virDomainPtr dom, virTypedParameterPtr params, int nparams, @@ -8198,7 +8239,11 @@ qemuDomainSetNumaParameters(virDomainPtr dom, } } else if (STREQ(param->field, VIR_DOMAIN_NUMA_NODESET)) { virBitmapPtr nodeset = NULL; + virBitmapPtr old_nodeset = NULL; + virBitmapPtr temp_nodeset = NULL; char *nodeset_str = NULL; + char *old_nodeset_str = NULL; + char *temp_nodeset_str = NULL; if (virBitmapParse(params[i].value.s, 0, &nodeset, @@ -8208,32 +8253,92 @@ qemuDomainSetNumaParameters(virDomainPtr dom, } if (flags & VIR_DOMAIN_AFFECT_LIVE) { + size_t j; + if (vm->def->numatune.memory.mode != VIR_DOMAIN_NUMATUNE_MEM_STRICT) { virReportError(VIR_ERR_OPERATION_INVALID, "%s", _("change of nodeset for running domain " "requires strict numa mode")); - virBitmapFree(nodeset); ret = -1; - continue; + goto next; } /* Ensure the cpuset string is formated before passing to cgroup */ if (!(nodeset_str = virBitmapFormat(nodeset))) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Failed to format nodeset")); - virBitmapFree(nodeset); ret = -1; - continue; + goto next; + } + + /*Get Exisitng nodeset values */ + if (virCgroupGetCpusetMems(priv->cgroup, &old_nodeset_str) < 0) { + ret = -1; + goto next; + } + if (virBitmapParse(old_nodeset_str, 0, &old_nodeset, + VIR_DOMAIN_CPUMASK_LEN) < 0){ + ret = -1; + goto next; + } + + /* Merge the existing and new nodeset values */ + if ((temp_nodeset = virBitmapNewCopy(old_nodeset)) == NULL) { + ret = -1; + goto next; + } + + for (j = 0; j < caps->host.nnumaCell; j++) { + bool result; + if (virBitmapGetBit(nodeset, j, &result) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Failed to get cpuset bit values")); + ret = -1; + goto next; + } + if (result && (virBitmapSetBit(temp_nodeset, j) < 0)) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Failed to set temporary cpuset bit values")); + ret = -1; + goto next; + } + } + + if (!(temp_nodeset_str = virBitmapFormat(temp_nodeset))) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Failed to format nodeset")); + ret = -1; + goto next; + } + + if (virCgroupSetCpusetMems(priv->cgroup, temp_nodeset_str) < 0) { + ret = -1; + goto next; + } + + if (qemuSetVcpuCpusetMems(vm, nodeset_str) || + qemuSetEmulatorCpusetMems(vm, nodeset_str)) { + ret = -1; + goto next; } if (virCgroupSetCpusetMems(priv->cgroup, nodeset_str) < 0) { virBitmapFree(nodeset); VIR_FREE(nodeset_str); ret = -1; - continue; + goto next; } +next : VIR_FREE(nodeset_str); + VIR_FREE(old_nodeset_str); + virBitmapFree(old_nodeset); + VIR_FREE(temp_nodeset_str); + virBitmapFree(temp_nodeset); + if (ret) { + virBitmapFree(nodeset); + continue; + } /* update vm->def here so that dumpxml can read the new * values from vm->def. */

On Tue, Nov 26, 2013 at 07:59:31PM +0530, Shivaprasad G Bhat wrote:
Version 3: Addressed comments on V2.
Version 2: Fixed the string formatting errors in v1.
The patch contains the fix for defect 1009880 reported at redhat bugzilla. The root cause is, ever since the subcpusets(vcpu,emulator) were introduced, the parent cpuset cannot be modified to remove the nodes that are in use by the subcpusets. The fix is to break the memory node modification into three steps as to assign new nodes into the parent first. Change the nodes in the child nodes. Then remove the old nodes on the parent node.
Signed-off-by: Shivaprasad G Bhat <sbhat@linux.vnet.ibm.com> --- src/qemu/qemu_driver.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 110 insertions(+), 5 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 8a1eefd..4bc9d1d 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -8132,6 +8132,47 @@ cleanup: }
static int +qemuSetVcpuCpusetMems(virDomainObjPtr vm, + char *nodeset_str) +{ + size_t j = 0; + qemuDomainObjPrivatePtr priv = vm->privateData; + virCgroupPtr cgroup_vcpu = NULL; + + for (j = 0; j < priv->nvcpupids; j++) { + if (virCgroupNewVcpu(priv->cgroup, j, false, &cgroup_vcpu) < 0) { + return -1; + } + if (virCgroupSetCpusetMems(cgroup_vcpu, nodeset_str) < 0) { + virCgroupFree(&cgroup_vcpu); + return -1; + } + virCgroupFree(&cgroup_vcpu); + } + + return 0; +} + +static int +qemuSetEmulatorCpusetMems(virDomainObjPtr vm, + char *nodeset_str) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + virCgroupPtr cgroup_emulator = NULL; + + if (virCgroupNewEmulator(priv->cgroup, false, &cgroup_emulator) < 0) { + return -1; + } + if (virCgroupSetCpusetMems(cgroup_emulator, nodeset_str) < 0) { + virCgroupFree(&cgroup_emulator); + return -1; + } + virCgroupFree(&cgroup_emulator); + + return 0; +} +
I suggested to offload this to a different function just in case it is used in more places than this one. If it is not then it just adds more code.
+static int qemuDomainSetNumaParameters(virDomainPtr dom, virTypedParameterPtr params, int nparams, @@ -8198,7 +8239,11 @@ qemuDomainSetNumaParameters(virDomainPtr dom, } } else if (STREQ(param->field, VIR_DOMAIN_NUMA_NODESET)) { virBitmapPtr nodeset = NULL; + virBitmapPtr old_nodeset = NULL; + virBitmapPtr temp_nodeset = NULL; char *nodeset_str = NULL; + char *old_nodeset_str = NULL; + char *temp_nodeset_str = NULL;
if (virBitmapParse(params[i].value.s, 0, &nodeset, @@ -8208,32 +8253,92 @@ qemuDomainSetNumaParameters(virDomainPtr dom, }
if (flags & VIR_DOMAIN_AFFECT_LIVE) { + size_t j; + if (vm->def->numatune.memory.mode != VIR_DOMAIN_NUMATUNE_MEM_STRICT) { virReportError(VIR_ERR_OPERATION_INVALID, "%s", _("change of nodeset for running domain " "requires strict numa mode")); - virBitmapFree(nodeset); ret = -1; - continue; + goto next; }
/* Ensure the cpuset string is formated before passing to cgroup */ if (!(nodeset_str = virBitmapFormat(nodeset))) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Failed to format nodeset")); - virBitmapFree(nodeset); ret = -1; - continue; + goto next; + } + + /*Get Exisitng nodeset values */ + if (virCgroupGetCpusetMems(priv->cgroup, &old_nodeset_str) < 0) { + ret = -1; + goto next; + } + if (virBitmapParse(old_nodeset_str, 0, &old_nodeset, + VIR_DOMAIN_CPUMASK_LEN) < 0){ + ret = -1; + goto next; + } + + /* Merge the existing and new nodeset values */ + if ((temp_nodeset = virBitmapNewCopy(old_nodeset)) == NULL) { + ret = -1; + goto next; + } + + for (j = 0; j < caps->host.nnumaCell; j++) { + bool result; + if (virBitmapGetBit(nodeset, j, &result) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Failed to get cpuset bit values")); + ret = -1; + goto next; + } + if (result && (virBitmapSetBit(temp_nodeset, j) < 0)) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Failed to set temporary cpuset bit values")); + ret = -1; + goto next; + } + } + + if (!(temp_nodeset_str = virBitmapFormat(temp_nodeset))) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Failed to format nodeset")); + ret = -1; + goto next; + } + + if (virCgroupSetCpusetMems(priv->cgroup, temp_nodeset_str) < 0) { + ret = -1; + goto next; + } + + if (qemuSetVcpuCpusetMems(vm, nodeset_str) || + qemuSetEmulatorCpusetMems(vm, nodeset_str)) { + ret = -1; + goto next; }
if (virCgroupSetCpusetMems(priv->cgroup, nodeset_str) < 0) { virBitmapFree(nodeset); VIR_FREE(nodeset_str); ret = -1; - continue; + goto next; } +next : VIR_FREE(nodeset_str); + VIR_FREE(old_nodeset_str); + virBitmapFree(old_nodeset); + VIR_FREE(temp_nodeset_str); + virBitmapFree(temp_nodeset); + if (ret) { + virBitmapFree(nodeset); + continue; + }
This label makes the code unclean, but when I tried "refactoring" it in the way I had on my mind (Free-ing the pointers on start of the cycle, changing it to continue, etc.), it looked also ugly. This is however perfect piece of code to make it into a function. You can then do a 'cleanup:' label there, clean all the memory after it, set 'ret = -1' on start and reset it before the label; basically the same way we do it everywhere else. The problem is that this for() cycle is special in the way that it continues working with other options when a problem appears. Sorry I haven't mentioned it at first, but when looking at the code today it seems like it should be done that way. You'll get rid of those "ret = -1; goto next;" thanks to that. Thanks, Martin

Thanks a lot Martin. I have sent v4 for review. Regards, Shiva On Mon, Dec 2, 2013 at 7:55 PM, Martin Kletzander <mkletzan@redhat.com> wrote:
On Tue, Nov 26, 2013 at 07:59:31PM +0530, Shivaprasad G Bhat wrote:
Version 3: Addressed comments on V2.
Version 2: Fixed the string formatting errors in v1.
The patch contains the fix for defect 1009880 reported at redhat bugzilla. The root cause is, ever since the subcpusets(vcpu,emulator) were introduced, the parent cpuset cannot be modified to remove the nodes that are in use by the subcpusets. The fix is to break the memory node modification into three steps as to assign new nodes into the parent first. Change the nodes in the child nodes. Then remove the old nodes on the parent node.
Signed-off-by: Shivaprasad G Bhat <sbhat@linux.vnet.ibm.com> --- src/qemu/qemu_driver.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 110 insertions(+), 5 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 8a1eefd..4bc9d1d 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -8132,6 +8132,47 @@ cleanup: }
static int +qemuSetVcpuCpusetMems(virDomainObjPtr vm, + char *nodeset_str) +{ + size_t j = 0; + qemuDomainObjPrivatePtr priv = vm->privateData; + virCgroupPtr cgroup_vcpu = NULL; + + for (j = 0; j < priv->nvcpupids; j++) { + if (virCgroupNewVcpu(priv->cgroup, j, false, &cgroup_vcpu) < 0) { + return -1; + } + if (virCgroupSetCpusetMems(cgroup_vcpu, nodeset_str) < 0) { + virCgroupFree(&cgroup_vcpu); + return -1; + } + virCgroupFree(&cgroup_vcpu); + } + + return 0; +} + +static int +qemuSetEmulatorCpusetMems(virDomainObjPtr vm, + char *nodeset_str) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + virCgroupPtr cgroup_emulator = NULL; + + if (virCgroupNewEmulator(priv->cgroup, false, &cgroup_emulator) < 0) { + return -1; + } + if (virCgroupSetCpusetMems(cgroup_emulator, nodeset_str) < 0) { + virCgroupFree(&cgroup_emulator); + return -1; + } + virCgroupFree(&cgroup_emulator); + + return 0; +} +
I suggested to offload this to a different function just in case it is used in more places than this one. If it is not then it just adds more code.
+static int qemuDomainSetNumaParameters(virDomainPtr dom, virTypedParameterPtr params, int nparams, @@ -8198,7 +8239,11 @@ qemuDomainSetNumaParameters(virDomainPtr dom, } } else if (STREQ(param->field, VIR_DOMAIN_NUMA_NODESET)) { virBitmapPtr nodeset = NULL; + virBitmapPtr old_nodeset = NULL; + virBitmapPtr temp_nodeset = NULL; char *nodeset_str = NULL; + char *old_nodeset_str = NULL; + char *temp_nodeset_str = NULL;
if (virBitmapParse(params[i].value.s, 0, &nodeset, @@ -8208,32 +8253,92 @@ qemuDomainSetNumaParameters(virDomainPtr dom, }
if (flags & VIR_DOMAIN_AFFECT_LIVE) { + size_t j; + if (vm->def->numatune.memory.mode != VIR_DOMAIN_NUMATUNE_MEM_STRICT) { virReportError(VIR_ERR_OPERATION_INVALID, "%s", _("change of nodeset for running domain " "requires strict numa mode")); - virBitmapFree(nodeset); ret = -1; - continue; + goto next; }
/* Ensure the cpuset string is formated before passing to cgroup */ if (!(nodeset_str = virBitmapFormat(nodeset))) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Failed to format nodeset")); - virBitmapFree(nodeset); ret = -1; - continue; + goto next; + } + + /*Get Exisitng nodeset values */ + if (virCgroupGetCpusetMems(priv->cgroup, &old_nodeset_str) < 0) { + ret = -1; + goto next; + } + if (virBitmapParse(old_nodeset_str, 0, &old_nodeset, + VIR_DOMAIN_CPUMASK_LEN) < 0){ + ret = -1; + goto next; + } + + /* Merge the existing and new nodeset values */ + if ((temp_nodeset = virBitmapNewCopy(old_nodeset)) == NULL) { + ret = -1; + goto next; + } + + for (j = 0; j < caps->host.nnumaCell; j++) { + bool result; + if (virBitmapGetBit(nodeset, j, &result) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Failed to get cpuset bit values")); + ret = -1; + goto next; + } + if (result && (virBitmapSetBit(temp_nodeset, j) < 0)) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Failed to set temporary cpuset bit values")); + ret = -1; + goto next; + } + } + + if (!(temp_nodeset_str = virBitmapFormat(temp_nodeset))) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Failed to format nodeset")); + ret = -1; + goto next; + } + + if (virCgroupSetCpusetMems(priv->cgroup, temp_nodeset_str) < 0) { + ret = -1; + goto next; + } + + if (qemuSetVcpuCpusetMems(vm, nodeset_str) || + qemuSetEmulatorCpusetMems(vm, nodeset_str)) { + ret = -1; + goto next; }
if (virCgroupSetCpusetMems(priv->cgroup, nodeset_str) < 0) { virBitmapFree(nodeset); VIR_FREE(nodeset_str); ret = -1; - continue; + goto next; } +next : VIR_FREE(nodeset_str); + VIR_FREE(old_nodeset_str); + virBitmapFree(old_nodeset); + VIR_FREE(temp_nodeset_str); + virBitmapFree(temp_nodeset); + if (ret) { + virBitmapFree(nodeset); + continue; + }
This label makes the code unclean, but when I tried "refactoring" it in the way I had on my mind (Free-ing the pointers on start of the cycle, changing it to continue, etc.), it looked also ugly. This is however perfect piece of code to make it into a function. You can then do a 'cleanup:' label there, clean all the memory after it, set 'ret = -1' on start and reset it before the label; basically the same way we do it everywhere else. The problem is that this for() cycle is special in the way that it continues working with other options when a problem appears. Sorry I haven't mentioned it at first, but when looking at the code today it seems like it should be done that way.
You'll get rid of those "ret = -1; goto next;" thanks to that.
Thanks, Martin
participants (3)
-
Martin Kletzander
-
Shivaprasad bhat
-
Shivaprasad G Bhat