于 2011年06月21日 21:58, Dave Allan 写道:
On Tue, Jun 21, 2011 at 11:51:44AM +0100, Daniel P. Berrange wrote:
> On Fri, Jun 17, 2011 at 06:22:57PM +0800, Osier Yang wrote:
>> Implemented as setting NUMA policy between fork and exec as a hook,
>> using libnuma. Only support memory tuning on domain process currently.
>>
>> For the nodemask out of range, will report soft warning instead of
>> hard error in libvirt layer. (Kernel will be silent as long as one
>> of set bit in the nodemask is valid on the host. E.g. For a host
>> has two NUMA nodes, kernel will be silent for nodemask "01010101").
>> So, soft warning is the only thing libvirt can do, as one might want
>> to specify the numa policy prior to a node that doesn't exist yet,
>> however, it may come as hotplug soon.
>> ---
>> src/qemu/qemu_process.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++-
>> 1 files changed, 167 insertions(+), 1 deletions(-)
>>
>> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
>> index fe66100..8e09e52 100644
>> --- a/src/qemu/qemu_process.c
>> +++ b/src/qemu/qemu_process.c
>> @@ -39,6 +39,10 @@
>> #include "qemu_hotplug.h"
>> #include "qemu_bridge_filter.h"
>>
>> +#if HAVE_NUMACTL
>> +# include<numa.h>
>> +#endif
>> +
>> #include "datatypes.h"
>> #include "logging.h"
>> #include "virterror_internal.h"
>> @@ -1175,6 +1179,166 @@ qemuProcessDetectVcpuPIDs(struct qemud_driver *driver,
>> return 0;
>> }
>>
>> +
>> +/*
>> + * Set NUMA memory policy for qemu process, to be run between
>> + * fork/exec of QEMU only.
>> + */
>> +#if HAVE_NUMACTL
>> +static int
>> +qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm)
>> +{
>> + struct bitmask *mask = NULL;
>> + virErrorPtr orig_err = NULL;
>> + virErrorPtr err = NULL;
>> + int mode = -1;
>> + int node = -1;
>> + int ret = -1;
>> + int i = 0;
>> + int maxnode = 0;
>> +
>> + VIR_DEBUG("Setting NUMA memory policy");
>> +
>> + if (numa_available()< 0) {
>> + qemuReportError(VIR_ERR_INTERNAL_ERROR,
>> + "%s", _("Host kernel is not aware of
NUMA."));
>> + return -1;
>> + }
>> +
>> + if (!vm->def->numatune.memory.nodemask)
>> + return 0;
>> +
>> + if (VIR_ALLOC(mask)< 0) {
>> + virReportOOMError();
>> + return -1;
>> + }
>> +
>> + mask->size = VIR_DOMAIN_CPUMASK_LEN / sizeof (unsigned long);
>> + if (VIR_ALLOC_N(mask->maskp, mask->size)< 0) {
>> + virReportOOMError();
>> + goto cleanup;
>> + }
>> + /* Convert nodemask to NUMA bitmask. */
>> + for (i = 0; i< VIR_DOMAIN_CPUMASK_LEN; i++) {
>> + int cur = 0;
>> + int mod = 0;
>> +
>> + if (i) {
>> + cur = i / (8 * sizeof (unsigned long));
>> + mod = i % (8 * sizeof (unsigned long));
>> + }
>> +
>> + if (vm->def->numatune.memory.nodemask[i]) {
>> + mask->maskp[cur] |= 1<< mod;
>> + }
>> + }
>> +
>> + maxnode = numa_max_node() + 1;
>> +
>> + for (i = 0; i< mask->size; i++) {
>> + if (i< (maxnode % (8 * sizeof (unsigned long)))) {
>> + if (mask->maskp[i]& ~maxnode) {
>> + VIR_WARN("nodeset is out of range, there is only %d NUMA
"
>> + "nodes on host", maxnode);
>
> This should be a fatal error.
We discussed this question with a bunch of NUMA people and the
consensus was that since it's possible to hotplug nodes, it's not
definitively wrong to set a mask containing nodes that do not
currently exist. Since it's an unusual thing to do, we thought it was
appropriate to warn the user, but the consensus was that it should not
be categorically rejected.
Yes, explained this in [PATCH 0/5]
<snip>
One may specify nodemask which is out of range, for this
case, libvirt will report soft warnings instead of hard
error. (Kernel is silent as long as one of the set bit is
valid, E.g. For a host with 2 NUMA nodes, kernel will be
silent for nodemask "0101010101", as first "bit" which
represents the first NUMA node is set). The reason for
reporting soft warnings instead of hard error is one may
want to define the memory tuning policy priorly with a node
doesn't exist yet, but may come soon.
</snip>
The point is kernel allows it, so our thought is we'd better to not
reject it at libvirt layer.
>> + break;
>> + }
>> +
>> + continue;
>> + }
>> +
>> + if (mask->maskp[i]) {
>> + VIR_WARN("nodeset is out of range, there is only %d NUMA nodes
"
>> + "on host", maxnode);
>
> This should be a fatal error
>
>> + }
>> + }
>> +
>> + orig_err = virSaveLastError();
>> + mode = vm->def->numatune.memory.mode;
>> +
>> + if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
>> + numa_set_bind_policy(1);
>> + numa_set_membind(mask);
>> + numa_set_bind_policy(0);
>> +
>> + err = virGetLastError();
>> + if ((err&& (err->code != orig_err->code)) ||
>> + (err&& !orig_err)) {
>> + qemuReportError(VIR_ERR_INTERNAL_ERROR,
>> + _("Failed to bind memory to specified nodeset:
%s"),
>> + err ? err->message : _("unknown
error"));
>> + virResetLastError();
>
> What ?????
>
> There is no code here between the virSaveLastError call
> and virGetLastError that will ever set an error. This then
> calls qemuReportError(), and immediately clears it again by
> calling virResetLastError. The numa_XXXX API calls can't
> fail, so I don't see what this error reporting code is trying
> todo. AFAICT this all needs to be deleted.
Agree, will update.
>
>
>> + goto cleanup;
>> + }
>> + } else if (mode == VIR_DOMAIN_NUMATUNE_MEM_PREFERRED) {
>> + int nnodes = 0;
>> + for (i = 0; i< mask->size; i++) {
>> + if (numa_bitmask_isbitset(mask, i)) {
>> + node = i;
>> + nnodes++;
>> + }
>> + }
>> +
>> + if (nnodes != 1) {
>> + qemuReportError(VIR_ERR_INTERNAL_ERROR,
>> + "%s", _("NUMA memory tuning in
'preferred' mode "
>> + "only supports single node"));
>> + goto cleanup;
>> + }
>> +
>> + numa_set_bind_policy(0);
>> + numa_set_preferred(node);
>> +
>> + err = virGetLastError();
>> + if ((err&& (err->code != orig_err->code)) ||
>> + (err&& !orig_err)) {
>> + qemuReportError(VIR_ERR_INTERNAL_ERROR,
>> + _("Failed to set memory policy as preferred to
specified "
>> + "node: %s"), err ? err->message :
_("unknown error"));
>> + virResetLastError();
>> + goto cleanup;
>> + }
>
> Again, this code seems just bogus
>
>> + } else if (mode == VIR_DOMAIN_NUMATUNE_MEM_INTERLEAVE) {
>> + numa_set_interleave_mask(mask);
>> +
>> + err = virGetLastError();
>> + if ((err&& (err->code != orig_err->code)) ||
>> + (err&& !orig_err)) {
>> + qemuReportError(VIR_ERR_INTERNAL_ERROR,
>> + _("Failed to interleave memory to specified
nodeset: %s"),
>> + err ? err->message : _("unknown
error"));
>> + virResetLastError();
>> + goto cleanup;
>> + }
>
> And this code
>
>> + } else {
>> + /* XXX: Shouldn't go here, as we already do checking when
>> + * parsing domain XML.
>> + */
>> + qemuReportError(VIR_ERR_XML_ERROR,
>> + "%s", _("Invalid mode for memory NUMA
tuning."));
>> + goto cleanup;
>> + }
>> +
>> + ret = 0;
>> +
>> +cleanup:
>> + numa_bitmask_free(mask);
>> + return ret;
>> +}
>> +#else
>> +static int
>> +qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm)
>> +{
>> + if (vm->def->numatune.memory.nodemask) {
>> + qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>> + _("libvirt is compiled without NUMA tuning
support"));
>> +
>> + return -1;
>> + }
>> +
>> + return 0;
>> +}
>> +#endif
>
> Since this appears to have already been commited, please send a followup
> patch to fix these problems.
>
> Regards,
> Daniel
> --
> |:
http://berrange.com -o-
http://www.flickr.com/photos/dberrange/ :|
> |:
http://libvirt.org -o-
http://virt-manager.org :|
> |:
http://autobuild.org -o-
http://search.cpan.org/~danberr/ :|
> |:
http://entangle-photo.org -o-
http://live.gnome.org/gtk-vnc :|
>
> --
> libvir-list mailing list
> libvir-list(a)redhat.com
>
https://www.redhat.com/mailman/listinfo/libvir-list