[libvirt] [PATCH v2 0/3] libxl: implement some chuncks of the NUMA interface

Hi everyone, This is take 2 of the series about introducing (some) NUMA support for the libxl driver. What was patch 2 in v1 has been already applied and so it's no longer part of the series. I think I addressed all the comments raised during v1 review (more details in the individual changelogs). Further comments are as usual welcome. Thanks and Regards, Dario --- Dario Faggioli (3): libxl: implement NUMA capabilities reporting libxl: advertise the support for VIR_TYPED_PARAM_STRING libxl: implement virDomainGetNumaParameters src/libxl/libxl_conf.c | 141 ++++++++++++++++++++++++++++++++++++++++ src/libxl/libxl_driver.c | 161 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 300 insertions(+), 2 deletions(-) -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

Starting from Xen 4.2, libxl has all the bits and pieces in place for retrieving an adequate amount of information about the host NUMA topology. It is therefore possible, after a bit of shuffling, to arrange those information in the way libvirt wants to present them to the outside world. Therefore, with this patch, the <topology> section of the host capabilities is properly populated, when running on Xen, so that we can figure out whether or not we're running on a NUMA host, and what its characteristics are. [raistlin@Zhaman ~]$ sudo virsh --connect xen:/// capabilities <capabilities> <host> <cpu> .... <topology> <cells num='2'> <cell id='0'> <memory unit='KiB'>6291456</memory> <cpus num='8'> <cpu id='0' socket_id='1' core_id='0' siblings='0-1'/> <cpu id='1' socket_id='1' core_id='0' siblings='0-1'/> <cpu id='2' socket_id='1' core_id='1' siblings='2-3'/> <cpu id='3' socket_id='1' core_id='1' siblings='2-3'/> <cpu id='4' socket_id='1' core_id='9' siblings='4-5'/> <cpu id='5' socket_id='1' core_id='9' siblings='4-5'/> <cpu id='6' socket_id='1' core_id='10' siblings='6-7'/> <cpu id='7' socket_id='1' core_id='10' siblings='6-7'/> </cpus> </cell> <cell id='1'> <memory unit='KiB'>6881280</memory> <cpus num='8'> <cpu id='8' socket_id='0' core_id='0' siblings='8-9'/> <cpu id='9' socket_id='0' core_id='0' siblings='8-9'/> <cpu id='10' socket_id='0' core_id='1' siblings='10-11'/> <cpu id='11' socket_id='0' core_id='1' siblings='10-11'/> <cpu id='12' socket_id='0' core_id='9' siblings='12-13'/> <cpu id='13' socket_id='0' core_id='9' siblings='12-13'/> <cpu id='14' socket_id='0' core_id='10' siblings='14-15'/> <cpu id='15' socket_id='0' core_id='10' siblings='14-15'/> </cpus> </cell> </cells> </topology> </host> .... Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> --- Changes from v1: * fixed a typo in the commit message, as requested during review; * fixed coding style (one function parameters per line and no spaces between variable definitions), as requested during review; * avoid zero-filling memory after VIR_ALLOC_N(), since it does that already, as requested during review; * improved out of memory error reporting, as requested during review; * libxlMakeNumaCapabilities() created, accommodating all the NUMA related additions, instead of having them within libxlMakeCapabilitiesInternal(), as suggested during review. --- src/libxl/libxl_conf.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+), 1 deletion(-) diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c index e170357..7515995 100644 --- a/src/libxl/libxl_conf.c +++ b/src/libxl/libxl_conf.c @@ -161,6 +161,115 @@ libxlBuildCapabilities(virArch hostarch, } static virCapsPtr +libxlMakeNumaCapabilities(libxl_numainfo *numa_info, + int nr_nodes, + libxl_cputopology *cpu_topo, + int nr_cpus, + virCapsPtr caps) +{ + virCapsHostNUMACellCPUPtr *cpus = NULL; + int *nr_cpus_node = NULL; + bool numa_failed = false; + int i; + + if (VIR_ALLOC_N(cpus, nr_nodes)) { + virReportOOMError(); + return caps; + } + + if (VIR_ALLOC_N(nr_cpus_node, nr_nodes)) { + VIR_FREE(cpus); + virReportOOMError(); + return caps; + } + + /* For each node, prepare a list of CPUs belonging to that node */ + for (i = 0; i < nr_cpus; i++) { + int node = cpu_topo[i].node; + + if (cpu_topo[i].core == LIBXL_CPUTOPOLOGY_INVALID_ENTRY) + continue; + + nr_cpus_node[node]++; + + if (nr_cpus_node[node] == 1) { + if (VIR_ALLOC(cpus[node]) < 0) { + virReportOOMError(); + numa_failed = true; + goto cleanup; + } + } + else { + if (VIR_REALLOC_N(cpus[node], nr_cpus_node[node]) < 0) { + virReportOOMError(); + numa_failed = true; + goto cleanup; + } + } + + /* Mapping between what libxl tells and what libvirt wants */ + cpus[node][nr_cpus_node[node]-1].id = i; + cpus[node][nr_cpus_node[node]-1].socket_id = cpu_topo[i].socket; + cpus[node][nr_cpus_node[node]-1].core_id = cpu_topo[i].core; + /* Allocate the siblings maps. We will be filling them later */ + cpus[node][nr_cpus_node[node]-1].siblings = virBitmapNew(nr_cpus); + if (!cpus[node][nr_cpus_node[node]-1].siblings) { + virReportOOMError(); + numa_failed = true; + goto cleanup; + } + } + + /* Let's now populate the siblings bitmaps */ + for (i = 0; i < nr_cpus; i++) { + int j, node = cpu_topo[i].node; + + if (cpu_topo[i].core == LIBXL_CPUTOPOLOGY_INVALID_ENTRY) + continue; + + for (j = 0; j < nr_cpus_node[node]; j++) { + if (cpus[node][j].core_id == cpu_topo[i].core) + ignore_value(virBitmapSetBit(cpus[node][j].siblings, i)); + } + } + + for (i = 0; i < nr_nodes; i++) { + if (numa_info[i].size == LIBXL_NUMAINFO_INVALID_ENTRY) + continue; + + if (virCapabilitiesAddHostNUMACell(caps, i, nr_cpus_node[i], + numa_info[i].size / 1024, + cpus[i]) < 0) { + virCapabilitiesClearHostNUMACellCPUTopology(cpus[i], + nr_cpus_node[i]); + numa_failed = true; + goto cleanup; + } + + /* This is safe, as the CPU list is now stored in the NUMA cell */ + cpus[i] = NULL; + } + + cleanup: + + if (numa_failed) { + /* Looks like something went wrong. Well, that's bad, but probably + * not enough to break the whole driver, so we log and carry on */ + for (i = 0; i < nr_nodes; i++) { + VIR_FREE(cpus[i]); + } + VIR_WARN("Failed to retrieve and build host NUMA topology properly,\n" + "disabling NUMA capabilities"); + virCapabilitiesFreeNUMAInfo(caps); + } + + VIR_FREE(cpus); + VIR_FREE(nr_cpus_node); + + return caps; +} + +static virCapsPtr libxlMakeCapabilitiesInternal(virArch hostarch, libxl_physinfo *phy_info, char *capabilities) @@ -772,7 +881,11 @@ libxlMakeCapabilities(libxl_ctx *ctx) { int err; libxl_physinfo phy_info; + libxl_numainfo *numa_info = NULL; + libxl_cputopology *cpu_topo = NULL; const libxl_version_info *ver_info; + int nr_nodes = 0, nr_cpus = 0; + virCapsPtr caps; err = regcomp(&xen_cap_rec, xen_cap_re, REG_EXTENDED); if (err != 0) { @@ -796,9 +909,35 @@ libxlMakeCapabilities(libxl_ctx *ctx) return NULL; } - return libxlMakeCapabilitiesInternal(virArchFromHost(), + /* Let's try to fetch NUMA info, but it is not critical if we fail */ + numa_info = libxl_get_numainfo(ctx, &nr_nodes); + if (numa_info == NULL) + VIR_WARN("libxl_get_numainfo failed to retrieve NUMA data"); + else { + /* If the above failed, we'd have no NUMa caps anyway! */ + cpu_topo = libxl_get_cpu_topology(ctx, &nr_cpus); + if (cpu_topo == NULL) { + VIR_WARN("libxl_get_cpu_topology failed to retrieve topology"); + libxl_numainfo_list_free(numa_info, nr_nodes); + } + } + + caps = libxlMakeCapabilitiesInternal(virArchFromHost(), &phy_info, ver_info->capabilities); + + /* Add topology information to caps, if available */ + if (numa_info && cpu_topo) + caps = libxlMakeNumaCapabilities(numa_info, + nr_nodes, + cpu_topo, + nr_cpus, + caps); + + libxl_cputopology_list_free(cpu_topo, nr_cpus); + libxl_numainfo_list_free(numa_info, nr_nodes); + + return caps; } int

Dario Faggioli wrote:
Starting from Xen 4.2, libxl has all the bits and pieces in place for retrieving an adequate amount of information about the host NUMA topology. It is therefore possible, after a bit of shuffling, to arrange those information in the way libvirt wants to present them to the outside world.
Therefore, with this patch, the <topology> section of the host capabilities is properly populated, when running on Xen, so that we can figure out whether or not we're running on a NUMA host, and what its characteristics are.
[raistlin@Zhaman ~]$ sudo virsh --connect xen:/// capabilities <capabilities> <host> <cpu> .... <topology> <cells num='2'> <cell id='0'> <memory unit='KiB'>6291456</memory> <cpus num='8'> <cpu id='0' socket_id='1' core_id='0' siblings='0-1'/> <cpu id='1' socket_id='1' core_id='0' siblings='0-1'/> <cpu id='2' socket_id='1' core_id='1' siblings='2-3'/> <cpu id='3' socket_id='1' core_id='1' siblings='2-3'/> <cpu id='4' socket_id='1' core_id='9' siblings='4-5'/> <cpu id='5' socket_id='1' core_id='9' siblings='4-5'/> <cpu id='6' socket_id='1' core_id='10' siblings='6-7'/> <cpu id='7' socket_id='1' core_id='10' siblings='6-7'/> </cpus> </cell> <cell id='1'> <memory unit='KiB'>6881280</memory> <cpus num='8'> <cpu id='8' socket_id='0' core_id='0' siblings='8-9'/> <cpu id='9' socket_id='0' core_id='0' siblings='8-9'/> <cpu id='10' socket_id='0' core_id='1' siblings='10-11'/> <cpu id='11' socket_id='0' core_id='1' siblings='10-11'/> <cpu id='12' socket_id='0' core_id='9' siblings='12-13'/> <cpu id='13' socket_id='0' core_id='9' siblings='12-13'/> <cpu id='14' socket_id='0' core_id='10' siblings='14-15'/> <cpu id='15' socket_id='0' core_id='10' siblings='14-15'/> </cpus> </cell> </cells> </topology> </host> ....
Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> --- Changes from v1: * fixed a typo in the commit message, as requested during review; * fixed coding style (one function parameters per line and no spaces between variable definitions), as requested during review; * avoid zero-filling memory after VIR_ALLOC_N(), since it does that already, as requested during review; * improved out of memory error reporting, as requested during review; * libxlMakeNumaCapabilities() created, accommodating all the NUMA related additions, instead of having them within libxlMakeCapabilitiesInternal(), as suggested during review. --- src/libxl/libxl_conf.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+), 1 deletion(-)
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c index e170357..7515995 100644 --- a/src/libxl/libxl_conf.c +++ b/src/libxl/libxl_conf.c @@ -161,6 +161,115 @@ libxlBuildCapabilities(virArch hostarch, }
static virCapsPtr +libxlMakeNumaCapabilities(libxl_numainfo *numa_info, + int nr_nodes, + libxl_cputopology *cpu_topo, + int nr_cpus, + virCapsPtr caps)
I think this should return an int (0 success, -1 failure).
+{ + virCapsHostNUMACellCPUPtr *cpus = NULL; + int *nr_cpus_node = NULL; + bool numa_failed = false; + int i; + + if (VIR_ALLOC_N(cpus, nr_nodes)) { + virReportOOMError(); + return caps; + } + + if (VIR_ALLOC_N(nr_cpus_node, nr_nodes)) { + VIR_FREE(cpus); + virReportOOMError(); + return caps; + } + + /* For each node, prepare a list of CPUs belonging to that node */ + for (i = 0; i < nr_cpus; i++) { + int node = cpu_topo[i].node; + + if (cpu_topo[i].core == LIBXL_CPUTOPOLOGY_INVALID_ENTRY) + continue; + + nr_cpus_node[node]++; + + if (nr_cpus_node[node] == 1) { + if (VIR_ALLOC(cpus[node]) < 0) { + virReportOOMError(); + numa_failed = true; + goto cleanup; + } + } + else { + if (VIR_REALLOC_N(cpus[node], nr_cpus_node[node]) < 0) { + virReportOOMError(); + numa_failed = true; + goto cleanup; + } + } + + /* Mapping between what libxl tells and what libvirt wants */ + cpus[node][nr_cpus_node[node]-1].id = i; + cpus[node][nr_cpus_node[node]-1].socket_id = cpu_topo[i].socket; + cpus[node][nr_cpus_node[node]-1].core_id = cpu_topo[i].core; + /* Allocate the siblings maps. We will be filling them later */ + cpus[node][nr_cpus_node[node]-1].siblings = virBitmapNew(nr_cpus); + if (!cpus[node][nr_cpus_node[node]-1].siblings) { + virReportOOMError(); + numa_failed = true; + goto cleanup; + } + } + + /* Let's now populate the siblings bitmaps */ + for (i = 0; i < nr_cpus; i++) { + int j, node = cpu_topo[i].node; + + if (cpu_topo[i].core == LIBXL_CPUTOPOLOGY_INVALID_ENTRY) + continue; + + for (j = 0; j < nr_cpus_node[node]; j++) { + if (cpus[node][j].core_id == cpu_topo[i].core) + ignore_value(virBitmapSetBit(cpus[node][j].siblings, i));
Noticed the following topology on an old 2 socket, quad core Intel machine I'm using to test this patch <topology> <cells num='1'> <cell id='0'> <memory unit='KiB'>9175040</memory> <cpus num='8'> <cpu id='0' socket_id='0' core_id='0' siblings='0,4'/> <cpu id='1' socket_id='0' core_id='1' siblings='1,5'/> <cpu id='2' socket_id='0' core_id='2' siblings='2,6'/> <cpu id='3' socket_id='0' core_id='3' siblings='3,7'/> <cpu id='4' socket_id='1' core_id='0' siblings='0,4'/> <cpu id='5' socket_id='1' core_id='1' siblings='1,5'/> <cpu id='6' socket_id='1' core_id='2' siblings='2,6'/> <cpu id='7' socket_id='1' core_id='3' siblings='3,7'/> </cpus> </cell> </cells> </topology> I'm not sure how cpus 0 and 4 can be siblings when they are on different sockets, but seems xl reports similar info cpu_topology : cpu: core socket node 0: 0 0 0 1: 1 0 0 2: 2 0 0 3: 3 0 0 4: 0 1 0 5: 1 1 0 6: 2 1 0 7: 3 1 0 numa_info : node: memsize memfree distances 0: 8960 7116 10 BTW, the qemu driver reports the following on the same machine <topology> <cells num='1'> <cell id='0'> <memory unit='KiB'>8387124</memory> <cpus num='8'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> <cpu id='1' socket_id='1' core_id='0' siblings='1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2'/> <cpu id='3' socket_id='1' core_id='1' siblings='3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4'/> <cpu id='5' socket_id='1' core_id='2' siblings='5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6'/> <cpu id='7' socket_id='1' core_id='3' siblings='7'/> </cpus> </cell> </cells> </topology> which seems correct since hyperthreading is not supported.
+ } + } + + for (i = 0; i < nr_nodes; i++) { + if (numa_info[i].size == LIBXL_NUMAINFO_INVALID_ENTRY) + continue; + + if (virCapabilitiesAddHostNUMACell(caps, i, nr_cpus_node[i], + numa_info[i].size / 1024, + cpus[i]) < 0) { + virCapabilitiesClearHostNUMACellCPUTopology(cpus[i], + nr_cpus_node[i]); + numa_failed = true; + goto cleanup; + } + + /* This is safe, as the CPU list is now stored in the NUMA cell */ + cpus[i] = NULL; + } + + cleanup: + + if (numa_failed) { + /* Looks like something went wrong. Well, that's bad, but probably + * not enough to break the whole driver, so we log and carry on */ + for (i = 0; i < nr_nodes; i++) { + VIR_FREE(cpus[i]); + } + VIR_WARN("Failed to retrieve and build host NUMA topology properly,\n" + "disabling NUMA capabilities"); + virCapabilitiesFreeNUMAInfo(caps); + } + + VIR_FREE(cpus); + VIR_FREE(nr_cpus_node); + + return caps; +} + +static virCapsPtr libxlMakeCapabilitiesInternal(virArch hostarch, libxl_physinfo *phy_info, char *capabilities) @@ -772,7 +881,11 @@ libxlMakeCapabilities(libxl_ctx *ctx) { int err; libxl_physinfo phy_info; + libxl_numainfo *numa_info = NULL; + libxl_cputopology *cpu_topo = NULL; const libxl_version_info *ver_info; + int nr_nodes = 0, nr_cpus = 0; + virCapsPtr caps;
err = regcomp(&xen_cap_rec, xen_cap_re, REG_EXTENDED); if (err != 0) { @@ -796,9 +909,35 @@ libxlMakeCapabilities(libxl_ctx *ctx) return NULL; }
- return libxlMakeCapabilitiesInternal(virArchFromHost(), + /* Let's try to fetch NUMA info, but it is not critical if we fail */ + numa_info = libxl_get_numainfo(ctx, &nr_nodes); + if (numa_info == NULL) + VIR_WARN("libxl_get_numainfo failed to retrieve NUMA data"); + else { + /* If the above failed, we'd have no NUMa caps anyway! */ + cpu_topo = libxl_get_cpu_topology(ctx, &nr_cpus); + if (cpu_topo == NULL) { + VIR_WARN("libxl_get_cpu_topology failed to retrieve topology"); + libxl_numainfo_list_free(numa_info, nr_nodes); + } + }
Move these after the call to libxlMakeCapabilitiesInternal, before calling libxlMakeNumaCapabilities.
+ + caps = libxlMakeCapabilitiesInternal(virArchFromHost(), &phy_info, ver_info->capabilities);
Check that caps != NULL ?
+ + /* Add topology information to caps, if available */ + if (numa_info && cpu_topo) + caps = libxlMakeNumaCapabilities(numa_info, + nr_nodes, + cpu_topo, + nr_cpus, + caps);
Hmm, guess there is not much to check wrt errors at this point, so libxlMakeNumaCapabilities could return void. I suppose returning success or failure via int is more libvirt style. Regards, Jim
+ + libxl_cputopology_list_free(cpu_topo, nr_cpus); + libxl_numainfo_list_free(numa_info, nr_nodes); + + return caps; }
int

On lun, 2013-07-08 at 18:45 -0600, Jim Fehlig wrote:
Dario Faggioli wrote:
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c index e170357..7515995 100644 --- a/src/libxl/libxl_conf.c +++ b/src/libxl/libxl_conf.c @@ -161,6 +161,115 @@ libxlBuildCapabilities(virArch hostarch, }
static virCapsPtr +libxlMakeNumaCapabilities(libxl_numainfo *numa_info, + int nr_nodes, + libxl_cputopology *cpu_topo, + int nr_cpus, + virCapsPtr caps)
I think this should return an int (0 success, -1 failure).
Well, of course I can do that (but see below)
Noticed the following topology on an old 2 socket, quad core Intel machine I'm using to test this patch
<topology> <cells num='1'> <cell id='0'> <memory unit='KiB'>9175040</memory> <cpus num='8'> <cpu id='0' socket_id='0' core_id='0' siblings='0,4'/> <cpu id='1' socket_id='0' core_id='1' siblings='1,5'/> <cpu id='2' socket_id='0' core_id='2' siblings='2,6'/> <cpu id='3' socket_id='0' core_id='3' siblings='3,7'/> <cpu id='4' socket_id='1' core_id='0' siblings='0,4'/> <cpu id='5' socket_id='1' core_id='1' siblings='1,5'/> <cpu id='6' socket_id='1' core_id='2' siblings='2,6'/> <cpu id='7' socket_id='1' core_id='3' siblings='3,7'/> </cpus> </cell> </cells> </topology>
I'm not sure how cpus 0 and 4 can be siblings when they are on different sockets, but seems xl reports similar info
Oh, I see, my bad, same coreID but different socketID means !siblings _even_ if on the same node. :-P That makes sense, what I was not thinking to was the possibility of having different sockets _within_ the same node, which seems like your case according to both libxl and numactl. Does that make sense? What machine is that? Do both the socket share the same memory bus? Again, it looks like so Anyway, I will fix that.
cpu_topology : cpu: core socket node 0: 0 0 0 1: 1 0 0 2: 2 0 0 3: 3 0 0 4: 0 1 0 5: 1 1 0 6: 2 1 0 7: 3 1 0 numa_info : node: memsize memfree distances 0: 8960 7116 10
BTW, the qemu driver reports the following on the same machine
<topology> <cells num='1'> <cell id='0'> <memory unit='KiB'>8387124</memory> <cpus num='8'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> <cpu id='1' socket_id='1' core_id='0' siblings='1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2'/> <cpu id='3' socket_id='1' core_id='1' siblings='3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4'/> <cpu id='5' socket_id='1' core_id='2' siblings='5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6'/> <cpu id='7' socket_id='1' core_id='3' siblings='7'/> </cpus> </cell> </cells> </topology>
which seems correct since hyperthreading is not supported.
Yes, and it is basically the same, apart from the ordering, than what libxl says (notice that all cpus belongs to node 0!). Again, I will fix the siblings part in my patch, in order to take the case of "more sockets per node" into better account.
+static virCapsPtr libxlMakeCapabilitiesInternal(virArch hostarch, libxl_physinfo *phy_info, char *capabilities) @@ -772,7 +881,11 @@ libxlMakeCapabilities(libxl_ctx *ctx) { int err; libxl_physinfo phy_info; + libxl_numainfo *numa_info = NULL; + libxl_cputopology *cpu_topo = NULL; const libxl_version_info *ver_info; + int nr_nodes = 0, nr_cpus = 0; + virCapsPtr caps;
err = regcomp(&xen_cap_rec, xen_cap_re, REG_EXTENDED); if (err != 0) { @@ -796,9 +909,35 @@ libxlMakeCapabilities(libxl_ctx *ctx) return NULL; }
- return libxlMakeCapabilitiesInternal(virArchFromHost(), + /* Let's try to fetch NUMA info, but it is not critical if we fail */ + numa_info = libxl_get_numainfo(ctx, &nr_nodes); + if (numa_info == NULL) + VIR_WARN("libxl_get_numainfo failed to retrieve NUMA data"); + else { + /* If the above failed, we'd have no NUMa caps anyway! */ + cpu_topo = libxl_get_cpu_topology(ctx, &nr_cpus); + if (cpu_topo == NULL) { + VIR_WARN("libxl_get_cpu_topology failed to retrieve topology"); + libxl_numainfo_list_free(numa_info, nr_nodes); + } + }
Move these after the call to libxlMakeCapabilitiesInternal, before calling libxlMakeNumaCapabilities.
Makes sense, will do.
+ + caps = libxlMakeCapabilitiesInternal(virArchFromHost(), &phy_info, ver_info->capabilities);
Check that caps != NULL ?
Ok.
+ + /* Add topology information to caps, if available */ + if (numa_info && cpu_topo) + caps = libxlMakeNumaCapabilities(numa_info, + nr_nodes, + cpu_topo, + nr_cpus, + caps);
Hmm, guess there is not much to check wrt errors at this point, so libxlMakeNumaCapabilities could return void. I suppose returning success or failure via int is more libvirt style.
And here's the return 0 or -1 point. The point is we do not care much about errors as, if something bad happened during the construction of NUMA capabilities, we revert all we've done, with the effect of leaving caps unmodified, wrt the one libxlMakeNumaCapabilities() was given. That is why errors are reported and handled (as described above) inside the function itself, and that is therefore why I don't think it would be that useful to have it propagate such information to the caller in such an explicit manner as returning 0 or -1. Moreover, given as you said yourself it could well return void, I figured it was worthwhile to use the return value for something useful, not to mention that, yes, 0/-1 will make it more libvirt style, but this is an internal function that, at least according to me, should look more like libxlMakeCapabilitiesInternal() than like anything else (and libxlMakeCapabilitiesInternal() was returning a virCapsPtr before my patch :-) ). So, in summary, I agree on the code motion above, but I think the signature and usage of libxlMakeNumaCapabilities() should stay as it is now. That being said, I will of course do the change if you insist that this is what you want. So, your call, I was just trying to give some more --hopefully useful-- context and rationale bits to reason on... After all, that is what maintainership is about, isn't it! :-P Thanks again and Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

Dario Faggioli wrote:
On lun, 2013-07-08 at 18:45 -0600, Jim Fehlig wrote:
Dario Faggioli wrote:
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c index e170357..7515995 100644 --- a/src/libxl/libxl_conf.c +++ b/src/libxl/libxl_conf.c @@ -161,6 +161,115 @@ libxlBuildCapabilities(virArch hostarch, }
static virCapsPtr +libxlMakeNumaCapabilities(libxl_numainfo *numa_info, + int nr_nodes, + libxl_cputopology *cpu_topo, + int nr_cpus, + virCapsPtr caps)
I think this should return an int (0 success, -1 failure).
Well, of course I can do that (but see below)
Noticed the following topology on an old 2 socket, quad core Intel machine I'm using to test this patch
<topology> <cells num='1'> <cell id='0'> <memory unit='KiB'>9175040</memory> <cpus num='8'> <cpu id='0' socket_id='0' core_id='0' siblings='0,4'/> <cpu id='1' socket_id='0' core_id='1' siblings='1,5'/> <cpu id='2' socket_id='0' core_id='2' siblings='2,6'/> <cpu id='3' socket_id='0' core_id='3' siblings='3,7'/> <cpu id='4' socket_id='1' core_id='0' siblings='0,4'/> <cpu id='5' socket_id='1' core_id='1' siblings='1,5'/> <cpu id='6' socket_id='1' core_id='2' siblings='2,6'/> <cpu id='7' socket_id='1' core_id='3' siblings='3,7'/> </cpus> </cell> </cells> </topology>
I'm not sure how cpus 0 and 4 can be siblings when they are on different sockets, but seems xl reports similar info
Oh, I see, my bad, same coreID but different socketID means !siblings _even_ if on the same node. :-P
That makes sense, what I was not thinking to was the possibility of having different sockets _within_ the same node, which seems like your case according to both libxl and numactl.
Does that make sense? What machine is that? Do both the socket share the same memory bus? Again, it looks like so
Yep, makes sense. And yes, both sockets share the same memory bus. The machine is an Intel DP server with Clovertown processors.
Anyway, I will fix that.
cpu_topology : cpu: core socket node 0: 0 0 0 1: 1 0 0 2: 2 0 0 3: 3 0 0 4: 0 1 0 5: 1 1 0 6: 2 1 0 7: 3 1 0 numa_info : node: memsize memfree distances 0: 8960 7116 10
BTW, the qemu driver reports the following on the same machine
<topology> <cells num='1'> <cell id='0'> <memory unit='KiB'>8387124</memory> <cpus num='8'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> <cpu id='1' socket_id='1' core_id='0' siblings='1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2'/> <cpu id='3' socket_id='1' core_id='1' siblings='3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4'/> <cpu id='5' socket_id='1' core_id='2' siblings='5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6'/> <cpu id='7' socket_id='1' core_id='3' siblings='7'/> </cpus> </cell> </cells> </topology>
which seems correct since hyperthreading is not supported.
Yes, and it is basically the same, apart from the ordering, than what libxl says (notice that all cpus belongs to node 0!). Again, I will fix the siblings part in my patch, in order to take the case of "more sockets per node" into better account.
+static virCapsPtr libxlMakeCapabilitiesInternal(virArch hostarch, libxl_physinfo *phy_info, char *capabilities) @@ -772,7 +881,11 @@ libxlMakeCapabilities(libxl_ctx *ctx) { int err; libxl_physinfo phy_info; + libxl_numainfo *numa_info = NULL; + libxl_cputopology *cpu_topo = NULL; const libxl_version_info *ver_info; + int nr_nodes = 0, nr_cpus = 0; + virCapsPtr caps;
err = regcomp(&xen_cap_rec, xen_cap_re, REG_EXTENDED); if (err != 0) { @@ -796,9 +909,35 @@ libxlMakeCapabilities(libxl_ctx *ctx) return NULL; }
- return libxlMakeCapabilitiesInternal(virArchFromHost(), + /* Let's try to fetch NUMA info, but it is not critical if we fail */ + numa_info = libxl_get_numainfo(ctx, &nr_nodes); + if (numa_info == NULL) + VIR_WARN("libxl_get_numainfo failed to retrieve NUMA data"); + else { + /* If the above failed, we'd have no NUMa caps anyway! */ + cpu_topo = libxl_get_cpu_topology(ctx, &nr_cpus); + if (cpu_topo == NULL) { + VIR_WARN("libxl_get_cpu_topology failed to retrieve topology"); + libxl_numainfo_list_free(numa_info, nr_nodes); + } + }
Move these after the call to libxlMakeCapabilitiesInternal, before calling libxlMakeNumaCapabilities.
Makes sense, will do.
+ + caps = libxlMakeCapabilitiesInternal(virArchFromHost(), &phy_info, ver_info->capabilities);
Check that caps != NULL ?
Ok.
+ + /* Add topology information to caps, if available */ + if (numa_info && cpu_topo) + caps = libxlMakeNumaCapabilities(numa_info, + nr_nodes, + cpu_topo, + nr_cpus, + caps);
Hmm, guess there is not much to check wrt errors at this point, so libxlMakeNumaCapabilities could return void. I suppose returning success or failure via int is more libvirt style.
And here's the return 0 or -1 point. The point is we do not care much about errors as, if something bad happened during the construction of NUMA capabilities, we revert all we've done, with the effect of leaving caps unmodified, wrt the one libxlMakeNumaCapabilities() was given.
That is why errors are reported and handled (as described above) inside the function itself, and that is therefore why I don't think it would be that useful to have it propagate such information to the caller in such an explicit manner as returning 0 or -1.
Moreover, given as you said yourself it could well return void, I figured it was worthwhile to use the return value for something useful, not to mention that, yes, 0/-1 will make it more libvirt style, but this is an internal function that, at least according to me, should look more like libxlMakeCapabilitiesInternal() than like anything else (and libxlMakeCapabilitiesInternal() was returning a virCapsPtr before my patch :-) ).
Nod.
So, in summary, I agree on the code motion above, but I think the signature and usage of libxlMakeNumaCapabilities() should stay as it is now.
Ok, no problem. As you said, it is an internal function and we can change it if some future code motion warrants such a change. Regards, Jim

Dario Faggioli wrote:
Starting from Xen 4.2, libxl has all the bits and pieces in place for retrieving an adequate amount of information about the host NUMA topology. It is therefore possible, after a bit of shuffling, to arrange those information in the way libvirt wants to present them to the outside world.
Therefore, with this patch, the <topology> section of the host capabilities is properly populated, when running on Xen, so that we can figure out whether or not we're running on a NUMA host, and what its characteristics are.
[raistlin@Zhaman ~]$ sudo virsh --connect xen:/// capabilities <capabilities> <host> <cpu> .... <topology> <cells num='2'> <cell id='0'> <memory unit='KiB'>6291456</memory> <cpus num='8'> <cpu id='0' socket_id='1' core_id='0' siblings='0-1'/> <cpu id='1' socket_id='1' core_id='0' siblings='0-1'/> <cpu id='2' socket_id='1' core_id='1' siblings='2-3'/> <cpu id='3' socket_id='1' core_id='1' siblings='2-3'/> <cpu id='4' socket_id='1' core_id='9' siblings='4-5'/> <cpu id='5' socket_id='1' core_id='9' siblings='4-5'/> <cpu id='6' socket_id='1' core_id='10' siblings='6-7'/> <cpu id='7' socket_id='1' core_id='10' siblings='6-7'/> </cpus> </cell> <cell id='1'> <memory unit='KiB'>6881280</memory> <cpus num='8'> <cpu id='8' socket_id='0' core_id='0' siblings='8-9'/> <cpu id='9' socket_id='0' core_id='0' siblings='8-9'/> <cpu id='10' socket_id='0' core_id='1' siblings='10-11'/> <cpu id='11' socket_id='0' core_id='1' siblings='10-11'/> <cpu id='12' socket_id='0' core_id='9' siblings='12-13'/> <cpu id='13' socket_id='0' core_id='9' siblings='12-13'/> <cpu id='14' socket_id='0' core_id='10' siblings='14-15'/> <cpu id='15' socket_id='0' core_id='10' siblings='14-15'/> </cpus> </cell> </cells> </topology> </host> ....
Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> --- Changes from v1: * fixed a typo in the commit message, as requested during review; * fixed coding style (one function parameters per line and no spaces between variable definitions), as requested during review; * avoid zero-filling memory after VIR_ALLOC_N(), since it does that already, as requested during review; * improved out of memory error reporting, as requested during review; * libxlMakeNumaCapabilities() created, accommodating all the NUMA related additions, instead of having them within libxlMakeCapabilitiesInternal(), as suggested during review. --- src/libxl/libxl_conf.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+), 1 deletion(-)
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c index e170357..7515995 100644 --- a/src/libxl/libxl_conf.c +++ b/src/libxl/libxl_conf.c @@ -161,6 +161,115 @@ libxlBuildCapabilities(virArch hostarch, }
static virCapsPtr +libxlMakeNumaCapabilities(libxl_numainfo *numa_info, + int nr_nodes, + libxl_cputopology *cpu_topo, + int nr_cpus, + virCapsPtr caps) +{ + virCapsHostNUMACellCPUPtr *cpus = NULL; + int *nr_cpus_node = NULL; + bool numa_failed = false; + int i;
BTW, the 'i' and 'j' iterators should be changed to size_t. danpb has a patchset to sanitize iterators that will be committed any time now, so might as well make this change in your next version https://www.redhat.com/archives/libvir-list/2013-July/msg00397.html Regards, Jim

On mar, 2013-07-09 at 16:21 -0600, Jim Fehlig wrote:
static virCapsPtr +libxlMakeNumaCapabilities(libxl_numainfo *numa_info, + int nr_nodes, + libxl_cputopology *cpu_topo, + int nr_cpus, + virCapsPtr caps) +{ + virCapsHostNUMACellCPUPtr *cpus = NULL; + int *nr_cpus_node = NULL; + bool numa_failed = false; + int i;
BTW, the 'i' and 'j' iterators should be changed to size_t. danpb has a patchset to sanitize iterators that will be committed any time now, so might as well make this change in your next version
Yep, I saw that, and I will. Thanks, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

domainGetNumaParameters has a string typed parameter, hence it is necessary for the libxl driver to support this. This change implements the connectSupportsFeature hook for the libxl driver, advertising that VIR_DRV_FEATURE_TYPED_PARAM_STRING is supported. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Cc: Eric Blake <eblake@redhat.com> --- Changes from v1: * commit message improved, as requested during review; * added VIR_TYPED_PARAM_STRING_OKAY handling code in libxlDomainGetSchedulerParametersFlags(); --- src/libxl/libxl_driver.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/libxl/libxl_driver.c b/src/libxl/libxl_driver.c index 1bae3d6..7555bc6 100644 --- a/src/libxl/libxl_driver.c +++ b/src/libxl/libxl_driver.c @@ -4364,7 +4364,10 @@ libxlDomainGetSchedulerParametersFlags(virDomainPtr dom, libxl_scheduler sched_id; int ret = -1; - virCheckFlags(0, -1); + virCheckFlags(VIR_TYPED_PARAM_STRING_OKAY, -1); + + /* We don't return strings, and thus trivially support this flag. */ + flags &= ~VIR_TYPED_PARAM_STRING_OKAY; libxlDriverLock(driver); vm = virDomainObjListFindByUUID(driver->domains, dom->uuid); @@ -4665,6 +4668,20 @@ libxlConnectListAllDomains(virConnectPtr conn, return ret; } +/* Which features are supported by this driver? */ +static int +libxlConnectSupportsFeature(virConnectPtr conn, int feature) +{ + if (virConnectSupportsFeatureEnsureACL(conn) < 0) + return -1; + + switch (feature) { + case VIR_DRV_FEATURE_TYPED_PARAM_STRING: + return 1; + default: + return 0; + } +} static virDriver libxlDriver = { @@ -4745,6 +4762,7 @@ static virDriver libxlDriver = { .connectDomainEventRegisterAny = libxlConnectDomainEventRegisterAny, /* 0.9.0 */ .connectDomainEventDeregisterAny = libxlConnectDomainEventDeregisterAny, /* 0.9.0 */ .connectIsAlive = libxlConnectIsAlive, /* 0.9.8 */ + .connectSupportsFeature = libxlConnectSupportsFeature, /* 1.1.1 */ }; static virStateDriver libxlStateDriver = {

Although, having it depending on Xen >= 4.3 (by using the proper libxl feature flag). Xen currently implements a NUMA placement policy which is basically the same as the 'interleaved' policy of `numactl', although it can be applied on a subset of the available nodes. We therefore hardcode "interleave" as 'numa_mode', and we use the newly introduced libxl interface to figure out what nodes a domain spans ('numa_nodeset'). With this change, it is now possible to query the NUMA node affinity of a running domain: [raistlin@Zhaman ~]$ sudo virsh --connect xen:/// list Id Name State ---------------------------------------------------- 23 F18_x64 running [raistlin@Zhaman ~]$ sudo virsh --connect xen:/// numatune 23 numa_mode : interleave numa_nodeset : 1 Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> --- Changes from v1: * fixed coding style, as requested during review; * VIR_TYPED_PARAM_STRING_OKAY handled more properly; --- src/libxl/libxl_driver.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/src/libxl/libxl_driver.c b/src/libxl/libxl_driver.c index 7555bc6..4ed231a 100644 --- a/src/libxl/libxl_driver.c +++ b/src/libxl/libxl_driver.c @@ -28,6 +28,7 @@ #include <math.h> #include <libxl.h> +#include <libxl_utils.h> #include <fcntl.h> #include <regex.h> @@ -4521,6 +4522,143 @@ libxlDomainSetSchedulerParameters(virDomainPtr dom, virTypedParameterPtr params, return libxlDomainSetSchedulerParametersFlags(dom, params, nparams, 0); } +/* NUMA node affinity information is available through libxl + * starting from Xen 4.3. */ +#ifdef LIBXL_HAVE_DOMAIN_NODEAFFINITY + +/* Number of Xen NUMA parameters */ +# define LIBXL_NUMA_NPARAM 2 + +static int +libxlDomainGetNumaParameters(virDomainPtr dom, + virTypedParameterPtr params, + int *nparams, + unsigned int flags) +{ + libxlDriverPrivatePtr driver = dom->conn->privateData; + libxlDomainObjPrivatePtr priv; + virDomainObjPtr vm; + libxl_bitmap nodemap; + virBitmapPtr nodes = NULL; + char *nodeset = NULL; + int rc, ret = -1; + int i, j; + + /* In Xen 4.3, it is possible to query the NUMA node affinity of a domain + * via libxl, but not to change it. We therefore only allow AFFECT_LIVE. */ + virCheckFlags(VIR_DOMAIN_AFFECT_LIVE | + VIR_TYPED_PARAM_STRING_OKAY, -1); + + /* We blindly return a string, and let libvirt.c and remote_driver.c do + * the filtering on behalf of older clients that can't parse it. */ + flags &= ~VIR_TYPED_PARAM_STRING_OKAY; + + libxlDriverLock(driver); + vm = virDomainObjListFindByUUID(driver->domains, dom->uuid); + libxlDriverUnlock(driver); + + if (!vm) { + virReportError(VIR_ERR_NO_DOMAIN, "%s", + _("no domain with matching uuid")); + goto cleanup; + } + + if (virDomainGetNumaParametersEnsureACL(dom->conn, vm->def) < 0) + goto cleanup; + + if (!virDomainObjIsActive(vm)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("Domain is not running")); + goto cleanup; + } + + priv = vm->privateData; + + libxl_bitmap_init(&nodemap); + + if ((*nparams) == 0) { + *nparams = LIBXL_NUMA_NPARAM; + ret = 0; + goto cleanup; + } + + for (i = 0; i < LIBXL_NUMA_NPARAM && i < *nparams; i++) { + virMemoryParameterPtr param = ¶ms[i]; + + switch (i) { + case 0: + /* NUMA mode */ + + /* Xen implements something that is is really close to numactl's + * 'interleave' policy (see `man 8 numactl' for details). */ + if (virTypedParameterAssign(param, VIR_DOMAIN_NUMA_MODE, + VIR_TYPED_PARAM_INT, + VIR_DOMAIN_NUMATUNE_MEM_INTERLEAVE) < 0) + goto cleanup; + + break; + + case 1: + /* Node affinity */ + + /* Let's allocate both libxl and libvirt bitmaps */ + if (libxl_node_bitmap_alloc(priv->ctx, &nodemap, 0) || + !(nodes = virBitmapNew(libxl_get_max_nodes(priv->ctx)))) { + virReportOOMError(); + goto cleanup; + } + + rc = libxl_domain_get_nodeaffinity(priv->ctx, + vm->def->id, + &nodemap); + if (rc != 0) { + virReportSystemError(-rc, "%s", + _("unable to get numa affinity")); + goto cleanup; + } + + /* First, we convert libxl_bitmap into virBitmap. After that, + * we format virBitmap as a string that can be returned. */ + virBitmapClearAll(nodes); + libxl_for_each_set_bit(j, nodemap) { + if (virBitmapSetBit(nodes, j)) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Node %d out of range"), j); + goto cleanup; + } + } + + nodeset = virBitmapFormat(nodes); + if (!nodeset && VIR_STRDUP(nodeset, "") < 0) + goto cleanup; + + if (virTypedParameterAssign(param, VIR_DOMAIN_NUMA_NODESET, + VIR_TYPED_PARAM_STRING, nodeset) < 0) + goto cleanup; + + nodeset = NULL; + + break; + + default: + break; + } + } + + if (*nparams > LIBXL_NUMA_NPARAM) + *nparams = LIBXL_NUMA_NPARAM; + ret = 0; + +cleanup: + VIR_FREE(nodeset); + virBitmapFree(nodes); + libxl_bitmap_dispose(&nodemap); + if (vm) + virObjectUnlock(vm); + return ret; +} +#endif + static int libxlDomainIsActive(virDomainPtr dom) { @@ -4749,6 +4887,9 @@ static virDriver libxlDriver = { .domainGetSchedulerParametersFlags = libxlDomainGetSchedulerParametersFlags, /* 0.9.2 */ .domainSetSchedulerParameters = libxlDomainSetSchedulerParameters, /* 0.9.0 */ .domainSetSchedulerParametersFlags = libxlDomainSetSchedulerParametersFlags, /* 0.9.2 */ +#ifdef LIBXL_HAVE_DOMAIN_NODEAFFINITY + .domainGetNumaParameters = libxlDomainGetNumaParameters, /* 1.1.1 */ +#endif .nodeGetFreeMemory = libxlNodeGetFreeMemory, /* 0.9.0 */ .nodeGetCellsFreeMemory = libxlNodeGetCellsFreeMemory, /* 1.1.1 */ .connectDomainEventRegister = libxlConnectDomainEventRegister, /* 0.9.0 */
participants (2)
-
Dario Faggioli
-
Jim Fehlig