[libvirt] [PATCH] nodedev: Export NUMA node locality for PCI devices

A PCI device can be associated with a specific NUMA node. Later, when a guest is pinned to one NUMA node the PCI device can be assigned on different NUMA node. This makes DMA transfers travel across nodes and thus results in suboptimal performance. We should expose the NUMA node locality for PCI devices so management applications can make better decisions. Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- The only difference to the previous round is, I'm sending it as standalone patch for easier review. docs/formatnode.html.in | 7 ++++ docs/schemas/nodedev.rng | 10 +++++ src/conf/node_device_conf.c | 44 ++++++++++++++++++++++ src/conf/node_device_conf.h | 1 + src/node_device/node_device_udev.c | 7 ++++ tests/nodedevschemadata/pci_1002_71c4.xml | 1 + tests/nodedevschemadata/pci_8086_10c9_sriov_pf.xml | 1 + 7 files changed, 71 insertions(+) diff --git a/docs/formatnode.html.in b/docs/formatnode.html.in index b424c96..76bf8af 100644 --- a/docs/formatnode.html.in +++ b/docs/formatnode.html.in @@ -110,6 +110,13 @@ have a list of <code>address</code> subelements, one for each VF on this PF. </dd> + <dt><code>numa</code></dt> + <dd> + This optional element contains information on the PCI device + with respect to NUMA. For example, the optional + <code>node</code> attribute tells which NUMA node is the PCI + device associated with. + </dd> </dl> </dd> <dt><code>usb_device</code></dt> diff --git a/docs/schemas/nodedev.rng b/docs/schemas/nodedev.rng index 81ab4d4..02d4106 100644 --- a/docs/schemas/nodedev.rng +++ b/docs/schemas/nodedev.rng @@ -158,6 +158,16 @@ </element> </optional> + <optional> + <element name='numa'> + <optional> + <attribute name='node'> + <data type='int'/> + </attribute> + </optional> + </element> + </optional> + </define> <define name='capusbdev'> diff --git a/src/conf/node_device_conf.c b/src/conf/node_device_conf.c index e65b5e4..99fa448 100644 --- a/src/conf/node_device_conf.c +++ b/src/conf/node_device_conf.c @@ -346,6 +346,9 @@ char *virNodeDeviceDefFormat(const virNodeDeviceDef *def) virBufferAdjustIndent(&buf, -2); virBufferAddLit(&buf, "</iommuGroup>\n"); } + if (data->pci_dev.numa_node >= 0) + virBufferAsprintf(&buf, "<numa node='%d'/>\n", + data->pci_dev.numa_node); break; case VIR_NODE_DEV_CAP_USB_DEV: virBufferAsprintf(&buf, "<bus>%d</bus>\n", data->usb_dev.bus); @@ -520,6 +523,41 @@ char *virNodeDeviceDefFormat(const virNodeDeviceDef *def) return NULL; } +/** + * virNodeDevCapsDefParseIntOptional: + * @xpath: XPath to evaluate + * @ctxt: Context + * @value: Where to store parsed value + * @def: Node device which is parsed + * @invalid_error_fmt: error message to print on invalid format + * + * Returns: -1 on error (invalid int format under @xpath) + * 0 if @xpath was not found (@value is untouched) + * 1 on success + */ +static int +virNodeDevCapsDefParseIntOptional(const char *xpath, + xmlXPathContextPtr ctxt, + int *value, + virNodeDeviceDefPtr def, + const char *invalid_error_fmt) +{ + int ret; + int val; + + ret = virXPathInt(xpath, ctxt, &val); + if (ret < -1) { + virReportError(VIR_ERR_INTERNAL_ERROR, + invalid_error_fmt, + def->name); + return -1; + } else if (ret == -1) { + return 0; + } + *value = val; + return 1; +} + static int virNodeDevCapsDefParseULong(const char *xpath, xmlXPathContextPtr ctxt, @@ -1101,6 +1139,12 @@ virNodeDevCapPCIDevParseXML(xmlXPathContextPtr ctxt, goto out; } } + + if (virNodeDevCapsDefParseIntOptional("number(./numa[1]/@node)", ctxt, + &data->pci_dev.numa_node, def, + _("invalid NUMA node ID supplied for '%s'")) < 0) + goto out; + ret = 0; out: ctxt->node = orignode; diff --git a/src/conf/node_device_conf.h b/src/conf/node_device_conf.h index 50e6805..50ce4b3 100644 --- a/src/conf/node_device_conf.h +++ b/src/conf/node_device_conf.h @@ -115,6 +115,7 @@ struct _virNodeDevCapsDef { virPCIDeviceAddressPtr *iommuGroupDevices; size_t nIommuGroupDevices; unsigned int iommuGroupNumber; + int numa_node; } pci_dev; struct { unsigned int bus; diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index 9a951d9..8e98ad2 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -493,6 +493,13 @@ static int udevProcessPCI(struct udev_device *device, goto out; } + if (udevGetIntSysfsAttr(device, + "numa_node", + &data->pci_dev.numa_node, + 10) == PROPERTY_ERROR) { + goto out; + } + if (!virPCIGetPhysicalFunction(syspath, &data->pci_dev.physical_function)) data->pci_dev.flags |= VIR_NODE_DEV_CAP_FLAG_PCI_PHYSICAL_FUNCTION; diff --git a/tests/nodedevschemadata/pci_1002_71c4.xml b/tests/nodedevschemadata/pci_1002_71c4.xml index 6de09c1..6d5d85b 100644 --- a/tests/nodedevschemadata/pci_1002_71c4.xml +++ b/tests/nodedevschemadata/pci_1002_71c4.xml @@ -8,5 +8,6 @@ <function>0</function> <product id='0x71c4'>M56GL [Mobility FireGL V5200]</product> <vendor id='0x1002'>ATI Technologies Inc</vendor> + <numa node='1'/> </capability> </device> diff --git a/tests/nodedevschemadata/pci_8086_10c9_sriov_pf.xml b/tests/nodedevschemadata/pci_8086_10c9_sriov_pf.xml index eff8932..6e1dc86 100644 --- a/tests/nodedevschemadata/pci_8086_10c9_sriov_pf.xml +++ b/tests/nodedevschemadata/pci_8086_10c9_sriov_pf.xml @@ -12,5 +12,6 @@ <address domain='0x0000' bus='0x02' slot='0x00' function='0x0'/> <address domain='0x0000' bus='0x02' slot='0x00' function='0x1'/> </iommuGroup> + <numa node='0'/> </capability> </device> -- 2.0.0

On Fri, Jun 06, 2014 at 01:09:58PM +0200, Michal Privoznik wrote:
A PCI device can be associated with a specific NUMA node. Later, when a guest is pinned to one NUMA node the PCI device can be assigned on different NUMA node. This makes DMA transfers travel across nodes and thus results in suboptimal performance. We should expose the NUMA node locality for PCI devices so management applications can make better decisions.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> ---
diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index 9a951d9..8e98ad2 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -493,6 +493,13 @@ static int udevProcessPCI(struct udev_device *device, goto out; }
+ if (udevGetIntSysfsAttr(device, + "numa_node", + &data->pci_dev.numa_node, + 10) == PROPERTY_ERROR) { + goto out;
Will this result in an error if the 'numa_node' file does not exist in sysfs - I wouldn't be suprised if older kernels lack it. ACK, if that doesn't cause an error on missing numa_node. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 06.06.2014 14:02, Daniel P. Berrange wrote:
On Fri, Jun 06, 2014 at 01:09:58PM +0200, Michal Privoznik wrote:
A PCI device can be associated with a specific NUMA node. Later, when a guest is pinned to one NUMA node the PCI device can be assigned on different NUMA node. This makes DMA transfers travel across nodes and thus results in suboptimal performance. We should expose the NUMA node locality for PCI devices so management applications can make better decisions.
Signed-off-by: Michal Privoznik <mprivozn@redhat.com> ---
diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index 9a951d9..8e98ad2 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -493,6 +493,13 @@ static int udevProcessPCI(struct udev_device *device, goto out; }
+ if (udevGetIntSysfsAttr(device, + "numa_node", + &data->pci_dev.numa_node, + 10) == PROPERTY_ERROR) { + goto out;
Will this result in an error if the 'numa_node' file does not exist in sysfs - I wouldn't be suprised if older kernels lack it.
No, if a file doesn't exist PROPERTY_MISSING is returned. PROPERTY_ERROR is returned if the file content can't be parsed as int. BTW: the 'numa_node' file is around since ages: kernel.git $ git describe --contains 81bb0e198 v2.6.21-rc1~80^2 But in order to deal with even older kernels (or those which are not NUMA aware) I'm squashing in this: diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c index 8e98ad2..91fc16f 100644 --- a/src/node_device/node_device_udev.c +++ b/src/node_device/node_device_udev.c @@ -493,11 +493,16 @@ static int udevProcessPCI(struct udev_device *device, goto out; } - if (udevGetIntSysfsAttr(device, + rc = udevGetIntSysfsAttr(device, "numa_node", &data->pci_dev.numa_node, - 10) == PROPERTY_ERROR) { + 10); + if (rc == PROPERTY_ERROR) { goto out; + } else if (rc == PROPERTY_MISSING) { + /* The default value is -1, because it can't be 0 + * as zero is valid node number. */ + data->pci_dev.numa_node = -1; } if (!virPCIGetPhysicalFunction(syspath, &data->pci_dev.physical_function)) and pushing. Thanks. Michal
participants (2)
-
Daniel P. Berrange
-
Michal Privoznik