A PCI device can be associated with a specific NUMA node. Later, when
a guest is pinned to one NUMA node the PCI device can be assigned on
different NUMA node. This makes DMA transfers travel across nodes and
thus results in suboptimal performance. We should expose the NUMA node
locality for PCI devices so management applications can make better
decisions.
Signed-off-by: Michal Privoznik <mprivozn(a)redhat.com>
---
Notes:
All the machines I have tried this on had only -1 in the
numa_node file. From the kernel sources it seems that this is the
default, so I'm not printing the <numa/> element into the XML in
this case. But I'd like to hear your opinion.
docs/formatnode.html.in | 6 +++
docs/schemas/nodedev.rng | 11 ++++++
src/conf/node_device_conf.c | 43 ++++++++++++++++++++++
src/conf/node_device_conf.h | 1 +
src/node_device/node_device_udev.c | 7 ++++
tests/nodedevschemadata/pci_1002_71c4.xml | 1 +
tests/nodedevschemadata/pci_8086_10c9_sriov_pf.xml | 1 +
7 files changed, 70 insertions(+)
diff --git a/docs/formatnode.html.in b/docs/formatnode.html.in
index 46ec2bc..3671c8b 100644
--- a/docs/formatnode.html.in
+++ b/docs/formatnode.html.in
@@ -124,6 +124,12 @@
GigaTransfers per second) and <code>width</code> for the
number
of lanes used. Since the port can't be negotiated, it's not
exposed in
<code>./pci-express/link/[@validity='sta']</code>.
+ <dt><code>numa</code></dt>
+ <dd>
+ This optional element contains information on the PCI device
+ with respect to NUMA. For example, the optional
+ <code>node</code> attribute tells which NUMA node is the PCI
+ device associated with.
</dd>
</dl>
</dd>
diff --git a/docs/schemas/nodedev.rng b/docs/schemas/nodedev.rng
index 79e8fd2..10b716f 100644
--- a/docs/schemas/nodedev.rng
+++ b/docs/schemas/nodedev.rng
@@ -184,6 +184,17 @@
</zeroOrMore>
</element>
</optional>
+
+ <optional>
+ <element name='numa'>
+ <optional>
+ <attribute name='node'>
+ <data type='int'/>
+ </attribute>
+ </optional>
+ </element>
+ </optional>
+
</define>
<define name='capusbdev'>
diff --git a/src/conf/node_device_conf.c b/src/conf/node_device_conf.c
index 70634cc..57dccc4 100644
--- a/src/conf/node_device_conf.c
+++ b/src/conf/node_device_conf.c
@@ -380,6 +380,9 @@ char *virNodeDeviceDefFormat(const virNodeDeviceDef *def)
}
if (data->pci_dev.flags & VIR_NODE_DEV_CAP_FLAG_PCIE)
virPCIEDeviceInfoFormat(&buf, data->pci_dev.pci_express);
+ if (data->pci_dev.numa_node >= 0)
+ virBufferAsprintf(&buf, "<numa
node='%d'/>\n",
+ data->pci_dev.numa_node);
break;
case VIR_NODE_DEV_CAP_USB_DEV:
virBufferAsprintf(&buf, "<bus>%d</bus>\n",
data->usb_dev.bus);
@@ -554,6 +557,41 @@ char *virNodeDeviceDefFormat(const virNodeDeviceDef *def)
return NULL;
}
+/**
+ * virNodeDevCapsDefParseIntOptional:
+ * @xpath: XPath to evaluate
+ * @ctxt: Context
+ * @value: Where to store parsed value
+ * @def: Node device which is parsed
+ * @invalid_error_fmt: error message to print on invalid format
+ *
+ * Returns: -1 on error (invalid int format under @xpath)
+ * 0 if @xpath was not found (@value is untouched)
+ * 1 on success
+ */
+static int
+virNodeDevCapsDefParseIntOptional(const char *xpath,
+ xmlXPathContextPtr ctxt,
+ int *value,
+ virNodeDeviceDefPtr def,
+ const char *invalid_error_fmt)
+{
+ int ret;
+ int val;
+
+ ret = virXPathInt(xpath, ctxt, &val);
+ if (ret < -1) {
+ virReportError(VIR_ERR_INTERNAL_ERROR,
+ invalid_error_fmt,
+ def->name);
+ return -1;
+ } else if (ret == -1) {
+ return 0;
+ }
+ *value = val;
+ return 1;
+}
+
static int
virNodeDevCapsDefParseULong(const char *xpath,
xmlXPathContextPtr ctxt,
@@ -1222,6 +1260,11 @@ virNodeDevCapPCIDevParseXML(xmlXPathContextPtr ctxt,
goto out;
}
+ if (virNodeDevCapsDefParseIntOptional("number(./numa[1]/@node)", ctxt,
+ &data->pci_dev.numa_node, def,
+ _("invalid NUMA node ID supplied for
'%s'")) < 0)
+ goto out;
+
ret = 0;
out:
ctxt->node = orignode;
diff --git a/src/conf/node_device_conf.h b/src/conf/node_device_conf.h
index 563bf6a..6abab5e 100644
--- a/src/conf/node_device_conf.h
+++ b/src/conf/node_device_conf.h
@@ -142,6 +142,7 @@ struct _virNodeDevCapsDef {
size_t nIommuGroupDevices;
unsigned int iommuGroupNumber;
virPCIEDeviceInfoPtr pci_express;
+ int numa_node;
} pci_dev;
struct {
unsigned int bus;
diff --git a/src/node_device/node_device_udev.c b/src/node_device/node_device_udev.c
index 9ed869b..1663edc 100644
--- a/src/node_device/node_device_udev.c
+++ b/src/node_device/node_device_udev.c
@@ -495,6 +495,13 @@ static int udevProcessPCI(struct udev_device *device,
goto out;
}
+ if (udevGetIntSysfsAttr(device,
+ "numa_node",
+ &data->pci_dev.numa_node,
+ 10) == PROPERTY_ERROR) {
+ goto out;
+ }
+
if (!virPCIGetPhysicalFunction(syspath, &data->pci_dev.physical_function))
data->pci_dev.flags |= VIR_NODE_DEV_CAP_FLAG_PCI_PHYSICAL_FUNCTION;
diff --git a/tests/nodedevschemadata/pci_1002_71c4.xml
b/tests/nodedevschemadata/pci_1002_71c4.xml
index 6de09c1..6d5d85b 100644
--- a/tests/nodedevschemadata/pci_1002_71c4.xml
+++ b/tests/nodedevschemadata/pci_1002_71c4.xml
@@ -8,5 +8,6 @@
<function>0</function>
<product id='0x71c4'>M56GL [Mobility FireGL V5200]</product>
<vendor id='0x1002'>ATI Technologies Inc</vendor>
+ <numa node='1'/>
</capability>
</device>
diff --git a/tests/nodedevschemadata/pci_8086_10c9_sriov_pf.xml
b/tests/nodedevschemadata/pci_8086_10c9_sriov_pf.xml
index eff8932..6e1dc86 100644
--- a/tests/nodedevschemadata/pci_8086_10c9_sriov_pf.xml
+++ b/tests/nodedevschemadata/pci_8086_10c9_sriov_pf.xml
@@ -12,5 +12,6 @@
<address domain='0x0000' bus='0x02' slot='0x00'
function='0x0'/>
<address domain='0x0000' bus='0x02' slot='0x00'
function='0x1'/>
</iommuGroup>
+ <numa node='0'/>
</capability>
</device>
--
1.9.3