On Fri, Dec 04, 2009 at 10:30:46AM -0500, Dave Allan wrote:
Daniel P. Berrange wrote:
>
>I don't much like including the raw BDF format, because it is effectively
>adding a 3rd way of specifying PCI addresses in libvirt XML. We already
>have 2 different ways, which is 1 too many
>
>Node dev XML
>
> <capability type='pci'>
> <domain>0</domain>
> <bus>0</bus>
> <slot>31</slot>
> <function>1</function>
> </capability>
>
>
>Domain XML for host devices & guest addresses
>
> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x5'/>
>
>Yes, my bad for screwing up the nodedev XML format when I designed
>it, stupidly choosing decimal instead of hex :-(
>
>I think we should make the virtual/physical function follow the domain
>XML style, with an <address/> element. Perhaps also use nested capabilities
>in the way we did for NPIV host/vport stuff
>
> <capability type='physical_function'>
> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x1'/>
> </capability>
>
> <capability type='virtual_functions'>
> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x2'/>
> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x3'/>
> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x4'/>
> </capability>
>
>
>I'd even be inclined to retro-fit the existing <capability
type='pci'>
>XML to duplicate the address info in this saner format eg
>
> <capability type='pci'>
> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x1' />
> <domain>0</domain>
> <bus>0</bus>
> <slot>31</slot>
> <function>1</function>
> </capability>
>
>
>So, a PCI card with SR-IOV would end up looking like
>
> <capability type='pci'>
> <domain>0</domain>
> <bus>0</bus>
> <slot>31</slot>
> <function>1</function>
> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x1' />
>
> <capability type='virtual_functions'>
> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x2'/>
> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x3'/>
> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x4'/>
> </capability>
> </capability>
>
>The nice thing about this kind of nesting is that it would let us show
>that a card is capable of exposing virtual functions, even if it does
>not currently have any enabled
>
>
>Regards,
>Daniel
Here's an updated patch reflecting the above suggestions.
Dave
From d11d00d93c4bfbfd1125560ce80abfdbf88de94b Mon Sep 17 00:00:00
2001
From: David Allan <dallan(a)redhat.com>
Date: Mon, 30 Nov 2009 15:58:47 -0500
Subject: [PATCH 1/1] Add SR IOV physical and virtual function relationships
---
src/conf/node_device_conf.c | 30 +++++
src/conf/node_device_conf.h | 16 +++
src/node_device/node_device_driver.h | 14 ++
src/node_device/node_device_hal.c | 6 +
src/node_device/node_device_linux_sysfs.c | 200 ++++++++++++++++++++++++++++-
5 files changed, 265 insertions(+), 1 deletions(-)
diff --git a/src/conf/node_device_conf.c b/src/conf/node_device_conf.c
index 6003ab1..a8ecfb0 100644
--- a/src/conf/node_device_conf.c
+++ b/src/conf/node_device_conf.c
@@ -246,6 +246,7 @@ char *virNodeDeviceDefFormat(virConnectPtr conn,
{
virBuffer buf = VIR_BUFFER_INITIALIZER;
virNodeDevCapsDefPtr caps;
+ unsigned int i = 0;
char *tmp;
virBufferAddLit(&buf, "<device>\n");
@@ -318,6 +319,30 @@ char *virNodeDeviceDefFormat(virConnectPtr conn,
data->pci_dev.vendor_name);
else
virBufferAddLit(&buf, " />\n");
+ if (data->pci_dev.flags &
VIR_NODE_DEV_CAP_FLAG_PCI_PHYSICAL_FUNCTION) {
+ virBufferAddLit(&buf, " <capability type='physical
function'>\n");
Can we change that to 'phys_function'
+ virBufferVSprintf(&buf,
+ " <address domain='0x%.4x'
bus='0x%.2x' "
+ "slot='0x%.2x'
function='0x%.1x'/>\n",
+ data->pci_dev.physical_function->domain,
+ data->pci_dev.physical_function->bus,
+ data->pci_dev.physical_function->slot,
+ data->pci_dev.physical_function->function);
+ virBufferAddLit(&buf, " </capability>\n");
+ }
+ if (data->pci_dev.flags & VIR_NODE_DEV_CAP_FLAG_PCI_VIRTUAL_FUNCTION)
{
+ virBufferAddLit(&buf, " <capability type='virtual
functions'>\n");
And 'virt_function', because its nice to not have whitespace in the values
+ for (i = 0 ; i <
data->pci_dev.num_virtual_functions ; i++) {
+ virBufferVSprintf(&buf,
+ " <address domain='0x%.4x'
bus='0x%.2x' "
+ "slot='0x%.2x'
function='0x%.1x'/>\n",
+ data->pci_dev.virtual_functions[i]->domain,
+ data->pci_dev.virtual_functions[i]->bus,
+ data->pci_dev.virtual_functions[i]->slot,
+
data->pci_dev.virtual_functions[i]->function);
+ }
+ virBufferAddLit(&buf, " </capability>\n");
+ }
break;
case VIR_NODE_DEV_CAP_USB_DEV:
virBufferVSprintf(&buf, " <bus>%d</bus>\n",
data->usb_dev.bus);
@@ -1387,6 +1412,7 @@ out:
void virNodeDevCapsDefFree(virNodeDevCapsDefPtr caps)
{
+ int i = 0;
union _virNodeDevCapData *data = &caps->data;
switch (caps->type) {
@@ -1402,6 +1428,10 @@ void virNodeDevCapsDefFree(virNodeDevCapsDefPtr caps)
case VIR_NODE_DEV_CAP_PCI_DEV:
VIR_FREE(data->pci_dev.product_name);
VIR_FREE(data->pci_dev.vendor_name);
+ VIR_FREE(data->pci_dev.physical_function);
+ for (i = 0 ; i < data->pci_dev.num_virtual_functions ; i++) {
+ VIR_FREE(data->pci_dev.virtual_functions[i]);
+ }
break;
case VIR_NODE_DEV_CAP_USB_DEV:
VIR_FREE(data->usb_dev.product_name);
diff --git a/src/conf/node_device_conf.h b/src/conf/node_device_conf.h
index 7a20bd6..4bfac90 100644
--- a/src/conf/node_device_conf.h
+++ b/src/conf/node_device_conf.h
@@ -76,6 +76,18 @@ enum virNodeDevScsiHostCapFlags {
VIR_NODE_DEV_CAP_FLAG_HBA_VPORT_OPS = (1 << 1),
};
+enum virNodeDevPCICapFlags {
+ VIR_NODE_DEV_CAP_FLAG_PCI_PHYSICAL_FUNCTION = (1 << 0),
+ VIR_NODE_DEV_CAP_FLAG_PCI_VIRTUAL_FUNCTION = (1 << 1),
+};
+
+struct pci_config_address {
+ unsigned domain;
+ unsigned bus;
+ unsigned slot;
+ unsigned function;
+};
+
typedef struct _virNodeDevCapsDef virNodeDevCapsDef;
typedef virNodeDevCapsDef *virNodeDevCapsDefPtr;
struct _virNodeDevCapsDef {
@@ -105,6 +117,10 @@ struct _virNodeDevCapsDef {
unsigned class;
char *product_name;
char *vendor_name;
+ struct pci_config_address *physical_function;
+ struct pci_config_address **virtual_functions;
+ unsigned num_virtual_functions;
+ unsigned flags;
} pci_dev;
struct {
unsigned bus;
diff --git a/src/node_device/node_device_driver.h b/src/node_device/node_device_driver.h
index 4f0822c..7b68f41 100644
--- a/src/node_device/node_device_driver.h
+++ b/src/node_device/node_device_driver.h
@@ -37,6 +37,10 @@
#define LINUX_SYSFS_VPORT_CREATE_POSTFIX "/vport_create"
#define LINUX_SYSFS_VPORT_DELETE_POSTFIX "/vport_delete"
+#define SRIOV_FOUND 0
+#define SRIOV_NOT_FOUND 1
+#define SRIOV_ERROR -1
+
#define LINUX_NEW_DEVICE_WAIT_TIME 60
#ifdef HAVE_HAL
@@ -61,6 +65,14 @@ int check_fc_host_linux(union _virNodeDevCapData *d);
#define check_vport_capable(d) check_vport_capable_linux(d)
int check_vport_capable_linux(union _virNodeDevCapData *d);
+#define get_physical_function(s,d) get_physical_function_linux(s,d)
+int get_physical_function_linux(const char *sysfs_path,
+ union _virNodeDevCapData *d);
+
+#define get_virtual_functions(s,d) get_virtual_functions_linux(s,d)
+int get_virtual_functions_linux(const char *sysfs_path,
+ union _virNodeDevCapData *d);
+
#define read_wwn(host, file, wwn) read_wwn_linux(host, file, wwn)
int read_wwn_linux(int host, const char *file, char **wwn);
@@ -68,6 +80,8 @@ int read_wwn_linux(int host, const char *file, char **wwn);
#define check_fc_host(d)
#define check_vport_capable(d)
+#define get_physical_function(sysfs_path, d)
+#define get_virtual_functions(sysfs_path, d)
#define read_wwn(host, file, wwn)
#endif /* __linux__ */
diff --git a/src/node_device/node_device_hal.c b/src/node_device/node_device_hal.c
index 6de7e3d..4496301 100644
--- a/src/node_device/node_device_hal.c
+++ b/src/node_device/node_device_hal.c
@@ -145,14 +145,20 @@ static int gather_pci_cap(LibHalContext *ctx, const char *udi,
(void)virStrToLong_ui(p+1, &p, 16, &d->pci_dev.slot);
(void)virStrToLong_ui(p+1, &p, 16, &d->pci_dev.function);
}
+
+ get_physical_function(sysfs_path, d);
+ get_virtual_functions(sysfs_path, d);
+
VIR_FREE(sysfs_path);
}
+
(void)get_int_prop(ctx, udi, "pci.vendor_id", (int
*)&d->pci_dev.vendor);
if (get_str_prop(ctx, udi, "pci.vendor", &d->pci_dev.vendor_name)
!= 0)
(void)get_str_prop(ctx, udi, "info.vendor",
&d->pci_dev.vendor_name);
(void)get_int_prop(ctx, udi, "pci.product_id", (int
*)&d->pci_dev.product);
if (get_str_prop(ctx, udi, "pci.product", &d->pci_dev.product_name)
!= 0)
(void)get_str_prop(ctx, udi, "info.product",
&d->pci_dev.product_name);
+
return 0;
}
diff --git a/src/node_device/node_device_linux_sysfs.c
b/src/node_device/node_device_linux_sysfs.c
index b7cf782..d2b10dd 100644
--- a/src/node_device/node_device_linux_sysfs.c
+++ b/src/node_device/node_device_linux_sysfs.c
@@ -29,6 +29,7 @@
#include "virterror_internal.h"
#include "memory.h"
#include "logging.h"
+#include <dirent.h>
#define VIR_FROM_THIS VIR_FROM_NODEDEV
@@ -70,7 +71,7 @@ int read_wwn_linux(int host, const char *file, char **wwn)
char buf[64];
if (open_wwn_file(LINUX_SYSFS_FC_HOST_PREFIX, host, file, &fd) < 0) {
- goto out;
+ goto out;
}
memset(buf, 0, sizeof(buf));
@@ -184,4 +185,201 @@ out:
return retval;
}
+
+static int logStrToLong_ui(char const *s,
+ char **end_ptr,
+ int base,
+ unsigned int *result)
+{
+ int ret = 0;
+
+ ret = virStrToLong_ui(s, end_ptr, base, result);
+ if (ret != 0) {
+ VIR_ERROR("Failed to convert '%s' to unsigned int\n", s);
+ } else {
+ VIR_DEBUG("Converted '%s' to unsigned int %u\n", s, *result);
+ }
There's no need for '\n' in log messages.
+
+ return ret;
+}
+
+
+static int parse_pci_config_address(char *address, struct pci_config_address *bdf)
+{
+ char *p = NULL;
+ int ret = -1;
+
+ if ((address == NULL) || (logStrToLong_ui(address,
+ &p,
+ 16,
+ &bdf->domain) == -1)) {
+ goto out;
+ }
+
+ if ((p == NULL) || (logStrToLong_ui(p+1,
+ &p,
+ 16,
+ &bdf->bus) == -1)) {
+ goto out;
+ }
+
+ if ((p == NULL) || (logStrToLong_ui(p+1,
+ &p,
+ 16,
+ &bdf->slot) == -1)) {
+ goto out;
+ }
+
+ if ((p == NULL) || (logStrToLong_ui(p+1,
+ &p,
+ 16,
+ &bdf->function) == -1)) {
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+
+
+
+static int get_sriov_function(const char *device_link,
+ struct pci_config_address **bdf)
+{
+ char *device_path = NULL, *config_address = NULL;
+ char errbuf[64];
+ int ret = SRIOV_ERROR;
+
+ VIR_DEBUG("Attempting to resolve device path from device link
'%s'\n",
+ device_link);
+
+ if (!virFileExists(device_link)) {
+
+ VIR_DEBUG("SR IOV function link '%s' does not exist\n",
device_link);
+ /* Not an SR IOV device, not an error, either. */
+ ret = SRIOV_NOT_FOUND;
+
+ goto out;
+
+ }
+
+ device_path = realpath(device_link, device_path);
+ if (device_path == NULL) {
+ memset(errbuf, '\0', sizeof(errbuf));
+ VIR_ERROR("Failed to resolve device link '%s':
'%s'\n", device_link,
+ strerror_r(errno, errbuf, sizeof(errbuf)));
Can you replace that with virStrerror() because strerror_r() has an
annoying API difference on Linux compared to POSIX (returns char *
instead of int, or vica-verca).
+ goto out;
+ }
+
+ VIR_DEBUG("SR IOV device path is '%s'\n", device_path);
+ config_address = basename(device_path);
+ if (VIR_ALLOC(*bdf) != 0) {
+ VIR_ERROR0("Failed to allocate memory for PCI device name\n");
+ goto out;
+ }
+
+ if (parse_pci_config_address(config_address, *bdf) != 0) {
+ VIR_ERROR("Failed to parse PCI config address '%s'\n",
config_address);
+ goto out;
+ }
+
+ VIR_DEBUG("SR IOV function %.4x:%.2x:%.2x.%.1x/>\n",
+ (*bdf)->domain,
+ (*bdf)->bus,
+ (*bdf)->slot,
+ (*bdf)->function);
+
+ ret = SRIOV_FOUND;
+
+out:
+ VIR_FREE(device_path);
+ return ret;
+}
+
+
+int get_physical_function_linux(const char *sysfs_path,
+ union _virNodeDevCapData *d ATTRIBUTE_UNUSED)
+{
+ int ret = -1;
+ char *device_link = NULL;
+
+ VIR_DEBUG("Attempting to get SR IOV physical function for device "
+ "with sysfs path '%s'\n", sysfs_path);
+
+ if (virBuildPath(&device_link, sysfs_path, "physfn") == -1) {
+ virReportOOMError(NULL);
+ } else {
+ ret = get_sriov_function(device_link, &d->pci_dev.physical_function);
+ if (ret == SRIOV_FOUND) {
+ d->pci_dev.flags |= VIR_NODE_DEV_CAP_FLAG_PCI_PHYSICAL_FUNCTION;
+ }
+ }
+
+ VIR_FREE(device_link);
+ return ret;
+}
+
+
+int get_virtual_functions_linux(const char *sysfs_path,
+ union _virNodeDevCapData *d)
+{
+ int ret = -1;
+ DIR *dir = NULL;
+ struct dirent *entry = NULL;
+ char *device_link = NULL;
+
+ VIR_DEBUG("Attempting to get SR IOV virtual functions for device"
+ "with sysfs path '%s'\n", sysfs_path);
+
+ dir = opendir(sysfs_path);
+ if (dir == NULL) {
+ goto out;
+ }
+
+ while ((entry = readdir(dir))) {
+ if (STRPREFIX(entry->d_name, "virtfn")) {
+ /* This local is just to avoid lines of code much > 80 col. */
+ unsigned int *num_funcs = &d->pci_dev.num_virtual_functions;
+
+ if (virBuildPath(&device_link, sysfs_path, entry->d_name) == -1) {
+ virReportOOMError(NULL);
+ goto out;
+ }
+
+ VIR_DEBUG("Number of virtual functions: %d\n", *num_funcs);
+ if (VIR_REALLOC_N(d->pci_dev.virtual_functions, (*num_funcs) + 1) != 0)
{
+ virReportOOMError(NULL);
+ goto out;
+ }
+
+ if (get_sriov_function(device_link,
+ &d->pci_dev.virtual_functions[*num_funcs]) !=
+ SRIOV_FOUND) {
+
+ /* We should not get back SRIOV_NOT_FOUND in this
+ * case, so if we do, it's an error. */
+ VIR_ERROR("Failed to get SR IOV function from device link
'%s'\n",
+ device_link);
+ goto out;
+ } else {
+ (*num_funcs)++;
+ d->pci_dev.flags |= VIR_NODE_DEV_CAP_FLAG_PCI_VIRTUAL_FUNCTION;
+ }
+
+ VIR_FREE(device_link);
+ }
+ }
+
+ closedir(dir);
+
+ ret = 0;
+
+out:
+ VIR_FREE(device_link);
+ return 0;
+}
+
#endif /* __linux__ */
--
ACK aside from the minor logging fixes & attribute whitespace
Daniel
--
|: Red Hat, Engineering, London -o-
http://people.redhat.com/berrange/ :|
|:
http://libvirt.org -o-
http://virt-manager.org -o-
http://ovirt.org :|
|:
http://autobuild.org -o-
http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|