On 12/10/2009 03:16 PM, Daniel P. Berrange wrote:
On Fri, Dec 04, 2009 at 10:30:46AM -0500, Dave Allan wrote:
> Daniel P. Berrange wrote:
>>
>> I don't much like including the raw BDF format, because it is effectively
>> adding a 3rd way of specifying PCI addresses in libvirt XML. We already
>> have 2 different ways, which is 1 too many
>>
>> Node dev XML
>>
>> <capability type='pci'>
>> <domain>0</domain>
>> <bus>0</bus>
>> <slot>31</slot>
>> <function>1</function>
>> </capability>
>>
>>
>> Domain XML for host devices& guest addresses
>>
>> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x5'/>
>>
>> Yes, my bad for screwing up the nodedev XML format when I designed
>> it, stupidly choosing decimal instead of hex :-(
>>
>> I think we should make the virtual/physical function follow the domain
>> XML style, with an<address/> element. Perhaps also use nested
capabilities
>> in the way we did for NPIV host/vport stuff
>>
>> <capability type='physical_function'>
>> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x1'/>
>> </capability>
>>
>> <capability type='virtual_functions'>
>> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x2'/>
>> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x3'/>
>> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x4'/>
>> </capability>
>>
>>
>> I'd even be inclined to retro-fit the existing<capability
type='pci'>
>> XML to duplicate the address info in this saner format eg
>>
>> <capability type='pci'>
>> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x1' />
>> <domain>0</domain>
>> <bus>0</bus>
>> <slot>31</slot>
>> <function>1</function>
>> </capability>
>>
>>
>> So, a PCI card with SR-IOV would end up looking like
>>
>> <capability type='pci'>
>> <domain>0</domain>
>> <bus>0</bus>
>> <slot>31</slot>
>> <function>1</function>
>> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x1' />
>>
>> <capability type='virtual_functions'>
>> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x2'/>
>> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x3'/>
>> <address domain='0x0000' bus='0x00' slot='0x1f'
function='0x4'/>
>> </capability>
>> </capability>
>>
>> The nice thing about this kind of nesting is that it would let us show
>> that a card is capable of exposing virtual functions, even if it does
>> not currently have any enabled
>>
>>
>> Regards,
>> Daniel
>
> Here's an updated patch reflecting the above suggestions.
>
> Dave
>
> From d11d00d93c4bfbfd1125560ce80abfdbf88de94b Mon Sep 17 00:00:00 2001
> From: David Allan<dallan(a)redhat.com>
> Date: Mon, 30 Nov 2009 15:58:47 -0500
> Subject: [PATCH 1/1] Add SR IOV physical and virtual function relationships
>
> ---
> src/conf/node_device_conf.c | 30 +++++
> src/conf/node_device_conf.h | 16 +++
> src/node_device/node_device_driver.h | 14 ++
> src/node_device/node_device_hal.c | 6 +
> src/node_device/node_device_linux_sysfs.c | 200 ++++++++++++++++++++++++++++-
> 5 files changed, 265 insertions(+), 1 deletions(-)
>
> diff --git a/src/conf/node_device_conf.c b/src/conf/node_device_conf.c
> index 6003ab1..a8ecfb0 100644
> --- a/src/conf/node_device_conf.c
> +++ b/src/conf/node_device_conf.c
> @@ -246,6 +246,7 @@ char *virNodeDeviceDefFormat(virConnectPtr conn,
> {
> virBuffer buf = VIR_BUFFER_INITIALIZER;
> virNodeDevCapsDefPtr caps;
> + unsigned int i = 0;
> char *tmp;
>
> virBufferAddLit(&buf, "<device>\n");
> @@ -318,6 +319,30 @@ char *virNodeDeviceDefFormat(virConnectPtr conn,
> data->pci_dev.vendor_name);
> else
> virBufferAddLit(&buf, " />\n");
> + if (data->pci_dev.flags&
VIR_NODE_DEV_CAP_FLAG_PCI_PHYSICAL_FUNCTION) {
> + virBufferAddLit(&buf, "<capability type='physical
function'>\n");
Can we change that to 'phys_function'
> + virBufferVSprintf(&buf,
> + "<address domain='0x%.4x'
bus='0x%.2x' "
> + "slot='0x%.2x'
function='0x%.1x'/>\n",
> + data->pci_dev.physical_function->domain,
> + data->pci_dev.physical_function->bus,
> + data->pci_dev.physical_function->slot,
> + data->pci_dev.physical_function->function);
> + virBufferAddLit(&buf, "</capability>\n");
> + }
> + if (data->pci_dev.flags&
VIR_NODE_DEV_CAP_FLAG_PCI_VIRTUAL_FUNCTION) {
> + virBufferAddLit(&buf, "<capability type='virtual
functions'>\n");
And 'virt_function', because its nice to not have whitespace in the values
> + for (i = 0 ; i< data->pci_dev.num_virtual_functions ; i++) {
> + virBufferVSprintf(&buf,
> + "<address domain='0x%.4x'
bus='0x%.2x' "
> + "slot='0x%.2x'
function='0x%.1x'/>\n",
> +
data->pci_dev.virtual_functions[i]->domain,
> +
data->pci_dev.virtual_functions[i]->bus,
> +
data->pci_dev.virtual_functions[i]->slot,
> +
data->pci_dev.virtual_functions[i]->function);
> + }
> + virBufferAddLit(&buf, "</capability>\n");
> + }
> break;
> case VIR_NODE_DEV_CAP_USB_DEV:
> virBufferVSprintf(&buf, "<bus>%d</bus>\n",
data->usb_dev.bus);
> @@ -1387,6 +1412,7 @@ out:
>
> void virNodeDevCapsDefFree(virNodeDevCapsDefPtr caps)
> {
> + int i = 0;
> union _virNodeDevCapData *data =&caps->data;
>
> switch (caps->type) {
> @@ -1402,6 +1428,10 @@ void virNodeDevCapsDefFree(virNodeDevCapsDefPtr caps)
> case VIR_NODE_DEV_CAP_PCI_DEV:
> VIR_FREE(data->pci_dev.product_name);
> VIR_FREE(data->pci_dev.vendor_name);
> + VIR_FREE(data->pci_dev.physical_function);
> + for (i = 0 ; i< data->pci_dev.num_virtual_functions ; i++) {
> + VIR_FREE(data->pci_dev.virtual_functions[i]);
> + }
> break;
> case VIR_NODE_DEV_CAP_USB_DEV:
> VIR_FREE(data->usb_dev.product_name);
> diff --git a/src/conf/node_device_conf.h b/src/conf/node_device_conf.h
> index 7a20bd6..4bfac90 100644
> --- a/src/conf/node_device_conf.h
> +++ b/src/conf/node_device_conf.h
> @@ -76,6 +76,18 @@ enum virNodeDevScsiHostCapFlags {
> VIR_NODE_DEV_CAP_FLAG_HBA_VPORT_OPS = (1<< 1),
> };
>
> +enum virNodeDevPCICapFlags {
> + VIR_NODE_DEV_CAP_FLAG_PCI_PHYSICAL_FUNCTION = (1<< 0),
> + VIR_NODE_DEV_CAP_FLAG_PCI_VIRTUAL_FUNCTION = (1<< 1),
> +};
> +
> +struct pci_config_address {
> + unsigned domain;
> + unsigned bus;
> + unsigned slot;
> + unsigned function;
> +};
> +
> typedef struct _virNodeDevCapsDef virNodeDevCapsDef;
> typedef virNodeDevCapsDef *virNodeDevCapsDefPtr;
> struct _virNodeDevCapsDef {
> @@ -105,6 +117,10 @@ struct _virNodeDevCapsDef {
> unsigned class;
> char *product_name;
> char *vendor_name;
> + struct pci_config_address *physical_function;
> + struct pci_config_address **virtual_functions;
> + unsigned num_virtual_functions;
> + unsigned flags;
> } pci_dev;
> struct {
> unsigned bus;
> diff --git a/src/node_device/node_device_driver.h
b/src/node_device/node_device_driver.h
> index 4f0822c..7b68f41 100644
> --- a/src/node_device/node_device_driver.h
> +++ b/src/node_device/node_device_driver.h
> @@ -37,6 +37,10 @@
> #define LINUX_SYSFS_VPORT_CREATE_POSTFIX "/vport_create"
> #define LINUX_SYSFS_VPORT_DELETE_POSTFIX "/vport_delete"
>
> +#define SRIOV_FOUND 0
> +#define SRIOV_NOT_FOUND 1
> +#define SRIOV_ERROR -1
> +
> #define LINUX_NEW_DEVICE_WAIT_TIME 60
>
> #ifdef HAVE_HAL
> @@ -61,6 +65,14 @@ int check_fc_host_linux(union _virNodeDevCapData *d);
> #define check_vport_capable(d) check_vport_capable_linux(d)
> int check_vport_capable_linux(union _virNodeDevCapData *d);
>
> +#define get_physical_function(s,d) get_physical_function_linux(s,d)
> +int get_physical_function_linux(const char *sysfs_path,
> + union _virNodeDevCapData *d);
> +
> +#define get_virtual_functions(s,d) get_virtual_functions_linux(s,d)
> +int get_virtual_functions_linux(const char *sysfs_path,
> + union _virNodeDevCapData *d);
> +
> #define read_wwn(host, file, wwn) read_wwn_linux(host, file, wwn)
> int read_wwn_linux(int host, const char *file, char **wwn);
>
> @@ -68,6 +80,8 @@ int read_wwn_linux(int host, const char *file, char **wwn);
>
> #define check_fc_host(d)
> #define check_vport_capable(d)
> +#define get_physical_function(sysfs_path, d)
> +#define get_virtual_functions(sysfs_path, d)
> #define read_wwn(host, file, wwn)
>
> #endif /* __linux__ */
> diff --git a/src/node_device/node_device_hal.c b/src/node_device/node_device_hal.c
> index 6de7e3d..4496301 100644
> --- a/src/node_device/node_device_hal.c
> +++ b/src/node_device/node_device_hal.c
> @@ -145,14 +145,20 @@ static int gather_pci_cap(LibHalContext *ctx, const char *udi,
> (void)virStrToLong_ui(p+1,&p, 16,&d->pci_dev.slot);
> (void)virStrToLong_ui(p+1,&p, 16,&d->pci_dev.function);
> }
> +
> + get_physical_function(sysfs_path, d);
> + get_virtual_functions(sysfs_path, d);
> +
> VIR_FREE(sysfs_path);
> }
> +
> (void)get_int_prop(ctx, udi, "pci.vendor_id", (int
*)&d->pci_dev.vendor);
> if (get_str_prop(ctx, udi,
"pci.vendor",&d->pci_dev.vendor_name) != 0)
> (void)get_str_prop(ctx, udi,
"info.vendor",&d->pci_dev.vendor_name);
> (void)get_int_prop(ctx, udi, "pci.product_id", (int
*)&d->pci_dev.product);
> if (get_str_prop(ctx, udi,
"pci.product",&d->pci_dev.product_name) != 0)
> (void)get_str_prop(ctx, udi,
"info.product",&d->pci_dev.product_name);
> +
> return 0;
> }
>
> diff --git a/src/node_device/node_device_linux_sysfs.c
b/src/node_device/node_device_linux_sysfs.c
> index b7cf782..d2b10dd 100644
> --- a/src/node_device/node_device_linux_sysfs.c
> +++ b/src/node_device/node_device_linux_sysfs.c
> @@ -29,6 +29,7 @@
> #include "virterror_internal.h"
> #include "memory.h"
> #include "logging.h"
> +#include<dirent.h>
>
> #define VIR_FROM_THIS VIR_FROM_NODEDEV
>
> @@ -70,7 +71,7 @@ int read_wwn_linux(int host, const char *file, char **wwn)
> char buf[64];
>
> if (open_wwn_file(LINUX_SYSFS_FC_HOST_PREFIX, host, file,&fd)< 0) {
> - goto out;
> + goto out;
> }
>
> memset(buf, 0, sizeof(buf));
> @@ -184,4 +185,201 @@ out:
> return retval;
> }
>
> +
> +static int logStrToLong_ui(char const *s,
> + char **end_ptr,
> + int base,
> + unsigned int *result)
> +{
> + int ret = 0;
> +
> + ret = virStrToLong_ui(s, end_ptr, base, result);
> + if (ret != 0) {
> + VIR_ERROR("Failed to convert '%s' to unsigned int\n", s);
> + } else {
> + VIR_DEBUG("Converted '%s' to unsigned int %u\n", s,
*result);
> + }
There's no need for '\n' in log messages.
> +
> + return ret;
> +}
> +
> +
> +static int parse_pci_config_address(char *address, struct pci_config_address *bdf)
> +{
> + char *p = NULL;
> + int ret = -1;
> +
> + if ((address == NULL) || (logStrToLong_ui(address,
> +&p,
> + 16,
> +&bdf->domain) == -1)) {
> + goto out;
> + }
> +
> + if ((p == NULL) || (logStrToLong_ui(p+1,
> +&p,
> + 16,
> +&bdf->bus) == -1)) {
> + goto out;
> + }
> +
> + if ((p == NULL) || (logStrToLong_ui(p+1,
> +&p,
> + 16,
> +&bdf->slot) == -1)) {
> + goto out;
> + }
> +
> + if ((p == NULL) || (logStrToLong_ui(p+1,
> +&p,
> + 16,
> +&bdf->function) == -1)) {
> + goto out;
> + }
> +
> + ret = 0;
> +
> +out:
> + return ret;
> +}
> +
> +
> +
> +
> +static int get_sriov_function(const char *device_link,
> + struct pci_config_address **bdf)
> +{
> + char *device_path = NULL, *config_address = NULL;
> + char errbuf[64];
> + int ret = SRIOV_ERROR;
> +
> + VIR_DEBUG("Attempting to resolve device path from device link
'%s'\n",
> + device_link);
> +
> + if (!virFileExists(device_link)) {
> +
> + VIR_DEBUG("SR IOV function link '%s' does not exist\n",
device_link);
> + /* Not an SR IOV device, not an error, either. */
> + ret = SRIOV_NOT_FOUND;
> +
> + goto out;
> +
> + }
> +
> + device_path = realpath(device_link, device_path);
> + if (device_path == NULL) {
> + memset(errbuf, '\0', sizeof(errbuf));
> + VIR_ERROR("Failed to resolve device link '%s':
'%s'\n", device_link,
> + strerror_r(errno, errbuf, sizeof(errbuf)));
Can you replace that with virStrerror() because strerror_r() has an
annoying API difference on Linux compared to POSIX (returns char *
instead of int, or vica-verca).
> + goto out;
> + }
> +
> + VIR_DEBUG("SR IOV device path is '%s'\n", device_path);
> + config_address = basename(device_path);
> + if (VIR_ALLOC(*bdf) != 0) {
> + VIR_ERROR0("Failed to allocate memory for PCI device name\n");
> + goto out;
> + }
> +
> + if (parse_pci_config_address(config_address, *bdf) != 0) {
> + VIR_ERROR("Failed to parse PCI config address '%s'\n",
config_address);
> + goto out;
> + }
> +
> + VIR_DEBUG("SR IOV function %.4x:%.2x:%.2x.%.1x/>\n",
> + (*bdf)->domain,
> + (*bdf)->bus,
> + (*bdf)->slot,
> + (*bdf)->function);
> +
> + ret = SRIOV_FOUND;
> +
> +out:
> + VIR_FREE(device_path);
> + return ret;
> +}
> +
> +
> +int get_physical_function_linux(const char *sysfs_path,
> + union _virNodeDevCapData *d ATTRIBUTE_UNUSED)
> +{
> + int ret = -1;
> + char *device_link = NULL;
> +
> + VIR_DEBUG("Attempting to get SR IOV physical function for device "
> + "with sysfs path '%s'\n", sysfs_path);
> +
> + if (virBuildPath(&device_link, sysfs_path, "physfn") == -1) {
> + virReportOOMError(NULL);
> + } else {
> + ret = get_sriov_function(device_link,&d->pci_dev.physical_function);
> + if (ret == SRIOV_FOUND) {
> + d->pci_dev.flags |= VIR_NODE_DEV_CAP_FLAG_PCI_PHYSICAL_FUNCTION;
> + }
> + }
> +
> + VIR_FREE(device_link);
> + return ret;
> +}
> +
> +
> +int get_virtual_functions_linux(const char *sysfs_path,
> + union _virNodeDevCapData *d)
> +{
> + int ret = -1;
> + DIR *dir = NULL;
> + struct dirent *entry = NULL;
> + char *device_link = NULL;
> +
> + VIR_DEBUG("Attempting to get SR IOV virtual functions for device"
> + "with sysfs path '%s'\n", sysfs_path);
> +
> + dir = opendir(sysfs_path);
> + if (dir == NULL) {
> + goto out;
> + }
> +
> + while ((entry = readdir(dir))) {
> + if (STRPREFIX(entry->d_name, "virtfn")) {
> + /* This local is just to avoid lines of code much> 80 col. */
> + unsigned int *num_funcs =&d->pci_dev.num_virtual_functions;
> +
> + if (virBuildPath(&device_link, sysfs_path, entry->d_name) == -1)
{
> + virReportOOMError(NULL);
> + goto out;
> + }
> +
> + VIR_DEBUG("Number of virtual functions: %d\n", *num_funcs);
> + if (VIR_REALLOC_N(d->pci_dev.virtual_functions, (*num_funcs) + 1) !=
0) {
> + virReportOOMError(NULL);
> + goto out;
> + }
> +
> + if (get_sriov_function(device_link,
> +&d->pci_dev.virtual_functions[*num_funcs]) !=
> + SRIOV_FOUND) {
> +
> + /* We should not get back SRIOV_NOT_FOUND in this
> + * case, so if we do, it's an error. */
> + VIR_ERROR("Failed to get SR IOV function from device link
'%s'\n",
> + device_link);
> + goto out;
> + } else {
> + (*num_funcs)++;
> + d->pci_dev.flags |= VIR_NODE_DEV_CAP_FLAG_PCI_VIRTUAL_FUNCTION;
> + }
> +
> + VIR_FREE(device_link);
> + }
> + }
> +
> + closedir(dir);
> +
> + ret = 0;
> +
> +out:
> + VIR_FREE(device_link);
> + return 0;
> +}
> +
> #endif /* __linux__ */
> --
ACK aside from the minor logging fixes& attribute whitespace
Daniel
Ok, thanks. Final version attached.
Dave