Mellanox ConnectX-3 dual port SRIOV NICs present a bit of a challenge
when assigning one of their VFs to a guest using VFIO device
assignment.
These NICs have only a single PCI PF device, and that single PF has
two netdevs sharing the single PCI address - one for port 1 and one
for port 2. When a VF is created it can also have 2 netdevs, or it can
be setup in "single port" mode, where the VF has only a single netdev,
and that netdev is connected either to port 1 or to port 2.
When the VF is created in dual port mode, you get/set the MAC
address/vlan tag for the port 1 VF by sending a netlink message to the
PF's port1 netdev, and you get/set the MAC address/vlan tag for the
port 2 VF by sending a netlink message to the PF's port 2 netdev. (Of
course libvirt doesn't have any way to describe MAC/vlan info for 2
ports in a single hostdev interface, so that's a bit of a moot point)
When the VF is created in single port mode, you can *set* the MAC/vlan
info by sending a netlink message to *either* PF netdev - the driver
is smart enough to understand that there's only a single netdev, and
set the MAC/vlan for that netdev. When you want to *get* it, however,
the driver is more accurate - it will return 00:00:00:00:00:00 for the
MAC if you request it from the port 1 PF netdev when the VF was
configured to be single port on port 2, or if you request if from the
port 2 PF netdev when the VF was configured to be single port on port
1.
Based on this information, when *getting* the MAC/vlan info (to save
the original setting prior to assignment), we determine the correct PF
netdev by matching phys_port_id between VF and PF.
(IMPORTANT NOTE: this implies that to do PCI device assignment of the
VFs on dual port Mellanox cards using <interface type='hostdev'>
(i.e. if you want the MAC address/vlan tag to be set), not only must
the VFs be configured in single port mode, but also the VFs *must* be
bound to the host VF net driver, and libvirt must use managed='yes')
By the time libvirt is ready to set the new MAC/vlan tag, the VF has
already been unbound from the host net driver and bound to
vfio-pci. This isn't problematic though because, as stated earlier,
when a VF is created in single port mode, commands to configure it can
be sent to either the port 1 PF netdev or the port 2 PF netdev.
When it is time to restore the original MAC/vlan tag, again the VF
will *not* be bound to a host net driver, so it won't be possible to
learn from sysfs whether to use the port 1 or port 2 PF netdev for the
netlink commands. And again, it doesn't matter which netdev you
use. However, we must keep in mind that we saved the original settings
to a file called "${PF}_${VFNUM}". To solve this problem, we just
check for the existence of ${PF1}_${VFNUM} and ${PF2}_${VFNUM}, and
use whichever one we find (since we know that only one can be there)
---
New in V2
src/util/virhostdev.c | 27 +++++++++++++++++++++------
src/util/virpci.c | 31 +++++++++++++++++++++++++++++--
src/util/virpci.h | 4 +++-
3 files changed, 53 insertions(+), 9 deletions(-)
diff --git a/src/util/virhostdev.c b/src/util/virhostdev.c
index 580f0fac0..102fd85c1 100644
--- a/src/util/virhostdev.c
+++ b/src/util/virhostdev.c
@@ -307,7 +307,9 @@ virHostdevIsVirtualFunction(virDomainHostdevDefPtr hostdev)
static int
-virHostdevNetDevice(virDomainHostdevDefPtr hostdev, char **linkdev,
+virHostdevNetDevice(virDomainHostdevDefPtr hostdev,
+ int pfNetDevIdx,
+ char **linkdev,
int *vf)
{
int ret = -1;
@@ -317,9 +319,10 @@ virHostdevNetDevice(virDomainHostdevDefPtr hostdev, char **linkdev,
return ret;
if (virPCIIsVirtualFunction(sysfs_path) == 1) {
- if (virPCIGetVirtualFunctionInfo(sysfs_path, linkdev,
- vf) < 0)
+ if (virPCIGetVirtualFunctionInfo(sysfs_path, pfNetDevIdx,
+ linkdev, vf) < 0) {
goto cleanup;
+ }
} else {
/* In practice this should never happen, since we currently
* only support assigning SRIOV VFs via <interface
@@ -444,7 +447,7 @@ virHostdevSaveNetConfig(virDomainHostdevDefPtr hostdev,
goto cleanup;
}
- if (virHostdevNetDevice(hostdev, &linkdev, &vf) < 0)
+ if (virHostdevNetDevice(hostdev, -1, &linkdev, &vf) < 0)
goto cleanup;
if (virNetDevSaveNetConfig(linkdev, vf, stateDir, true) < 0)
@@ -482,7 +485,7 @@ virHostdevSetNetConfig(virDomainHostdevDefPtr hostdev,
if (!virHostdevIsPCINetDevice(hostdev))
return 0;
- if (virHostdevNetDevice(hostdev, &linkdev, &vf) < 0)
+ if (virHostdevNetDevice(hostdev, -1, &linkdev, &vf) < 0)
goto cleanup;
vlan = virDomainNetGetActualVlan(hostdev->parent.data.net);
@@ -545,7 +548,7 @@ virHostdevRestoreNetConfig(virDomainHostdevDefPtr hostdev,
return ret;
}
- if (virHostdevNetDevice(hostdev, &linkdev, &vf) < 0)
+ if (virHostdevNetDevice(hostdev, 0, &linkdev, &vf) < 0)
return ret;
virtPort = virDomainNetGetActualVirtPortProfile(
@@ -565,6 +568,18 @@ virHostdevRestoreNetConfig(virDomainHostdevDefPtr hostdev,
ret = virNetDevReadNetConfig(linkdev, vf, oldStateDir,
&adminMAC, &vlan, &MAC);
+ if (ret < 0) {
+ /* see if the config was saved using the PF's "port 2"
+ * netdev for the file name.
+ */
+ VIR_FREE(linkdev);
+
+ if (virHostdevNetDevice(hostdev, 1, &linkdev, &vf) >= 0) {
+ ret = virNetDevReadNetConfig(linkdev, vf, stateDir,
+ &adminMAC, &vlan, &MAC);
+ }
+ }
+
if (ret == 0) {
/* if a MAC was stored for the VF, we should now restore
* that as the adminMAC. We have to do it this way because
diff --git a/src/util/virpci.c b/src/util/virpci.c
index 62a36b380..5ded77087 100644
--- a/src/util/virpci.c
+++ b/src/util/virpci.c
@@ -2935,10 +2935,14 @@ virPCIGetNetName(const char *device_link_sysfs_path,
int
virPCIGetVirtualFunctionInfo(const char *vf_sysfs_device_path,
- char **pfname, int *vf_index)
+ int pfNetDevIdx,
+ char **pfname,
+ int *vf_index)
{
virPCIDeviceAddressPtr pf_config_address = NULL;
char *pf_sysfs_device_path = NULL;
+ char *vfname = NULL;
+ char *vfPhysPortID = NULL;
int ret = -1;
if (virPCIGetPhysicalFunction(vf_sysfs_device_path, &pf_config_address) < 0)
@@ -2957,8 +2961,28 @@ virPCIGetVirtualFunctionInfo(const char *vf_sysfs_device_path,
goto cleanup;
}
- if (virPCIGetNetName(pf_sysfs_device_path, 0, NULL, pfname) < 0)
+ /* If the caller hasn't asked for a specific pfNetDevIdx, and VF
+ * is bound to a netdev, learn that netdev's phys_port_id (if
+ * available). This can be used to disambiguate when the PF has
+ * multiple netdevs. If the VF isn't bound to a netdev, then we
+ * return netdev[pfNetDevIdx] on the PF, which may or may not be
+ * correct.
+ */
+ if (pfNetDevIdx == -1) {
+ if (virPCIGetNetName(vf_sysfs_device_path, 0, NULL, &vfname) < 0)
+ goto cleanup;
+
+ if (vfname) {
+ if (virNetDevGetPhysPortID(vfname, &vfPhysPortID) < 0)
+ goto cleanup;
+ }
+ pfNetDevIdx = 0;
+ }
+
+ if (virPCIGetNetName(pf_sysfs_device_path,
+ pfNetDevIdx, vfPhysPortID, pfname) < 0) {
goto cleanup;
+ }
if (!*pfname) {
/* this shouldn't be possible. A VF can't exist unless its
@@ -2974,6 +2998,8 @@ virPCIGetVirtualFunctionInfo(const char *vf_sysfs_device_path,
cleanup:
VIR_FREE(pf_config_address);
VIR_FREE(pf_sysfs_device_path);
+ VIR_FREE(vfname);
+ VIR_FREE(vfPhysPortID);
return ret;
}
@@ -3044,6 +3070,7 @@ virPCIGetNetName(const char *device_link_sysfs_path
ATTRIBUTE_UNUSED,
int
virPCIGetVirtualFunctionInfo(const char *vf_sysfs_device_path ATTRIBUTE_UNUSED,
+ int pfNetDevIdx ATTRIBUTE_UNUSED,
char **pfname ATTRIBUTE_UNUSED,
int *vf_index ATTRIBUTE_UNUSED)
{
diff --git a/src/util/virpci.h b/src/util/virpci.h
index adf336706..f1fbe39e6 100644
--- a/src/util/virpci.h
+++ b/src/util/virpci.h
@@ -226,7 +226,9 @@ int virPCIGetAddrString(unsigned int domain,
int virPCIDeviceAddressParse(char *address, virPCIDeviceAddressPtr bdf);
int virPCIGetVirtualFunctionInfo(const char *vf_sysfs_device_path,
- char **pfname, int *vf_index);
+ int pfNetDevIdx,
+ char **pfname,
+ int *vf_index);
int virPCIDeviceUnbind(virPCIDevicePtr dev);
int virPCIDeviceRebind(virPCIDevicePtr dev);
--
2.13.3