Adding functionality to libvirt that will allow querying the interface
for the availability of switchdev Offloading NIC capabilities.
The switchdev mode was introduced in kernel 4.8, the iproute2-devlink
command to retrieve the swtichdev NIC feature,
Command example: devlink dev eswitch show pci/0000:03:00.0
This feature is needed for Openstack so we can do a scheduling decision
if the NIC is in Hardware Offload (switchdev) or regular SR-IOV (legacy) mode.
And select the appropriate hypervisors with the requested capability see [1].
[1] -
https://specs.openstack.org/openstack/nova-specs/specs/pike/approved/enab...
---
configure.ac | 13 ++
docs/formatnode.html.in | 1 +
src/util/virnetdev.c | 187 +++++++++++++++++++++-
src/util/virnetdev.h | 1 +
tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml | 1 +
tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml | 1 +
6 files changed, 203 insertions(+), 1 deletion(-)
diff --git a/configure.ac b/configure.ac
index b12b7fa..c089798 100644
--- a/configure.ac
+++ b/configure.ac
@@ -627,6 +627,19 @@ if test "$with_linux" = "yes"; then
AC_CHECK_HEADERS([linux/btrfs.h])
fi
+dnl
+dnl check for kernel headers required by devlink
+dnl
+if test "$with_linux" = "yes"; then
+ AC_CHECK_HEADERS([linux/devlink.h])
+ AC_CHECK_DECLS([DEVLINK_GENL_VERSION, DEVLINK_GENL_NAME, DEVLINK_ATTR_MAX,
DEVLINK_CMD_ESWITCH_GET, DEVLINK_ATTR_BUS_NAME, DEVLINK_ATTR_DEV_NAME,
DEVLINK_ATTR_ESWITCH_MODE, DEVLINK_ESWITCH_MODE_SWITCHDEV],
+ [AC_DEFINE([HAVE_DECL_DEVLINK],
+ [1],
+ [whether devlink declarations is available])],
+ [],
+ [[#include <linux/devlink.h>]])
+fi
+
dnl Allow perl/python overrides
AC_PATH_PROGS([PYTHON], [python2 python])
if test -z "$PYTHON"; then
diff --git a/docs/formatnode.html.in b/docs/formatnode.html.in
index 4d935b5..29244a8 100644
--- a/docs/formatnode.html.in
+++ b/docs/formatnode.html.in
@@ -227,6 +227,7 @@
<dt><code>rxhash</code></dt><dd>receive-hashing</dd>
<dt><code>rdma</code></dt><dd>remote-direct-memory-access</dd>
<dt><code>txudptnl</code></dt><dd>tx-udp-tunnel-segmentation</dd>
+
<dt><code>switchdev</code></dt><dd>kernel-forward-plane-offload</dd>
</dl>
</dd>
<dt><code>capability</code></dt>
diff --git a/src/util/virnetdev.c b/src/util/virnetdev.c
index 51a6e42..fc7c961 100644
--- a/src/util/virnetdev.c
+++ b/src/util/virnetdev.c
@@ -59,6 +59,10 @@
# include <net/if_dl.h>
#endif
+#if HAVE_DECL_DEVLINK
+# include <linux/devlink.h>
+#endif
+
#ifndef IFNAMSIZ
# define IFNAMSIZ 16
#endif
@@ -2481,7 +2485,8 @@ VIR_ENUM_IMPL(virNetDevFeature,
"ntuple",
"rxhash",
"rdma",
- "txudptnl")
+ "txudptnl",
+ "switchdev")
#ifdef __linux__
int
@@ -2936,6 +2941,7 @@ int virNetDevGetRxFilter(const char *ifname,
return ret;
}
+
#if defined(SIOCETHTOOL) && defined(HAVE_STRUCT_IFREQ)
/**
@@ -3115,6 +3121,182 @@ virNetDevGetEthtoolFeatures(virBitmapPtr bitmap,
}
+#if HAVE_DECL_DEVLINK
+/**
+ * virNetDevPutExtraHeader
+ * reserve and prepare room for an extra header
+ * This function sets to zero the room that is required to put the extra
+ * header after the initial Netlink header. This function also increases
+ * the nlmsg_len field.
+ *
+ * @nlh: pointer to Netlink header
+ * @size: size of the extra header that we want to put
+ *
+ * Returns pointer to the start of the extended header
+ */
+static void *
+virNetDevPutExtraHeader(struct nlmsghdr *nlh,
+ size_t size)
+{
+ char *ptr = (char *)nlh + nlh->nlmsg_len;
+ size_t len = NLMSG_ALIGN(size);
+ nlh->nlmsg_len += len;
+ memset(ptr, 0, len);
+ return ptr;
+}
+
+
+/**
+ * virNetDevGetFamilyId:
+ * This function supplies the devlink family id
+ *
+ * @family_name: the name of the family to query
+ *
+ * Returns family id or 0 on failure.
+ */
+static uint32_t
+virNetDevGetFamilyId(const char *family_name)
+{
+ struct nl_msg *nl_msg = NULL;
+ struct nlmsghdr *resp = NULL;
+ struct genlmsghdr* gmsgh = NULL;
+ struct nlattr *tb[CTRL_ATTR_MAX + 1] = {NULL, };
+ unsigned int recvbuflen;
+ uint32_t family_id = 0;
+
+ if (!(nl_msg = nlmsg_alloc_simple(GENL_ID_CTRL,
+ NLM_F_REQUEST | NLM_F_ACK))) {
+ virReportOOMError();
+ goto cleanup;
+ }
+
+ if (!(gmsgh = virNetDevPutExtraHeader(nlmsg_hdr(nl_msg), sizeof(struct
genlmsghdr))))
+ goto cleanup;
+
+ gmsgh->cmd = CTRL_CMD_GETFAMILY;
+ gmsgh->version = DEVLINK_GENL_VERSION;
+
+ if (nla_put_string(nl_msg, CTRL_ATTR_FAMILY_NAME, family_name) < 0) {
+ virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+ _("allocated netlink buffer is too small"));
+ goto cleanup;
+ }
+
+ if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, NETLINK_GENERIC, 0)
< 0)
+ goto cleanup;
+
+ if (nlmsg_parse(resp, sizeof(struct nlmsghdr), tb, CTRL_CMD_MAX, NULL) < 0) {
+ virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+ _("malformed netlink response message"));
+ goto cleanup;
+ }
+
+ if (tb[CTRL_ATTR_FAMILY_ID] == NULL)
+ goto cleanup;
+
+ family_id = *(uint32_t *)RTA_DATA(tb[CTRL_ATTR_FAMILY_ID]);
+
+ cleanup:
+ nlmsg_free(nl_msg);
+ VIR_FREE(resp);
+ return family_id;
+}
+
+
+/**
+ * virNetDevSwitchdevFeature
+ * This function checks for the availability of Switchdev feature
+ * and add it to bitmap
+ *
+ * @ifname: name of the interface
+ * @out: add Switchdev feature if exist to bitmap
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+static int
+virNetDevSwitchdevFeature(const char *ifname,
+ virBitmapPtr *out)
+{
+ struct nl_msg *nl_msg = NULL;
+ struct nlmsghdr *resp = NULL;
+ unsigned int recvbuflen;
+ struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {NULL, };
+ virPCIDevicePtr pci_device_ptr = NULL;
+ struct genlmsghdr* gmsgh = NULL;
+ const char *pci_name;
+ char *pfname = NULL;
+ int is_vf = -1;
+ int ret = -1;
+ uint32_t family_id;
+
+ if ((family_id = virNetDevGetFamilyId(DEVLINK_GENL_NAME)) <= 0)
+ return ret;
+
+ if ((is_vf = virNetDevIsVirtualFunction(ifname)) < 0)
+ return ret;
+
+ if (is_vf == 1 && virNetDevGetPhysicalFunction(ifname, &pfname) < 0)
+ goto cleanup;
+
+ if (!(nl_msg = nlmsg_alloc_simple(family_id,
+ NLM_F_REQUEST | NLM_F_ACK))) {
+ virReportOOMError();
+ goto cleanup;
+ }
+
+ if (!(gmsgh = virNetDevPutExtraHeader(nlmsg_hdr(nl_msg), sizeof(struct
genlmsghdr))))
+ goto cleanup;
+
+ gmsgh->cmd = DEVLINK_CMD_ESWITCH_GET;
+ gmsgh->version = DEVLINK_GENL_VERSION;
+
+ pci_device_ptr = pfname ? virNetDevGetPCIDevice(pfname) :
+ virNetDevGetPCIDevice(ifname);
+ if (pci_device_ptr == NULL)
+ goto cleanup;
+
+ pci_name = virPCIDeviceGetName(pci_device_ptr);
+
+ if (nla_put(nl_msg, DEVLINK_ATTR_BUS_NAME, strlen("pci")+1,
"pci") < 0 ||
+ nla_put(nl_msg, DEVLINK_ATTR_DEV_NAME, strlen(pci_name)+1, pci_name) < 0) {
+ virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+ _("allocated netlink buffer is too small"));
+ goto cleanup;
+ }
+
+ if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, NETLINK_GENERIC, 0)
< 0)
+ goto cleanup;
+
+ if (nlmsg_parse(resp, sizeof(struct genlmsghdr), tb, DEVLINK_ATTR_MAX, NULL) < 0)
{
+ virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+ _("malformed netlink response message"));
+ goto cleanup;
+ }
+
+ if (tb[DEVLINK_ATTR_ESWITCH_MODE] &&
+ *(int *)RTA_DATA(tb[DEVLINK_ATTR_ESWITCH_MODE]) ==
DEVLINK_ESWITCH_MODE_SWITCHDEV) {
+ ignore_value(virBitmapSetBit(*out, VIR_NET_DEV_FEAT_SWITCHDEV));
+ }
+
+ ret = 0;
+
+ cleanup:
+ nlmsg_free(nl_msg);
+ virPCIDeviceFree(pci_device_ptr);
+ VIR_FREE(resp);
+ VIR_FREE(pfname);
+ return ret;
+}
+#else
+static int
+virNetDevSwitchdevFeature(const char *ifname ATTRIBUTE_UNUSED,
+ virBitmapPtr *out ATTRIBUTE_UNUSED)
+{
+ return 0;
+}
+#endif
+
+
# if HAVE_DECL_ETHTOOL_GFEATURES
/**
* virNetDevGFeatureAvailable
@@ -3315,6 +3497,9 @@ virNetDevGetFeatures(const char *ifname,
if (virNetDevRDMAFeature(ifname, out) < 0)
goto cleanup;
+ if (virNetDevSwitchdevFeature(ifname, out) < 0)
+ goto cleanup;
+
ret = 0;
cleanup:
VIR_FORCE_CLOSE(fd);
diff --git a/src/util/virnetdev.h b/src/util/virnetdev.h
index 9205c0e..71eaf45 100644
--- a/src/util/virnetdev.h
+++ b/src/util/virnetdev.h
@@ -112,6 +112,7 @@ typedef enum {
VIR_NET_DEV_FEAT_RXHASH,
VIR_NET_DEV_FEAT_RDMA,
VIR_NET_DEV_FEAT_TXUDPTNL,
+ VIR_NET_DEV_FEAT_SWITCHDEV,
VIR_NET_DEV_FEAT_LAST
} virNetDevFeature;
diff --git a/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
b/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
index d4c96e8..88252e6 100644
--- a/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
+++ b/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
@@ -15,6 +15,7 @@
<feature name='rxhash'/>
<feature name='rdma'/>
<feature name='txudptnl'/>
+ <feature name='switchdev'/>
<capability type='80211'/>
</capability>
</device>
diff --git a/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
b/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
index 71bf90e..f77dfcc 100644
--- a/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
+++ b/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml
@@ -15,6 +15,7 @@
<feature name='rxhash'/>
<feature name='rdma'/>
<feature name='txudptnl'/>
+ <feature name='switchdev'/>
<capability type='80203'/>
</capability>
</device>
--
2.1.4