[libvirt] [RFC PATCH] Set and reset MAC for PASSTHROUGH mode

there is a problem present in libvirt that when a PASSTHROUGH mode DIRECT NIC connection is made the MAC address of the NIC is not automatically set and reset to the configured VM MAC and back again. The attached patch fixes this problem by setting and resetting the MAC while remembering the previous setting while the VM is running. This also works if libvirtd is restarted while the VM is running. The storing of the previous MAC address is done in a link ( currently in /tmp, but should probably go into /var/run somewhere ). Best regards Dirk

--- src/libvirt_macvtap.syms | 2 + src/qemu/qemu_command.c | 48 +++++++++++++++ src/qemu/qemu_process.c | 45 ++++++++++++++ src/util/macvtap.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/macvtap.h | 6 ++ 5 files changed, 250 insertions(+), 0 deletions(-) diff --git a/src/libvirt_macvtap.syms b/src/libvirt_macvtap.syms index b48565b..0520cb5 100644 --- a/src/libvirt_macvtap.syms +++ b/src/libvirt_macvtap.syms @@ -6,5 +6,7 @@ # macvtap.h delMacvtap; openMacvtapTap; +get_macaddr; +set_macaddr; vpAssociatePortProfileId; vpDisassociatePortProfileId; diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 4e68241..41972fa 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -118,6 +118,7 @@ qemuPhysIfaceConnect(virDomainDefPtr def, int rc; #if WITH_MACVTAP char *res_ifname = NULL; + unsigned char old_macaddr[6]; int vnet_hdr = 0; int err; @@ -125,6 +126,53 @@ qemuPhysIfaceConnect(virDomainDefPtr def, net->model && STREQ(net->model, "virtio")) vnet_hdr = 1; + /** Note: When using PASSTHROUGH mode with MACVTAP devices the link + * devices's MAC address must be set to the VMs MAC address. In + * order to not confuse the first switch or bridge in line this MAC + * address must be reset when the VM is shut down. + * This is especially important when using SRIOV capable cards that + * emulate their switch in firmware. + */ + if ( net->data.direct.mode == VIR_DOMAIN_NETDEV_MACVTAP_MODE_PASSTHRU ) { + rc = get_macaddr(&old_macaddr, 6, net->data.direct.linkdev); + if (rc) { + virReportSystemError(rc, + _("Getting MAC address from '%s' " + "to '%02x:%02x:%02x:%02x:%02x:%02x' failed."), + net->data.direct.linkdev, + old_macaddr[0],old_macaddr[1],old_macaddr[2], + old_macaddr[3],old_macaddr[4],old_macaddr[5]); + } else { + char oldmacname[256], newmacname[256]; + + sprintf(oldmacname,"%02x:%02x:%02x:%02x:%02x:%02x", + old_macaddr[0],old_macaddr[1],old_macaddr[2], + old_macaddr[3],old_macaddr[4],old_macaddr[5]); + + sprintf(newmacname,"/tmp/%s@%02x:%02x:%02x:%02x:%02x:%02x", + net->data.direct.linkdev, + net->mac[0],net->mac[1],net->mac[2], + net->mac[3],net->mac[4],net->mac[5]); + + rc = symlink (oldmacname, newmacname); + if (rc) { + virReportSystemError(rc, + _("MAC link file creation failed for %s."), + net->data.direct.linkdev); + } + } + + rc = set_macaddr(net->mac, 6, net->data.direct.linkdev); + if (rc) { + virReportSystemError(rc, + _("Setting MAC address on '%s' to " + "'%02x:%02x:%02x:%02x:%02x:%02x' failed."), + net->data.direct.linkdev, + net->mac[0],net->mac[1],net->mac[2], + net->mac[3],net->mac[4],net->mac[5]); + } + } + rc = openMacvtapTap(net->ifname, net->mac, net->data.direct.linkdev, net->data.direct.mode, vnet_hdr, def->uuid, &net->data.direct.virtPortProfile, &res_ifname, diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 1efe024..127acb8 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2663,6 +2663,51 @@ void qemuProcessStop(struct qemud_driver *driver, if (net->type == VIR_DOMAIN_NET_TYPE_DIRECT) { delMacvtap(net->ifname, net->mac, net->data.direct.linkdev, &net->data.direct.virtPortProfile); + + if ( net->data.direct.mode == VIR_DOMAIN_NETDEV_MACVTAP_MODE_PASSTHRU ) { + char newmacname[256], linkdata[64]; + unsigned int old_macaddr[6]; + + sprintf(newmacname,"/tmp/%s@%02x:%02x:%02x:%02x:%02x:%02x", + net->data.direct.linkdev, + net->mac[0],net->mac[1],net->mac[2], + net->mac[3],net->mac[4],net->mac[5]); + + ret = readlink (newmacname, linkdata, 64); + if ( ret == 17 ) { + linkdata[17] = 0; + + ret = sscanf(linkdata,"%x:%x:%x:%x:%x:%x", + &old_macaddr[0],&old_macaddr[1],&old_macaddr[2], + &old_macaddr[3],&old_macaddr[4],&old_macaddr[5]); + if ( ret == 6 ) { + unsigned char oldmac[6]; + oldmac[0] = (unsigned char)(old_macaddr[0] & 0xFF) ; + oldmac[1] = (unsigned char)(old_macaddr[1] & 0xFF) ; + oldmac[2] = (unsigned char)(old_macaddr[2] & 0xFF) ; + oldmac[3] = (unsigned char)(old_macaddr[3] & 0xFF) ; + oldmac[4] = (unsigned char)(old_macaddr[4] & 0xFF) ; + oldmac[5] = (unsigned char)(old_macaddr[5] & 0xFF) ; + + VIR_WARN(("Resetting MAC address on '%s' " + "to '%02x:%02x:%02x:%02x:%02x:%02x'."), + net->data.direct.linkdev, + oldmac[0],oldmac[1],oldmac[2], + oldmac[3],oldmac[4],oldmac[5]); + /*reset mac and remove link file-ignore restults*/ + ret = set_macaddr(oldmac, 6, net->data.direct.linkdev); + ret = unlink(newmacname); + } else { + VIR_WARN(("Scanning MAC address from '%s' " + "failed ! Got %i values."), + linkdata, ret); + } + } else { + VIR_WARN(("Reading MAC address from '%s' failed ! " + "Got %i bytes."), + newmacname, ret); + } + } VIR_FREE(net->ifname); } } diff --git a/src/util/macvtap.c b/src/util/macvtap.c index 068638e..1026a84 100644 --- a/src/util/macvtap.c +++ b/src/util/macvtap.c @@ -191,6 +191,155 @@ err_exit: # if WITH_MACVTAP +/** + * get_macaddr: + * Get the MAC address of a network device + * + * @macaddress: Pointer where the MAC address will be stored + * @macaddrsize: Size of the MAC address. + * @srcdev: The interface name of the NIC to get the MAC from + * + * Returns zero in case of success, + * negative value otherwise with error reported. + * + */ +int +get_macaddr(const unsigned char *macaddress, int macaddrsize, + const char *srcdev ) +{ + int sockfd; + int io; + char buffer[1024]; + struct ifreq ifr; + + strcpy(ifr.ifr_name, srcdev); + + sockfd = socket(AF_INET, SOCK_STREAM, 0); + if(sockfd < 0){ + return -1; + } + + io = ioctl(sockfd, SIOCGIFHWADDR, (char *)&ifr); + if(io < 0){ + return -1; + } + + bcopy( ifr.ifr_ifru.ifru_hwaddr.sa_data, macaddress, macaddrsize); + + return 0; +} + +/** + * Set_macaddr: + * Set the MAC address of a network device + * + * @macaddress: MAC address to assign to the NIC + * @macaddrsize: Size of the MAC address. + * @srcdev: The interface name of the NIC + * + * Returns zero in case of success, + * negative value otherwise with error reported. + * + */ +int +set_macaddr(const unsigned char *macaddress, int macaddrsize, + const char *srcdev ) +{ + int rc = 0; + struct nlmsghdr *resp; + struct nlmsgerr *err; + struct ifinfomsg ifinfo = { .ifi_family = AF_UNSPEC }; + int ifindex; + unsigned char *recvbuf = NULL; + unsigned int recvbuflen; + struct nl_msg *nl_msg; + struct nlattr *linkinfo; + + if (ifaceGetIndex(true, srcdev, &ifindex) != 0) + return -1; + + nl_msg = nlmsg_alloc_simple(RTM_SETLINK, NLM_F_REQUEST); + + if (!nl_msg) { + virReportOOMError(); + return -1; + } + + if (nlmsg_append(nl_msg, &ifinfo, sizeof(ifinfo), NLMSG_ALIGNTO) < 0) + goto buffer_too_small; + + if (nla_put_u32(nl_msg, IFLA_LINK, ifindex) < 0) + goto buffer_too_small; + + if (nla_put(nl_msg, IFLA_ADDRESS, macaddrsize, macaddress) < 0) + goto buffer_too_small; + + if (srcdev && + nla_put(nl_msg, IFLA_IFNAME, strlen(srcdev)+1, srcdev) < 0) + goto buffer_too_small; + + if (nlComm(nl_msg, &recvbuf, &recvbuflen, 0) < 0) { + rc = -1; + goto err_exit; + } + + if (recvbuflen < NLMSG_LENGTH(0) || recvbuf == NULL) + goto malformed_resp; + + resp = (struct nlmsghdr *)recvbuf; + + switch (resp->nlmsg_type) { + case NLMSG_ERROR: + err = (struct nlmsgerr *)NLMSG_DATA(resp); + if (resp->nlmsg_len < NLMSG_LENGTH(sizeof(*err))) + goto malformed_resp; + + switch (err->error) { + + case 0: + break; + + case -EEXIST: + rc = -1; + break; + + default: + macvtapError(VIR_ERR_INTERNAL_ERROR, "%s", + _("error setting device mac address")); + rc = -1; + } + break; + + case NLMSG_DONE: + break; + + default: + goto malformed_resp; + } + +err_exit: + nlmsg_free(nl_msg); + + VIR_FREE(recvbuf); + + return rc; + +malformed_resp: + nlmsg_free(nl_msg); + + macvtapError(VIR_ERR_INTERNAL_ERROR, "%s", + _("malformed netlink response message")); + VIR_FREE(recvbuf); + return -1; + +buffer_too_small: + nlmsg_free(nl_msg); + + macvtapError(VIR_ERR_INTERNAL_ERROR, "%s", + _("allocated netlink buffer is too small")); + return -1; +} + static int link_add(const char *type, const unsigned char *macaddress, int macaddrsize, diff --git a/src/util/macvtap.h b/src/util/macvtap.h index a1383c4..d71546a 100644 --- a/src/util/macvtap.h +++ b/src/util/macvtap.h @@ -75,6 +75,12 @@ enum virVMOperationType { # include "internal.h" +int get_macaddr(const unsigned char *macaddress, int macaddrsize, + const char *srcdev ); + +int set_macaddr(const unsigned char *macaddress, int macaddrsize, + const char *srcdev ); + int openMacvtapTap(const char *ifname, const unsigned char *macaddress, const char *linkdev, -- 1.7.5.2

On 06/10/2011 05:40 AM, Dirk Herrendoerfer wrote:
---
...see my earlier mail about the cover letter containing useful commit information. Also, your subject line is way long; typically the subject should be < 60 characters, then a blank line, then you can give detailed information about the patch. 'git shortlog -30' will give you an idea of why short subject lines are useful.
+++ b/src/libvirt_macvtap.syms @@ -6,5 +6,7 @@ # macvtap.h delMacvtap; openMacvtapTap; +get_macaddr; +set_macaddr;
Let's keep this list sorted, and stick with camelCase (that is, getMacaddr comes before openMacvtapTap). Also, you're not the first, but this file isn't very namespace clean; I'm wondering if we should first clean things up to use a virFunction prefix style for all the internally-exported functions in our macvtap wrapper file.
@@ -125,6 +126,53 @@ qemuPhysIfaceConnect(virDomainDefPtr def, net->model && STREQ(net->model, "virtio")) vnet_hdr = 1;
+ /** Note: When using PASSTHROUGH mode with MACVTAP devices the link + * devices's MAC address must be set to the VMs MAC address. In
s/devices's/device's/ trailing space; please make sure your patch passes 'make syntax-check' first.
+ * order to not confuse the first switch or bridge in line this MAC + * address must be reset when the VM is shut down. + * This is especially important when using SRIOV capable cards that + * emulate their switch in firmware. + */ + if ( net->data.direct.mode == VIR_DOMAIN_NETDEV_MACVTAP_MODE_PASSTHRU ) {
Style: we usually don't put space immediately inside the (): if (net->data.direct.mode == VIR_DOMAIN_NETDEV_MACVTAP_MODE_PASSTHRU) {
+ rc = get_macaddr(&old_macaddr, 6, net->data.direct.linkdev); + if (rc) { + virReportSystemError(rc, + _("Getting MAC address from '%s' " + "to '%02x:%02x:%02x:%02x:%02x:%02x' failed."), + net->data.direct.linkdev, + old_macaddr[0],old_macaddr[1],old_macaddr[2],
Style - space after comma: old_macaddr[0], old_macaddr[1], ...
+ old_macaddr[3],old_macaddr[4],old_macaddr[5]); + } else { + char oldmacname[256], newmacname[256];
Why so big, when you know that you have a fixed-width address that only takes sizeof("00:00:00:00:00:00") bytes?
+ + sprintf(oldmacname,"%02x:%02x:%02x:%02x:%02x:%02x", + old_macaddr[0],old_macaddr[1],old_macaddr[2], + old_macaddr[3],old_macaddr[4],old_macaddr[5]); + + sprintf(newmacname,"/tmp/%s@%02x:%02x:%02x:%02x:%02x:%02x", + net->data.direct.linkdev, + net->mac[0],net->mac[1],net->mac[2], + net->mac[3],net->mac[4],net->mac[5]);
For that matter, since it appears that you are trying to create a file name, it would be more appropriate to use virAsprintf (to malloc the name, rather than doing it on the stack), so that the directory prefix can then be variable sized according to configure arguments. For example, see how daemon/libvirtd.c computes /var/run/libvirt if base_dir_prefix is "/var", but is flexible to being run by non-root ($HOME/.libvirt/run instead of /var/run).
+ + rc = symlink (oldmacname, newmacname); + if (rc) { + virReportSystemError(rc,
Here, and in several other places after a failed syscall, you want to report the value of errno (which is a positive value such as EACCES), not rc (which is often -1, and which makes no sense in virReportSystemError).
+ _("MAC link file creation failed for %s."), + net->data.direct.linkdev); + } + } + + rc = set_macaddr(net->mac, 6, net->data.direct.linkdev);
Why the hardcoded 6? MAC addresses are fixed length; there's no need to pass that length as a magic number parameter.
+++ b/src/qemu/qemu_process.c @@ -2663,6 +2663,51 @@ void qemuProcessStop(struct qemud_driver *driver, if (net->type == VIR_DOMAIN_NET_TYPE_DIRECT) { delMacvtap(net->ifname, net->mac, net->data.direct.linkdev, &net->data.direct.virtPortProfile); + + if ( net->data.direct.mode == VIR_DOMAIN_NETDEV_MACVTAP_MODE_PASSTHRU ) { + char newmacname[256], linkdata[64]; + unsigned int old_macaddr[6]; + + sprintf(newmacname,"/tmp/%s@%02x:%02x:%02x:%02x:%02x:%02x",
Same style issues as above. Same question about newmacname being stack-allocated. Another reason to use virAsprintf is to avoid potential stack overflow issues if net->data.direct.linkdev is really large.
+ net->data.direct.linkdev, + net->mac[0],net->mac[1],net->mac[2], + net->mac[3],net->mac[4],net->mac[5]); + + ret = readlink (newmacname, linkdata, 64);
'make syntax-check' will call you on this one. Use virFileResolveLink instead.
+ if ( ret == 17 ) { + linkdata[17] = 0; + + ret = sscanf(linkdata,"%x:%x:%x:%x:%x:%x", + &old_macaddr[0],&old_macaddr[1],&old_macaddr[2], + &old_macaddr[3],&old_macaddr[4],&old_macaddr[5]);
I'm not necessarily a fan of sscanf (it misses out on some integer overflow issues), although I think this particular usage is probably safe enough.
+ if ( ret == 6 ) { + unsigned char oldmac[6]; + oldmac[0] = (unsigned char)(old_macaddr[0] & 0xFF) ;
Odd indentation, and poor style with space before ;. Unnecessary mask and cast since oldmac is already the right type. You can simply use: oldmac[0] = old_macaddr[0]; for properly converting the int (necessary for sscanf) down to an unsigned char. Even nicer would be to use the POSIX-mandated sscanf("%hhx", &oldmac[0]) in the first place, if that were portable (unfortunately, it isn't portable, and gnulib intentionally doesn't provide a fixed sscanf implementation because sscanf suffers from silent overflow issues in general). It would probably be nicer to rewrite this whole thing to avoid sscanf, and parse directly into unsigned char without going through intermediate ints. In fact, it seems like the conversion from a string into a MAC address ought to be an already supported common routine... </me searches> Yep - virParseMacAddr looks like your friend.
+ oldmac[1] = (unsigned char)(old_macaddr[1] & 0xFF) ; + oldmac[2] = (unsigned char)(old_macaddr[2] & 0xFF) ; + oldmac[3] = (unsigned char)(old_macaddr[3] & 0xFF) ; + oldmac[4] = (unsigned char)(old_macaddr[4] & 0xFF) ; + oldmac[5] = (unsigned char)(old_macaddr[5] & 0xFF) ; + + VIR_WARN(("Resetting MAC address on '%s' " + "to '%02x:%02x:%02x:%02x:%02x:%02x'."), + net->data.direct.linkdev, + oldmac[0],oldmac[1],oldmac[2], + oldmac[3],oldmac[4],oldmac[5]); + /*reset mac and remove link file-ignore restults*/
s/restults/results/
+int +get_macaddr(const unsigned char *macaddress, int macaddrsize, + const char *srcdev ) +{ + int sockfd; + int io; + char buffer[1024]; + struct ifreq ifr; + + strcpy(ifr.ifr_name, srcdev);
Another 'make syntax-check' violation.
+ + sockfd = socket(AF_INET, SOCK_STREAM, 0); + if(sockfd < 0){ + return -1; + } + + io = ioctl(sockfd, SIOCGIFHWADDR, (char *)&ifr); + if(io < 0){ + return -1; + } + + bcopy( ifr.ifr_ifru.ifru_hwaddr.sa_data, macaddress, macaddrsize);
bcopy is dead. Use memcpy. I ran out of time to review the rest of this; the overall idea has merit, but there are a lot of changes needed in v2 before anything can be applied. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

On Mon, 2011-06-13 at 12:34 -0600, Eric Blake wrote:
On 06/10/2011 05:40 AM, Dirk Herrendoerfer wrote:
--- .
I ran out of time to review the rest of this; the overall idea has merit, but there are a lot of changes needed in v2 before anything can be applied.
thanks for the review and the comments. I am temporarily taking over from Dirk and just sent out a V2 patch which should address most of the comments -- Best regards, Gerhard Stenzel, ----------------------------------------------------------------------------------------------------------------------------------- IBM Deutschland Research & Development GmbH Vorsitzender des Aufsichtsrats: Martin Jetter Geschäftsführung: Dirk Wittkopp Sitz der Gesellschaft: Böblingen Registergericht: Amtsgericht Stuttgart, HRB 243294

On 06/10/2011 05:40 AM, Dirk Herrendoerfer wrote:
there is a problem present in libvirt that when a PASSTHROUGH mode DIRECT NIC connection is made the MAC address of the NIC is not automatically set and reset to the configured VM MAC and back again.
The attached patch fixes this problem by setting and resetting the MAC while remembering the previous setting while the VM is running. This also works if libvirtd is restarted while the VM is running.
The storing of the previous MAC address is done in a link ( currently in /tmp, but should probably go into /var/run somewhere ).
/tmp is probably not the best place; we already have a hierarchy under /var/run/libvirt/ and this information should live somewhere in that hierarchy. Meanwhile, this information is useful in the commit message itself, rather than in a cover letter... -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

On Mon, 2011-06-13 at 12:09 -0600, Eric Blake wrote:
/tmp is probably not the best place; we already have a hierarchy under /var/run/libvirt/ and this information should live somewhere in that hierarchy.
How about "/var/run/libvirt/network" ? -- Best regards, Gerhard Stenzel, ----------------------------------------------------------------------------------------------------------------------------------- IBM Deutschland Research & Development GmbH Vorsitzender des Aufsichtsrats: Martin Jetter Geschäftsführung: Dirk Wittkopp Sitz der Gesellschaft: Böblingen Registergericht: Amtsgericht Stuttgart, HRB 243294
participants (3)
-
Dirk Herrendoerfer
-
Eric Blake
-
Gerhard Stenzel