[libvirt] [PATCHv3 0/9] Let libvirt manage a bridge's MAC table

The idea behind these patches is the following: 1) most virtual machines only have a single MAC address behind each interface, and that MAC address is known by libvirt. 2) If we (i.e. libvirt) manually add an entry to the bridge's forwarding database (fdb) for the MAC address associated with a port on the bridge, we can turn off learning and unicast_flooding for that port. 3) kernels starting with 3.15 (and actually working correctly starting in kernel 3.17) will notice that all of a bridge's ports have flood and learning turned off, and in that case will turn off promiscuous mode on all ports. If all but one of the ports have flood/learning turned off, then promiscuous will be turned off on that port (and left on for all the other ports) 4) When (4) can be done, there is a measurable performance advantage. It can also *kind of* help security, as it will prevent a guest from doing anything useful if it changes its MAC address (but won't prevent the guest from *sending* packets with a spoofed MAC address). NB: These only work with a fixed MAC address, and no vlan tags set in the guest. Support for both of those will be coming. This series is the same as V2, which was previously ACK (pending final determination of attribute name): https://www.redhat.com/archives/libvir-list/2014-December/msg00173.html but with the name of the attribute changed - in V2 it was: fdb="learnWithFlooding|managed" and it is now: macTableManager="kernel|libvirt" which more accurately reflects what is being controlled with the attribute. Laine Stump (9): util: new functions for setting bridge and bridge port attributes util: functions to manage bridge fdb (forwarding database) conf: new network bridge device attribute macTableManager network: save bridge name in ActualNetDef when actualType==network too network: store network macTableManager setting in NetDef actual object network: setup bridge devices for macTableManager='libvirt' qemu: setup tap devices for macTableManager='libvirt' qemu: always use virDomainNetGetActualBridgeName to get interface's bridge lxc: always use virDomainNetGetActualBridgeName to get interface's bridge docs/formatnetwork.html.in | 50 ++- docs/schemas/network.rng | 9 + src/conf/domain_conf.c | 130 ++++--- src/conf/domain_conf.h | 2 + src/conf/network_conf.c | 51 ++- src/conf/network_conf.h | 11 + src/libvirt_private.syms | 11 + src/lxc/lxc_driver.c | 26 +- src/lxc/lxc_process.c | 26 +- src/network/bridge_driver.c | 78 +++++ src/qemu/qemu_command.c | 53 +-- src/qemu/qemu_hotplug.c | 54 +-- src/util/virnetdevbridge.c | 382 ++++++++++++++++++++- src/util/virnetdevbridge.h | 44 ++- tests/networkxml2xmlin/host-bridge-no-flood.xml | 6 + .../nat-network-explicit-flood.xml | 21 ++ tests/networkxml2xmlout/host-bridge-no-flood.xml | 6 + .../nat-network-explicit-flood.xml | 23 ++ tests/networkxml2xmltest.c | 2 + 19 files changed, 796 insertions(+), 189 deletions(-) create mode 100644 tests/networkxml2xmlin/host-bridge-no-flood.xml create mode 100644 tests/networkxml2xmlin/nat-network-explicit-flood.xml create mode 100644 tests/networkxml2xmlout/host-bridge-no-flood.xml create mode 100644 tests/networkxml2xmlout/nat-network-explicit-flood.xml -- 1.9.3

These functions all set/get items in the sysfs for a bridge device. --- No change since V1. src/libvirt_private.syms | 6 ++ src/util/virnetdevbridge.c | 235 ++++++++++++++++++++++++++++++++++++++++++++- src/util/virnetdevbridge.h | 28 +++++- 3 files changed, 266 insertions(+), 3 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 1853a9c..7cb57a9 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1663,9 +1663,15 @@ virNetDevBridgeCreate; virNetDevBridgeDelete; virNetDevBridgeGetSTP; virNetDevBridgeGetSTPDelay; +virNetDevBridgeGetVlanFiltering; +virNetDevBridgePortGetLearning; +virNetDevBridgePortGetUnicastFlood; +virNetDevBridgePortSetLearning; +virNetDevBridgePortSetUnicastFlood; virNetDevBridgeRemovePort; virNetDevBridgeSetSTP; virNetDevBridgeSetSTPDelay; +virNetDevBridgeSetVlanFiltering; # util/virnetdevmacvlan.h diff --git a/src/util/virnetdevbridge.c b/src/util/virnetdevbridge.c index 15434de..c6a3e8e 100644 --- a/src/util/virnetdevbridge.c +++ b/src/util/virnetdevbridge.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2013 Red Hat, Inc. + * Copyright (C) 2007-2014 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -219,6 +219,170 @@ static int virNetDevBridgeGet(const char *brname, } #endif /* __linux__ */ +#if defined(__linux__) +static int +virNetDevBridgePortSet(const char *brname, + const char *ifname, + const char *paramname, + unsigned long value) +{ + char *path = NULL; + char valuestr[INT_BUFSIZE_BOUND(value)]; + int ret = -1; + + snprintf(valuestr, sizeof(valuestr), "%lu", value); + + if (virAsprintf(&path, "%s/%s/brif/%s/%s", + SYSFS_NET_DIR, brname, ifname, paramname) < 0) + return -1; + + if (!virFileExists(path)) + errno = EINVAL; + else + ret = virFileWriteStr(path, valuestr, 0); + + if (ret < 0) { + virReportSystemError(errno, + _("Unable to set bridge %s port %s %s to %s"), + brname, ifname, paramname, valuestr); + } + + VIR_FREE(path); + return ret; +} + + +static int +virNetDevBridgePortGet(const char *brname, + const char *ifname, + const char *paramname, + unsigned long *value) +{ + char *path = NULL; + char *valuestr = NULL; + int ret = -1; + + if (virAsprintf(&path, "%s/%s/brif/%s/%s", + SYSFS_NET_DIR, brname, ifname, paramname) < 0) + return -1; + + if (virFileReadAll(path, INT_BUFSIZE_BOUND(unsigned long), &valuestr) < 0) + goto cleanup; + + if (virStrToLong_ul(valuestr, NULL, 10, value) < 0) { + virReportSystemError(EINVAL, + _("Unable to get bridge %s port %s %s"), + brname, ifname, paramname); + goto cleanup; + } + + ret = 0; + cleanup: + VIR_FREE(path); + VIR_FREE(valuestr); + return ret; +} + + +int +virNetDevBridgePortGetLearning(const char *brname, + const char *ifname, + bool *enable) +{ + int ret = -1; + unsigned long value; + + if (virNetDevBridgePortGet(brname, ifname, "learning", &value) < 0) + goto cleanup; + + *enable = !!value; + ret = 0; + cleanup: + return ret; +} + + +int +virNetDevBridgePortSetLearning(const char *brname, + const char *ifname, + bool enable) +{ + return virNetDevBridgePortSet(brname, ifname, "learning", enable ? 1 : 0); +} + + +int +virNetDevBridgePortGetUnicastFlood(const char *brname, + const char *ifname, + bool *enable) +{ + int ret = -1; + unsigned long value; + + if (virNetDevBridgePortGet(brname, ifname, "unicast_flood", &value) < 0) + goto cleanup; + + *enable = !!value; + ret = 0; + cleanup: + return ret; +} + + +int +virNetDevBridgePortSetUnicastFlood(const char *brname, + const char *ifname, + bool enable) +{ + return virNetDevBridgePortSet(brname, ifname, "unicast_flood", enable ? 1 : 0); +} + + +#else +int +virNetDevBridgePortGetLearning(const char *brname ATTRIBUTE_UNUSED, + const char *ifname ATTRIBUTE_UNUSED, + bool *enable ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("Unable to get bridge port learning on this platform")); + return -1; +} + + +int +virNetDevBridgePortSetLearning(const char *brname ATTRIBUTE_UNUSED, + const char *ifname ATTRIBUTE_UNUSED, + bool enable) +{ + virReportSystemError(ENOSYS, "%s", + _("Unable to set bridge port learning on this platform")); + return -1; +} + + +int +virNetDevBridgePortGetUnicastFlood(const char *brname ATTRIBUTE_UNUSED, + const char *ifname ATTRIBUTE_UNUSED, + bool *enable ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("Unable to get bridge port unicast_flood on this platform")); + return -1; +} + + +int +virNetDevBridgePortSetUnicastFlood(const char *brname ATTRIBUTE_UNUSED, + const char *ifname ATTRIBUTE_UNUSED, + bool enable ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("Unable to set bridge port unicast_flood on this platform")); + return -1; +} +#endif + /** * virNetDevBridgeCreate: @@ -517,7 +681,7 @@ int virNetDevBridgeSetSTPDelay(const char *brname, * @brname: the bridge device name * @delayms: the forward delay in milliseconds * - * Retrives the forward delay for the bridge device @brname + * Retrieves the forward delay for the bridge device @brname * storing it in @delayms. The forward delay is only meaningful * if STP is enabled * @@ -682,3 +846,70 @@ int virNetDevBridgeGetSTP(const char *brname, return -1; } #endif + +#if defined(HAVE_STRUCT_IFREQ) && defined(__linux__) +/** + * virNetDevBridgeGetVlanFiltering: + * @brname: the bridge device name + * @enable: true or false + * + * Retrieves the vlan_filtering setting for the bridge device @brname + * storing it in @enable. + * + * Returns 0 on success, -1 on error + */ +int +virNetDevBridgeGetVlanFiltering(const char *brname, + bool *enable) +{ + int ret = -1; + unsigned long value; + + if (virNetDevBridgeGet(brname, "vlan_filtering", &value, -1, NULL) < 0) + goto cleanup; + + *enable = !!value; + ret = 0; + cleanup: + return ret; +} + + +/** + * virNetDevBridgeSetVlanFiltering: + * @brname: the bridge name + * @enable: true or false + * + * Set the bridge vlan_filtering mode + * + * Returns 0 in case of success or -1 on failure + */ + +int +virNetDevBridgeSetVlanFiltering(const char *brname, + bool enable) +{ + return virNetDevBridgeSet(brname, "vlan_filtering", enable ? 1 : 0, -1, NULL); +} + + +#else +int +virNetDevBridgeGetVlanFiltering(const char *brname ATTRIBUTE_UNUSED, + bool *enable ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("Unable to get bridge vlan_filtering on this platform")); + return -1; +} + + +int +virNetDevBridgeSetVlanFiltering(const char *brname ATTRIBUTE_UNUSED, + bool enable ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("Unable to set bridge vlan_filtering on this platform")); + return -1; +} +#endif diff --git a/src/util/virnetdevbridge.h b/src/util/virnetdevbridge.h index 13776d2..8188a2f 100644 --- a/src/util/virnetdevbridge.h +++ b/src/util/virnetdevbridge.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2011 Red Hat, Inc. + * Copyright (C) 2007-2012, 2014 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -51,4 +51,30 @@ int virNetDevBridgeGetSTP(const char *brname, bool *enable) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; +int virNetDevBridgeSetVlanFiltering(const char *brname, + bool enable) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_RETURN_CHECK; +int virNetDevBridgeGetVlanFiltering(const char *brname, + bool *enable) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; + +int virNetDevBridgePortGetLearning(const char *brname, + const char *ifname, + bool *enable) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_NONNULL(3) + ATTRIBUTE_RETURN_CHECK; +int virNetDevBridgePortSetLearning(const char *brname, + const char *ifname, + bool enable) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; +int virNetDevBridgePortGetUnicastFlood(const char *brname, + const char *ifname, + bool *enable) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_NONNULL(3) + ATTRIBUTE_RETURN_CHECK; +int virNetDevBridgePortSetUnicastFlood(const char *brname, + const char *ifname, + bool enable) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; + #endif /* __VIR_NETDEV_BRIDGE_H__ */ -- 1.9.3

These two functions use netlink RTM_NEWNEIGH and RTM_DELNEIGH messages to add and delete entries from a bridge's fdb. The bridge itself is not referenced in the arguments to the functions, only the name of the device that is attached to the bridge (since a device can only be attached to one bridge at a time, and must be attached for this function to make sense, the kernel easily infers which bridge's fdb is being modified by looking at the device name/index). --- No change from V2. src/libvirt_private.syms | 2 + src/util/virnetdevbridge.c | 147 +++++++++++++++++++++++++++++++++++++++++++++ src/util/virnetdevbridge.h | 16 +++++ 3 files changed, 165 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 7cb57a9..3e445fc 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1661,6 +1661,8 @@ virNetDevBandwidthUpdateRate; virNetDevBridgeAddPort; virNetDevBridgeCreate; virNetDevBridgeDelete; +virNetDevBridgeFDBAdd; +virNetDevBridgeFDBDel; virNetDevBridgeGetSTP; virNetDevBridgeGetSTPDelay; virNetDevBridgeGetVlanFiltering; diff --git a/src/util/virnetdevbridge.c b/src/util/virnetdevbridge.c index c6a3e8e..78c4a25 100644 --- a/src/util/virnetdevbridge.c +++ b/src/util/virnetdevbridge.c @@ -37,6 +37,9 @@ #include <netinet/in.h> #ifdef __linux__ +# if defined(HAVE_LIBNL) +# include "virnetlink.h" +# endif # include <linux/sockios.h> # include <linux/param.h> /* HZ */ # if NETINET_LINUX_WORKAROUND @@ -913,3 +916,147 @@ virNetDevBridgeSetVlanFiltering(const char *brname ATTRIBUTE_UNUSED, return -1; } #endif + + +#if defined(__linux__) && defined(HAVE_LIBNL) +/* virNetDevBridgeFDBAddDel: + * @mac: the MAC address being added to the table + * @ifname: name of the port (interface) of the bridge that wants this MAC + * @flags: any of virNetDevBridgeFDBFlags ORed together. + * @isAdd: true if adding the entry, fals if deleting + * + * Use netlink RTM_NEWNEIGH and RTM_DELNEIGH messages to add and + * delete entries from a bridge's fdb. The bridge itself is not + * referenced in the arguments to the function, only the name of the + * device that is attached to the bridge (since a device can only be + * attached to one bridge at a time, and must be attached for this + * function to make sense, the kernel easily infers which bridge's fdb + * is being modified by looking at the device name/index). + * + * Attempting to add an existing entry, or delete a non-existing entry + * *is* an error. + * + * returns 0 on success, -1 on failure. + */ +static int +virNetDevBridgeFDBAddDel(const virMacAddr *mac, const char *ifname, + unsigned int flags, bool isAdd) +{ + int ret = -1; + struct nlmsghdr *resp = NULL; + struct nlmsgerr *err; + unsigned int recvbuflen; + struct nl_msg *nl_msg; + struct ndmsg ndm = { .ndm_family = PF_BRIDGE, .ndm_state = NUD_NOARP }; + + if (virNetDevGetIndex(ifname, &ndm.ndm_ifindex) < 0) + return -1; + + if (flags & VIR_NETDEVBRIDGE_FDB_FLAG_ROUTER) + ndm.ndm_flags |= NTF_ROUTER; + if (flags & VIR_NETDEVBRIDGE_FDB_FLAG_SELF) + ndm.ndm_flags |= NTF_SELF; + if (flags & VIR_NETDEVBRIDGE_FDB_FLAG_MASTER) + ndm.ndm_flags |= NTF_MASTER; + /* default self (same as iproute2's bridge command */ + if (!(ndm.ndm_flags & (NTF_MASTER | NTF_SELF))) + ndm.ndm_flags |= NTF_SELF; + + if (flags & VIR_NETDEVBRIDGE_FDB_FLAG_PERMANENT) + ndm.ndm_state |= NUD_PERMANENT; + if (flags & VIR_NETDEVBRIDGE_FDB_FLAG_TEMP) + ndm.ndm_state |= NUD_REACHABLE; + /* default permanent, same as iproute2's bridge command */ + if (!(ndm.ndm_state & (NUD_PERMANENT | NUD_REACHABLE))) + ndm.ndm_state |= NUD_PERMANENT; + + nl_msg = nlmsg_alloc_simple(isAdd ? RTM_NEWNEIGH : RTM_DELNEIGH, + NLM_F_REQUEST | + (isAdd ? (NLM_F_CREATE | NLM_F_EXCL) : 0)); + if (!nl_msg) { + virReportOOMError(); + return -1; + } + + if (nlmsg_append(nl_msg, &ndm, sizeof(ndm), NLMSG_ALIGNTO) < 0) + goto buffer_too_small; + if (nla_put(nl_msg, NDA_LLADDR, VIR_MAC_BUFLEN, mac) < 0) + goto buffer_too_small; + + /* NB: this message can also accept a Destination IP, a port, a + * vlan tag, and a via (see iproute2/bridge/fdb.c:fdb_modify()), + * but those aren't required for our application + */ + + if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, + NETLINK_ROUTE, 0) < 0) { + goto cleanup; + } + if (recvbuflen < NLMSG_LENGTH(0) || resp == NULL) + goto malformed_resp; + + switch (resp->nlmsg_type) { + case NLMSG_ERROR: + err = (struct nlmsgerr *)NLMSG_DATA(resp); + if (resp->nlmsg_len < NLMSG_LENGTH(sizeof(*err))) + goto malformed_resp; + if (err->error) { + virReportSystemError(-err->error, + _("error adding fdb entry for %s"), ifname); + goto cleanup; + } + break; + case NLMSG_DONE: + break; + + default: + goto malformed_resp; + } + + ret = 0; + cleanup: + nlmsg_free(nl_msg); + VIR_FREE(resp); + return ret; + + malformed_resp: + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("malformed netlink response message")); + goto cleanup; + + buffer_too_small: + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("allocated netlink buffer is too small")); + goto cleanup; +} + + +#else +static int +virNetDevBridgeFDBAddDel(const virMacAddr *mac ATTRIBUTE_UNUSED, + const char *ifname ATTRIBUTE_UNUSED, + unsigned int fdbFlags ATTRIBUTE_UNUSED, + bool isAdd ATTRIBUTE_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("Unable to add/delete fdb entries on this platform")); + return -1; +} + + +#endif + +int +virNetDevBridgeFDBAdd(const virMacAddr *mac, const char *ifname, + unsigned int flags) +{ + return virNetDevBridgeFDBAddDel(mac, ifname, flags, true); +} + + +int +virNetDevBridgeFDBDel(const virMacAddr *mac, const char *ifname, + unsigned int flags) +{ + return virNetDevBridgeFDBAddDel(mac, ifname, flags, false); +} diff --git a/src/util/virnetdevbridge.h b/src/util/virnetdevbridge.h index 8188a2f..141f231 100644 --- a/src/util/virnetdevbridge.h +++ b/src/util/virnetdevbridge.h @@ -24,6 +24,7 @@ # define __VIR_NETDEV_BRIDGE_H__ # include "internal.h" +# include "virmacaddr.h" int virNetDevBridgeCreate(const char *brname) ATTRIBUTE_NONNULL(1) ATTRIBUTE_RETURN_CHECK; @@ -77,4 +78,19 @@ int virNetDevBridgePortSetUnicastFlood(const char *brname, bool enable) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; +typedef enum { + VIR_NETDEVBRIDGE_FDB_FLAG_ROUTER = (1 << 0), + VIR_NETDEVBRIDGE_FDB_FLAG_SELF = (1 << 1), + VIR_NETDEVBRIDGE_FDB_FLAG_MASTER = (1 << 2), + + VIR_NETDEVBRIDGE_FDB_FLAG_PERMANENT = (1 << 3), + VIR_NETDEVBRIDGE_FDB_FLAG_TEMP = (1 << 4), +} virNetDevBridgeFDBFlags; + +int virNetDevBridgeFDBAdd(const virMacAddr *mac, const char *ifname, + unsigned int flags) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; +int virNetDevBridgeFDBDel(const virMacAddr *mac, const char *ifname, + unsigned int flags) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; #endif /* __VIR_NETDEV_BRIDGE_H__ */ -- 1.9.3

The macTableManager attribute of a network's bridge subelement tells libvirt how the bridge's MAC address table (used to determine the egress port for packets) is managed. In the default mode, "kernel", management is left to the kernel, which usually determines entries in part by turning on promiscuous mode on all ports of the bridge, flooding packets to all ports when the correct destination is unknown, and adding/removing entries to the fdb as it sees incoming traffic from particular MAC addresses. In "libvirt" mode, libvirt turns off learning and flooding on all the bridge ports connected to guest domain interfaces, and adds/removes entries according to the MAC addresses in the domain interface configurations. A side effect of turning off learning and unicast_flood on the ports of a bridge is that (with Linux kernel 3.17 and newer), the kernel can automatically turn off promiscuous mode on one or more of the bridge's ports (usually only the one interface that is used to connect the bridge to the physical network). The result is better performance (because packets aren't being flooded to all ports, and can be dropped earlier when they are of no interest) and slightly better security (a guest can still send out packets with a spoofed source MAC address, but will only receive traffic intended for the guest interface's configured MAC address). The attribute looks like this in the configuration: <network> <name>test</name> <bridge name='br0' macTableManager='libvirt'/> ... This patch only adds the config knob, documentation, and test cases. The functionality behind this knob is added in later patches. --- Change from V2 - changed attribute name from "fdb" to "macTableManager", and values from "learnWithFlooding" and "managed" to "kernel" and "libvirt". docs/formatnetwork.html.in | 50 ++++++++++++++++++--- docs/schemas/network.rng | 9 ++++ src/conf/network_conf.c | 51 +++++++++++++++++----- src/conf/network_conf.h | 11 +++++ src/libvirt_private.syms | 2 + tests/networkxml2xmlin/host-bridge-no-flood.xml | 6 +++ .../nat-network-explicit-flood.xml | 21 +++++++++ tests/networkxml2xmlout/host-bridge-no-flood.xml | 6 +++ .../nat-network-explicit-flood.xml | 23 ++++++++++ tests/networkxml2xmltest.c | 2 + 10 files changed, 164 insertions(+), 17 deletions(-) create mode 100644 tests/networkxml2xmlin/host-bridge-no-flood.xml create mode 100644 tests/networkxml2xmlin/nat-network-explicit-flood.xml create mode 100644 tests/networkxml2xmlout/host-bridge-no-flood.xml create mode 100644 tests/networkxml2xmlout/nat-network-explicit-flood.xml diff --git a/docs/formatnetwork.html.in b/docs/formatnetwork.html.in index 7bcf316..ca0e207 100644 --- a/docs/formatnetwork.html.in +++ b/docs/formatnetwork.html.in @@ -81,7 +81,7 @@ <pre> ... - <bridge name="virbr0" stp="on" delay="5"/> + <bridge name="virbr0" stp="on" delay="5" macTableManager="libvirt"/> <domain name="example.com"/> <forward mode="nat" dev="eth0"/> ...</pre> @@ -92,18 +92,56 @@ defines the name of a bridge device which will be used to construct the virtual network. The virtual machines will be connected to this bridge device allowing them to talk to each other. The bridge device - may also be connected to the LAN. It is recommended that bridge - device names started with the prefix <code>vir</code>, but the name - <code>virbr0</code> is reserved for the "default" virtual - network. This element should always be provided when defining + may also be connected to the LAN. When defining a new network with a <code><forward></code> mode of + "nat" or "route" (or an isolated network with - no <code><forward></code> element). + no <code><forward></code> element), libvirt will + automatically generate a unique name for the bridge device if + none is given, and this name will be permanently stored in the + network configuration so that that the same name will be used + every time the network is started. For these types of networks + (nat, routed, and isolated), a bridge name beginning with the + prefix "virbr" is recommended (and that is what is + auto-generated), but not enforced. Attribute <code>stp</code> specifies if Spanning Tree Protocol is 'on' or 'off' (default is 'on'). Attribute <code>delay</code> sets the bridge's forward delay value in seconds (default is 0). <span class="since">Since 0.3.0</span> + + <p> + The <code>macTableManager</code> attribute of the bridge + element is used to tell libvirt how the bridge's MAC address + table (used to determine the correct egress port for packets + based on destination MAC address) will be managed. In the + default <code>kernel</code> setting, the kernel + automatically adds and removes entries, typically using + learning, flooding, and promiscuous mode on the bridge's + ports in order to determine the proper egress port for + packets. When <code>macTableManager</code> is set + to <code>libvirt</code>, libvirt disables kernel management + of the MAC table (in the case of the Linux host bridge, this + means enabling vlan_filtering on the bridge, and disabling + learning and unicast_filter for all bridge ports), and + explicitly adds/removes entries to the table according to + the MAC addresses in the domain interface configurations. + Allowing libvirt to manage the MAC table can improve + performance - with a Linux host bridge, for example, turning + off learning and unicast_flood on ports has its own + performance advantage, and can also lead to an additional + boost by permitting the kernel to automatically turn off + promiscuous mode on some ports of the bridge (in particular, + the port attaching the bridge to the physical + network). However, it can also cause some networking setups + to stop working (e.g. vlan tagging, multicast, + guest-initiated changes to MAC address) and is not supported + by older kernels. + <span class="since">Since 1.2.11, requires kernel 3.17 or + newer</span> + </p> + + </dd> <dt><code>domain</code></dt> <dd> diff --git a/docs/schemas/network.rng b/docs/schemas/network.rng index 4546f80..9a7d156 100644 --- a/docs/schemas/network.rng +++ b/docs/schemas/network.rng @@ -65,6 +65,15 @@ </attribute> </optional> + <optional> + <attribute name="macTableManager"> + <choice> + <value>kernel</value> + <value>libvirt</value> + </choice> + </attribute> + </optional> + </element> </optional> diff --git a/src/conf/network_conf.c b/src/conf/network_conf.c index 97719ed..0d09def 100644 --- a/src/conf/network_conf.c +++ b/src/conf/network_conf.c @@ -55,6 +55,10 @@ VIR_ENUM_IMPL(virNetworkForward, VIR_NETWORK_FORWARD_LAST, "none", "nat", "route", "bridge", "private", "vepa", "passthrough", "hostdev") +VIR_ENUM_IMPL(virNetworkBridgeMACTableManager, + VIR_NETWORK_BRIDGE_MAC_TABLE_MANAGER_LAST, + "default", "kernel", "libvirt") + VIR_ENUM_DECL(virNetworkForwardHostdevDevice) VIR_ENUM_IMPL(virNetworkForwardHostdevDevice, VIR_NETWORK_FORWARD_HOSTDEV_DEVICE_LAST, @@ -2108,6 +2112,18 @@ virNetworkDefParseXML(xmlXPathContextPtr ctxt) } VIR_FREE(tmp); + tmp = virXPathString("string(./bridge[1]/@macTableManager)", ctxt); + if (tmp) { + if ((def->macTableManager + = virNetworkBridgeMACTableManagerTypeFromString(tmp)) <= 0) { + virReportError(VIR_ERR_XML_ERROR, + _("Invalid macTableManager setting '%s' " + "in network '%s'"), tmp, def->name); + goto error; + } + VIR_FREE(tmp); + } + tmp = virXPathString("string(./mac[1]/@address)", ctxt); if (tmp) { if (virMacAddrParse(tmp, &def->mac) < 0) { @@ -2290,6 +2306,14 @@ virNetworkDefParseXML(xmlXPathContextPtr ctxt) def->name); goto error; } + if (def->macTableManager) { + virReportError(VIR_ERR_XML_ERROR, + _("bridge macTableManager setting not allowed " + "in %s mode (network '%s')"), + virNetworkForwardTypeToString(def->forward.type), + def->name); + goto error; + } /* fall through to next case */ case VIR_NETWORK_FORWARD_BRIDGE: if (def->delay || stp) { @@ -2783,22 +2807,27 @@ virNetworkDefFormatBuf(virBufferPtr buf, virBufferAddLit(buf, "</forward>\n"); } + if (def->forward.type == VIR_NETWORK_FORWARD_NONE || - def->forward.type == VIR_NETWORK_FORWARD_NAT || - def->forward.type == VIR_NETWORK_FORWARD_ROUTE) { + def->forward.type == VIR_NETWORK_FORWARD_NAT || + def->forward.type == VIR_NETWORK_FORWARD_ROUTE || + def->bridge || def->macTableManager) { virBufferAddLit(buf, "<bridge"); - if (def->bridge) - virBufferEscapeString(buf, " name='%s'", def->bridge); - virBufferAsprintf(buf, " stp='%s' delay='%ld'/>\n", - def->stp ? "on" : "off", - def->delay); - } else if (def->forward.type == VIR_NETWORK_FORWARD_BRIDGE && - def->bridge) { - virBufferEscapeString(buf, "<bridge name='%s'/>\n", def->bridge); + virBufferEscapeString(buf, " name='%s'", def->bridge); + if (def->forward.type == VIR_NETWORK_FORWARD_NONE || + def->forward.type == VIR_NETWORK_FORWARD_NAT || + def->forward.type == VIR_NETWORK_FORWARD_ROUTE) { + virBufferAsprintf(buf, " stp='%s' delay='%ld'", + def->stp ? "on" : "off", def->delay); + } + if (def->macTableManager) { + virBufferAsprintf(buf, " macTableManager='%s'", + virNetworkBridgeMACTableManagerTypeToString(def->macTableManager)); + } + virBufferAddLit(buf, "/>\n"); } - if (def->mac_specified) { char macaddr[VIR_MAC_STRING_BUFLEN]; virMacAddrFormat(&def->mac, macaddr); diff --git a/src/conf/network_conf.h b/src/conf/network_conf.h index 660cd2d..8110028 100644 --- a/src/conf/network_conf.h +++ b/src/conf/network_conf.h @@ -54,6 +54,16 @@ typedef enum { } virNetworkForwardType; typedef enum { + VIR_NETWORK_BRIDGE_MAC_TABLE_MANAGER_DEFAULT = 0, + VIR_NETWORK_BRIDGE_MAC_TABLE_MANAGER_KERNEL, + VIR_NETWORK_BRIDGE_MAC_TABLE_MANAGER_LIBVIRT, + + VIR_NETWORK_BRIDGE_MAC_TABLE_MANAGER_LAST, +} virNetworkBridgeMACTableManagerType; + +VIR_ENUM_DECL(virNetworkBridgeMACTableManager) + +typedef enum { VIR_NETWORK_FORWARD_HOSTDEV_DEVICE_NONE = 0, VIR_NETWORK_FORWARD_HOSTDEV_DEVICE_PCI, VIR_NETWORK_FORWARD_HOSTDEV_DEVICE_NETDEV, @@ -231,6 +241,7 @@ struct _virNetworkDef { int connections; /* # of guest interfaces connected to this network */ char *bridge; /* Name of bridge device */ + int macTableManager; /* enum virNetworkBridgeMACTableManager */ char *domain; unsigned long delay; /* Bridge forward delay (ms) */ bool stp; /* Spanning tree protocol */ diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 3e445fc..645d516 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -529,6 +529,8 @@ virNetDevVPortTypeToString; # conf/network_conf.h virNetworkAssignDef; +virNetworkBridgeMACTableManagerTypeFromString; +virNetworkBridgeMACTableManagerTypeToString; virNetworkConfigChangeSetup; virNetworkConfigFile; virNetworkDefCopy; diff --git a/tests/networkxml2xmlin/host-bridge-no-flood.xml b/tests/networkxml2xmlin/host-bridge-no-flood.xml new file mode 100644 index 0000000..562e697 --- /dev/null +++ b/tests/networkxml2xmlin/host-bridge-no-flood.xml @@ -0,0 +1,6 @@ +<network> + <name>host-bridge-net</name> + <uuid>81ff0d90-c91e-6742-64da-4a736edb9a8e</uuid> + <forward mode="bridge"/> + <bridge name="br0" macTableManager="libvirt"/> +</network> diff --git a/tests/networkxml2xmlin/nat-network-explicit-flood.xml b/tests/networkxml2xmlin/nat-network-explicit-flood.xml new file mode 100644 index 0000000..9738b81 --- /dev/null +++ b/tests/networkxml2xmlin/nat-network-explicit-flood.xml @@ -0,0 +1,21 @@ +<network> + <name>default</name> + <uuid>81ff0d90-c91e-6742-64da-4a736edb9a9b</uuid> + <bridge name="virbr0" macTableManager="kernel"/> + <forward mode="nat" dev="eth1"/> + <ip address="192.168.122.1" netmask="255.255.255.0"> + <dhcp> + <range start="192.168.122.2" end="192.168.122.254"/> + <host mac="00:16:3e:77:e2:ed" name="a.example.com" ip="192.168.122.10"/> + <host mac="00:16:3e:3e:a9:1a" name="b.example.com" ip="192.168.122.11"/> + </dhcp> + </ip> + <ip family="ipv4" address="192.168.123.1" netmask="255.255.255.0"> + </ip> + <ip family="ipv6" address="2001:db8:ac10:fe01::1" prefix="64"> + </ip> + <ip family="ipv6" address="2001:db8:ac10:fd01::1" prefix="64"> + </ip> + <ip family="ipv4" address="10.24.10.1"> + </ip> +</network> diff --git a/tests/networkxml2xmlout/host-bridge-no-flood.xml b/tests/networkxml2xmlout/host-bridge-no-flood.xml new file mode 100644 index 0000000..bdbbf3b --- /dev/null +++ b/tests/networkxml2xmlout/host-bridge-no-flood.xml @@ -0,0 +1,6 @@ +<network> + <name>host-bridge-net</name> + <uuid>81ff0d90-c91e-6742-64da-4a736edb9a8e</uuid> + <forward mode='bridge'/> + <bridge name='br0' macTableManager='libvirt'/> +</network> diff --git a/tests/networkxml2xmlout/nat-network-explicit-flood.xml b/tests/networkxml2xmlout/nat-network-explicit-flood.xml new file mode 100644 index 0000000..305c3b7 --- /dev/null +++ b/tests/networkxml2xmlout/nat-network-explicit-flood.xml @@ -0,0 +1,23 @@ +<network> + <name>default</name> + <uuid>81ff0d90-c91e-6742-64da-4a736edb9a9b</uuid> + <forward dev='eth1' mode='nat'> + <interface dev='eth1'/> + </forward> + <bridge name='virbr0' stp='on' delay='0' macTableManager='kernel'/> + <ip address='192.168.122.1' netmask='255.255.255.0'> + <dhcp> + <range start='192.168.122.2' end='192.168.122.254'/> + <host mac='00:16:3e:77:e2:ed' name='a.example.com' ip='192.168.122.10'/> + <host mac='00:16:3e:3e:a9:1a' name='b.example.com' ip='192.168.122.11'/> + </dhcp> + </ip> + <ip family='ipv4' address='192.168.123.1' netmask='255.255.255.0'> + </ip> + <ip family='ipv6' address='2001:db8:ac10:fe01::1' prefix='64'> + </ip> + <ip family='ipv6' address='2001:db8:ac10:fd01::1' prefix='64'> + </ip> + <ip family='ipv4' address='10.24.10.1'> + </ip> +</network> diff --git a/tests/networkxml2xmltest.c b/tests/networkxml2xmltest.c index 65ac591..34a5211 100644 --- a/tests/networkxml2xmltest.c +++ b/tests/networkxml2xmltest.c @@ -120,6 +120,8 @@ mymain(void) DO_TEST("hostdev"); DO_TEST_FULL("hostdev-pf", VIR_NETWORK_XML_INACTIVE); DO_TEST("passthrough-address-crash"); + DO_TEST("nat-network-explicit-flood"); + DO_TEST("host-bridge-no-flood"); return ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE; } -- 1.9.3

On Mon, Dec 08, 2014 at 11:00:03AM -0500, Laine Stump wrote:
The macTableManager attribute of a network's bridge subelement tells libvirt how the bridge's MAC address table (used to determine the egress port for packets) is managed. In the default mode, "kernel", management is left to the kernel, which usually determines entries in part by turning on promiscuous mode on all ports of the bridge, flooding packets to all ports when the correct destination is unknown, and adding/removing entries to the fdb as it sees incoming traffic from particular MAC addresses. In "libvirt" mode, libvirt turns off learning and flooding on all the bridge ports connected to guest domain interfaces, and adds/removes entries according to the MAC addresses in the domain interface configurations. A side effect of turning off learning and unicast_flood on the ports of a bridge is that (with Linux kernel 3.17 and newer), the kernel can automatically turn off promiscuous mode on one or more of the bridge's ports (usually only the one interface that is used to connect the bridge to the physical network). The result is better performance (because packets aren't being flooded to all ports, and can be dropped earlier when they are of no interest) and slightly better security (a guest can still send out packets with a spoofed source MAC address, but will only receive traffic intended for the guest interface's configured MAC address).
The attribute looks like this in the configuration:
<network> <name>test</name> <bridge name='br0' macTableManager='libvirt'/> ...
This patch only adds the config knob, documentation, and test cases. The functionality behind this knob is added in later patches.
ACK, design looks good now. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

When the actualType of a virDomainNetDef is "network", it means that we are connecting to a libvirt-managed network (routed, natted, or isolated) which does use a bridge device (created by libvirt). In the past we have required drivers such as qemu to call the public API to retrieve the bridge name in this case (even though it is available in the NetDef's ActualNetDef if the actualType is "bridge" (i.e., an externally-created bridge that isn't managed by libvirt). There is no real reason for this difference, and as a matter of fact it complicates things for qemu. Also, there is another bridge-related attribute (macTableManager) that will need to be available in both cases, so this makes things consistent. In order to avoid problems when restarting libvirtd after an update from an older version that *doesn't* store the network's bridgename in the ActualNetDef, we also need to put it in place during networkNotifyActualDevice() (this function is run for each interface of each domain whenever libvirtd is restarted). Along with making the bridge name available in the internal object, it is also now reported in the <source> element of the <interface> state XML (or the <actual> subelement in the internally-stored format). The one oddity about this change is that usually there is a separate union for every different "type" in a higher level object (e.g. in the case of a virDomainNetDef there are separate "network" and "bridge" members of the union that pivots on the type), but in this case network and bridge types both have exactly the same attributes, so the "bridge" member is used for both type==network and type==bridge. --- No change from V2. src/conf/domain_conf.c | 102 +++++++++++++++++++++++--------------------- src/network/bridge_driver.c | 20 +++++++++ 2 files changed, 73 insertions(+), 49 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 2d81c37..f45969f 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -1354,6 +1354,7 @@ virDomainActualNetDefFree(virDomainActualNetDefPtr def) switch (def->type) { case VIR_DOMAIN_NET_TYPE_BRIDGE: + case VIR_DOMAIN_NET_TYPE_NETWORK: VIR_FREE(def->data.bridge.brname); break; case VIR_DOMAIN_NET_TYPE_DIRECT: @@ -7106,9 +7107,12 @@ virDomainActualNetDefParseXML(xmlNodePtr node, goto error; } VIR_FREE(class_id); - } else if (actual->type == VIR_DOMAIN_NET_TYPE_BRIDGE) { + } + if (actual->type == VIR_DOMAIN_NET_TYPE_BRIDGE || + actual->type == VIR_DOMAIN_NET_TYPE_NETWORK) { char *brname = virXPathString("string(./source/@bridge)", ctxt); - if (!brname) { + + if (!brname && actual->type == VIR_DOMAIN_NET_TYPE_BRIDGE) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Missing <source> element with bridge name in " "interface's <actual> element")); @@ -17126,60 +17130,59 @@ virDomainHostdevDefFormatCaps(virBufferPtr buf, static int virDomainActualNetDefContentsFormat(virBufferPtr buf, virDomainNetDefPtr def, - const char *typeStr, bool inSubelement, unsigned int flags) { - const char *mode; - - switch (virDomainNetGetActualType(def)) { - case VIR_DOMAIN_NET_TYPE_BRIDGE: - virBufferEscapeString(buf, "<source bridge='%s'/>\n", - virDomainNetGetActualBridgeName(def)); - break; - - case VIR_DOMAIN_NET_TYPE_DIRECT: - virBufferAddLit(buf, "<source"); - virBufferEscapeString(buf, " dev='%s'", - virDomainNetGetActualDirectDev(def)); + int actualType = virDomainNetGetActualType(def); - mode = virNetDevMacVLanModeTypeToString(virDomainNetGetActualDirectMode(def)); - if (!mode) { - virReportError(VIR_ERR_INTERNAL_ERROR, - _("unexpected source mode %d"), - virDomainNetGetActualDirectMode(def)); - return -1; - } - virBufferAsprintf(buf, " mode='%s'/>\n", mode); - break; - - case VIR_DOMAIN_NET_TYPE_HOSTDEV: + if (actualType == VIR_DOMAIN_NET_TYPE_HOSTDEV) { if (virDomainHostdevDefFormatSubsys(buf, virDomainNetGetActualHostdev(def), flags, true) < 0) { return -1; } - break; - - case VIR_DOMAIN_NET_TYPE_NETWORK: - if (!inSubelement) { - /* the <source> element isn't included in <actual> - * (i.e. when we're putting our output into a subelement - * rather than inline) because the main element has the - * same info already. If we're outputting inline, though, - * we *do* need to output <source>, because the caller - * won't have done it. + } else { + virBufferAddLit(buf, "<source"); + if (def->type == VIR_DOMAIN_NET_TYPE_NETWORK && !inSubelement) { + /* When we're putting our output into the <actual> + * subelement rather than the main <interface>, the + * network name isn't included in the <source> because the + * main interface element's <source> has the same info + * already. If we've been called to output directly into + * the main element's <source> though (the case here - + * "!inSubElement"), we *do* need to output the network + * name, because the caller won't have done it). */ - virBufferEscapeString(buf, "<source network='%s'/>\n", - def->data.network.name); + virBufferEscapeString(buf, " network='%s'", def->data.network.name); } - if (def->data.network.actual && def->data.network.actual->class_id) - virBufferAsprintf(buf, "<class id='%u'/>\n", - def->data.network.actual->class_id); - break; - default: - virReportError(VIR_ERR_INTERNAL_ERROR, - _("unexpected actual net type %s"), typeStr); - return -1; + if (actualType == VIR_DOMAIN_NET_TYPE_BRIDGE || + actualType == VIR_DOMAIN_NET_TYPE_NETWORK) { + /* actualType == NETWORK includes the name of the bridge + * that is used by the network, whether we are + * "inSubElement" or not. + */ + virBufferEscapeString(buf, " bridge='%s'", + virDomainNetGetActualBridgeName(def)); + } else if (actualType == VIR_DOMAIN_NET_TYPE_DIRECT) { + const char *mode; + + virBufferEscapeString(buf, " dev='%s'", + virDomainNetGetActualDirectDev(def)); + mode = virNetDevMacVLanModeTypeToString(virDomainNetGetActualDirectMode(def)); + if (!mode) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("unexpected source mode %d"), + virDomainNetGetActualDirectMode(def)); + return -1; + } + virBufferAsprintf(buf, " mode='%s'", mode); + } + + virBufferAddLit(buf, "/>\n"); + } + if (actualType == VIR_DOMAIN_NET_TYPE_DIRECT && + def->data.network.actual && def->data.network.actual->class_id) { + virBufferAsprintf(buf, "<class id='%u'/>\n", + def->data.network.actual->class_id); } if (virNetDevVlanFormat(virDomainNetGetActualVlan(def), buf) < 0) @@ -17227,7 +17230,7 @@ virDomainActualNetDefFormat(virBufferPtr buf, virBufferAddLit(buf, ">\n"); virBufferAdjustIndent(buf, 2); - if (virDomainActualNetDefContentsFormat(buf, def, typeStr, true, flags) < 0) + if (virDomainActualNetDefContentsFormat(buf, def, true, flags) < 0) return -1; virBufferAdjustIndent(buf, -2); virBufferAddLit(buf, "</actual>\n"); @@ -17398,7 +17401,7 @@ virDomainNetDefFormat(virBufferPtr buf, * the standard place... (this is for public reporting of * interface status) */ - if (virDomainActualNetDefContentsFormat(buf, def, typeStr, false, flags) < 0) + if (virDomainActualNetDefContentsFormat(buf, def, false, flags) < 0) return -1; } else { /* ...but if we've asked for the inactive XML (rather than @@ -20751,7 +20754,8 @@ virDomainNetGetActualBridgeName(virDomainNetDefPtr iface) return iface->data.bridge.brname; if (iface->type == VIR_DOMAIN_NET_TYPE_NETWORK && iface->data.network.actual && - iface->data.network.actual->type == VIR_DOMAIN_NET_TYPE_BRIDGE) + (iface->data.network.actual->type == VIR_DOMAIN_NET_TYPE_BRIDGE || + iface->data.network.actual->type == VIR_DOMAIN_NET_TYPE_NETWORK)) return iface->data.network.actual->data.bridge.brname; return NULL; } diff --git a/src/network/bridge_driver.c b/src/network/bridge_driver.c index 1459f04..a41510b 100644 --- a/src/network/bridge_driver.c +++ b/src/network/bridge_driver.c @@ -3795,6 +3795,15 @@ networkAllocateActualDevice(virDomainDefPtr dom, */ iface->data.network.actual->type = VIR_DOMAIN_NET_TYPE_NETWORK; + /* we also store the bridge device + * in iface->data.network.actual->data.bridge for later use + * after the domain's tap device is created (to attach to the + * bridge and set flood/learning mode on the tap device) + */ + if (VIR_STRDUP(iface->data.network.actual->data.bridge.brname, + netdef->bridge) < 0) + goto error; + if (networkPlugBandwidth(network, iface) < 0) goto error; @@ -4131,6 +4140,17 @@ networkNotifyActualDevice(virDomainDefPtr dom, } netdef = network->def; + /* if we're restarting libvirtd after an upgrade from a version + * that didn't save bridge name in actualNetDef for + * actualType==network, we need to copy it in so that it will be + * available in all cases + */ + if (actualType == VIR_DOMAIN_NET_TYPE_NETWORK && + !iface->data.network.actual->data.bridge.brname && + (VIR_STRDUP(iface->data.network.actual->data.bridge.brname, + netdef->bridge) < 0)) + goto error; + if (!iface->data.network.actual || (actualType != VIR_DOMAIN_NET_TYPE_DIRECT && actualType != VIR_DOMAIN_NET_TYPE_HOSTDEV)) { -- 1.9.3

At the time that the network driver allocates a connection to a network, the tap device that will be used hasn't yet been created - that will be done later by qemu (or lxc or whoever) - but if the network has macTableManager='libvirt', then when we do get around to creating the tap device, we will need to add an entry for it to the network bridge's fdb (forwarding database) *and* turn off learning and unicast_flood for that tap device in the bridge's sysfs settings. This means that qemu needs to know both the bridge name as well as the setting of macTableManager, so we either need to create a new API to retrieve that info, or just pass it back in the ActualNetDef that is created during networkAllocateActualDevice. We choose the latter method, since it's already done for the bridge device, and it has the side effect of making the information available in domain status. (NB: in the future, I think that the tap device should actually be created by networkAllocateActualDevice(), as that will solve several other problems, but that is a battle for another day, and this information will still be useful outside the network driver) --- Change from V2: attribute name. src/conf/domain_conf.c | 30 ++++++++++++++++++++++++++++++ src/conf/domain_conf.h | 2 ++ src/libvirt_private.syms | 1 + src/network/bridge_driver.c | 6 +++++- 4 files changed, 38 insertions(+), 1 deletion(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index f45969f..843cdec 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -51,6 +51,7 @@ #include "netdev_bandwidth_conf.h" #include "netdev_vlan_conf.h" #include "device_conf.h" +#include "network_conf.h" #include "virtpm.h" #include "virstring.h" @@ -7003,6 +7004,7 @@ virDomainActualNetDefParseXML(xmlNodePtr node, char *mode = NULL; char *addrtype = NULL; char *trustGuestRxFilters = NULL; + char *macTableManager = NULL; if (VIR_ALLOC(actual) < 0) return -1; @@ -7119,6 +7121,16 @@ virDomainActualNetDefParseXML(xmlNodePtr node, goto error; } actual->data.bridge.brname = brname; + macTableManager = virXPathString("string(./source/@macTableManager)", ctxt); + if (macTableManager && + (actual->data.bridge.macTableManager + = virNetworkBridgeMACTableManagerTypeFromString(macTableManager)) <= 0) { + virReportError(VIR_ERR_XML_ERROR, + _("Invalid macTableManager setting '%s' " + "in domain interface's <actual> element"), + macTableManager); + goto error; + } } bandwidth_node = virXPathNode("./bandwidth", ctxt); @@ -7139,6 +7151,7 @@ virDomainActualNetDefParseXML(xmlNodePtr node, VIR_FREE(mode); VIR_FREE(addrtype); VIR_FREE(trustGuestRxFilters); + VIR_FREE(macTableManager); virDomainActualNetDefFree(actual); ctxt->node = save_ctxt; @@ -17156,12 +17169,18 @@ virDomainActualNetDefContentsFormat(virBufferPtr buf, } if (actualType == VIR_DOMAIN_NET_TYPE_BRIDGE || actualType == VIR_DOMAIN_NET_TYPE_NETWORK) { + int macTableManager = virDomainNetGetActualBridgeMACTableManager(def); + /* actualType == NETWORK includes the name of the bridge * that is used by the network, whether we are * "inSubElement" or not. */ virBufferEscapeString(buf, " bridge='%s'", virDomainNetGetActualBridgeName(def)); + if (macTableManager) { + virBufferAsprintf(buf, " macTableManager='%s'", + virNetworkBridgeMACTableManagerTypeToString(macTableManager)); + } } else if (actualType == VIR_DOMAIN_NET_TYPE_DIRECT) { const char *mode; @@ -20760,6 +20779,17 @@ virDomainNetGetActualBridgeName(virDomainNetDefPtr iface) return NULL; } +int +virDomainNetGetActualBridgeMACTableManager(virDomainNetDefPtr iface) +{ + if (iface->type == VIR_DOMAIN_NET_TYPE_NETWORK && + iface->data.network.actual && + (iface->data.network.actual->type == VIR_DOMAIN_NET_TYPE_BRIDGE || + iface->data.network.actual->type == VIR_DOMAIN_NET_TYPE_NETWORK)) + return iface->data.network.actual->data.bridge.macTableManager; + return 0; +} + const char * virDomainNetGetActualDirectDev(virDomainNetDefPtr iface) { diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 439f3c0..e10b3c5 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -888,6 +888,7 @@ struct _virDomainActualNetDef { union { struct { char *brname; + int macTableManager; /* enum virNetworkBridgeMACTableManagerType */ } bridge; struct { char *linkdev; @@ -2540,6 +2541,7 @@ int virDomainGraphicsListenSetNetwork(virDomainGraphicsDefPtr def, int virDomainNetGetActualType(virDomainNetDefPtr iface); const char *virDomainNetGetActualBridgeName(virDomainNetDefPtr iface); +int virDomainNetGetActualBridgeMACTableManager(virDomainNetDefPtr iface); const char *virDomainNetGetActualDirectDev(virDomainNetDefPtr iface); int virDomainNetGetActualDirectMode(virDomainNetDefPtr iface); virDomainHostdevDefPtr virDomainNetGetActualHostdev(virDomainNetDefPtr iface); diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 645d516..6df2784 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -328,6 +328,7 @@ virDomainNetFind; virDomainNetFindIdx; virDomainNetGenerateMAC; virDomainNetGetActualBandwidth; +virDomainNetGetActualBridgeMACTableManager; virDomainNetGetActualBridgeName; virDomainNetGetActualDirectDev; virDomainNetGetActualDirectMode; diff --git a/src/network/bridge_driver.c b/src/network/bridge_driver.c index a41510b..93d36ab 100644 --- a/src/network/bridge_driver.c +++ b/src/network/bridge_driver.c @@ -3795,7 +3795,7 @@ networkAllocateActualDevice(virDomainDefPtr dom, */ iface->data.network.actual->type = VIR_DOMAIN_NET_TYPE_NETWORK; - /* we also store the bridge device + /* we also store the bridge device and macTableManager settings * in iface->data.network.actual->data.bridge for later use * after the domain's tap device is created (to attach to the * bridge and set flood/learning mode on the tap device) @@ -3803,6 +3803,8 @@ networkAllocateActualDevice(virDomainDefPtr dom, if (VIR_STRDUP(iface->data.network.actual->data.bridge.brname, netdef->bridge) < 0) goto error; + iface->data.network.actual->data.bridge.macTableManager + = netdef->macTableManager; if (networkPlugBandwidth(network, iface) < 0) goto error; @@ -3818,6 +3820,8 @@ networkAllocateActualDevice(virDomainDefPtr dom, if (VIR_STRDUP(iface->data.network.actual->data.bridge.brname, netdef->bridge) < 0) goto error; + iface->data.network.actual->data.bridge.macTableManager + = netdef->macTableManager; /* merge virtualports from interface, network, and portgroup to * arrive at actual virtualport to use -- 1.9.3

When the bridge device for a network has macTableManager='libvirt' the intent is that all kernel management of the bridge's MAC table (Forwarding Database, or fdb, in the case of a Linux Host Bridge) be disabled, with libvirt handling updates to the table instead. The setup required for the bridge itself is: 1) set the "vlan_filtering" property of the bridge device to 1. 2) If the bridge has a "Dummy" tap device used to set a fixed MAC address on the bridge (which is always the case for a bridge created by libvirt, and never the case for a bridge created by the host system network config), turn off learning and unicast_flood on this tap (this is needed even though this tap is never IFF_UP, because the kernel ignores the IFF_UP flag of devices when using their settings to automatically decide whether or not to turn off promiscuous mode for any attached device). (1) is done both for libvirt-created/managed bridges, and for bridges that are created by the host system config, while (2) is done only for bridges created by libvirt (i.e. for forward modes of nat, routed, and isolated bridges) There is no attempt to turn vlan_filtering off when destroying the network because in the case of a libvirt-created bridge, the bridge is about to be destroyed anyway, and in the case of a system bridge, if the other devices attached to the bridge could operate properly before destroying libvirt's network object, they will continue to operate properly (this is similar to the way that libvirt will enable ip_forwarding whenever a routed/natted network is started, but will never attempt to disable it if they are stopped). --- Change from V2: attribute name. src/network/bridge_driver.c | 54 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/src/network/bridge_driver.c b/src/network/bridge_driver.c index 93d36ab..29222d0 100644 --- a/src/network/bridge_driver.c +++ b/src/network/bridge_driver.c @@ -1906,6 +1906,29 @@ networkAddAddrToBridge(virNetworkObjPtr network, return 0; } + +static int +networkStartHandleMACTableManagerMode(virNetworkObjPtr network, + const char *macTapIfName) +{ + const char *brname = network->def->bridge; + + if (brname && + network->def->macTableManager + == VIR_NETWORK_BRIDGE_MAC_TABLE_MANAGER_LIBVIRT) { + if (virNetDevBridgeSetVlanFiltering(brname, true) < 0) + return -1; + if (macTapIfName) { + if (virNetDevBridgePortSetLearning(brname, macTapIfName, false) < 0) + return -1; + if (virNetDevBridgePortSetUnicastFlood(brname, macTapIfName, false) < 0) + return -1; + } + } + return 0; +} + + /* add an IP (static) route to a bridge */ static int networkAddRouteToBridge(virNetworkObjPtr network, @@ -2032,6 +2055,9 @@ networkStartNetworkVirtual(virNetworkObjPtr network) goto err2; } + if (networkStartHandleMACTableManagerMode(network, macTapIfName) < 0) + goto err2; + /* Bring up the bridge interface */ if (virNetDevSetOnline(network->def->bridge, 1) < 0) goto err2; @@ -2176,6 +2202,27 @@ static int networkShutdownNetworkVirtual(virNetworkObjPtr network) } +static int +networkStartNetworkBridge(virNetworkObjPtr network) +{ + /* put anything here that needs to be done each time a network of + * type BRIDGE, is started. On failure, undo anything you've done, + * and return -1. On success return 0. + */ + return networkStartHandleMACTableManagerMode(network, NULL); +} + +static int +networkShutdownNetworkBridge(virNetworkObjPtr network ATTRIBUTE_UNUSED) +{ + /* put anything here that needs to be done each time a network of + * type BRIDGE is shutdown. On failure, undo anything you've done, + * and return -1. On success return 0. + */ + return 0; +} + + /* networkCreateInterfacePool: * @netdef: the original NetDef from the network * @@ -2339,6 +2386,10 @@ networkStartNetwork(virNetworkObjPtr network) break; case VIR_NETWORK_FORWARD_BRIDGE: + if (networkStartNetworkBridge(network) < 0) + goto cleanup; + break; + case VIR_NETWORK_FORWARD_PRIVATE: case VIR_NETWORK_FORWARD_VEPA: case VIR_NETWORK_FORWARD_PASSTHROUGH: @@ -2405,6 +2456,9 @@ static int networkShutdownNetwork(virNetworkObjPtr network) break; case VIR_NETWORK_FORWARD_BRIDGE: + ret = networkShutdownNetworkBridge(network); + break; + case VIR_NETWORK_FORWARD_PRIVATE: case VIR_NETWORK_FORWARD_VEPA: case VIR_NETWORK_FORWARD_PASSTHROUGH: -- 1.9.3

When libvirt is managing the MAC table of a Linux host bridge, it must turn off learning and unicast_flood for each tap device attached to that bridge, then add a Forwarding Database (fdb) entry for the tap device using the MAC address from the domain interface config. Once we have disabled learning and flooding, any packet that has a destination MAC address not present in the fdb will be dropped by the bridge. This, along with the opportunistic disabling of promiscuous mode[*], can result in enhanced network performance. and a potential slight security improvement. [*] If there is only one device on the bridge with learning/unicast_flood enabled, then that device will automatically have promiscuous mode disabled. If there are *no* devices with learning/unicast_flood enabled (e.g. for a libvirt "route", "nat", or isolated network that has no physical device attached), then all non-tap devices will have promiscuous mode disabled (tap devices always have promiscuous mode enabled, which may be a bug in the kernel, but in practice has 0 effect). None of this has any effect for kernels prior to 3.15 (upstream kernel commit 2796d0c648c940b4796f84384fbcfb0a2399db84 "bridge: Automatically manage port promiscuous mode"). Even after that, until kernel 3.17 (upstream commit 5be5a2df40f005ea7fb7e280e87bbbcfcf1c2fc0 "bridge: Add filtering support for default_pvid") traffic will not be properly forwarded without manually adding vlan table entries. Unfortunately, although the presence of the first patch is signalled by existence of the "learning" and "unicast_flood" options in sysfs, there is no reliable way to query whether or not the system's kernel has the second of those patches installed, the only thing that can be done is to try the setting and see if traffic continues to pass. --- Change from V2: attribute name and commit log explanation. src/qemu/qemu_command.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 1831323..11ed58b 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -35,6 +35,7 @@ #include "virerror.h" #include "virfile.h" #include "virnetdev.h" +#include "virnetdevbridge.h" #include "virstring.h" #include "virtime.h" #include "viruuid.h" @@ -344,6 +345,24 @@ qemuNetworkIfaceConnect(virDomainDefPtr def, virDomainAuditNetDevice(def, net, tunpath, false); goto cleanup; } + if (virDomainNetGetActualBridgeMACTableManager(net) + == VIR_NETWORK_BRIDGE_MAC_TABLE_MANAGER_LIBVIRT) { + /* libvirt is managing the FDB of the bridge this device + * is attaching to, so we need to turn off learning and + * unicast_flood on the device to prevent the kernel from + * adding any FDB entries for it, then add an fdb entry + * outselves, using the MAC address from the interface + * config. + */ + if (virNetDevBridgePortSetLearning(brname, net->ifname, false) < 0) + goto cleanup; + if (virNetDevBridgePortSetUnicastFlood(brname, net->ifname, false) < 0) + goto cleanup; + if (virNetDevBridgeFDBAdd(&net->mac, net->ifname, + VIR_NETDEVBRIDGE_FDB_FLAG_MASTER | + VIR_NETDEVBRIDGE_FDB_FLAG_TEMP) < 0) + goto cleanup; + } } else { if (qemuCreateInBridgePortWithHelper(cfg, brname, &net->ifname, -- 1.9.3

qemuNetworkIfaceConnect() used to have a special case for actualType='network' (a network with forward mode of route, nat, or isolated) to call the libvirt public API to retrieve the bridge being used by a network. That is no longer necessary - since all network types that use a bridge and tap device now get the bridge name stored in the ActualNetDef, we can just always use virDomainNetGetActualBridgeName() instead. (an audit of the two callers to qemuNetworkIfaceConnect() confirms that it is never called for any other type of network, so the dead code in the else statement (logging an internal error if it is called for any other type of network) is eliminated in the process.) --- No change from V2. src/qemu/qemu_command.c | 34 +++++++++---------------------- src/qemu/qemu_hotplug.c | 54 +++++++------------------------------------------ 2 files changed, 16 insertions(+), 72 deletions(-) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 11ed58b..48bdf4e 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -277,6 +277,11 @@ static int qemuCreateInBridgePortWithHelper(virQEMUDriverConfigPtr cfg, return *tapfd < 0 ? -1 : 0; } +/* qemuNetworkIfaceConnect - *only* called if actualType is + * VIR_DOMAIN_NET_TYPE_NETWORK or VIR_DOMAIN_NET_TYPE_BRIDGE (i.e. if + * the connection is made with a tap device connecting to a bridge + * device) + */ int qemuNetworkIfaceConnect(virDomainDefPtr def, virConnectPtr conn, @@ -286,39 +291,19 @@ qemuNetworkIfaceConnect(virDomainDefPtr def, int *tapfd, int *tapfdSize) { - char *brname = NULL; + const char *brname; int ret = -1; unsigned int tap_create_flags = VIR_NETDEV_TAP_CREATE_IFUP; bool template_ifname = false; - int actualType = virDomainNetGetActualType(net); virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver); const char *tunpath = "/dev/net/tun"; if (net->backend.tap) tunpath = net->backend.tap; - if (actualType == VIR_DOMAIN_NET_TYPE_NETWORK) { - bool fail = false; - virNetworkPtr network = virNetworkLookupByName(conn, - net->data.network.name); - if (!network) - return ret; - - if (!(brname = virNetworkGetBridgeName(network))) - fail = true; - - virObjectUnref(network); - if (fail) - return ret; - - } else if (actualType == VIR_DOMAIN_NET_TYPE_BRIDGE) { - if (VIR_STRDUP(brname, virDomainNetGetActualBridgeName(net)) < 0) - return ret; - } else { - virReportError(VIR_ERR_INTERNAL_ERROR, - _("Network type %d is not supported"), - virDomainNetGetActualType(net)); - return ret; + if (!(brname = virDomainNetGetActualBridgeName(net))) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Missing bridge name")); + goto cleanup; } if (!net->ifname || @@ -400,7 +385,6 @@ qemuNetworkIfaceConnect(virDomainDefPtr def, if (template_ifname) VIR_FREE(net->ifname); } - VIR_FREE(brname); virObjectUnref(cfg); return ret; diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index 9467d7d..0d97b57 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -1746,58 +1746,20 @@ static virDomainNetDefPtr *qemuDomainFindNet(virDomainObjPtr vm, return NULL; } -static char * -qemuDomainNetGetBridgeName(virConnectPtr conn, virDomainNetDefPtr net) -{ - char *brname = NULL; - int actualType = virDomainNetGetActualType(net); - - if (actualType == VIR_DOMAIN_NET_TYPE_BRIDGE) { - const char *tmpbr = virDomainNetGetActualBridgeName(net); - if (!tmpbr) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("interface is missing bridge name")); - goto cleanup; - } - /* we need a copy, not just a pointer to the original */ - if (VIR_STRDUP(brname, tmpbr) < 0) - goto cleanup; - } else if (actualType == VIR_DOMAIN_NET_TYPE_NETWORK) { - virNetworkPtr network; - - if (!(network = virNetworkLookupByName(conn, net->data.network.name))) { - virReportError(VIR_ERR_INTERNAL_ERROR, - _("Couldn't find network '%s'"), - net->data.network.name); - goto cleanup; - } - brname = virNetworkGetBridgeName(network); - - virObjectUnref(network); - } else { - virReportError(VIR_ERR_INTERNAL_ERROR, - _("Interface type %d has no bridge name"), - virDomainNetGetActualType(net)); - } - - cleanup: - return brname; -} static int -qemuDomainChangeNetBridge(virConnectPtr conn, - virDomainObjPtr vm, +qemuDomainChangeNetBridge(virDomainObjPtr vm, virDomainNetDefPtr olddev, virDomainNetDefPtr newdev) { int ret = -1; - char *oldbridge = NULL, *newbridge = NULL; + const char *oldbridge = virDomainNetGetActualBridgeName(olddev); + const char *newbridge = virDomainNetGetActualBridgeName(newdev); - if (!(oldbridge = qemuDomainNetGetBridgeName(conn, olddev))) - goto cleanup; - - if (!(newbridge = qemuDomainNetGetBridgeName(conn, newdev))) + if (!oldbridge || !newbridge) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Missing bridge name")); goto cleanup; + } VIR_DEBUG("Change bridge for interface %s: %s -> %s", olddev->ifname, oldbridge, newbridge); @@ -1836,8 +1798,6 @@ qemuDomainChangeNetBridge(virConnectPtr conn, /* caller will replace entire olddev with newdev in domain nets list */ ret = 0; cleanup: - VIR_FREE(oldbridge); - VIR_FREE(newbridge); return ret; } @@ -2231,7 +2191,7 @@ qemuDomainChangeNet(virQEMUDriverPtr driver, } if (needBridgeChange) { - if (qemuDomainChangeNetBridge(dom->conn, vm, olddev, newdev) < 0) + if (qemuDomainChangeNetBridge(vm, olddev, newdev) < 0) goto cleanup; /* we successfully switched to the new bridge, and we've * determined that the rest of newdev is equivalent to olddev, -- 1.9.3

lxcProcessSetupInterfaces() used to have a special case for actualType='network' (a network with forward mode of route, nat, or isolated) to call the libvirt public API to retrieve the bridge being used by a network. That is no longer necessary - since all network types that use a bridge and tap device now get the bridge name stored in the ActualNetDef, we can just always use virDomainNetGetActualBridgeName() instead. --- Change from V2: change attribute name and fix commit log to reference lxc rather than qemu function. src/lxc/lxc_driver.c | 26 ++------------------------ src/lxc/lxc_process.c | 26 +------------------------- 2 files changed, 3 insertions(+), 49 deletions(-) diff --git a/src/lxc/lxc_driver.c b/src/lxc/lxc_driver.c index 97caee3..51c664f 100644 --- a/src/lxc/lxc_driver.c +++ b/src/lxc/lxc_driver.c @@ -4161,7 +4161,8 @@ lxcDomainAttachDeviceNetLive(virConnectPtr conn, actualType = virDomainNetGetActualType(net); switch (actualType) { - case VIR_DOMAIN_NET_TYPE_BRIDGE: { + case VIR_DOMAIN_NET_TYPE_BRIDGE: + case VIR_DOMAIN_NET_TYPE_NETWORK: { const char *brname = virDomainNetGetActualBridgeName(net); if (!brname) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", @@ -4174,29 +4175,6 @@ lxcDomainAttachDeviceNetLive(virConnectPtr conn, brname))) goto cleanup; } break; - case VIR_DOMAIN_NET_TYPE_NETWORK: { - virNetworkPtr network; - char *brname = NULL; - bool fail = false; - - if (!(network = virNetworkLookupByName(conn, net->data.network.name))) - goto cleanup; - if (!(brname = virNetworkGetBridgeName(network))) - fail = true; - - virObjectUnref(network); - if (fail) - goto cleanup; - - if (!(veth = virLXCProcessSetupInterfaceBridged(conn, - vm->def, - net, - brname))) { - VIR_FREE(brname); - goto cleanup; - } - VIR_FREE(brname); - } break; case VIR_DOMAIN_NET_TYPE_DIRECT: { if (!(veth = virLXCProcessSetupInterfaceDirect(conn, vm->def, diff --git a/src/lxc/lxc_process.c b/src/lxc/lxc_process.c index de574a9..632fc17 100644 --- a/src/lxc/lxc_process.c +++ b/src/lxc/lxc_process.c @@ -382,31 +382,7 @@ static int virLXCProcessSetupInterfaces(virConnectPtr conn, type = virDomainNetGetActualType(net); switch (type) { - case VIR_DOMAIN_NET_TYPE_NETWORK: { - virNetworkPtr network; - char *brname = NULL; - bool fail = false; - - if (!(network = virNetworkLookupByName(conn, - net->data.network.name))) - goto cleanup; - if (!(brname = virNetworkGetBridgeName(network))) - fail = true; - - virObjectUnref(network); - if (fail) - goto cleanup; - - if (!(veth = virLXCProcessSetupInterfaceBridged(conn, - def, - net, - brname))) { - VIR_FREE(brname); - goto cleanup; - } - VIR_FREE(brname); - break; - } + case VIR_DOMAIN_NET_TYPE_NETWORK: case VIR_DOMAIN_NET_TYPE_BRIDGE: { const char *brname = virDomainNetGetActualBridgeName(net); if (!brname) { -- 1.9.3
participants (2)
-
Daniel P. Berrange
-
Laine Stump