[libvirt] [PATCHv4 0/2] Added waiting for DAD to finish for bridge address.

This is a fix for commit db488c79173b240459c7754f38c3c6af9b432970 dnsmasq main process which is relied on when waiting for DAD to complete exits without actually waiting for DAD. This is dnsmasq daemon's task. It seems to be a race that DAD finished before dnsmasq main process exited. The above commit needs the execution to block until DAD finishes for bridge IPv6 address because then it closes dummy tap device. Thus we need to ensure this ourselves. So we periodically poll the kernel using netlink and check whether there are any IPv6 addresses assigned to bridge which have 'tentative' state. After DAD is finished, execution continues. I guess that is what dnsmasq was assumed to do. We use netlink to dump information about existing IPv6 addresses. Netlink's response is a multi-part message. Unfortunately, the current implementation of virNetlink treats such messages as faulty and throws an error. So the patch 2/2 adds multi-part nelink response support. Update v2: fixed syntax. Update v3: moved to virnetdev. Update v4: added DAD timeout, minor fixes. Maxim Perevedentsev (2): network: added waiting for DAD to finish for bridge address. netlink: add support for multi-part netlink messages. src/libvirt_private.syms | 1 + src/network/bridge_driver.c | 34 +++++++++++++- src/util/virnetdev.c | 110 ++++++++++++++++++++++++++++++++++++++++++++ src/util/virnetdev.h | 2 + src/util/virnetlink.c | 4 +- 5 files changed, 149 insertions(+), 2 deletions(-) -- 1.8.3.1

This is a fix for commit db488c79173b240459c7754f38c3c6af9b432970 dnsmasq main process exits without waiting for DAD, this is dnsmasq daemon's task. So we periodically poll the kernel using netlink and check whether there are any IPv6 addresses assigned to bridge which have 'tentative' state. After DAD is finished, execution continues. I guess that is what dnsmasq was assumed to do. --- Difference to v1: fixed syntax (comments and alignment). Difference to v2: moved to virnetdev. Difference to v3: added timeout. src/libvirt_private.syms | 1 + src/network/bridge_driver.c | 33 +++++++++++++- src/util/virnetdev.c | 109 ++++++++++++++++++++++++++++++++++++++++++++ src/util/virnetdev.h | 2 + 4 files changed, 144 insertions(+), 1 deletion(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index be6ee19..578c6fe 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1792,6 +1792,7 @@ virNetDevSetRcvMulti; virNetDevSetupControl; virNetDevSysfsFile; virNetDevValidateConfig; +virNetDevWaitDadFinish; # util/virnetdevbandwidth.h diff --git a/src/network/bridge_driver.c b/src/network/bridge_driver.c index cd9a51e..63efffa 100644 --- a/src/network/bridge_driver.c +++ b/src/network/bridge_driver.c @@ -2026,6 +2026,31 @@ networkAddRouteToBridge(virNetworkObjPtr network, } static int +networkWaitDadFinish(virNetworkObjPtr network) +{ + virNetworkIpDefPtr ipdef; + virSocketAddrPtr *addrs = NULL, addr = NULL; + size_t naddrs = 0; + int ret = -1; + + VIR_DEBUG("Started waiting for IPv6 DAD"); + + while ((ipdef = virNetworkDefGetIpByIndex( + network->def, AF_INET6, naddrs))) { + addr = &ipdef->address; + if (VIR_APPEND_ELEMENT_COPY(addrs, naddrs, addr) < 0) + goto cleanup; + } + + ret = (naddrs == 0) ? 0 : virNetDevWaitDadFinish(addrs, naddrs); + + cleanup: + VIR_FREE(addrs); + VIR_DEBUG("IPv6 DAD finished with status %d", ret); + return ret; +} + +static int networkStartNetworkVirtual(virNetworkDriverStatePtr driver, virNetworkObjPtr network) { @@ -2159,7 +2184,13 @@ networkStartNetworkVirtual(virNetworkDriverStatePtr driver, if (v6present && networkStartRadvd(driver, network) < 0) goto err4; - /* DAD has happened (dnsmasq waits for it), dnsmasq is now bound to the + /* dnsmasq main process does not wait for DAD to complete, + * so we need to wait for it ourselves. + */ + if (v6present && networkWaitDadFinish(network) < 0) + goto err4; + + /* DAD has happened, dnsmasq is now bound to the * bridge's IPv6 address, so we can now set the dummy tun down. */ if (tapfd >= 0) { diff --git a/src/util/virnetdev.c b/src/util/virnetdev.c index 579ff3b..6b6bba0 100644 --- a/src/util/virnetdev.c +++ b/src/util/virnetdev.c @@ -96,6 +96,7 @@ VIR_LOG_INIT("util.netdev"); # define FEATURE_BIT_IS_SET(blocks, index, field) \ (FEATURE_WORD(blocks, index, field) & FEATURE_FIELD_FLAG(index)) #endif +# define DAD_WAIT_TIMEOUT 5 /* seconds */ typedef enum { VIR_MCAST_TYPE_INDEX_TOKEN, @@ -1305,6 +1306,105 @@ int virNetDevClearIPAddress(const char *ifname, return ret; } +/* return whether there is a known address with 'tentative' flag set */ +static int +virNetDevParseDadStatus(struct nlmsghdr *nlh, int len, + virSocketAddrPtr *addrs, size_t count) +{ + struct ifaddrmsg *ifaddrmsg_ptr; + unsigned int ifaddrmsg_len; + struct rtattr *rtattr_ptr; + size_t i; + struct in6_addr *addr; + for (; NLMSG_OK(nlh, len); nlh = NLMSG_NEXT(nlh, len)) { + if (NLMSG_PAYLOAD(nlh, 0) < sizeof(struct ifaddrmsg)) { + /* Message without payload is the last one. */ + break; + } + + ifaddrmsg_ptr = (struct ifaddrmsg *)NLMSG_DATA(nlh); + if (!(ifaddrmsg_ptr->ifa_flags & IFA_F_TENTATIVE)) { + /* Not tentative: we are not interested in this entry. */ + continue; + } + + ifaddrmsg_len = IFA_PAYLOAD(nlh); + rtattr_ptr = (struct rtattr *) IFA_RTA(ifaddrmsg_ptr); + for (; RTA_OK(rtattr_ptr, ifaddrmsg_len); + rtattr_ptr = RTA_NEXT(rtattr_ptr, ifaddrmsg_len)) { + if (RTA_PAYLOAD(rtattr_ptr) != sizeof(struct in6_addr)) { + /* No address: ignore. */ + continue; + } + + /* We check only known addresses. */ + for (i = 0; i < count; i++) { + addr = &addrs[i]->data.inet6.sin6_addr; + if (!memcmp(addr, RTA_DATA(rtattr_ptr), + sizeof(struct in6_addr))) { + /* We found matching tentative address. */ + return 1; + } + } + } + } + return 0; +} + +/* return after DAD finishes for all known IPv6 addresses or an error */ +int +virNetDevWaitDadFinish(virSocketAddrPtr *addrs, size_t count) +{ + struct nl_msg *nlmsg = NULL; + struct ifaddrmsg ifa; + struct nlmsghdr *resp = NULL; + unsigned int recvbuflen; + int ret = -1, dad = 1; + time_t max_time = time(NULL) + DAD_WAIT_TIMEOUT; + + if (!(nlmsg = nlmsg_alloc_simple(RTM_GETADDR, + NLM_F_REQUEST | NLM_F_DUMP))) { + virReportOOMError(); + return -1; + } + + memset(&ifa, 0, sizeof(ifa)); + /* DAD is for IPv6 adresses only. */ + ifa.ifa_family = AF_INET6; + if (nlmsg_append(nlmsg, &ifa, sizeof(ifa), NLMSG_ALIGNTO) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("allocated netlink buffer is too small")); + goto cleanup; + } + + /* Periodically query netlink until DAD finishes on all known addresses. */ + while (dad && time(NULL) < max_time) { + if (virNetlinkCommand(nlmsg, &resp, &recvbuflen, 0, 0, + NETLINK_ROUTE, 0) < 0) + goto cleanup; + + if (virNetlinkGetErrorCode(resp, recvbuflen) < 0) { + virReportError(VIR_ERR_SYSTEM_ERROR, "%s", + _("error reading DAD state information")); + goto cleanup; + } + + /* Parse response. */ + dad = virNetDevParseDadStatus(resp, recvbuflen, addrs, count); + if (dad) + usleep(1000 * 10); + + VIR_FREE(resp); + } + /* Check timeout. */ + ret = dad ? -1 : 0; + + cleanup: + VIR_FREE(resp); + nlmsg_free(nlmsg); + return ret; +} + #else /* defined(__linux__) && defined(HAVE_LIBNL) */ int virNetDevSetIPAddress(const char *ifname, @@ -1424,6 +1524,15 @@ int virNetDevClearIPAddress(const char *ifname, return ret; } +/* return after DAD finishes for all known IPv6 addresses or an error */ +int +virNetDevWaitDadFinish(virSocketAddrPtr *addrs, size_t count) +{ + virReportSystemError(ENOSYS, "%s", + _("Unable to wait for IPv6 DAD on this platform")); + return -1; +} + #endif /* defined(__linux__) && defined(HAVE_LIBNL) */ /** diff --git a/src/util/virnetdev.h b/src/util/virnetdev.h index a27504b..9108be6 100644 --- a/src/util/virnetdev.h +++ b/src/util/virnetdev.h @@ -105,6 +105,8 @@ int virNetDevClearIPAddress(const char *ifname, ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; int virNetDevGetIPAddress(const char *ifname, virSocketAddrPtr addr) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; +int virNetDevWaitDadFinish(virSocketAddrPtr *addrs, size_t count) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); int virNetDevSetMAC(const char *ifname, -- 1.8.3.1

On 10/20/2015 11:44 AM, Maxim Perevedentsev wrote:
This is a fix for commit db488c79173b240459c7754f38c3c6af9b432970 dnsmasq main process exits without waiting for DAD, this is dnsmasq daemon's task. So we periodically poll the kernel using netlink and check whether there are any IPv6 addresses assigned to bridge which have 'tentative' state. After DAD is finished, execution continues. I guess that is what dnsmasq was assumed to do.
ACK and pushed. There are a few comments below of some minor things I tweaked before pushing. Thanks for persevering with this patch! I reworded the commit message a bit (get the latest from git to see the difference).
---
Difference to v1: fixed syntax (comments and alignment). Difference to v2: moved to virnetdev. Difference to v3: added timeout.
src/libvirt_private.syms | 1 + src/network/bridge_driver.c | 33 +++++++++++++- src/util/virnetdev.c | 109 ++++++++++++++++++++++++++++++++++++++++++++ src/util/virnetdev.h | 2 + 4 files changed, 144 insertions(+), 1 deletion(-)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index be6ee19..578c6fe 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1792,6 +1792,7 @@ virNetDevSetRcvMulti; virNetDevSetupControl; virNetDevSysfsFile; virNetDevValidateConfig; +virNetDevWaitDadFinish;
# util/virnetdevbandwidth.h diff --git a/src/network/bridge_driver.c b/src/network/bridge_driver.c index cd9a51e..63efffa 100644 --- a/src/network/bridge_driver.c +++ b/src/network/bridge_driver.c @@ -2026,6 +2026,31 @@ networkAddRouteToBridge(virNetworkObjPtr network, }
static int +networkWaitDadFinish(virNetworkObjPtr network) +{ + virNetworkIpDefPtr ipdef; + virSocketAddrPtr *addrs = NULL, addr = NULL; + size_t naddrs = 0; + int ret = -1; + + VIR_DEBUG("Started waiting for IPv6 DAD");
I added the name of the network to this debug message
+ + while ((ipdef = virNetworkDefGetIpByIndex( + network->def, AF_INET6, naddrs))) { + addr = &ipdef->address; + if (VIR_APPEND_ELEMENT_COPY(addrs, naddrs, addr) < 0) + goto cleanup; + } + + ret = (naddrs == 0) ? 0 : virNetDevWaitDadFinish(addrs, naddrs); + + cleanup: + VIR_FREE(addrs); + VIR_DEBUG("IPv6 DAD finished with status %d", ret);
Also added network name to this debug log message.
+ return ret; +} + +static int networkStartNetworkVirtual(virNetworkDriverStatePtr driver, virNetworkObjPtr network) { @@ -2159,7 +2184,13 @@ networkStartNetworkVirtual(virNetworkDriverStatePtr driver, if (v6present && networkStartRadvd(driver, network) < 0) goto err4;
- /* DAD has happened (dnsmasq waits for it), dnsmasq is now bound to the + /* dnsmasq main process does not wait for DAD to complete, + * so we need to wait for it ourselves. + */ + if (v6present && networkWaitDadFinish(network) < 0) + goto err4; + + /* DAD has happened, dnsmasq is now bound to the * bridge's IPv6 address, so we can now set the dummy tun down. */ if (tapfd >= 0) { diff --git a/src/util/virnetdev.c b/src/util/virnetdev.c index 579ff3b..6b6bba0 100644 --- a/src/util/virnetdev.c +++ b/src/util/virnetdev.c @@ -96,6 +96,7 @@ VIR_LOG_INIT("util.netdev"); # define FEATURE_BIT_IS_SET(blocks, index, field) \ (FEATURE_WORD(blocks, index, field) & FEATURE_FIELD_FLAG(index)) #endif +# define DAD_WAIT_TIMEOUT 5 /* seconds */
I changed this name to VIR_DAD_WAIT_TIMEOUT to avoid any potential future clash with someone else's name.
typedef enum { VIR_MCAST_TYPE_INDEX_TOKEN, @@ -1305,6 +1306,105 @@ int virNetDevClearIPAddress(const char *ifname, return ret; }
+/* return whether there is a known address with 'tentative' flag set */ +static int
I changed this to return bool (and updated the comment accordingly)
+virNetDevParseDadStatus(struct nlmsghdr *nlh, int len, + virSocketAddrPtr *addrs, size_t count) +{ + struct ifaddrmsg *ifaddrmsg_ptr; + unsigned int ifaddrmsg_len; + struct rtattr *rtattr_ptr; + size_t i; + struct in6_addr *addr; + for (; NLMSG_OK(nlh, len); nlh = NLMSG_NEXT(nlh, len)) { + if (NLMSG_PAYLOAD(nlh, 0) < sizeof(struct ifaddrmsg)) { + /* Message without payload is the last one. */ + break; + } + + ifaddrmsg_ptr = (struct ifaddrmsg *)NLMSG_DATA(nlh); + if (!(ifaddrmsg_ptr->ifa_flags & IFA_F_TENTATIVE)) { + /* Not tentative: we are not interested in this entry. */ + continue; + } + + ifaddrmsg_len = IFA_PAYLOAD(nlh); + rtattr_ptr = (struct rtattr *) IFA_RTA(ifaddrmsg_ptr); + for (; RTA_OK(rtattr_ptr, ifaddrmsg_len); + rtattr_ptr = RTA_NEXT(rtattr_ptr, ifaddrmsg_len)) { + if (RTA_PAYLOAD(rtattr_ptr) != sizeof(struct in6_addr)) { + /* No address: ignore. */ + continue; + } + + /* We check only known addresses. */ + for (i = 0; i < count; i++) { + addr = &addrs[i]->data.inet6.sin6_addr; + if (!memcmp(addr, RTA_DATA(rtattr_ptr), + sizeof(struct in6_addr))) { + /* We found matching tentative address. */ + return 1; + } + } + } + } + return 0; +} + +/* return after DAD finishes for all known IPv6 addresses or an error */ +int +virNetDevWaitDadFinish(virSocketAddrPtr *addrs, size_t count) +{ + struct nl_msg *nlmsg = NULL; + struct ifaddrmsg ifa; + struct nlmsghdr *resp = NULL; + unsigned int recvbuflen; + int ret = -1, dad = 1;
changed dad to a bool.
+ time_t max_time = time(NULL) + DAD_WAIT_TIMEOUT; + + if (!(nlmsg = nlmsg_alloc_simple(RTM_GETADDR, + NLM_F_REQUEST | NLM_F_DUMP))) { + virReportOOMError(); + return -1; + } + + memset(&ifa, 0, sizeof(ifa)); + /* DAD is for IPv6 adresses only. */ + ifa.ifa_family = AF_INET6; + if (nlmsg_append(nlmsg, &ifa, sizeof(ifa), NLMSG_ALIGNTO) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("allocated netlink buffer is too small")); + goto cleanup; + } + + /* Periodically query netlink until DAD finishes on all known addresses. */ + while (dad && time(NULL) < max_time) { + if (virNetlinkCommand(nlmsg, &resp, &recvbuflen, 0, 0, + NETLINK_ROUTE, 0) < 0) + goto cleanup; + + if (virNetlinkGetErrorCode(resp, recvbuflen) < 0) { + virReportError(VIR_ERR_SYSTEM_ERROR, "%s", + _("error reading DAD state information")); + goto cleanup; + } + + /* Parse response. */ + dad = virNetDevParseDadStatus(resp, recvbuflen, addrs, count); + if (dad) + usleep(1000 * 10); + + VIR_FREE(resp); + } + /* Check timeout. */ + ret = dad ? -1 : 0; + + cleanup: + VIR_FREE(resp); + nlmsg_free(nlmsg); + return ret; +} + #else /* defined(__linux__) && defined(HAVE_LIBNL) */
int virNetDevSetIPAddress(const char *ifname, @@ -1424,6 +1524,15 @@ int virNetDevClearIPAddress(const char *ifname, return ret; }
+/* return after DAD finishes for all known IPv6 addresses or an error */ +int +virNetDevWaitDadFinish(virSocketAddrPtr *addrs, size_t count) +{ + virReportSystemError(ENOSYS, "%s", + _("Unable to wait for IPv6 DAD on this platform")); + return -1;
I was wondering if it might be better to have this do a VIR_INFO and return 0, but decided that currently the only caller (bridge_driver.c) is only compiled on Linux anyway, and if somebody starts building that on non-Linux in the future, they would probably prefer to get an error rather than a warning.
+} + #endif /* defined(__linux__) && defined(HAVE_LIBNL) */
/** diff --git a/src/util/virnetdev.h b/src/util/virnetdev.h index a27504b..9108be6 100644 --- a/src/util/virnetdev.h +++ b/src/util/virnetdev.h @@ -105,6 +105,8 @@ int virNetDevClearIPAddress(const char *ifname, ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; int virNetDevGetIPAddress(const char *ifname, virSocketAddrPtr addr) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; +int virNetDevWaitDadFinish(virSocketAddrPtr *addrs, size_t count) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2);
int virNetDevSetMAC(const char *ifname, -- 1.8.3.1
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list

Such messages do not have NLMSG_ERROR or NLMSG_DONE type but they are valid responses. We test 'multi-partness' by looking for NLM_F_MULTI flag. --- Difference to v1: fixed comment style. src/util/virnetlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/util/virnetlink.c b/src/util/virnetlink.c index 0276522..679b48e 100644 --- a/src/util/virnetlink.c +++ b/src/util/virnetlink.c @@ -394,7 +394,9 @@ virNetlinkGetErrorCode(struct nlmsghdr *resp, unsigned int recvbuflen) break; default: - goto malformed_resp; + /* We allow multipart messages. */ + if (!(resp->nlmsg_flags & NLM_F_MULTI)) + goto malformed_resp; } return result; -- 1.8.3.1

On 10/20/2015 11:44 AM, Maxim Perevedentsev wrote:
Such messages do not have NLMSG_ERROR or NLMSG_DONE type but they are valid responses. We test 'multi-partness' by looking for NLM_F_MULTI flag.
ACK. As I understand, this is required for the other patch to work, so I pushed it first.
--- Difference to v1: fixed comment style.
src/util/virnetlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/src/util/virnetlink.c b/src/util/virnetlink.c index 0276522..679b48e 100644 --- a/src/util/virnetlink.c +++ b/src/util/virnetlink.c @@ -394,7 +394,9 @@ virNetlinkGetErrorCode(struct nlmsghdr *resp, unsigned int recvbuflen) break;
default: - goto malformed_resp; + /* We allow multipart messages. */ + if (!(resp->nlmsg_flags & NLM_F_MULTI)) + goto malformed_resp; }
return result; -- 1.8.3.1
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
participants (2)
-
Laine Stump
-
Maxim Perevedentsev