[libvirt] [PATCHv3 0/2] Added waiting for DAD to finish for bridge address.

This is a fix for commit db488c79173b240459c7754f38c3c6af9b432970 dnsmasq main process which is relied on when waiting for DAD to complete exits without actually waiting for DAD. This is dnsmasq daemon's task. It seems to be a race that DAD finished before dnsmasq main process exited. The above commit needs the execution to block until DAD finishes for bridge IPv6 address because then it closes dummy tap device. Thus we need to ensure this ourselves. So we periodically poll the kernel using netlink and check whether there are any IPv6 addresses assigned to bridge which have 'tentative' state. After DAD is finished, execution continues. I guess that is what dnsmasq was assumed to do. We use netlink to dump information about existing IPv6 addresses. Netlink's response is a multi-part message. Unfortunately, the current implementation of virNetlink treats such messages as faulty and throws an error. So the patch 2/2 adds multi-part nelink response support. Update v2: fixed syntax. Update v3: moved to virnetdev. Resend: These patches were ignored and buried long ago :-( Maxim Perevedentsev (2): network: added waiting for DAD to finish for bridge address. netlink: add support for multi-part netlink messages. src/libvirt_private.syms | 1 + src/network/bridge_driver.c | 35 +++++++++- src/util/virnetdev.c | 160 ++++++++++++++++++++++++++++++++++++++++++++ src/util/virnetdev.h | 2 + src/util/virnetlink.c | 4 +- 5 files changed, 200 insertions(+), 2 deletions(-) -- 1.8.3.1

This is a fix for commit db488c79173b240459c7754f38c3c6af9b432970 dnsmasq main process exits without waiting for DAD, this is dnsmasq daemon's task. So we periodically poll the kernel using netlink and check whether there are any IPv6 addresses assigned to bridge which have 'tentative' state. After DAD is finished, execution continues. I guess that is what dnsmasq was assumed to do. --- Difference to v1: fixed syntax (comments and alignment). Difference to v2: moved to virnetdev. src/libvirt_private.syms | 1 + src/network/bridge_driver.c | 35 +++++++++- src/util/virnetdev.c | 160 ++++++++++++++++++++++++++++++++++++++++++++ src/util/virnetdev.h | 2 + 4 files changed, 197 insertions(+), 1 deletion(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index c87efa1..514560b 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1791,6 +1791,7 @@ virNetDevSetRcvMulti; virNetDevSetupControl; virNetDevSysfsFile; virNetDevValidateConfig; +virNetDevWaitDadFinish; # util/virnetdevbandwidth.h diff --git a/src/network/bridge_driver.c b/src/network/bridge_driver.c index c343e5b..0a93678 100644 --- a/src/network/bridge_driver.c +++ b/src/network/bridge_driver.c @@ -2026,6 +2026,33 @@ networkAddRouteToBridge(virNetworkObjPtr network, } static int +networkWaitDadFinish(virNetworkObjPtr network) +{ + virNetworkIpDefPtr ipdef; + virSocketAddrPtr *addrs = NULL; + size_t i; + int ret; + for (i = 0; + (ipdef = virNetworkDefGetIpByIndex(network->def, AF_INET6, i)); + i++) {} + + if (i == 0) + return 0; + if (VIR_ALLOC_N(addrs, i)) + return -1; + + for (i = 0; + (ipdef = virNetworkDefGetIpByIndex(network->def, AF_INET6, i)); + i++) { + addrs[i] = &ipdef->address; + } + + ret = virNetDevWaitDadFinish(addrs, i); + VIR_FREE(addrs); + return ret; +} + +static int networkStartNetworkVirtual(virNetworkDriverStatePtr driver, virNetworkObjPtr network) { @@ -2159,7 +2186,13 @@ networkStartNetworkVirtual(virNetworkDriverStatePtr driver, if (v6present && networkStartRadvd(driver, network) < 0) goto err4; - /* DAD has happened (dnsmasq waits for it), dnsmasq is now bound to the + /* dnsmasq main process does not wait for DAD to complete, + * so we need to wait for it ourselves. + */ + if (v6present && networkWaitDadFinish(network) < 0) + goto err4; + + /* DAD has happened, dnsmasq is now bound to the * bridge's IPv6 address, so we can now set the dummy tun down. */ if (tapfd >= 0) { diff --git a/src/util/virnetdev.c b/src/util/virnetdev.c index 579ff3b..3d77b37 100644 --- a/src/util/virnetdev.c +++ b/src/util/virnetdev.c @@ -96,6 +96,7 @@ VIR_LOG_INIT("util.netdev"); # define FEATURE_BIT_IS_SET(blocks, index, field) \ (FEATURE_WORD(blocks, index, field) & FEATURE_FIELD_FLAG(index)) #endif +# define IP_BUF_SIZE 4096 typedef enum { VIR_MCAST_TYPE_INDEX_TOKEN, @@ -1305,6 +1306,103 @@ int virNetDevClearIPAddress(const char *ifname, return ret; } +/* return whether there is a known address with 'tentative' flag set */ +static int +virNetDevParseDadStatus(struct nlmsghdr *nlh, int len, + virSocketAddrPtr *addrs, size_t count) +{ + struct ifaddrmsg *ifaddrmsg_ptr; + unsigned int ifaddrmsg_len; + struct rtattr *rtattr_ptr; + size_t i; + struct in6_addr *addr; + for (; NLMSG_OK(nlh, len); nlh = NLMSG_NEXT(nlh, len)) { + if (NLMSG_PAYLOAD(nlh, 0) < sizeof(struct ifaddrmsg)) { + /* Message without payload is the last one. */ + break; + } + + ifaddrmsg_ptr = (struct ifaddrmsg *)NLMSG_DATA(nlh); + if (!(ifaddrmsg_ptr->ifa_flags & IFA_F_TENTATIVE)) { + /* Not tentative: we are not interested in this entry. */ + continue; + } + + ifaddrmsg_len = IFA_PAYLOAD(nlh); + rtattr_ptr = (struct rtattr *) IFA_RTA(ifaddrmsg_ptr); + for (; RTA_OK(rtattr_ptr, ifaddrmsg_len); + rtattr_ptr = RTA_NEXT(rtattr_ptr, ifaddrmsg_len)) { + if (RTA_PAYLOAD(rtattr_ptr) != sizeof(struct in6_addr)) { + /* No address: ignore. */ + continue; + } + + /* We check only known addresses. */ + for (i = 0; i < count; i++) { + addr = &addrs[i]->data.inet6.sin6_addr; + if (!memcmp(addr, RTA_DATA(rtattr_ptr), + sizeof(struct in6_addr))) { + /* We found matching tentative address. */ + return 1; + } + } + } + } + return 0; +} + +/* return after DAD finishes for all known IPv6 addresses or an error */ +int +virNetDevWaitDadFinish(virSocketAddrPtr *addrs, size_t count) +{ + struct nl_msg *nlmsg = NULL; + struct ifaddrmsg ifa; + struct nlmsghdr *resp = NULL; + unsigned int recvbuflen; + int ret = -1, dad = 1; + + if (!(nlmsg = nlmsg_alloc_simple(RTM_GETADDR, + NLM_F_REQUEST | NLM_F_DUMP))) { + virReportOOMError(); + return -1; + } + + memset(&ifa, 0, sizeof(ifa)); + /* DAD is for IPv6 adresses only. */ + ifa.ifa_family = AF_INET6; + if (nlmsg_append(nlmsg, &ifa, sizeof(ifa), NLMSG_ALIGNTO) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("allocated netlink buffer is too small")); + goto cleanup; + } + + /* Periodically query netlink until DAD finishes on all known addresses. */ + while (dad) { + if (virNetlinkCommand(nlmsg, &resp, &recvbuflen, 0, 0, + NETLINK_ROUTE, 0) < 0) + goto cleanup; + + if (virNetlinkGetErrorCode(resp, recvbuflen) < 0) { + virReportError(VIR_ERR_SYSTEM_ERROR, "%s", + _("error reading DAD state information")); + goto cleanup; + } + + /* Parse response. */ + dad = virNetDevParseDadStatus(resp, recvbuflen, addrs, count); + if (dad) + usleep(1000 * 10); + + VIR_FREE(resp); + } + ret = 0; + + cleanup: + VIR_FREE(resp); + nlmsg_free(nlmsg); + return ret; +} + #else /* defined(__linux__) && defined(HAVE_LIBNL) */ int virNetDevSetIPAddress(const char *ifname, @@ -1424,6 +1522,68 @@ int virNetDevClearIPAddress(const char *ifname, return ret; } +/* return whether there is a known address with 'tentative' flag set */ +static int +virNetDevParseDadStatus(char *outbuf, + virSocketAddrPtr *addrs, size_t count) +{ + virSocketAddr sockaddr; + size_t i, j; + int ret = 0; + char **addr_strings = virStringSplit(outbuf, "\n", 0); + for (j = 0; addr_strings[j] != NULL; ++j) { + if (virSocketParseAddr(strings[j], &sockaddr, AF_INET6) < 0) + continue; + for (i = 0; i < count; i++) { + if (virSocketAddrEqual(addrs[i], &sockaddr)) { + ret = 1; + goto cleanup; + } + } + } + + cleanup: + virStringFreeList(addr_strings); + return ret; +} + +/* return after DAD finishes for all known IPv6 addresses or an error */ +int +virNetDevWaitDadFinish(virSocketAddrPtr *addrs, size_t count) +{ + int ret = -1, dad = 1; + char *outbuf = NULL; + virCommandPtr cmd = NULL; + +# ifdef IP_PATH + while (dad) { + cmd = virCommandNew(POSIX_SHELL); + virCommandAddArgList(cmd, "-c", + IP_PATH "-6 addr show tentative | \ + awk '/inet6/{split(\\$2, a, \\\"/\\\"); \ + print a[1]}'", NULL); + virCommandSetOutputBuffer(cmd, &outbuf); + if (virCommandRun(cmd, NULL)) + goto cleanup; + virCommandFree(cmd); + + dad = virNetDevParseDadStatus(outbuf, addrs, count); + if (dad) + usleep(1000 * 10); + + VIR_FREE(outbuf); + } + ret = 0; +# else + virReportSystemError(ENOSYS, "%s", _("Unable to check IPv6 DAD")); +# endif + + cleanup: + virCommandFree(cmd); + VIR_FREE(outbuf); + return ret; +} + #endif /* defined(__linux__) && defined(HAVE_LIBNL) */ /** diff --git a/src/util/virnetdev.h b/src/util/virnetdev.h index a27504b..9108be6 100644 --- a/src/util/virnetdev.h +++ b/src/util/virnetdev.h @@ -105,6 +105,8 @@ int virNetDevClearIPAddress(const char *ifname, ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; int virNetDevGetIPAddress(const char *ifname, virSocketAddrPtr addr) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; +int virNetDevWaitDadFinish(virSocketAddrPtr *addrs, size_t count) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); int virNetDevSetMAC(const char *ifname, -- 1.8.3.1

Sorry I keep delaying replying about this patch. Whenever I start, I run into a problem with it, then while I'm trying to decide how to respond I get distracted by something else and forget to come back. On 09/29/2015 06:16 AM, Maxim Perevedentsev wrote:
This is a fix for commit db488c79173b240459c7754f38c3c6af9b432970 dnsmasq main process exits without waiting for DAD, this is dnsmasq daemon's task. So we periodically poll the kernel using netlink and check whether there are any IPv6 addresses assigned to bridge which have 'tentative' state. After DAD is finished, execution continues. I guess that is what dnsmasq was assumed to do.
Correct. The author of that patch assumed that DAD would be finished by the time dnsmasq daemonized, but as we learned from Simon Kelley on the dnsmasq mailing list, this isn't the case. We can go ahead and state this as fact in the commit message.
---
Difference to v1: fixed syntax (comments and alignment). Difference to v2: moved to virnetdev.
src/libvirt_private.syms | 1 + src/network/bridge_driver.c | 35 +++++++++- src/util/virnetdev.c | 160 ++++++++++++++++++++++++++++++++++++++++++++ src/util/virnetdev.h | 2 + 4 files changed, 197 insertions(+), 1 deletion(-)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index c87efa1..514560b 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1791,6 +1791,7 @@ virNetDevSetRcvMulti; virNetDevSetupControl; virNetDevSysfsFile; virNetDevValidateConfig; +virNetDevWaitDadFinish;
# util/virnetdevbandwidth.h diff --git a/src/network/bridge_driver.c b/src/network/bridge_driver.c index c343e5b..0a93678 100644 --- a/src/network/bridge_driver.c +++ b/src/network/bridge_driver.c @@ -2026,6 +2026,33 @@ networkAddRouteToBridge(virNetworkObjPtr network, }
static int +networkWaitDadFinish(virNetworkObjPtr network) +{ + virNetworkIpDefPtr ipdef; + virSocketAddrPtr *addrs = NULL; + size_t i; + int ret; + for (i = 0; + (ipdef = virNetworkDefGetIpByIndex(network->def, AF_INET6, i)); + i++) {} + + if (i == 0) + return 0; + if (VIR_ALLOC_N(addrs, i)) + return -1; + + for (i = 0; + (ipdef = virNetworkDefGetIpByIndex(network->def, AF_INET6, i)); + i++) { + addrs[i] = &ipdef->address; + }
This code could be much more compact (and only very slightly less efficient) by using something like: virNetworkIpDefPtr ipdef; virSocketAddrPtr addrs = NULL; size_t i, naddrs = 0; for (i = 0; (ipdef == virNetworkDefGetIpByIndex(...); i++) { if (VIR_APPEND_ELEMENT_COPY(addrs, naddrs, ipdef->address) < 0) return -1; } if (naddrs == 0) return 0; addrs would then be a pointer to an array of addresses rather than a pointer to an array of pointers to addresses, so the lower functions would need to be adjusted accordingly. (NB: due to the switch from stuff** to stuff*, there would be fewer mallocs than your existing code, although at each malloc you could potentially incur the penalty of a copy of all existing elements. For the size of array we're talking about though, this is inconsequential (especially since any good malloc implementation will carve out memory in larger chunks, and so will not need to do a copy each time the region is realloced).
+ + ret = virNetDevWaitDadFinish(addrs, i);
s/i/naddrs/ after the above change.
+ VIR_FREE(addrs);
The way you have this now, the above leaks the memory pointed to by each element in the array. (once you switch from virSocketAddrPtr* to virSocketAddr* this would no longer be a problem).
+ return ret; +} + +static int networkStartNetworkVirtual(virNetworkDriverStatePtr driver, virNetworkObjPtr network) { @@ -2159,7 +2186,13 @@ networkStartNetworkVirtual(virNetworkDriverStatePtr driver, if (v6present && networkStartRadvd(driver, network) < 0) goto err4;
- /* DAD has happened (dnsmasq waits for it), dnsmasq is now bound to the + /* dnsmasq main process does not wait for DAD to complete, + * so we need to wait for it ourselves. + */ + if (v6present && networkWaitDadFinish(network) < 0) + goto err4; + + /* DAD has happened, dnsmasq is now bound to the * bridge's IPv6 address, so we can now set the dummy tun down. */ if (tapfd >= 0) { diff --git a/src/util/virnetdev.c b/src/util/virnetdev.c index 579ff3b..3d77b37 100644 --- a/src/util/virnetdev.c +++ b/src/util/virnetdev.c @@ -96,6 +96,7 @@ VIR_LOG_INIT("util.netdev"); # define FEATURE_BIT_IS_SET(blocks, index, field) \ (FEATURE_WORD(blocks, index, field) & FEATURE_FIELD_FLAG(index)) #endif +# define IP_BUF_SIZE 4096
typedef enum { VIR_MCAST_TYPE_INDEX_TOKEN, @@ -1305,6 +1306,103 @@ int virNetDevClearIPAddress(const char *ifname, return ret; }
+/* return whether there is a known address with 'tentative' flag set */ +static int +virNetDevParseDadStatus(struct nlmsghdr *nlh, int len, + virSocketAddrPtr *addrs, size_t count) +{ + struct ifaddrmsg *ifaddrmsg_ptr; + unsigned int ifaddrmsg_len; + struct rtattr *rtattr_ptr; + size_t i; + struct in6_addr *addr; + for (; NLMSG_OK(nlh, len); nlh = NLMSG_NEXT(nlh, len)) { + if (NLMSG_PAYLOAD(nlh, 0) < sizeof(struct ifaddrmsg)) { + /* Message without payload is the last one. */ + break; + } + + ifaddrmsg_ptr = (struct ifaddrmsg *)NLMSG_DATA(nlh); + if (!(ifaddrmsg_ptr->ifa_flags & IFA_F_TENTATIVE)) { + /* Not tentative: we are not interested in this entry. */ + continue; + } + + ifaddrmsg_len = IFA_PAYLOAD(nlh); + rtattr_ptr = (struct rtattr *) IFA_RTA(ifaddrmsg_ptr); + for (; RTA_OK(rtattr_ptr, ifaddrmsg_len); + rtattr_ptr = RTA_NEXT(rtattr_ptr, ifaddrmsg_len)) { + if (RTA_PAYLOAD(rtattr_ptr) != sizeof(struct in6_addr)) { + /* No address: ignore. */ + continue; + } + + /* We check only known addresses. */ + for (i = 0; i < count; i++) { + addr = &addrs[i]->data.inet6.sin6_addr; + if (!memcmp(addr, RTA_DATA(rtattr_ptr), + sizeof(struct in6_addr))) { + /* We found matching tentative address. */ + return 1; + } + } + } + } + return 0; +} + +/* return after DAD finishes for all known IPv6 addresses or an error */ +int +virNetDevWaitDadFinish(virSocketAddrPtr *addrs, size_t count) +{ + struct nl_msg *nlmsg = NULL; + struct ifaddrmsg ifa; + struct nlmsghdr *resp = NULL; + unsigned int recvbuflen; + int ret = -1, dad = 1; + + if (!(nlmsg = nlmsg_alloc_simple(RTM_GETADDR, + NLM_F_REQUEST | NLM_F_DUMP))) { + virReportOOMError(); + return -1; + } + + memset(&ifa, 0, sizeof(ifa)); + /* DAD is for IPv6 adresses only. */ + ifa.ifa_family = AF_INET6; + if (nlmsg_append(nlmsg, &ifa, sizeof(ifa), NLMSG_ALIGNTO) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("allocated netlink buffer is too small")); + goto cleanup; + } + + /* Periodically query netlink until DAD finishes on all known addresses. */ + while (dad) { + if (virNetlinkCommand(nlmsg, &resp, &recvbuflen, 0, 0, + NETLINK_ROUTE, 0) < 0) + goto cleanup; + + if (virNetlinkGetErrorCode(resp, recvbuflen) < 0) { + virReportError(VIR_ERR_SYSTEM_ERROR, "%s", + _("error reading DAD state information")); + goto cleanup; + } + + /* Parse response. */ + dad = virNetDevParseDadStatus(resp, recvbuflen, addrs, count); + if (dad) + usleep(1000 * 10); + + VIR_FREE(resp); + }
This loop *really* bothers me, because there is no failsafe to terminate it if we never get positive notification that DAD has completed. This would lock up the network driver startup, which would lock up libvirtd startup. I think we need to decide on what is the maximum time this could possibly take to complete (maybe it is somehow based on the number of interfaces? or maybe it doesn't matter...) and timeout from the loop after the appropriate iterations.
+ ret = 0; + + cleanup: + VIR_FREE(resp); + nlmsg_free(nlmsg); + return ret; +} + #else /* defined(__linux__) && defined(HAVE_LIBNL) */
int virNetDevSetIPAddress(const char *ifname, @@ -1424,6 +1522,68 @@ int virNetDevClearIPAddress(const char *ifname, return ret; }
+/* return whether there is a known address with 'tentative' flag set */ +static int +virNetDevParseDadStatus(char *outbuf, + virSocketAddrPtr *addrs, size_t count) +{ + virSocketAddr sockaddr; + size_t i, j; + int ret = 0; + char **addr_strings = virStringSplit(outbuf, "\n", 0); + for (j = 0; addr_strings[j] != NULL; ++j) { + if (virSocketParseAddr(strings[j], &sockaddr, AF_INET6) < 0) + continue; + for (i = 0; i < count; i++) { + if (virSocketAddrEqual(addrs[i], &sockaddr)) { + ret = 1; + goto cleanup; + } + } + } + + cleanup: + virStringFreeList(addr_strings); + return ret; +} + +/* return after DAD finishes for all known IPv6 addresses or an error */ +int +virNetDevWaitDadFinish(virSocketAddrPtr *addrs, size_t count) +{ + int ret = -1, dad = 1; + char *outbuf = NULL; + virCommandPtr cmd = NULL; + +# ifdef IP_PATH + while (dad) { + cmd = virCommandNew(POSIX_SHELL); + virCommandAddArgList(cmd, "-c", + IP_PATH "-6 addr show tentative | \ + awk '/inet6/{split(\\$2, a, \\\"/\\\"); \ + print a[1]}'", NULL); + virCommandSetOutputBuffer(cmd, &outbuf); + if (virCommandRun(cmd, NULL)) + goto cleanup; + virCommandFree(cmd); + + dad = virNetDevParseDadStatus(outbuf, addrs, count); + if (dad) + usleep(1000 * 10); + + VIR_FREE(outbuf); + } + ret = 0; +# else + virReportSystemError(ENOSYS, "%s", _("Unable to check IPv6 DAD")); +# endif
Is the "ip" command available on non-linux platforms? I know there is *a lot* of netlink stuff in there). Aside from that, POSIX_SHELL isn't even defined, so this code would fail to compile on non-Linux platforms with IP_PATH defined (and it would fail to compile on non-Linux with IP_PATH *not* defined, since addrs and count would then need ATTRIBUTE_UNUSED). Anyway we try to avoid execing external binaries unless absolutely necessary, so I think it's reasonable to only have a stub function that says "not supported" in place of this (very polite but non-working) replacement. Once these 3 issues are dealt with, I think we can (finally) push this.
+ + cleanup: + virCommandFree(cmd); + VIR_FREE(outbuf); + return ret; +} + #endif /* defined(__linux__) && defined(HAVE_LIBNL) */
/** diff --git a/src/util/virnetdev.h b/src/util/virnetdev.h index a27504b..9108be6 100644 --- a/src/util/virnetdev.h +++ b/src/util/virnetdev.h @@ -105,6 +105,8 @@ int virNetDevClearIPAddress(const char *ifname, ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; int virNetDevGetIPAddress(const char *ifname, virSocketAddrPtr addr) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; +int virNetDevWaitDadFinish(virSocketAddrPtr *addrs, size_t count) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2);
int virNetDevSetMAC(const char *ifname, -- 1.8.3.1
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list

On 10/15/2015 09:03 PM, Laine Stump wrote:
This loop *really* bothers me, because there is no failsafe to terminate it if we never get positive notification that DAD has completed. This would lock up the network driver startup, which would lock up libvirtd startup. I think we need to decide on what is the maximum time this could possibly take to complete (maybe it is somehow based on the number of interfaces? or maybe it doesn't matter...) and timeout from the loop after the appropriate iterations. I did not hear about general timeout for this operation. Maybe 5 min, for example? Is the "ip" command available on non-linux platforms? I know there is *a lot* of netlink stuff in there). Aside from that, POSIX_SHELL isn't even defined, so this code would fail to compile on non-Linux platforms with IP_PATH defined (and it would fail to compile on non-Linux with IP_PATH *not* defined, since addrs and count would then need ATTRIBUTE_UNUSED). Anyway we try to avoid execing external binaries unless absolutely necessary, so I think it's reasonable to only have a stub function that says "not supported" in place of this (very polite but non-working) replacement. Ok. I wanted to work-around the case if we do not have libnl installed. Once these 3 issues are dealt with, I think we can (finally) push this. I'll try to fix it ASAP. Thank you.
-- Your sincerely, Maxim Perevedentsev

On 10/16/2015 12:51 PM, Maxim Perevedentsev wrote:
On 10/15/2015 09:03 PM, Laine Stump wrote:
This loop *really* bothers me, because there is no failsafe to terminate it if we never get positive notification that DAD has completed. This would lock up the network driver startup, which would lock up libvirtd startup. I think we need to decide on what is the maximum time this could possibly take to complete (maybe it is somehow based on the number of interfaces? or maybe it doesn't matter...) and timeout from the loop after the appropriate iterations. I did not hear about general timeout for this operation. Maybe 5 min, for example?
5 minutes is a very long time when everything else is being held up. If the wait for each network is much beyond a couple seconds, we'll need to think about spawning off a thread to wait for DAD. (if you're curious where the single-threadedness is, look at virStateInitialize() - it calls the stateAutoStart() function for each subsystem driver in sequence, and within the stateAutoStart() for the network driver, each network with autostart set is called in sequence).
Is the "ip" command available on non-linux platforms? I know there is *a lot* of netlink stuff in there). Aside from that, POSIX_SHELL isn't even defined, so this code would fail to compile on non-Linux platforms with IP_PATH defined (and it would fail to compile on non-Linux with IP_PATH *not* defined, since addrs and count would then need ATTRIBUTE_UNUSED). Anyway we try to avoid execing external binaries unless absolutely necessary, so I think it's reasonable to only have a stub function that says "not supported" in place of this (very polite but non-working) replacement. Ok. I wanted to work-around the case if we do not have libnl installed.
There are other things that will fail if libnl isn't installed. Are you actually building libvirt without libnl enabled and having any amount of success? (since the only way we interact with netlink is via libnl, anything that requires netlink will not work, e.g. macvtap interfaces)
Once these 3 issues are dealt with, I think we can (finally) push this. I'll try to fix it ASAP. Thank you.

On 10/16/2015 08:13 PM, Laine Stump wrote:
On 10/16/2015 12:51 PM, Maxim Perevedentsev wrote:
On 10/15/2015 09:03 PM, Laine Stump wrote:
This loop *really* bothers me, because there is no failsafe to terminate it if we never get positive notification that DAD has completed. This would lock up the network driver startup, which would lock up libvirtd startup. I think we need to decide on what is the maximum time this could possibly take to complete (maybe it is somehow based on the number of interfaces? or maybe it doesn't matter...) and timeout from the loop after the appropriate iterations. I did not hear about general timeout for this operation. Maybe 5 min, for example?
5 minutes is a very long time when everything else is being held up. If the wait for each network is much beyond a couple seconds, we'll need to think about spawning off a thread to wait for DAD.
(if you're curious where the single-threadedness is, look at virStateInitialize() - it calls the stateAutoStart() function for each subsystem driver in sequence, and within the stateAutoStart() for the network driver, each network with autostart set is called in sequence). I looked at kernel sources. It looks like DAD time is configurable: rand() % net.ipv6.conf.default.router_solicitation_delay [1s] + net.ipv6.conf.default.dad_transmits [1] * net.ipv6.neigh.default.retrans_time_ms [1000ms]
By default on my machine, it gives 0s + 1 * 1000ms = 1s maximum. Kernel keeps track of DAD success (on timeout) / fail. We could take these variables from sysctl, but this is cumbersome. I suggest taking 5s as maximum timeout and hope this is not too long. Yes, I *really* do not want to write this thread-related stuff :)
Is the "ip" command available on non-linux platforms? I know there is *a lot* of netlink stuff in there). Aside from that, POSIX_SHELL isn't even defined, so this code would fail to compile on non-Linux platforms with IP_PATH defined (and it would fail to compile on non-Linux with IP_PATH *not* defined, since addrs and count would then need ATTRIBUTE_UNUSED). Anyway we try to avoid execing external binaries unless absolutely necessary, so I think it's reasonable to only have a stub function that says "not supported" in place of this (very polite but non-working) replacement. Ok. I wanted to work-around the case if we do not have libnl installed.
There are other things that will fail if libnl isn't installed. Are you actually building libvirt without libnl enabled and having any amount of success? (since the only way we interact with netlink is via libnl, anything that requires netlink will not work, e.g. macvtap interfaces)
No, I don't. Thank you for explanation. -- Your sincerely, Maxim Perevedentsev

On 10/19/2015 08:36 AM, Maxim Perevedentsev wrote:
On 10/16/2015 08:13 PM, Laine Stump wrote:
On 10/16/2015 12:51 PM, Maxim Perevedentsev wrote:
On 10/15/2015 09:03 PM, Laine Stump wrote:
This loop *really* bothers me, because there is no failsafe to terminate it if we never get positive notification that DAD has completed. This would lock up the network driver startup, which would lock up libvirtd startup. I think we need to decide on what is the maximum time this could possibly take to complete (maybe it is somehow based on the number of interfaces? or maybe it doesn't matter...) and timeout from the loop after the appropriate iterations. I did not hear about general timeout for this operation. Maybe 5 min, for example?
5 minutes is a very long time when everything else is being held up. If the wait for each network is much beyond a couple seconds, we'll need to think about spawning off a thread to wait for DAD.
(if you're curious where the single-threadedness is, look at virStateInitialize() - it calls the stateAutoStart() function for each subsystem driver in sequence, and within the stateAutoStart() for the network driver, each network with autostart set is called in sequence). I looked at kernel sources. It looks like DAD time is configurable: rand() % net.ipv6.conf.default.router_solicitation_delay [1s] + net.ipv6.conf.default.dad_transmits [1] * net.ipv6.neigh.default.retrans_time_ms [1000ms]
By default on my machine, it gives 0s + 1 * 1000ms = 1s maximum. Kernel keeps track of DAD success (on timeout) / fail. We could take these variables from sysctl, but this is cumbersome. I suggest taking 5s as maximum timeout and hope this is not too long.
Yes, I *really* do not want to write this thread-related stuff :)
I agree. I think it would be okay to put a 5s timeout on the loop. We should probably have a VIR_DEBUG before we start waiting for DAD, and another when we're finished, so that someone experiencing long delays in startup will be able to determine the reason.

Some more comments. On 10/15/2015 09:03 PM, Laine Stump wrote:
static int +networkWaitDadFinish(virNetworkObjPtr network) +{ + virNetworkIpDefPtr ipdef; + virSocketAddrPtr *addrs = NULL; + size_t i; + int ret; + for (i = 0; + (ipdef = virNetworkDefGetIpByIndex(network->def, AF_INET6, i)); + i++) {} + + if (i == 0) + return 0; + if (VIR_ALLOC_N(addrs, i)) + return -1; + + for (i = 0; + (ipdef = virNetworkDefGetIpByIndex(network->def, AF_INET6, i)); + i++) { + addrs[i] = &ipdef->address; + }
This code could be much more compact (and only very slightly less efficient) by using something like:
virNetworkIpDefPtr ipdef; virSocketAddrPtr addrs = NULL; size_t i, naddrs = 0; for (i = 0; (ipdef == virNetworkDefGetIpByIndex(...); i++) { if (VIR_APPEND_ELEMENT_COPY(addrs, naddrs, ipdef->address) < 0) return -1; } if (naddrs == 0) return 0;
addrs would then be a pointer to an array of addresses rather than a pointer to an array of pointers to addresses, so the lower functions would need to be adjusted accordingly.
(NB: due to the switch from stuff** to stuff*, there would be fewer mallocs than your existing code, although at each malloc you could potentially incur the penalty of a copy of all existing elements. For the size of array we're talking about though, this is inconsequential (especially since any good malloc implementation will carve out memory in larger chunks, and so will not need to do a copy each time the region is realloced). The only malloc is allocating the array of pointers. I don't actually copy virSocketAddr's with in6_addr inside, I just store pointers to IPv6 addresses in external structures.
In your snippet, we copy virSocketAddr on each iteration and, possibly, everything copied as far on relocation. Isn't it better to copy pointers? virSocketAddrPtr *addrs = NULL, addr = NULL; for (i = 0; (ipdef == virNetworkDefGetIpByIndex(...); i++) { addr = &ipdef->address; if (VIR_APPEND_ELEMENT_COPY(addrs, naddrs, addr) < 0) return -1; } ...
+ VIR_FREE(addrs);
The way you have this now, the above leaks the memory pointed to by each element in the array. (once you switch from virSocketAddrPtr* to virSocketAddr* this would no longer be a problem). The addresses themselves are external structures. We do not want to destroy them, just remove pointers.
-- Your sincerely, Maxim Perevedentsev

On 10/19/2015 09:15 AM, Maxim Perevedentsev wrote:
Some more comments.
On 10/15/2015 09:03 PM, Laine Stump wrote:
static int +networkWaitDadFinish(virNetworkObjPtr network) +{ + virNetworkIpDefPtr ipdef; + virSocketAddrPtr *addrs = NULL; + size_t i; + int ret; + for (i = 0; + (ipdef = virNetworkDefGetIpByIndex(network->def, AF_INET6, i)); + i++) {} + + if (i == 0) + return 0; + if (VIR_ALLOC_N(addrs, i)) + return -1; + + for (i = 0; + (ipdef = virNetworkDefGetIpByIndex(network->def, AF_INET6, i)); + i++) { + addrs[i] = &ipdef->address; + }
This code could be much more compact (and only very slightly less efficient) by using something like:
virNetworkIpDefPtr ipdef; virSocketAddrPtr addrs = NULL; size_t i, naddrs = 0; for (i = 0; (ipdef == virNetworkDefGetIpByIndex(...); i++) { if (VIR_APPEND_ELEMENT_COPY(addrs, naddrs, ipdef->address) < 0) return -1; } if (naddrs == 0) return 0;
addrs would then be a pointer to an array of addresses rather than a pointer to an array of pointers to addresses, so the lower functions would need to be adjusted accordingly.
(NB: due to the switch from stuff** to stuff*, there would be fewer mallocs than your existing code, although at each malloc you could potentially incur the penalty of a copy of all existing elements. For the size of array we're talking about though, this is inconsequential (especially since any good malloc implementation will carve out memory in larger chunks, and so will not need to do a copy each time the region is realloced). The only malloc is allocating the array of pointers. I don't actually copy virSocketAddr's with in6_addr inside, I just store pointers to IPv6 addresses in external structures.
Derp. You're right. I wasn't thinking straight when I wrote that, just looking at what the data structure was.
In your snippet, we copy virSocketAddr on each iteration and, possibly, everything copied as far on relocation. Isn't it better to copy pointers?
I still prefer the simpler code though, because there are less lines for a future maintainer to understand, and it's not going to add much to execution time unless there are hundreds of IPv6 addresses set for a single network. As it seems unlikely there will be more than one or two, the extra code complexity doesn't seem worthwhile. (If we thought it would be common to have hundreds or thousands of IP addresses for each network, then I would rethink my position).
virSocketAddrPtr *addrs = NULL, addr = NULL;
for (i = 0; (ipdef == virNetworkDefGetIpByIndex(...); i++) { addr = &ipdef->address; if (VIR_APPEND_ELEMENT_COPY(addrs, naddrs, addr) < 0) return -1; } ...
+ VIR_FREE(addrs);
The way you have this now, the above leaks the memory pointed to by each element in the array. (once you switch from virSocketAddrPtr* to virSocketAddr* this would no longer be a problem). The addresses themselves are external structures. We do not want to destroy them, just remove pointers.
Yep. See above.

Such messages do not have NLMSG_ERROR or NLMSG_DONE type but they are valid responses. We test 'multi-partness' by looking for NLM_F_MULTI flag. --- Difference to v1: fixed comment style. src/util/virnetlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/util/virnetlink.c b/src/util/virnetlink.c index 0276522..679b48e 100644 --- a/src/util/virnetlink.c +++ b/src/util/virnetlink.c @@ -394,7 +394,9 @@ virNetlinkGetErrorCode(struct nlmsghdr *resp, unsigned int recvbuflen) break; default: - goto malformed_resp; + /* We allow multipart messages. */ + if (!(resp->nlmsg_flags & NLM_F_MULTI)) + goto malformed_resp; } return result; -- 1.8.3.1

On 09/29/2015 01:16 PM, Maxim Perevedentsev wrote:
This is a fix for commit db488c79173b240459c7754f38c3c6af9b432970 dnsmasq main process which is relied on when waiting for DAD to complete exits without actually waiting for DAD. This is dnsmasq daemon's task.
It seems to be a race that DAD finished before dnsmasq main process exited. The above commit needs the execution to block until DAD finishes for bridge IPv6 address because then it closes dummy tap device. Thus we need to ensure this ourselves.
So we periodically poll the kernel using netlink and check whether there are any IPv6 addresses assigned to bridge which have 'tentative' state. After DAD is finished, execution continues. I guess that is what dnsmasq was assumed to do.
We use netlink to dump information about existing IPv6 addresses. Netlink's response is a multi-part message. Unfortunately, the current implementation of virNetlink treats such messages as faulty and throws an error. So the patch 2/2 adds multi-part nelink response support.
Update v2: fixed syntax. Update v3: moved to virnetdev. Resend: These patches were ignored and buried long ago :-(
Maxim Perevedentsev (2): network: added waiting for DAD to finish for bridge address. netlink: add support for multi-part netlink messages.
Could someone, please, look at these patches? There was no objections against general idea on previous series, syntax looks good to me. Our autotests really catch this problem, and patch fixes it.
src/libvirt_private.syms | 1 + src/network/bridge_driver.c | 35 +++++++++- src/util/virnetdev.c | 160 ++++++++++++++++++++++++++++++++++++++++++++ src/util/virnetdev.h | 2 + src/util/virnetlink.c | 4 +- 5 files changed, 200 insertions(+), 2 deletions(-)
-- 1.8.3.1
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
participants (3)
-
Dmitry Guryanov
-
Laine Stump
-
Maxim Perevedentsev