[libvirt] [PATCH 0/3] qemu: handle connect hangs in virDomainOpenChannel

The problem is described in [1] in more details. From libvirt perspective we have whole driver became unresponsive if connect to channel unix socket hangs during virDomainOpenChannel. This series address this by first using job for the API call and second by introducing connect timeout so that domain itself don't became unresponsive indefinetily. [1] http://lists.nongnu.org/archive/html/qemu-devel/2017-11/msg03013.html Nikolay Shirokovskiy (3): util: virFDStreamConnectUNIX: add missing error report util: add 30s connection timeout to virFDStreamConnectUNIX qemu: use job to keep driver responsive on qemuDomainOpenChannel src/qemu/qemu_driver.c | 11 ++++++++++ src/util/virfdstream.c | 18 +++++++++++++++ src/util/virutil.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/util/virutil.h | 2 ++ 4 files changed, 90 insertions(+) -- 1.8.3.1

--- src/util/virfdstream.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/util/virfdstream.c b/src/util/virfdstream.c index be40379..1d064a7 100644 --- a/src/util/virfdstream.c +++ b/src/util/virfdstream.c @@ -1189,6 +1189,7 @@ int virFDStreamConnectUNIX(virStreamPtr st, continue; } + virReportSystemError(errno, "%s", _("Unable to connect to UNIX socket")); goto error; } -- 1.8.3.1

There is a bug in qemu currently on hanling agent channels. As a result if you open agent channel for guest without running an agent several times in raw you get hang on connect syscall eventually. Qemu will be fixed eventually but it is nice to handle hanging connect condition in libvirt. Let's add 30s connect timeout. --- src/util/virfdstream.c | 17 +++++++++++++++ src/util/virutil.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/util/virutil.h | 2 ++ 3 files changed, 78 insertions(+) diff --git a/src/util/virfdstream.c b/src/util/virfdstream.c index 1d064a7..029cc4e 100644 --- a/src/util/virfdstream.c +++ b/src/util/virfdstream.c @@ -1176,6 +1176,11 @@ int virFDStreamConnectUNIX(virStreamPtr st, goto error; } + if (virSetNonBlock(fd) < 0) { + virReportSystemError(errno, "%s", _("Unable to set non-blocking mode")); + goto error; + } + if (virTimeBackOffStart(&timeout, 1, 3*1000 /* ms */) < 0) goto error; while (virTimeBackOffWait(&timeout)) { @@ -1189,10 +1194,22 @@ int virFDStreamConnectUNIX(virStreamPtr st, continue; } + if (errno == EINPROGRESS || errno == EAGAIN) { + if (virConnectWait(fd, 30 * 1000) < 0) + goto error; + + break; + } + virReportSystemError(errno, "%s", _("Unable to connect to UNIX socket")); goto error; } + if (virSetBlocking(fd, true) < 0) { + virReportSystemError(errno, "%s", _("Unable to set blocking mode")); + goto error; + } + if (virFDStreamOpenInternal(st, fd, NULL, 0) < 0) goto error; return 0; diff --git a/src/util/virutil.c b/src/util/virutil.c index 8bdcb02..75abd4f 100644 --- a/src/util/virutil.c +++ b/src/util/virutil.c @@ -80,6 +80,7 @@ #include "nonblocking.h" #include "virprocess.h" #include "virstring.h" +#include "virtime.h" #include "virutil.h" verify(sizeof(gid_t) <= sizeof(unsigned int) && @@ -195,6 +196,64 @@ int virSetSockReuseAddr(int fd, bool fatal) #endif int +virConnectWait(int fd, unsigned long long timeout) +{ + struct pollfd fds[1]; + unsigned long long then; + unsigned long long now; + int rc; + int optval; + socklen_t optlen = sizeof(optval); + + fds[0].fd = fd; + fds[0].events = POLLOUT; + fds[0].revents = 0; + + if (virTimeMillisNow(&now) < 0) + return -1; + + then = now + timeout; + + retry: + rc = poll(fds, 1, then - now); + + if (rc < 0 && (errno == EAGAIN || errno == EINTR)) { + if (virTimeMillisNow(&now) < 0) + return -1; + + if (now >= then) { + virReportError(VIR_ERR_OPERATION_TIMEOUT, "%s", + _("Connection timeout expired")); + return -1; + } + + goto retry; + } + + if (rc < 0) { + virReportSystemError(errno, "%s", _("Unable to connect to socket")); + return -1; + } else if (rc == 0) { + virReportError(VIR_ERR_OPERATION_TIMEOUT, "%s", + _("Connection timeout expired")); + return -1; + } + + if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &optval, &optlen) < 0) { + virReportSystemError(errno, "%s", + _("Cannot check socket connection status")); + return -1; + } + + if (optval != 0) { + virReportSystemError(optval, "%s", _("Unable to connect to socket")); + return -1; + } + + return 0; +} + +int virPipeReadUntilEOF(int outfd, int errfd, char **outbuf, char **errbuf) { diff --git a/src/util/virutil.h b/src/util/virutil.h index ff89d1a..b0c9672 100644 --- a/src/util/virutil.h +++ b/src/util/virutil.h @@ -43,6 +43,8 @@ int virSetInherit(int fd, bool inherit) ATTRIBUTE_RETURN_CHECK; int virSetCloseExec(int fd) ATTRIBUTE_RETURN_CHECK; int virSetSockReuseAddr(int fd, bool fatal) ATTRIBUTE_RETURN_CHECK; +int virConnectWait(int fd, unsigned long long timeout); + int virPipeReadUntilEOF(int outfd, int errfd, char **outbuf, char **errbuf); -- 1.8.3.1

As we now assume that virChrdevOpen can take a significant time we should take a job and drop vm lock during operation to keep qemu driver responsive. We drop vm lock but access vm structures after it because we are in exlusive modify job so vm structures can only be read concurrently. --- src/qemu/qemu_driver.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 6132bc4..9401695 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -16386,6 +16386,8 @@ qemuDomainOpenChannel(virDomainPtr dom, size_t i; virDomainChrDefPtr chr = NULL; qemuDomainObjPrivatePtr priv; + virQEMUDriverPtr driver = dom->conn->privateData; + bool job = false; virCheckFlags(VIR_DOMAIN_CHANNEL_FORCE, -1); @@ -16395,6 +16397,10 @@ qemuDomainOpenChannel(virDomainPtr dom, if (virDomainOpenChannelEnsureACL(dom->conn, vm->def) < 0) goto cleanup; + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) + goto cleanup; + job = true; + if (!virDomainObjIsActive(vm)) { virReportError(VIR_ERR_OPERATION_INVALID, "%s", _("domain is not running")); @@ -16432,11 +16438,13 @@ qemuDomainOpenChannel(virDomainPtr dom, goto cleanup; } + virObjectUnlock(vm); /* handle mutually exclusive access to channel devices */ ret = virChrdevOpen(priv->devs, chr->source, st, (flags & VIR_DOMAIN_CHANNEL_FORCE) != 0); + virObjectLock(vm); if (ret == 1) { virReportError(VIR_ERR_OPERATION_FAILED, "%s", @@ -16445,6 +16453,9 @@ qemuDomainOpenChannel(virDomainPtr dom, } cleanup: + if (job) + qemuDomainObjEndJob(driver, vm); + virDomainObjEndAPI(&vm); return ret; } -- 1.8.3.1
participants (1)
-
Nikolay Shirokovskiy