[libvirt] [PATCH 0/3] Improve keepalive timeout handling in clients

Jiri Denemark (3): client rpc: Report proper error for keepalive disconnections client rpc: Process pending data on error virsh: Notify users about disconnects src/rpc/virkeepalive.c | 10 +++++----- src/rpc/virnetclient.c | 30 +++++++++++++++++++++++++----- tools/virsh.c | 35 ++++++++++++++++++++++++++++++++--- 3 files changed, 62 insertions(+), 13 deletions(-) -- 2.5.2

Whenever a connection was closed due to keepalive timeout, we would log a warning but the interrupted API would return rather useless generic error: internal error: received hangup / error event on socket Let's report a proper keepalive timeout error and make sure it is propagated to all pending APIs. The error should be better now: internal error: connection closed due to keepalive timeout Based on an old patch from Martin Kletzander. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/rpc/virkeepalive.c | 10 +++++----- src/rpc/virnetclient.c | 12 ++++++++++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/rpc/virkeepalive.c b/src/rpc/virkeepalive.c index c882313..c9faf88 100644 --- a/src/rpc/virkeepalive.c +++ b/src/rpc/virkeepalive.c @@ -136,11 +136,11 @@ virKeepAliveTimerInternal(virKeepAlivePtr ka, ka, ka->client, ka->countToDeath, timeval); if (ka->countToDeath == 0) { - VIR_WARN("No response from client %p after %d keepalive messages in" - " %d seconds", - ka->client, - ka->count, - timeval); + VIR_DEBUG("No response from client %p after %d keepalive messages " + "in %d seconds", + ka->client, ka->count, timeval); + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("connection closed due to keepalive timeout")); return true; } else { ka->countToDeath--; diff --git a/src/rpc/virnetclient.c b/src/rpc/virnetclient.c index d0b96b6..6e59ea6 100644 --- a/src/rpc/virnetclient.c +++ b/src/rpc/virnetclient.c @@ -107,6 +107,7 @@ struct _virNetClient { virKeepAlivePtr keepalive; bool wantClose; int closeReason; + virErrorPtr error; virNetClientCloseFunc closeCb; void *closeOpaque; @@ -636,10 +637,14 @@ virNetClientMarkClose(virNetClientPtr client, int reason) { VIR_DEBUG("client=%p, reason=%d", client, reason); + if (client->sock) virNetSocketRemoveIOCallback(client->sock); + /* Don't override reason that's already set. */ if (!client->wantClose) { + if (!client->error) + client->error = virSaveLastError(); client->wantClose = true; client->closeReason = reason; } @@ -670,6 +675,9 @@ virNetClientCloseLocked(virNetClientPtr client) client->keepalive = NULL; client->wantClose = false; + virFreeError(client->error); + client->error = NULL; + if (ka || client->closeCb) { virNetClientCloseFunc closeCb = client->closeCb; void *closeOpaque = client->closeOpaque; @@ -1602,6 +1610,10 @@ static int virNetClientIOEventLoop(virNetClientPtr client, } error: + if (client->error) { + VIR_DEBUG("error on socket: %s", client->error->message); + virSetError(client->error); + } virNetClientCallRemove(&client->waitDispatch, thiscall); virNetClientIOEventLoopPassTheBuck(client, thiscall); return -1; -- 2.5.2

Even though we hit an error in client's IO loop, we still want to process any pending data. So instead of reporting the error right away, we can finish the current iteration and report the error once we're done with it. Note that the error is stored in client->error by virNetClientMarkClose so we don't need to worry about it being reset or rewritten by any API we call in the meantime. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/rpc/virnetclient.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/rpc/virnetclient.c b/src/rpc/virnetclient.c index 6e59ea6..c68da6d 100644 --- a/src/rpc/virnetclient.c +++ b/src/rpc/virnetclient.c @@ -1460,6 +1460,7 @@ static int virNetClientIOEventLoop(virNetClientPtr client, virNetClientCallPtr thiscall) { struct pollfd fds[2]; + bool error = false; int ret; fds[0].fd = virNetSocketGetFD(client->sock); @@ -1551,10 +1552,11 @@ static int virNetClientIOEventLoop(virNetClientPtr client, if (virNetSocketHasCachedData(client->sock)) fds[0].revents |= POLLIN; - /* If wantClose flag is set, pretend there was an error on the socket + /* If wantClose flag is set, pretend there was an error on the socket, + * but still read and process any data we received so far. */ if (client->wantClose) - fds[0].revents = POLLERR; + error = true; if (fds[1].revents) { VIR_DEBUG("Woken up from poll by other thread"); @@ -1562,21 +1564,24 @@ static int virNetClientIOEventLoop(virNetClientPtr client, virReportSystemError(errno, "%s", _("read on wakeup fd failed")); virNetClientMarkClose(client, VIR_CONNECT_CLOSE_REASON_ERROR); - goto error; + error = true; + /* Fall through to process any pending data. */ } } if (fds[0].revents & POLLOUT) { if (virNetClientIOHandleOutput(client) < 0) { virNetClientMarkClose(client, VIR_CONNECT_CLOSE_REASON_ERROR); - goto error; + error = true; + /* Fall through to process any pending data. */ } } if (fds[0].revents & POLLIN) { if (virNetClientIOHandleInput(client) < 0) { virNetClientMarkClose(client, VIR_CONNECT_CLOSE_REASON_ERROR); - goto error; + error = true; + /* Fall through to process any pending data. */ } } @@ -1601,6 +1606,9 @@ static int virNetClientIOEventLoop(virNetClientPtr client, return 1; } + if (error) + goto error; + if (fds[0].revents & (POLLHUP | POLLERR)) { virNetClientMarkClose(client, VIR_CONNECT_CLOSE_REASON_EOF); virReportError(VIR_ERR_INTERNAL_ERROR, "%s", -- 2.5.2

After my "client rpc: Report proper error for keepalive disconnections" patch, virsh would no long print a warning when it closes a connection to a daemon after a keepalive timeout. Although the warning virsh # 2015-09-15 10:59:26.729+0000: 642080: info : libvirt version: 1.2.19 2015-09-15 10:59:26.729+0000: 642080: warning : virKeepAliveTimerInternal:143 : No response from client 0x7efdc0a46730 after 1 keepalive messages in 2 seconds was pretty ugly, it was still useful. This patch brings the useful part back while making it much nicer: virsh # error: Disconnected from qemu:///system due to keepalive timeout Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- tools/virsh.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/tools/virsh.c b/tools/virsh.c index bb12dec..23436ea 100644 --- a/tools/virsh.c +++ b/tools/virsh.c @@ -95,12 +95,41 @@ static int disconnected; /* we may have been disconnected */ * handler, just save the fact it was raised. */ static void -virshCatchDisconnect(virConnectPtr conn ATTRIBUTE_UNUSED, +virshCatchDisconnect(virConnectPtr conn, int reason, - void *opaque ATTRIBUTE_UNUSED) + void *opaque) { - if (reason != VIR_CONNECT_CLOSE_REASON_CLIENT) + if (reason != VIR_CONNECT_CLOSE_REASON_CLIENT) { + vshControl *ctl = opaque; + const char *str = "unknown reason"; + virErrorPtr error; + char *uri; + + error = virSaveLastError(); + uri = virConnectGetURI(conn); + + switch ((virConnectCloseReason) reason) { + case VIR_CONNECT_CLOSE_REASON_ERROR: + str = N_("Disconnected from %s due to I/O error"); + break; + case VIR_CONNECT_CLOSE_REASON_EOF: + str = N_("Disconnected from %s due to end of file"); + break; + case VIR_CONNECT_CLOSE_REASON_KEEPALIVE: + str = N_("Disconnected from %s due to keepalive timeout"); + break; + case VIR_CONNECT_CLOSE_REASON_CLIENT: + case VIR_CONNECT_CLOSE_REASON_LAST: + break; + } + vshError(ctl, _(str), NULLSTR(uri)); + + if (error) { + virSetError(error); + virFreeError(error); + } disconnected++; + } } /* Main Function which should be used for connecting. -- 2.5.2

On 09/17/2015 08:23 AM, Jiri Denemark wrote:
After my "client rpc: Report proper error for keepalive disconnections" patch, virsh would no long print a warning when it closes a connection to a daemon after a keepalive timeout. Although the warning
virsh # 2015-09-15 10:59:26.729+0000: 642080: info : libvirt version: 1.2.19 2015-09-15 10:59:26.729+0000: 642080: warning : virKeepAliveTimerInternal:143 : No response from client 0x7efdc0a46730 after 1 keepalive messages in 2 seconds
was pretty ugly, it was still useful. This patch brings the useful part back while making it much nicer:
virsh # error: Disconnected from qemu:///system due to keepalive timeout
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- tools/virsh.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-)
Patch series seems reasonable to me, except for one minor thing.... Found by my coverity checker... Naturally
diff --git a/tools/virsh.c b/tools/virsh.c index bb12dec..23436ea 100644 --- a/tools/virsh.c +++ b/tools/virsh.c @@ -95,12 +95,41 @@ static int disconnected; /* we may have been disconnected */ * handler, just save the fact it was raised. */ static void -virshCatchDisconnect(virConnectPtr conn ATTRIBUTE_UNUSED, +virshCatchDisconnect(virConnectPtr conn, int reason, - void *opaque ATTRIBUTE_UNUSED) + void *opaque) { - if (reason != VIR_CONNECT_CLOSE_REASON_CLIENT) + if (reason != VIR_CONNECT_CLOSE_REASON_CLIENT) {
[1]
+ vshControl *ctl = opaque; + const char *str = "unknown reason"; + virErrorPtr error; + char *uri; + + error = virSaveLastError(); + uri = virConnectGetURI(conn); + + switch ((virConnectCloseReason) reason) { + case VIR_CONNECT_CLOSE_REASON_ERROR: + str = N_("Disconnected from %s due to I/O error"); + break; + case VIR_CONNECT_CLOSE_REASON_EOF: + str = N_("Disconnected from %s due to end of file"); + break; + case VIR_CONNECT_CLOSE_REASON_KEEPALIVE: + str = N_("Disconnected from %s due to keepalive timeout"); + break; + case VIR_CONNECT_CLOSE_REASON_CLIENT:
[1] ^^^ Coverity complains about DEADCODE Other than setting str = NULL and testing for it later, I'm not exactly sure of the 'best' course of action. e.g., if (str) { vshError(); disconnected++; } I think if this is handled, then the series is ACK-able. John
+ case VIR_CONNECT_CLOSE_REASON_LAST: + break; + } + vshError(ctl, _(str), NULLSTR(uri)); + + if (error) { + virSetError(error); + virFreeError(error); + } disconnected++; + } }
/* Main Function which should be used for connecting.

On Mon, Sep 21, 2015 at 17:32:41 -0400, John Ferlan wrote:
On 09/17/2015 08:23 AM, Jiri Denemark wrote:
After my "client rpc: Report proper error for keepalive disconnections" patch, virsh would no long print a warning when it closes a connection to a daemon after a keepalive timeout. Although the warning
virsh # 2015-09-15 10:59:26.729+0000: 642080: info : libvirt version: 1.2.19 2015-09-15 10:59:26.729+0000: 642080: warning : virKeepAliveTimerInternal:143 : No response from client 0x7efdc0a46730 after 1 keepalive messages in 2 seconds
was pretty ugly, it was still useful. This patch brings the useful part back while making it much nicer:
virsh # error: Disconnected from qemu:///system due to keepalive timeout
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- tools/virsh.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-)
Patch series seems reasonable to me, except for one minor thing.... Found by my coverity checker... Naturally
diff --git a/tools/virsh.c b/tools/virsh.c index bb12dec..23436ea 100644 --- a/tools/virsh.c +++ b/tools/virsh.c @@ -95,12 +95,41 @@ static int disconnected; /* we may have been disconnected */ * handler, just save the fact it was raised. */ static void -virshCatchDisconnect(virConnectPtr conn ATTRIBUTE_UNUSED, +virshCatchDisconnect(virConnectPtr conn, int reason, - void *opaque ATTRIBUTE_UNUSED) + void *opaque) { - if (reason != VIR_CONNECT_CLOSE_REASON_CLIENT) + if (reason != VIR_CONNECT_CLOSE_REASON_CLIENT) {
[1]
+ vshControl *ctl = opaque; + const char *str = "unknown reason"; + virErrorPtr error; + char *uri; + + error = virSaveLastError(); + uri = virConnectGetURI(conn); + + switch ((virConnectCloseReason) reason) { + case VIR_CONNECT_CLOSE_REASON_ERROR: + str = N_("Disconnected from %s due to I/O error"); + break; + case VIR_CONNECT_CLOSE_REASON_EOF: + str = N_("Disconnected from %s due to end of file"); + break; + case VIR_CONNECT_CLOSE_REASON_KEEPALIVE: + str = N_("Disconnected from %s due to keepalive timeout"); + break; + case VIR_CONNECT_CLOSE_REASON_CLIENT:
[1] ^^^ Coverity complains about DEADCODE
Other than setting str = NULL and testing for it later, I'm not exactly sure of the 'best' course of action. e.g., if (str) { vshError(); disconnected++; }
I think if this is handled, then the series is ACK-able.
I think /* coverity[dead_error_begin] */ is the right solution in this case. Jirka

On Tue, Sep 22, 2015 at 13:21:18 +0200, Jiri Denemark wrote:
On Mon, Sep 21, 2015 at 17:32:41 -0400, John Ferlan wrote:
On 09/17/2015 08:23 AM, Jiri Denemark wrote:
After my "client rpc: Report proper error for keepalive disconnections" patch, virsh would no long print a warning when it closes a connection to a daemon after a keepalive timeout. Although the warning
virsh # 2015-09-15 10:59:26.729+0000: 642080: info : libvirt version: 1.2.19 2015-09-15 10:59:26.729+0000: 642080: warning : virKeepAliveTimerInternal:143 : No response from client 0x7efdc0a46730 after 1 keepalive messages in 2 seconds
was pretty ugly, it was still useful. This patch brings the useful part back while making it much nicer:
virsh # error: Disconnected from qemu:///system due to keepalive timeout
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- tools/virsh.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-)
Patch series seems reasonable to me, except for one minor thing.... Found by my coverity checker... Naturally
diff --git a/tools/virsh.c b/tools/virsh.c index bb12dec..23436ea 100644 --- a/tools/virsh.c +++ b/tools/virsh.c @@ -95,12 +95,41 @@ static int disconnected; /* we may have been disconnected */ * handler, just save the fact it was raised. */ static void -virshCatchDisconnect(virConnectPtr conn ATTRIBUTE_UNUSED, +virshCatchDisconnect(virConnectPtr conn, int reason, - void *opaque ATTRIBUTE_UNUSED) + void *opaque) { - if (reason != VIR_CONNECT_CLOSE_REASON_CLIENT) + if (reason != VIR_CONNECT_CLOSE_REASON_CLIENT) {
[1]
+ vshControl *ctl = opaque; + const char *str = "unknown reason"; + virErrorPtr error; + char *uri; + + error = virSaveLastError(); + uri = virConnectGetURI(conn); + + switch ((virConnectCloseReason) reason) { + case VIR_CONNECT_CLOSE_REASON_ERROR: + str = N_("Disconnected from %s due to I/O error"); + break; + case VIR_CONNECT_CLOSE_REASON_EOF: + str = N_("Disconnected from %s due to end of file"); + break; + case VIR_CONNECT_CLOSE_REASON_KEEPALIVE: + str = N_("Disconnected from %s due to keepalive timeout"); + break; + case VIR_CONNECT_CLOSE_REASON_CLIENT:
[1] ^^^ Coverity complains about DEADCODE
Other than setting str = NULL and testing for it later, I'm not exactly sure of the 'best' course of action. e.g., if (str) { vshError(); disconnected++; }
I think if this is handled, then the series is ACK-able.
I think /* coverity[dead_error_begin] */ is the right solution in this case.
I pushed the series with this comment added above case VIR_CONNECT_CLOSE_REASON_CLIENT. Thanks, Jirka
participants (2)
-
Jiri Denemark
-
John Ferlan