[PATCH 0/2] qemu: agent: sync once if qemu has serial port event

The second patch is unrelated through. Nikolay Shirokovskiy (2): qemu: agent: sync once if qemu has serial port event qemu: remove redundant needReply argument of qemuAgentCommand src/qemu/qemu_agent.c | 60 ++++++++++++++++++++---------------- src/qemu/qemu_agent.h | 3 +- src/qemu/qemu_process.c | 3 +- tests/qemumonitortestutils.c | 3 +- 4 files changed, 40 insertions(+), 29 deletions(-) -- 2.23.0

Sync was introduced in [1] to check for ga presence. This check is racy but in the era before serial events are available there was not better solution I guess. In case we have the events the sync function is different. It allows us to flush stateless ga channel from remnants of previous communications. But we need to do it only once. Until we get timeout on issued command channel state is ok. [1] qemu_agent: Issue guest-sync prior to every command Signed-off-by: Nikolay Shirokovskiy <nshirokovskiy@virtuozzo.com> --- src/qemu/qemu_agent.c | 13 ++++++++++++- src/qemu/qemu_agent.h | 3 ++- src/qemu/qemu_process.c | 3 ++- tests/qemumonitortestutils.c | 3 ++- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/qemu/qemu_agent.c b/src/qemu/qemu_agent.c index cd25ef6cd3..2de53efb7a 100644 --- a/src/qemu/qemu_agent.c +++ b/src/qemu/qemu_agent.c @@ -104,6 +104,8 @@ struct _qemuAgent { int watch; bool running; + bool singleSync; + bool inSync; virDomainObjPtr vm; @@ -679,7 +681,8 @@ qemuAgentIO(int watch, int fd, int events, void *opaque) qemuAgentPtr qemuAgentOpen(virDomainObjPtr vm, const virDomainChrSourceDef *config, - qemuAgentCallbacksPtr cb) + qemuAgentCallbacksPtr cb, + bool singleSync) { qemuAgentPtr mon; @@ -705,6 +708,7 @@ qemuAgentOpen(virDomainObjPtr vm, } mon->vm = vm; mon->cb = cb; + mon->singleSync = singleSync; switch (config->type) { case VIR_DOMAIN_CHR_TYPE_UNIX: @@ -869,6 +873,7 @@ static int qemuAgentSend(qemuAgentPtr mon, _("Unable to wait on agent monitor " "condition")); } + mon->inSync = false; goto cleanup; } } @@ -910,6 +915,9 @@ qemuAgentGuestSync(qemuAgentPtr mon) qemuAgentMessage sync_msg; int timeout = VIR_DOMAIN_QEMU_AGENT_COMMAND_DEFAULT; + if (mon->singleSync && mon->inSync) + return 0; + /* if user specified a custom agent timeout that is lower than the * default timeout, use the shorter timeout instead */ if ((mon->timeout >= 0) && (mon->timeout < timeout)) @@ -955,6 +963,9 @@ qemuAgentGuestSync(qemuAgentPtr mon) } } + if (mon->singleSync) + mon->inSync = true; + ret = 0; cleanup: diff --git a/src/qemu/qemu_agent.h b/src/qemu/qemu_agent.h index 5656fe60ff..f9f75cee25 100644 --- a/src/qemu/qemu_agent.h +++ b/src/qemu/qemu_agent.h @@ -41,7 +41,8 @@ struct _qemuAgentCallbacks { qemuAgentPtr qemuAgentOpen(virDomainObjPtr vm, const virDomainChrSourceDef *config, - qemuAgentCallbacksPtr cb); + qemuAgentCallbacksPtr cb, + bool singleSync); void qemuAgentClose(qemuAgentPtr mon); diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 36fbffd7b8..a910afc307 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -237,7 +237,8 @@ qemuConnectAgent(virQEMUDriverPtr driver, virDomainObjPtr vm) agent = qemuAgentOpen(vm, config->source, - &agentCallbacks); + &agentCallbacks, + virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_VSERPORT_CHANGE)); virObjectLock(vm); diff --git a/tests/qemumonitortestutils.c b/tests/qemumonitortestutils.c index 00f5b49439..c3e4d86dd6 100644 --- a/tests/qemumonitortestutils.c +++ b/tests/qemumonitortestutils.c @@ -1393,7 +1393,8 @@ qemuMonitorTestNewAgent(virDomainXMLOptionPtr xmlopt) if (!(test->agent = qemuAgentOpen(test->vm, &src, - &qemuMonitorTestAgentCallbacks))) + &qemuMonitorTestAgentCallbacks, + false))) goto error; virObjectLock(test->agent); -- 2.23.0

On 3/5/20 3:47 PM, Nikolay Shirokovskiy wrote:
Sync was introduced in [1] to check for ga presence. This check is racy but in the era before serial events are available there was not better solution I guess.
Can you elaborate more on the raciness? There should never be more than one thread talking on the agent monitor.
In case we have the events the sync function is different. It allows us to flush stateless ga channel from remnants of previous communications. But we need to do it only once. Until we get timeout on issued command channel state is ok.
[1] qemu_agent: Issue guest-sync prior to every command
Signed-off-by: Nikolay Shirokovskiy <nshirokovskiy@virtuozzo.com> --- src/qemu/qemu_agent.c | 13 ++++++++++++- src/qemu/qemu_agent.h | 3 ++- src/qemu/qemu_process.c | 3 ++- tests/qemumonitortestutils.c | 3 ++- 4 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/src/qemu/qemu_agent.c b/src/qemu/qemu_agent.c index cd25ef6cd3..2de53efb7a 100644 --- a/src/qemu/qemu_agent.c +++ b/src/qemu/qemu_agent.c @@ -104,6 +104,8 @@ struct _qemuAgent { int watch;
bool running; + bool singleSync; + bool inSync;
I wonder if we can do this with just inSync and not have singleSync. I mean, it looks like at all places where singleSync is used we have access to qemuCaps so it should be as easy as: qemuAgentOpen(); if (virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_VSERPORT_CHANGE)) { if *qemuAgentGuestSync(mon) < 0) goto error; mon->inSync = true; and then qemuagentGuestSync() would be a NOP if inSync is set. But it would also not set it. But maybe I'm pushing it too far, it's just that I'm confused by the name 'singleSync'. Probably 'hasSingleSync' would be better? The boolean reflect whether qemu has the VSERPORT_CHANGE capability and thus libvirt knows when GA connects/disconnects. 'singleSync' just doesn't ring that bell at first. Michal

On 11.03.2020 12:38, Michal Privoznik wrote:
On 3/5/20 3:47 PM, Nikolay Shirokovskiy wrote:
Sync was introduced in [1] to check for ga presence. This check is racy but in the era before serial events are available there was not better solution I guess.
Can you elaborate more on the raciness? There should never be more than one thread talking on the agent monitor.
The race is in another dimension) So sync checks for GA presence, it only waits for 5s so if there is no GA the actual command with no timeout won't block. But GA can go down right after successful sync so command can block. This is true if there are no serial events in the latter case agent monitor will be closed when GA go down and command will unblock.
In case we have the events the sync function is different. It allows us to flush stateless ga channel from remnants of previous communications. But we need to do it only once. Until we get timeout on issued command channel state is ok.
[1] qemu_agent: Issue guest-sync prior to every command
Signed-off-by: Nikolay Shirokovskiy <nshirokovskiy@virtuozzo.com> --- src/qemu/qemu_agent.c | 13 ++++++++++++- src/qemu/qemu_agent.h | 3 ++- src/qemu/qemu_process.c | 3 ++- tests/qemumonitortestutils.c | 3 ++- 4 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/src/qemu/qemu_agent.c b/src/qemu/qemu_agent.c index cd25ef6cd3..2de53efb7a 100644 --- a/src/qemu/qemu_agent.c +++ b/src/qemu/qemu_agent.c @@ -104,6 +104,8 @@ struct _qemuAgent { int watch; bool running; + bool singleSync; + bool inSync;
I wonder if we can do this with just inSync and not have singleSync. I mean, it looks like at all places where singleSync is used we have access to qemuCaps so it should be as easy as:
qemuAgentOpen(); if (virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_VSERPORT_CHANGE)) { if *qemuAgentGuestSync(mon) < 0) goto error; mon->inSync = true;
and then qemuagentGuestSync() would be a NOP if inSync is set. But it would also not set it. But maybe I'm pushing it too far, it's just that I'm confused by the name 'singleSync'. Probably 'hasSingleSync' would be better? The boolean reflect whether qemu has the VSERPORT_CHANGE capability and thus libvirt knows when GA connects/disconnects. 'singleSync' just doesn't ring that bell at first.
If we try to sync only on monitor opening then we can't sync if for example command timeouts and GA stays connect (thus no serial event and monitor won't be closed). With current patch we just try to sync when we need to send next command to GA. I guess singleSync is better. It just reflects that we don't need to sync before every command (in order to avoid hangs) in case there is serial events. Strictly speaking it is not correct to use sync to check for GA presence because of the race and as to using it as a means to flush channel it works well without sereal events as well. Nikolay

On 3/11/20 12:12 PM, Nikolay Shirokovskiy wrote:
On 11.03.2020 12:38, Michal Privoznik wrote:
On 3/5/20 3:47 PM, Nikolay Shirokovskiy wrote:
Sync was introduced in [1] to check for ga presence. This check is racy but in the era before serial events are available there was not better solution I guess.
Can you elaborate more on the raciness? There should never be more than one thread talking on the agent monitor.
The race is in another dimension) So sync checks for GA presence, it only waits for 5s so if there is no GA the actual command with no timeout won't block. But GA can go down right after successful sync so command can block. This is true if there are no serial events in the latter case agent monitor will be closed when GA go down and command will unblock.
In case we have the events the sync function is different. It allows us to flush stateless ga channel from remnants of previous communications. But we need to do it only once. Until we get timeout on issued command channel state is ok.
[1] qemu_agent: Issue guest-sync prior to every command
Signed-off-by: Nikolay Shirokovskiy <nshirokovskiy@virtuozzo.com> --- src/qemu/qemu_agent.c | 13 ++++++++++++- src/qemu/qemu_agent.h | 3 ++- src/qemu/qemu_process.c | 3 ++- tests/qemumonitortestutils.c | 3 ++- 4 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/src/qemu/qemu_agent.c b/src/qemu/qemu_agent.c index cd25ef6cd3..2de53efb7a 100644 --- a/src/qemu/qemu_agent.c +++ b/src/qemu/qemu_agent.c @@ -104,6 +104,8 @@ struct _qemuAgent { int watch; bool running; + bool singleSync; + bool inSync;
I wonder if we can do this with just inSync and not have singleSync. I mean, it looks like at all places where singleSync is used we have access to qemuCaps so it should be as easy as:
qemuAgentOpen(); if (virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_VSERPORT_CHANGE)) { if *qemuAgentGuestSync(mon) < 0) goto error; mon->inSync = true;
and then qemuagentGuestSync() would be a NOP if inSync is set. But it would also not set it. But maybe I'm pushing it too far, it's just that I'm confused by the name 'singleSync'. Probably 'hasSingleSync' would be better? The boolean reflect whether qemu has the VSERPORT_CHANGE capability and thus libvirt knows when GA connects/disconnects. 'singleSync' just doesn't ring that bell at first.
If we try to sync only on monitor opening then we can't sync if for example command timeouts and GA stays connect (thus no serial event and monitor won't be closed). With current patch we just try to sync when we need to send next command to GA.
I guess singleSync is better. It just reflects that we don't need to sync before every command (in order to avoid hangs) in case there is serial events. Strictly speaking it is not correct to use sync to check for GA presence because of the race and as to using it as a means to flush channel it works well without sereal events as well.
Nikolay
Alright, you've persuaded me on both. Reviewed-by: Michal Privoznik <mprivozn@redhat.com> and resolved the merge commit on both patches that appeared meanwhile and pushed. Michal

needReply added in [1] looks redundant. Indeed it is set to false only when mon->await_event is set too (the only exception qemuAgentFSTrim which is mistaken). However it fixes the issue when qemuAgentCommand exits on error path and mon->await_event is not reset. Let's instead reset mon->await_event properly. Also remove "Woken up by event" debug message as it can be misleading. We can get it also if monitor is closed due to serial changed event currently. Anyway both qemuAgentClose and qemuAgentNotifyEvent log itself. [1] qemu: make sure agent returns error when required data are missing Signed-off-by: Nikolay Shirokovskiy <nshirokovskiy@virtuozzo.com> --- src/qemu/qemu_agent.c | 47 ++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/src/qemu/qemu_agent.c b/src/qemu/qemu_agent.c index 2de53efb7a..605c9f563e 100644 --- a/src/qemu/qemu_agent.c +++ b/src/qemu/qemu_agent.c @@ -1112,7 +1112,6 @@ static int qemuAgentCommand(qemuAgentPtr mon, virJSONValuePtr cmd, virJSONValuePtr *reply, - bool needReply, int seconds) { int ret = -1; @@ -1121,17 +1120,16 @@ qemuAgentCommand(qemuAgentPtr mon, int await_event = mon->await_event; *reply = NULL; + memset(&msg, 0, sizeof(msg)); if (!mon->running) { virReportError(VIR_ERR_AGENT_UNRESPONSIVE, "%s", _("Guest agent disappeared while executing command")); - return -1; + goto cleanup; } if (qemuAgentGuestSync(mon) < 0) - return -1; - - memset(&msg, 0, sizeof(msg)); + goto cleanup; if (!(cmdstr = virJSONValueToString(cmd, false))) goto cleanup; @@ -1149,9 +1147,7 @@ qemuAgentCommand(qemuAgentPtr mon, /* If we haven't obtained any reply but we wait for an * event, then don't report this as error */ if (!msg.rxObject) { - if (await_event && !needReply) { - VIR_DEBUG("Woken up by event %d", await_event); - } else { + if (!await_event) { if (mon->running) virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Missing monitor reply object")); @@ -1169,6 +1165,7 @@ qemuAgentCommand(qemuAgentPtr mon, cleanup: VIR_FREE(cmdstr); VIR_FREE(msg.txBuffer); + mon->await_event = QEMU_AGENT_EVENT_NONE; return ret; } @@ -1269,7 +1266,7 @@ int qemuAgentShutdown(qemuAgentPtr mon, mon->await_event = QEMU_AGENT_EVENT_RESET; else mon->await_event = QEMU_AGENT_EVENT_SHUTDOWN; - ret = qemuAgentCommand(mon, cmd, &reply, false, + ret = qemuAgentCommand(mon, cmd, &reply, VIR_DOMAIN_QEMU_AGENT_COMMAND_SHUTDOWN); virJSONValueFree(cmd); @@ -1312,7 +1309,7 @@ int qemuAgentFSFreeze(qemuAgentPtr mon, const char **mountpoints, if (!cmd) goto cleanup; - if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) goto cleanup; if (virJSONValueObjectGetNumberInt(reply, "return", &ret) < 0) { @@ -1349,7 +1346,7 @@ int qemuAgentFSThaw(qemuAgentPtr mon) if (!cmd) return -1; - if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) goto cleanup; if (virJSONValueObjectGetNumberInt(reply, "return", &ret) < 0) { @@ -1386,7 +1383,7 @@ qemuAgentSuspend(qemuAgentPtr mon, return -1; mon->await_event = QEMU_AGENT_EVENT_SUSPEND; - ret = qemuAgentCommand(mon, cmd, &reply, false, mon->timeout); + ret = qemuAgentCommand(mon, cmd, &reply, mon->timeout); virJSONValueFree(cmd); virJSONValueFree(reply); @@ -1415,7 +1412,7 @@ qemuAgentArbitraryCommand(qemuAgentPtr mon, if (!(cmd = virJSONValueFromString(cmd_str))) goto cleanup; - if ((ret = qemuAgentCommand(mon, cmd, &reply, true, timeout)) < 0) + if ((ret = qemuAgentCommand(mon, cmd, &reply, timeout)) < 0) goto cleanup; if (!(*result = virJSONValueToString(reply, false))) @@ -1442,7 +1439,7 @@ qemuAgentFSTrim(qemuAgentPtr mon, if (!cmd) return ret; - ret = qemuAgentCommand(mon, cmd, &reply, false, mon->timeout); + ret = qemuAgentCommand(mon, cmd, &reply, mon->timeout); virJSONValueFree(cmd); virJSONValueFree(reply); @@ -1463,7 +1460,7 @@ qemuAgentGetVCPUs(qemuAgentPtr mon, if (!(cmd = qemuAgentMakeCommand("guest-get-vcpus", NULL))) return -1; - if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) goto cleanup; if (!(data = virJSONValueObjectGetArray(reply, "return"))) { @@ -1576,7 +1573,7 @@ qemuAgentSetVCPUsCommand(qemuAgentPtr mon, NULL))) goto cleanup; - if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) goto cleanup; /* All negative values are invalid. Return of 0 is bogus since we wouldn't @@ -1731,7 +1728,7 @@ qemuAgentGetHostname(qemuAgentPtr mon, if (!cmd) return ret; - if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) { + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) { if (qemuAgentErrorCommandUnsupported(reply)) ret = -2; goto cleanup; @@ -1775,7 +1772,7 @@ qemuAgentGetTime(qemuAgentPtr mon, if (!cmd) return ret; - if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) goto cleanup; if (virJSONValueObjectGetNumberUlong(reply, "return", &json_time) < 0) { @@ -1840,7 +1837,7 @@ qemuAgentSetTime(qemuAgentPtr mon, if (!cmd) return ret; - if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) goto cleanup; ret = 0; @@ -1977,7 +1974,7 @@ qemuAgentGetFSInfo(qemuAgentPtr mon, if (!cmd) return ret; - if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) { + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) { if (qemuAgentErrorCommandUnsupported(reply)) ret = -2; goto cleanup; @@ -2128,7 +2125,7 @@ qemuAgentGetInterfaces(qemuAgentPtr mon, if (!(cmd = qemuAgentMakeCommand("guest-network-get-interfaces", NULL))) goto cleanup; - if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) goto cleanup; if (!(ret_array = virJSONValueObjectGet(reply, "return"))) { @@ -2305,7 +2302,7 @@ qemuAgentSetUserPassword(qemuAgentPtr mon, NULL))) goto cleanup; - if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) goto cleanup; ret = 0; @@ -2336,7 +2333,7 @@ qemuAgentGetUsers(qemuAgentPtr mon, if (!(cmd = qemuAgentMakeCommand("guest-get-users", NULL))) return -1; - if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) { + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) { if (qemuAgentErrorCommandUnsupported(reply)) return -2; return -1; @@ -2425,7 +2422,7 @@ qemuAgentGetOSInfo(qemuAgentPtr mon, if (!(cmd = qemuAgentMakeCommand("guest-get-osinfo", NULL))) return -1; - if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) { + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) { if (qemuAgentErrorCommandUnsupported(reply)) return -2; return -1; @@ -2480,7 +2477,7 @@ qemuAgentGetTimezone(qemuAgentPtr mon, if (!(cmd = qemuAgentMakeCommand("guest-get-timezone", NULL))) return -1; - if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) { + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) { if (qemuAgentErrorCommandUnsupported(reply)) return -2; return -1; -- 2.23.0

On 3/5/20 3:47 PM, Nikolay Shirokovskiy wrote:
needReply added in [1] looks redundant. Indeed it is set to false only when mon->await_event is set too (the only exception qemuAgentFSTrim which is mistaken).
However it fixes the issue when qemuAgentCommand exits on error path and mon->await_event is not reset. Let's instead reset mon->await_event properly.
Also remove "Woken up by event" debug message as it can be misleading. We can get it also if monitor is closed due to serial changed event currently. Anyway both qemuAgentClose and qemuAgentNotifyEvent log itself.
[1] qemu: make sure agent returns error when required data are missing
Signed-off-by: Nikolay Shirokovskiy <nshirokovskiy@virtuozzo.com> --- src/qemu/qemu_agent.c | 47 ++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 25 deletions(-)
diff --git a/src/qemu/qemu_agent.c b/src/qemu/qemu_agent.c index 2de53efb7a..605c9f563e 100644 --- a/src/qemu/qemu_agent.c +++ b/src/qemu/qemu_agent.c @@ -1112,7 +1112,6 @@ static int qemuAgentCommand(qemuAgentPtr mon, virJSONValuePtr cmd, virJSONValuePtr *reply, - bool needReply, int seconds) { int ret = -1; @@ -1121,17 +1120,16 @@ qemuAgentCommand(qemuAgentPtr mon, int await_event = mon->await_event;
*reply = NULL; + memset(&msg, 0, sizeof(msg));
if (!mon->running) { virReportError(VIR_ERR_AGENT_UNRESPONSIVE, "%s", _("Guest agent disappeared while executing command")); - return -1; + goto cleanup; }
if (qemuAgentGuestSync(mon) < 0) - return -1; - - memset(&msg, 0, sizeof(msg)); + goto cleanup;
if (!(cmdstr = virJSONValueToString(cmd, false))) goto cleanup; @@ -1149,9 +1147,7 @@ qemuAgentCommand(qemuAgentPtr mon, /* If we haven't obtained any reply but we wait for an * event, then don't report this as error */ if (!msg.rxObject) { - if (await_event && !needReply) { - VIR_DEBUG("Woken up by event %d", await_event); - } else {
Please keep this debug printing. It doesn't hurt anything and has potential of helping us to debug.
+ if (!await_event) { if (mon->running) virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Missing monitor reply object")); @@ -1169,6 +1165,7 @@ qemuAgentCommand(qemuAgentPtr mon, cleanup: VIR_FREE(cmdstr); VIR_FREE(msg.txBuffer); + mon->await_event = QEMU_AGENT_EVENT_NONE;
This is the part I don't like about this patch. The rest looks fine. Why is this needed? Either the mon->await_event is not set by the caller, or if set the the event handler will clear it out by calling qemuAgentNotifyEvent(). Or am I missing something?
return ret; } @@ -1269,7 +1266,7 @@ int qemuAgentShutdown(qemuAgentPtr mon, mon->await_event = QEMU_AGENT_EVENT_RESET; else mon->await_event = QEMU_AGENT_EVENT_SHUTDOWN; - ret = qemuAgentCommand(mon, cmd, &reply, false, + ret = qemuAgentCommand(mon, cmd, &reply, VIR_DOMAIN_QEMU_AGENT_COMMAND_SHUTDOWN);
virJSONValueFree(cmd); @@ -1312,7 +1309,7 @@ int qemuAgentFSFreeze(qemuAgentPtr mon, const char **mountpoints, if (!cmd) goto cleanup;
- if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) goto cleanup;
if (virJSONValueObjectGetNumberInt(reply, "return", &ret) < 0) { @@ -1349,7 +1346,7 @@ int qemuAgentFSThaw(qemuAgentPtr mon) if (!cmd) return -1;
- if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) goto cleanup;
if (virJSONValueObjectGetNumberInt(reply, "return", &ret) < 0) { @@ -1386,7 +1383,7 @@ qemuAgentSuspend(qemuAgentPtr mon, return -1;
mon->await_event = QEMU_AGENT_EVENT_SUSPEND; - ret = qemuAgentCommand(mon, cmd, &reply, false, mon->timeout); + ret = qemuAgentCommand(mon, cmd, &reply, mon->timeout);
virJSONValueFree(cmd); virJSONValueFree(reply); @@ -1415,7 +1412,7 @@ qemuAgentArbitraryCommand(qemuAgentPtr mon, if (!(cmd = virJSONValueFromString(cmd_str))) goto cleanup;
- if ((ret = qemuAgentCommand(mon, cmd, &reply, true, timeout)) < 0) + if ((ret = qemuAgentCommand(mon, cmd, &reply, timeout)) < 0) goto cleanup;
if (!(*result = virJSONValueToString(reply, false))) @@ -1442,7 +1439,7 @@ qemuAgentFSTrim(qemuAgentPtr mon, if (!cmd) return ret;
- ret = qemuAgentCommand(mon, cmd, &reply, false, mon->timeout); + ret = qemuAgentCommand(mon, cmd, &reply, mon->timeout);
Oh this is fun. This 'false' you are removing should have been true as the command does have a reply that we need to wait for.
virJSONValueFree(cmd); virJSONValueFree(reply);
Michal

On 11.03.2020 12:38, Michal Privoznik wrote:
On 3/5/20 3:47 PM, Nikolay Shirokovskiy wrote:
needReply added in [1] looks redundant. Indeed it is set to false only when mon->await_event is set too (the only exception qemuAgentFSTrim which is mistaken).
However it fixes the issue when qemuAgentCommand exits on error path and mon->await_event is not reset. Let's instead reset mon->await_event properly.
Also remove "Woken up by event" debug message as it can be misleading. We can get it also if monitor is closed due to serial changed event currently. Anyway both qemuAgentClose and qemuAgentNotifyEvent log itself.
[1] qemu: make sure agent returns error when required data are missing
Signed-off-by: Nikolay Shirokovskiy <nshirokovskiy@virtuozzo.com> --- src/qemu/qemu_agent.c | 47 ++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 25 deletions(-)
diff --git a/src/qemu/qemu_agent.c b/src/qemu/qemu_agent.c index 2de53efb7a..605c9f563e 100644 --- a/src/qemu/qemu_agent.c +++ b/src/qemu/qemu_agent.c @@ -1112,7 +1112,6 @@ static int qemuAgentCommand(qemuAgentPtr mon, virJSONValuePtr cmd, virJSONValuePtr *reply, - bool needReply, int seconds) { int ret = -1; @@ -1121,17 +1120,16 @@ qemuAgentCommand(qemuAgentPtr mon, int await_event = mon->await_event; *reply = NULL; + memset(&msg, 0, sizeof(msg)); if (!mon->running) { virReportError(VIR_ERR_AGENT_UNRESPONSIVE, "%s", _("Guest agent disappeared while executing command")); - return -1; + goto cleanup; } if (qemuAgentGuestSync(mon) < 0) - return -1; - - memset(&msg, 0, sizeof(msg)); + goto cleanup; if (!(cmdstr = virJSONValueToString(cmd, false))) goto cleanup; @@ -1149,9 +1147,7 @@ qemuAgentCommand(qemuAgentPtr mon, /* If we haven't obtained any reply but we wait for an * event, then don't report this as error */ if (!msg.rxObject) { - if (await_event && !needReply) { - VIR_DEBUG("Woken up by event %d", await_event); - } else {
Please keep this debug printing. It doesn't hurt anything and has potential of helping us to debug.
This debug line is not correct. I reflected it in the commit message. I actually get this line during debug session when agent monitor is closed during VM shutdown on serial event and not shutdown event. Serial event just comes first.
+ if (!await_event) { if (mon->running) virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Missing monitor reply object")); @@ -1169,6 +1165,7 @@ qemuAgentCommand(qemuAgentPtr mon, cleanup: VIR_FREE(cmdstr); VIR_FREE(msg.txBuffer); + mon->await_event = QEMU_AGENT_EVENT_NONE;
This is the part I don't like about this patch. The rest looks fine. Why is this needed? Either the mon->await_event is not set by the caller, or if set the the event handler will clear it out by calling qemuAgentNotifyEvent(). Or am I missing something?
The whole point of the patch is in this line) Yeah, you are talking of success paths. But on error paths mon->await_event is not cleared currently so next command which need reply can be finish on await event without needReply which lead to SIGSGEV. This is what patch [1] fixes by introducing needReply which don't need to be cleared as it is on stack. I guess scenario is next given the bug [2] mentioned in [1]: - vm shutdown command is sent to GA - it is finished with timeout (60 s), mon->await_event is not cleared - guest ping is sent - guest shutdowned and ping command get awakend by this event as mon->await_event is set (bug) So needReply argument is helpful but it is cleaner just to reset mon->await_event properly. [1] qemu: make sure agent returns error when required data are missing [2] https://bugzilla.redhat.com/show_bug.cgi?id=1058149 Nikolay
return ret; } @@ -1269,7 +1266,7 @@ int qemuAgentShutdown(qemuAgentPtr mon, mon->await_event = QEMU_AGENT_EVENT_RESET; else mon->await_event = QEMU_AGENT_EVENT_SHUTDOWN; - ret = qemuAgentCommand(mon, cmd, &reply, false, + ret = qemuAgentCommand(mon, cmd, &reply, VIR_DOMAIN_QEMU_AGENT_COMMAND_SHUTDOWN); virJSONValueFree(cmd); @@ -1312,7 +1309,7 @@ int qemuAgentFSFreeze(qemuAgentPtr mon, const char **mountpoints, if (!cmd) goto cleanup; - if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) goto cleanup; if (virJSONValueObjectGetNumberInt(reply, "return", &ret) < 0) { @@ -1349,7 +1346,7 @@ int qemuAgentFSThaw(qemuAgentPtr mon) if (!cmd) return -1; - if (qemuAgentCommand(mon, cmd, &reply, true, mon->timeout) < 0) + if (qemuAgentCommand(mon, cmd, &reply, mon->timeout) < 0) goto cleanup; if (virJSONValueObjectGetNumberInt(reply, "return", &ret) < 0) { @@ -1386,7 +1383,7 @@ qemuAgentSuspend(qemuAgentPtr mon, return -1; mon->await_event = QEMU_AGENT_EVENT_SUSPEND; - ret = qemuAgentCommand(mon, cmd, &reply, false, mon->timeout); + ret = qemuAgentCommand(mon, cmd, &reply, mon->timeout); virJSONValueFree(cmd); virJSONValueFree(reply); @@ -1415,7 +1412,7 @@ qemuAgentArbitraryCommand(qemuAgentPtr mon, if (!(cmd = virJSONValueFromString(cmd_str))) goto cleanup; - if ((ret = qemuAgentCommand(mon, cmd, &reply, true, timeout)) < 0) + if ((ret = qemuAgentCommand(mon, cmd, &reply, timeout)) < 0) goto cleanup; if (!(*result = virJSONValueToString(reply, false))) @@ -1442,7 +1439,7 @@ qemuAgentFSTrim(qemuAgentPtr mon, if (!cmd) return ret; - ret = qemuAgentCommand(mon, cmd, &reply, false, mon->timeout); + ret = qemuAgentCommand(mon, cmd, &reply, mon->timeout);
Oh this is fun. This 'false' you are removing should have been true as the command does have a reply that we need to wait for.
virJSONValueFree(cmd); virJSONValueFree(reply);
Michal
participants (2)
-
Michal Privoznik
-
Nikolay Shirokovskiy