[PATCH 0/3] Add event for hypervisor faults and implement it for qemu

Peter Krempa (3): qemuMonitorJSONGetStatus: Refactor cleanup API: Add VIR_DOMAIN_PAUSED_HYPERVISOR_FAULT enum value qemu: Update paused reason on unexpected 'STOP' event examples/c/misc/event-test.c | 3 +++ include/libvirt/libvirt-domain.h | 2 ++ src/conf/domain_conf.c | 1 + src/qemu/qemu_domain.c | 4 ++++ src/qemu/qemu_domain.h | 1 + src/qemu/qemu_driver.c | 37 ++++++++++++++++++++++++++++++++ src/qemu/qemu_monitor.c | 4 +++- src/qemu/qemu_monitor_json.c | 18 ++++++---------- src/qemu/qemu_process.c | 22 ++++++++++++++++--- tools/virsh-domain-monitor.c | 1 + tools/virsh-domain.c | 3 ++- 11 files changed, 79 insertions(+), 17 deletions(-) -- 2.29.2

Use g_autofree for the JSON values to remove cleanup label and ret variable. Signed-off-by: Peter Krempa <pkrempa@redhat.com> --- src/qemu/qemu_monitor_json.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/src/qemu/qemu_monitor_json.c b/src/qemu/qemu_monitor_json.c index b669630bc8..e62adcb5b5 100644 --- a/src/qemu/qemu_monitor_json.c +++ b/src/qemu/qemu_monitor_json.c @@ -1729,10 +1729,9 @@ qemuMonitorJSONGetStatus(qemuMonitorPtr mon, bool *running, virDomainPausedReason *reason) { - int ret = -1; const char *status; - virJSONValuePtr cmd; - virJSONValuePtr reply = NULL; + g_autoptr(virJSONValue) cmd = NULL; + g_autoptr(virJSONValue) reply = NULL; virJSONValuePtr data; if (reason) @@ -1742,17 +1741,17 @@ qemuMonitorJSONGetStatus(qemuMonitorPtr mon, return -1; if (qemuMonitorJSONCommand(mon, cmd, &reply) < 0) - goto cleanup; + return -1; if (qemuMonitorJSONCheckReply(cmd, reply, VIR_JSON_TYPE_OBJECT) < 0) - goto cleanup; + return -1; data = virJSONValueObjectGetObject(reply, "return"); if (virJSONValueObjectGetBoolean(data, "running", running) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("query-status reply was missing running state")); - goto cleanup; + return -1; } if ((status = virJSONValueObjectGetString(data, "status"))) { @@ -1762,12 +1761,7 @@ qemuMonitorJSONGetStatus(qemuMonitorPtr mon, VIR_DEBUG("query-status reply was missing status details"); } - ret = 0; - - cleanup: - virJSONValueFree(cmd); - virJSONValueFree(reply); - return ret; + return 0; } -- 2.29.2

On 3/18/21 6:40 PM, Peter Krempa wrote:
Use g_autofree for the JSON values to remove cleanup label and ret variable.
Signed-off-by: Peter Krempa <pkrempa@redhat.com> --- src/qemu/qemu_monitor_json.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-)
Reviewed-by: Michal Privoznik <mprivozn@redhat.com> Michal

In certain cases hypervisors on encountering an internal fault pause the domain to allow gathering useful debugging information. Libvirt for now would report 'VIR_DOMAIN_PAUSED_UNKNOWN' which isn't entirely helpful. Add a new paused reason and the corresponding value for the suspended event, so that hypervisor drivers can report this state properly. In qemu this will be mapped to the 'internal-error' paused reason. Signed-off-by: Peter Krempa <pkrempa@redhat.com> --- examples/c/misc/event-test.c | 3 +++ include/libvirt/libvirt-domain.h | 2 ++ src/conf/domain_conf.c | 1 + src/qemu/qemu_domain.c | 3 +++ src/qemu/qemu_monitor.c | 4 +++- tools/virsh-domain-monitor.c | 1 + tools/virsh-domain.c | 3 ++- 7 files changed, 15 insertions(+), 2 deletions(-) diff --git a/examples/c/misc/event-test.c b/examples/c/misc/event-test.c index 76d4f3f6e8..c7d60d4c61 100644 --- a/examples/c/misc/event-test.c +++ b/examples/c/misc/event-test.c @@ -187,6 +187,9 @@ eventDetailToString(int event, case VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY_FAILED: return "Post-copy Error"; + case VIR_DOMAIN_EVENT_SUSPENDED_HYPERVISOR_FAULT: + return "Hypervisor fault"; + case VIR_DOMAIN_EVENT_SUSPENDED_LAST: break; } diff --git a/include/libvirt/libvirt-domain.h b/include/libvirt/libvirt-domain.h index 03c119fe26..f1c0e960f9 100644 --- a/include/libvirt/libvirt-domain.h +++ b/include/libvirt/libvirt-domain.h @@ -119,6 +119,7 @@ typedef enum { VIR_DOMAIN_PAUSED_STARTING_UP = 11, /* the domain is being started */ VIR_DOMAIN_PAUSED_POSTCOPY = 12, /* paused for post-copy migration */ VIR_DOMAIN_PAUSED_POSTCOPY_FAILED = 13, /* paused after failed post-copy */ + VIR_DOMAIN_PAUSED_HYPERVISOR_FAULT = 14, /* paused after a hypervisor failure */ # ifdef VIR_ENUM_SENTINELS VIR_DOMAIN_PAUSED_LAST @@ -3108,6 +3109,7 @@ typedef enum { VIR_DOMAIN_EVENT_SUSPENDED_API_ERROR = 6, /* suspended after failure during libvirt API call */ VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY = 7, /* suspended for post-copy migration */ VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY_FAILED = 8, /* suspended after failed post-copy */ + VIR_DOMAIN_EVENT_SUSPENDED_HYPERVISOR_FAULT = 9, /* suspended after a hypervisor fault */ # ifdef VIR_ENUM_SENTINELS VIR_DOMAIN_EVENT_SUSPENDED_LAST diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 47756ff0be..f49db930ca 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -1082,6 +1082,7 @@ VIR_ENUM_IMPL(virDomainPausedReason, "starting up", "post-copy", "post-copy failed", + "hypervisor fault", ); VIR_ENUM_IMPL(virDomainShutdownReason, diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index ed2a1481d4..dde9ba92c6 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -11180,6 +11180,9 @@ qemuDomainPausedReasonToSuspendedEvent(virDomainPausedReason reason) case VIR_DOMAIN_PAUSED_POSTCOPY: return VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY; + case VIR_DOMAIN_PAUSED_HYPERVISOR_FAULT: + return VIR_DOMAIN_EVENT_SUSPENDED_HYPERVISOR_FAULT; + case VIR_DOMAIN_PAUSED_UNKNOWN: case VIR_DOMAIN_PAUSED_USER: case VIR_DOMAIN_PAUSED_SAVE: diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c index c43c6f180e..ca6bd79516 100644 --- a/src/qemu/qemu_monitor.c +++ b/src/qemu/qemu_monitor.c @@ -3508,10 +3508,12 @@ qemuMonitorVMStatusToPausedReason(const char *status) switch ((qemuMonitorVMStatus) st) { case QEMU_MONITOR_VM_STATUS_DEBUG: - case QEMU_MONITOR_VM_STATUS_INTERNAL_ERROR: case QEMU_MONITOR_VM_STATUS_RESTORE_VM: return VIR_DOMAIN_PAUSED_UNKNOWN; + case QEMU_MONITOR_VM_STATUS_INTERNAL_ERROR: + return VIR_DOMAIN_PAUSED_HYPERVISOR_FAULT; + case QEMU_MONITOR_VM_STATUS_INMIGRATE: case QEMU_MONITOR_VM_STATUS_POSTMIGRATE: case QEMU_MONITOR_VM_STATUS_FINISH_MIGRATE: diff --git a/tools/virsh-domain-monitor.c b/tools/virsh-domain-monitor.c index c4d7464695..169fd5931b 100644 --- a/tools/virsh-domain-monitor.c +++ b/tools/virsh-domain-monitor.c @@ -195,6 +195,7 @@ VIR_ENUM_IMPL(virshDomainPausedReason, N_("starting up"), N_("post-copy"), N_("post-copy failed"), + N_("hypervisor fault"), ); VIR_ENUM_DECL(virshDomainShutdownReason); diff --git a/tools/virsh-domain.c b/tools/virsh-domain.c index 7db88f700a..05b75edcda 100644 --- a/tools/virsh-domain.c +++ b/tools/virsh-domain.c @@ -12837,7 +12837,8 @@ VIR_ENUM_IMPL(virshDomainEventSuspended, N_("Snapshot"), N_("API error"), N_("Post-copy"), - N_("Post-copy Error")); + N_("Post-copy Error"), + N_("Hypervisor fault")); VIR_ENUM_DECL(virshDomainEventResumed); VIR_ENUM_IMPL(virshDomainEventResumed, -- 2.29.2

On 3/18/21 6:40 PM, Peter Krempa wrote:
In certain cases hypervisors on encountering an internal fault pause the domain to allow gathering useful debugging information. Libvirt for now would report 'VIR_DOMAIN_PAUSED_UNKNOWN' which isn't entirely helpful.
Add a new paused reason and the corresponding value for the suspended event, so that hypervisor drivers can report this state properly.
In qemu this will be mapped to the 'internal-error' paused reason.
Signed-off-by: Peter Krempa <pkrempa@redhat.com> --- examples/c/misc/event-test.c | 3 +++ include/libvirt/libvirt-domain.h | 2 ++ src/conf/domain_conf.c | 1 + src/qemu/qemu_domain.c | 3 +++ src/qemu/qemu_monitor.c | 4 +++- tools/virsh-domain-monitor.c | 1 + tools/virsh-domain.c | 3 ++- 7 files changed, 15 insertions(+), 2 deletions(-)
Reviewed-by: Michal Privoznik <mprivozn@redhat.com> Michal

The qemu STOP event doesn't really report why the VM was stopped. In certain cases we do expect this by storing the expectation in the private data. In cases we've encountered an unexpected STOP event we can offload to the event thread a job to refresh the state using 'query-status'. For all other paused reasons which are expected we keep the original logic. Signed-off-by: Peter Krempa <pkrempa@redhat.com> --- src/qemu/qemu_domain.c | 1 + src/qemu/qemu_domain.h | 1 + src/qemu/qemu_driver.c | 37 +++++++++++++++++++++++++++++++++++++ src/qemu/qemu_process.c | 22 +++++++++++++++++++--- 4 files changed, 58 insertions(+), 3 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index dde9ba92c6..106a292ef3 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -11049,6 +11049,7 @@ qemuProcessEventFree(struct qemuProcessEvent *event) virObjectUnref(event->data); break; case QEMU_PROCESS_EVENT_PR_DISCONNECT: + case QEMU_PROCESS_EVENT_STOP: case QEMU_PROCESS_EVENT_LAST: break; } diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 949307229b..b057e94df0 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -441,6 +441,7 @@ typedef enum { QEMU_PROCESS_EVENT_PR_DISCONNECT, QEMU_PROCESS_EVENT_RDMA_GID_STATUS_CHANGED, QEMU_PROCESS_EVENT_GUEST_CRASHLOADED, + QEMU_PROCESS_EVENT_STOP, QEMU_PROCESS_EVENT_LAST } qemuProcessEventType; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index f3f8caab44..53be363338 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4316,6 +4316,40 @@ processGuestCrashloadedEvent(virQEMUDriverPtr driver, } +static void +processGuestStopEvent(virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + virObjectEventPtr event = NULL; + virDomainPausedReason reason = VIR_DOMAIN_PAUSED_UNKNOWN; + + if (qemuDomainObjBeginJob(priv->driver, vm, QEMU_JOB_MODIFY) == 0) { + bool running; + int rc; + + qemuDomainObjEnterMonitor(priv->driver, vm); + + rc = qemuMonitorGetStatus(priv->mon, &running, &reason); + + /* update reason only on successful state update */ + if (qemuDomainObjExitMonitor(priv->driver, vm) < 0) + rc = -1; + + qemuDomainObjEndJob(priv->driver, vm); + + if (rc < 0) + return; + } + + virDomainObjSetState(vm, VIR_DOMAIN_PAUSED, reason); + + event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_SUSPENDED, + qemuDomainPausedReasonToSuspendedEvent(reason)); + + virObjectEventStateQueue(priv->driver->domainEventState, event); +} + + static void qemuProcessEventHandler(void *data, void *opaque) { struct qemuProcessEvent *processEvent = data; @@ -4365,6 +4399,9 @@ static void qemuProcessEventHandler(void *data, void *opaque) case QEMU_PROCESS_EVENT_GUEST_CRASHLOADED: processGuestCrashloadedEvent(driver, vm); break; + case QEMU_PROCESS_EVENT_STOP: + processGuestStopEvent(vm); + break; case QEMU_PROCESS_EVENT_LAST: break; } diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 5f31260221..7003c92046 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -689,10 +689,26 @@ qemuProcessHandleStop(qemuMonitorPtr mon G_GNUC_UNUSED, if (priv->signalStop) virDomainObjBroadcast(vm); + /* In case of VIR_DOMAIN_PAUSED_UNKNOWN we'll update it later, but we + * must set the state to PAUSED right away. We delay the event until + * the update. */ virDomainObjSetState(vm, VIR_DOMAIN_PAUSED, reason); - event = virDomainEventLifecycleNewFromObj(vm, - VIR_DOMAIN_EVENT_SUSPENDED, - detail); + if (reason == VIR_DOMAIN_PAUSED_UNKNOWN) { + /* offload unknown state to thread updating state */ + struct qemuProcessEvent *processEvent = g_new0(struct qemuProcessEvent, 1); + + processEvent->eventType = QEMU_PROCESS_EVENT_STOP; + processEvent->vm = virObjectRef(vm); + + if (virThreadPoolSendJob(driver->workerPool, 0, processEvent) < 0) { + virObjectUnref(vm); + qemuProcessEventFree(processEvent); + } + } else { + event = virDomainEventLifecycleNewFromObj(vm, + VIR_DOMAIN_EVENT_SUSPENDED, + detail); + } VIR_FREE(priv->lockState); if (virDomainLockProcessPause(driver->lockManager, vm, &priv->lockState) < 0) -- 2.29.2

On 3/18/21 6:40 PM, Peter Krempa wrote:
The qemu STOP event doesn't really report why the VM was stopped. In certain cases we do expect this by storing the expectation in the private data.
In cases we've encountered an unexpected STOP event we can offload to the event thread a job to refresh the state using 'query-status'.
For all other paused reasons which are expected we keep the original logic.
Signed-off-by: Peter Krempa <pkrempa@redhat.com> --- src/qemu/qemu_domain.c | 1 + src/qemu/qemu_domain.h | 1 + src/qemu/qemu_driver.c | 37 +++++++++++++++++++++++++++++++++++++ src/qemu/qemu_process.c | 22 +++++++++++++++++++--- 4 files changed, 58 insertions(+), 3 deletions(-)
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index dde9ba92c6..106a292ef3 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -11049,6 +11049,7 @@ qemuProcessEventFree(struct qemuProcessEvent *event) virObjectUnref(event->data); break; case QEMU_PROCESS_EVENT_PR_DISCONNECT: + case QEMU_PROCESS_EVENT_STOP: case QEMU_PROCESS_EVENT_LAST: break; } diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 949307229b..b057e94df0 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -441,6 +441,7 @@ typedef enum { QEMU_PROCESS_EVENT_PR_DISCONNECT, QEMU_PROCESS_EVENT_RDMA_GID_STATUS_CHANGED, QEMU_PROCESS_EVENT_GUEST_CRASHLOADED, + QEMU_PROCESS_EVENT_STOP,
QEMU_PROCESS_EVENT_LAST } qemuProcessEventType; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index f3f8caab44..53be363338 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4316,6 +4316,40 @@ processGuestCrashloadedEvent(virQEMUDriverPtr driver, }
+static void +processGuestStopEvent(virDomainObjPtr vm) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + virObjectEventPtr event = NULL; + virDomainPausedReason reason = VIR_DOMAIN_PAUSED_UNKNOWN; + + if (qemuDomainObjBeginJob(priv->driver, vm, QEMU_JOB_MODIFY) == 0) { + bool running; + int rc;
While @vm is locked when this function is called it was unlocked for a brief moment. Therefore, another thread might have changed its state. Hence, virDomainObjIsActive() is a must here.
+ + qemuDomainObjEnterMonitor(priv->driver, vm); + + rc = qemuMonitorGetStatus(priv->mon, &running, &reason); + + /* update reason only on successful state update */ + if (qemuDomainObjExitMonitor(priv->driver, vm) < 0) + rc = -1; + + qemuDomainObjEndJob(priv->driver, vm); + + if (rc < 0) + return; + } + + virDomainObjSetState(vm, VIR_DOMAIN_PAUSED, reason); +
Again, another thread might have just resumed vCPUS. Setting this here might lead to wrong state. What if we called SetState() iff !running && reason? I mean, SetState() was called from the in-loop handler already and this is here only to report reason (if any).
+ event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_SUSPENDED, + qemuDomainPausedReasonToSuspendedEvent(reason)); + + virObjectEventStateQueue(priv->driver->domainEventState, event); +} + + static void qemuProcessEventHandler(void *data, void *opaque) { struct qemuProcessEvent *processEvent = data; @@ -4365,6 +4399,9 @@ static void qemuProcessEventHandler(void *data, void *opaque) case QEMU_PROCESS_EVENT_GUEST_CRASHLOADED: processGuestCrashloadedEvent(driver, vm); break; + case QEMU_PROCESS_EVENT_STOP: + processGuestStopEvent(vm); + break; case QEMU_PROCESS_EVENT_LAST: break; } diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 5f31260221..7003c92046 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -689,10 +689,26 @@ qemuProcessHandleStop(qemuMonitorPtr mon G_GNUC_UNUSED, if (priv->signalStop) virDomainObjBroadcast(vm);
+ /* In case of VIR_DOMAIN_PAUSED_UNKNOWN we'll update it later, but we + * must set the state to PAUSED right away. We delay the event until + * the update. */ virDomainObjSetState(vm, VIR_DOMAIN_PAUSED, reason); - event = virDomainEventLifecycleNewFromObj(vm, - VIR_DOMAIN_EVENT_SUSPENDED, - detail); + if (reason == VIR_DOMAIN_PAUSED_UNKNOWN) { + /* offload unknown state to thread updating state */ + struct qemuProcessEvent *processEvent = g_new0(struct qemuProcessEvent, 1); + + processEvent->eventType = QEMU_PROCESS_EVENT_STOP; + processEvent->vm = virObjectRef(vm); + + if (virThreadPoolSendJob(driver->workerPool, 0, processEvent) < 0) { + virObjectUnref(vm); + qemuProcessEventFree(processEvent); + } + } else { + event = virDomainEventLifecycleNewFromObj(vm, + VIR_DOMAIN_EVENT_SUSPENDED, + detail); + }
VIR_FREE(priv->lockState); if (virDomainLockProcessPause(driver->lockManager, vm, &priv->lockState) < 0)
Michal
participants (2)
-
Michal Privoznik
-
Peter Krempa