[PATCH 0/3] qemu: Fix waiting for domain condition when qemu crashes

See 3/4. Peter Krempa (4): qemuProcessBeginStopJob: Add debug log when waking up all threads waiting on domain condition qemu: Replace virDomainObjWait with qemuDomainObjWait qemuDomainObjWait: Report error when VM is being destroyed DO_NOT_APPLY_UPSTREAM: reproducer src/qemu/qemu_block.c | 2 +- src/qemu/qemu_domain.c | 17 +++++++++++++++++ src/qemu/qemu_domain.h | 3 +++ src/qemu/qemu_driver.c | 5 +++-- src/qemu/qemu_migration.c | 12 ++++++------ src/qemu/qemu_process.c | 1 + 6 files changed, 31 insertions(+), 9 deletions(-) -- 2.37.1

Aid in debugging of potentially stuck threads. Signed-off-by: Peter Krempa <pkrempa@redhat.com> --- src/qemu/qemu_process.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 478a46bff6..0f30247817 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -8145,6 +8145,7 @@ qemuProcessBeginStopJob(virQEMUDriver *driver, goto cleanup; /* Wake up anything waiting on domain condition */ + VIR_DEBUG("waking up all jobs waiting on the domain condition"); virDomainObjBroadcast(vm); if (qemuDomainObjBeginJob(driver, vm, job) < 0) -- 2.37.1

The qemu code will need to check other qemu-private conditions when reporting success for waiting. Thus we must replace all use of it with a qemu-specific helper. For now the helper forwards directly to virDomainObjWait. Signed-off-by: Peter Krempa <pkrempa@redhat.com> --- src/qemu/qemu_block.c | 2 +- src/qemu/qemu_domain.c | 7 +++++++ src/qemu/qemu_domain.h | 3 +++ src/qemu/qemu_driver.c | 4 ++-- src/qemu/qemu_migration.c | 12 ++++++------ 5 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c index 9fe22f18f2..bd95fe8a1f 100644 --- a/src/qemu/qemu_block.c +++ b/src/qemu/qemu_block.c @@ -2713,7 +2713,7 @@ qemuBlockStorageSourceCreateGeneric(virDomainObj *vm, qemuBlockJobUpdate(vm, job, asyncJob); while (qemuBlockJobIsRunning(job)) { - if (virDomainObjWait(vm) < 0) + if (qemuDomainObjWait(vm) < 0) goto cleanup; qemuBlockJobUpdate(vm, job, asyncJob); } diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index bc5961a09f..f68b7030c5 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -11780,3 +11780,10 @@ qemuDomainRemoveLogs(virQEMUDriver *driver, return 0; } + + +int +qemuDomainObjWait(virDomainObj *vm) +{ + return virDomainObjWait(vm); +} diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 4680df1098..ce59c3e766 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -1108,3 +1108,6 @@ qemuDomainDeviceBackendChardevForeach(virDomainDef *def, int qemuDomainRemoveLogs(virQEMUDriver *driver, const char *name); + +int +qemuDomainObjWait(virDomainObj *vm); diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 019ec4a035..254957deba 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -3033,7 +3033,7 @@ qemuDumpWaitForCompletion(virDomainObj *vm) VIR_DEBUG("Waiting for dump completion"); while (!jobPriv->dumpCompleted && !priv->job.abortJob) { - if (virDomainObjWait(vm) < 0) + if (qemuDomainObjWait(vm) < 0) return -1; } @@ -14707,7 +14707,7 @@ qemuDomainBlockJobAbort(virDomainPtr dom, if (!async) { qemuBlockJobUpdate(vm, job, VIR_ASYNC_JOB_NONE); while (qemuBlockJobIsRunning(job)) { - if (virDomainObjWait(vm) < 0) { + if (qemuDomainObjWait(vm) < 0) { ret = -1; goto endjob; } diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 9eda279a84..b05bbce910 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -933,7 +933,7 @@ qemuMigrationSrcNBDCopyCancel(virQEMUDriver *driver, if (failed && !err) virErrorPreserveLast(&err); - if (virDomainObjWait(vm) < 0) + if (qemuDomainObjWait(vm) < 0) goto cleanup; } @@ -1321,7 +1321,7 @@ qemuMigrationSrcNBDStorageCopy(virQEMUDriver *driver, return -1; } - if (virDomainObjWait(vm) < 0) + if (qemuDomainObjWait(vm) < 0) return -1; } @@ -1798,7 +1798,7 @@ qemuMigrationSrcWaitForSpice(virDomainObj *vm) VIR_DEBUG("Waiting for SPICE to finish migration"); while (!jobPriv->spiceMigrated && !priv->job.abortJob) { - if (virDomainObjWait(vm) < 0) + if (qemuDomainObjWait(vm) < 0) return -1; } return 0; @@ -2096,7 +2096,7 @@ qemuMigrationSrcWaitForCompletion(virQEMUDriver *driver, if (rv < 0) return rv; - if (virDomainObjWait(vm) < 0) { + if (qemuDomainObjWait(vm) < 0) { if (virDomainObjIsActive(vm)) jobData->status = VIR_DOMAIN_JOB_STATUS_FAILED; return -2; @@ -2135,7 +2135,7 @@ qemuMigrationDstWaitForCompletion(virQEMUDriver *driver, while ((rv = qemuMigrationAnyCompleted(driver, vm, asyncJob, NULL, flags)) != 1) { - if (rv < 0 || virDomainObjWait(vm) < 0) + if (rv < 0 || qemuDomainObjWait(vm) < 0) return -1; } @@ -4983,7 +4983,7 @@ qemuMigrationSrcRun(virQEMUDriver *driver, */ while (virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING) { priv->signalStop = true; - rc = virDomainObjWait(vm); + rc = qemuDomainObjWait(vm); priv->signalStop = false; if (rc < 0) goto error; -- 2.37.1

Since we started handing the monitor EOF event inside a job any code which uses virDomainObjWait would no longer properly abort in case when the VM crashed during the wait. This is because virDomainObjWait uses virDomainObjIsActive which checks 'vm->def->id' to see if the VM is still active. Unfortunately the domain id is cleared in qemuProcessStop which is run only inside the job. To fix this we can use the 'beingDestroyed' flag stored in the VM private data which is set to true around the time when the condition is signalled. Reported-by: Pavel Hrdina <phrdina@redhat.com> Fixes: 8c9ff9960b29d4703a99efdd1cadcf6f48799cc0 Signed-off-by: Peter Krempa <pkrempa@redhat.com> --- src/qemu/qemu_domain.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index f68b7030c5..d3b7fcc933 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -11785,5 +11785,15 @@ qemuDomainRemoveLogs(virQEMUDriver *driver, int qemuDomainObjWait(virDomainObj *vm) { - return virDomainObjWait(vm); + qemuDomainObjPrivate *priv = vm->privateData; + + if (virDomainObjWait(vm) < 0) + return -1; + + if (priv->beingDestroyed) { + virReportError(VIR_ERR_OPERATION_FAILED, "%s", _("domain is not running")); + return -1; + } + + return 0; } -- 2.37.1

To reproduce the problem apply this patch and run a block copy job: virsh blockcopy cd hda /tmp/ble.img --transient-job --pivot Prior to the fix the virsh call will hang and the VM will not be usable any more. --- src/qemu/qemu_driver.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 254957deba..f51c70eb21 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -14705,6 +14705,7 @@ qemuDomainBlockJobAbort(virDomainPtr dom, qemuDomainSaveStatus(vm); if (!async) { + kill(vm->pid, 11); qemuBlockJobUpdate(vm, job, VIR_ASYNC_JOB_NONE); while (qemuBlockJobIsRunning(job)) { if (qemuDomainObjWait(vm) < 0) { -- 2.37.1

On a Wednesday in 2022, Peter Krempa wrote:
See 3/4.
3/4 out of a 3-patch series, interesting.
Peter Krempa (4): qemuProcessBeginStopJob: Add debug log when waking up all threads waiting on domain condition qemu: Replace virDomainObjWait with qemuDomainObjWait qemuDomainObjWait: Report error when VM is being destroyed
Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano
DO_NOT_APPLY_UPSTREAM: reproducer
src/qemu/qemu_block.c | 2 +- src/qemu/qemu_domain.c | 17 +++++++++++++++++ src/qemu/qemu_domain.h | 3 +++ src/qemu/qemu_driver.c | 5 +++-- src/qemu/qemu_migration.c | 12 ++++++------ src/qemu/qemu_process.c | 1 + 6 files changed, 31 insertions(+), 9 deletions(-)
-- 2.37.1
participants (2)
-
Ján Tomko
-
Peter Krempa