When libvirtd reconnects to a running QEMU process that had an in-progress migration, qemuProcessReconnect first connects the monitor and only later recovers the migration job. During this window the async job is VIR_ASYNC_JOB_NONE, so any MIGRATION status events from QEMU are silently dropped by qemuProcessHandleMigrationStatus. If the migration was already cancelled or completed by QEMU during this window, no further events will be emitted. When qemuMigrationSrcCancelUnattended later restores the async job and calls qemuMigrationSrcCancel with wait=true, the wait loop calls qemuDomainObjWait (virCondWait with no timeout) and blocks forever waiting for an event that will never arrive. qemuProcessRecoverMigration already queries QEMU for the current migration state via qemuMigrationAnyRefreshStatus and passes the result to qemuProcessRecoverMigrationOut as migStatus. Plumb that value one level further into qemuMigrationSrcCancelUnattended and, when it indicates the migration has already reached a terminal state (VIR_DOMAIN_JOB_STATUS_CANCELED), skip restoring the async job and the qemuMigrationSrcCancel/virDomainObjEndAsyncJob pair entirely. Signed-off-by: Denis V. Lunev <den@openvz.org> Suggested-by: Jiri Denemark <jdenemar@redhat.com> CC: Peter Krempa <pkrempa@redhat.com> CC: Michal Privoznik <mprivozn@redhat.com> --- Changes from v2: * passed status from the called to qemuMigrationSrcCancelUnattended by the suggestion from Jiri Changed from v1: * moved status checking to qemuMigrationSrcCancelUnattended src/qemu/qemu_migration.c | 47 ++++++++++++++++++++++++--------------- src/qemu/qemu_migration.h | 3 ++- src/qemu/qemu_process.c | 2 +- 3 files changed, 32 insertions(+), 20 deletions(-) diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 33cc0f0ffe..ffffeea75c 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -7390,7 +7390,8 @@ qemuMigrationSrcToFile(virDomainObj *vm, */ int qemuMigrationSrcCancelUnattended(virDomainObj *vm, - virDomainJobObj *oldJob) + virDomainJobObj *oldJob, + virDomainJobStatus migStatus) { bool storage = false; size_t i; @@ -7398,25 +7399,35 @@ qemuMigrationSrcCancelUnattended(virDomainObj *vm, VIR_DEBUG("Canceling unfinished outgoing migration of domain %s", vm->def->name); - /* Make sure MIGRATION event handler can store the current migration state - * in the job. + /* If QEMU has already reached a terminal state during the reconnect + * gap, skip restoring the async job and issuing migrate_cancel: QEMU + * won't emit any further MIGRATION events, so the wait loop in + * qemuMigrationSrcCancel would block forever. The migStatus passed in + * comes from the query-migrate call in qemuProcessRecoverMigration, + * which is authoritative for the state QEMU reached while no libvirtd + * was attached. */ - if (!vm->job->current) { - qemuDomainObjRestoreAsyncJob(vm, VIR_ASYNC_JOB_MIGRATION_OUT, - oldJob->phase, oldJob->asyncStarted, - VIR_DOMAIN_JOB_OPERATION_MIGRATION_OUT, - QEMU_DOMAIN_JOB_STATS_TYPE_MIGRATION, - VIR_DOMAIN_JOB_STATUS_FAILED, - VIR_JOB_NONE); - } - - /* We're inside a MODIFY job and the restored MIGRATION_OUT async job is - * used only for processing migration events from QEMU. Thus we don't want - * to start a nested job for talking to QEMU. - */ - qemuMigrationSrcCancel(vm, VIR_ASYNC_JOB_NONE, true); + if (migStatus != VIR_DOMAIN_JOB_STATUS_CANCELED) { + /* Make sure MIGRATION event handler can store the current migration + * state in the job. + */ + if (!vm->job->current) { + qemuDomainObjRestoreAsyncJob(vm, VIR_ASYNC_JOB_MIGRATION_OUT, + oldJob->phase, oldJob->asyncStarted, + VIR_DOMAIN_JOB_OPERATION_MIGRATION_OUT, + QEMU_DOMAIN_JOB_STATS_TYPE_MIGRATION, + VIR_DOMAIN_JOB_STATUS_FAILED, + VIR_JOB_NONE); + } + + /* We're inside a MODIFY job and the restored MIGRATION_OUT async job is + * used only for processing migration events from QEMU. Thus we don't + * want to start a nested job for talking to QEMU. + */ + qemuMigrationSrcCancel(vm, VIR_ASYNC_JOB_NONE, true); - virDomainObjEndAsyncJob(vm); + virDomainObjEndAsyncJob(vm); + } for (i = 0; i < vm->def->ndisks; i++) { virDomainDiskDef *disk = vm->def->disks[i]; diff --git a/src/qemu/qemu_migration.h b/src/qemu/qemu_migration.h index ef6a1563a0..59f32d2ebf 100644 --- a/src/qemu/qemu_migration.h +++ b/src/qemu/qemu_migration.h @@ -253,7 +253,8 @@ qemuMigrationSrcToFile(virDomainObj *vm, int qemuMigrationSrcCancelUnattended(virDomainObj *vm, - virDomainJobObj *oldJob); + virDomainJobObj *oldJob, + virDomainJobStatus migStatus); int qemuMigrationSrcCancel(virDomainObj *vm, diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 7ebc038e54..a6d33f6746 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3798,7 +3798,7 @@ qemuProcessRecoverMigrationOut(virQEMUDriver *driver, */ VIR_DEBUG("Cancelling unfinished migration of domain %s", vm->def->name); - if (qemuMigrationSrcCancelUnattended(vm, job) < 0) { + if (qemuMigrationSrcCancelUnattended(vm, job, migStatus) < 0) { VIR_WARN("Could not cancel ongoing migration of domain %s", vm->def->name); } -- 2.51.0