[libvirt PATCH 0/4] qemu: Fix post-copy recovery after dest domain gets paused

See 3/4 for details. Jiri Denemark (4): conf: Drop virDomainJobOperation parameter from virDomainObjIsPostcopy conf: Add job parameter to virDomainObjIsFailedPostcopy qemu: Remember failed post-copy migration in job virDomainObjGetState: Promote VIR_DOMAIN_PAUSED_POSTCOPY_FAILED src/conf/domain_conf.c | 41 +++++++++++++++------------- src/conf/domain_conf.h | 5 ++-- src/conf/virdomainjob.c | 1 + src/conf/virdomainjob.h | 1 + src/qemu/qemu_domainjob.c | 9 +++++++ src/qemu/qemu_driver.c | 4 +-- src/qemu/qemu_migration.c | 56 ++++++++++++++++++++++++--------------- src/qemu/qemu_process.c | 23 +++++++++++++--- 8 files changed, 92 insertions(+), 48 deletions(-) -- 2.39.0

The parameter was only used to select which states correspond to an active or failed post-copy migration. But these states are either applicable to both operations or the check would just paper over a code bug in case of an impossible combination of state and operation. By dropping the check we can make the code simpler and also reuse existing virDomainObjIsFailedPostcopy function and only check for active post-copy states. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/conf/domain_conf.c | 23 +++++++---------------- src/conf/domain_conf.h | 3 +-- src/qemu/qemu_driver.c | 4 ++-- src/qemu/qemu_migration.c | 6 +++--- src/qemu/qemu_process.c | 6 +++--- 5 files changed, 16 insertions(+), 26 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index a180398b14..d1def730a4 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -27883,24 +27883,15 @@ virDomainObjIsFailedPostcopy(virDomainObj *dom) bool -virDomainObjIsPostcopy(virDomainObj *dom, - virDomainJobOperation op) +virDomainObjIsPostcopy(virDomainObj *dom) { - if (op != VIR_DOMAIN_JOB_OPERATION_MIGRATION_IN && - op != VIR_DOMAIN_JOB_OPERATION_MIGRATION_OUT) - return false; - - if (op == VIR_DOMAIN_JOB_OPERATION_MIGRATION_IN) { - return (dom->state.state == VIR_DOMAIN_PAUSED && - dom->state.reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) || - (dom->state.state == VIR_DOMAIN_RUNNING && - (dom->state.reason == VIR_DOMAIN_RUNNING_POSTCOPY || - dom->state.reason == VIR_DOMAIN_RUNNING_POSTCOPY_FAILED)); - } + if (virDomainObjIsFailedPostcopy(dom)) + return true; - return dom->state.state == VIR_DOMAIN_PAUSED && - (dom->state.reason == VIR_DOMAIN_PAUSED_POSTCOPY || - dom->state.reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED); + return (dom->state.state == VIR_DOMAIN_PAUSED && + dom->state.reason == VIR_DOMAIN_PAUSED_POSTCOPY) || + (dom->state.state == VIR_DOMAIN_RUNNING && + dom->state.reason == VIR_DOMAIN_RUNNING_POSTCOPY); } diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index c19dfc5470..3abf018574 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -3933,8 +3933,7 @@ bool virDomainObjIsFailedPostcopy(virDomainObj *obj) ATTRIBUTE_NONNULL(1); bool -virDomainObjIsPostcopy(virDomainObj *dom, - virDomainJobOperation op) +virDomainObjIsPostcopy(virDomainObj *dom) ATTRIBUTE_NONNULL(1); virSecurityLabelDef * diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index d509582719..dea3d65b57 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -12614,7 +12614,7 @@ qemuDomainAbortJobFlags(virDomainPtr dom, if (flags & VIR_DOMAIN_ABORT_JOB_POSTCOPY && (vm->job->asyncJob != VIR_ASYNC_JOB_MIGRATION_OUT || - !virDomainObjIsPostcopy(vm, VIR_DOMAIN_JOB_OPERATION_MIGRATION_OUT))) { + !virDomainObjIsPostcopy(vm))) { virReportError(VIR_ERR_OPERATION_INVALID, "%s", _("current job is not outgoing migration in post-copy mode")); goto endjob; @@ -12639,7 +12639,7 @@ qemuDomainAbortJobFlags(virDomainPtr dom, break; case VIR_ASYNC_JOB_MIGRATION_OUT: - if (virDomainObjIsPostcopy(vm, VIR_DOMAIN_JOB_OPERATION_MIGRATION_OUT)) + if (virDomainObjIsPostcopy(vm)) ret = qemuDomainAbortJobPostcopy(vm, flags); else ret = qemuDomainAbortJobMigration(vm); diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index f4441d61ae..fd8583ca34 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -2294,7 +2294,7 @@ qemuMigrationAnyConnectionClosed(virDomainObj *vm, break; case QEMU_MIGRATION_PHASE_PERFORM3_DONE: - if (virDomainObjIsPostcopy(vm, VIR_DOMAIN_JOB_OPERATION_MIGRATION_OUT)) { + if (virDomainObjIsPostcopy(vm)) { VIR_DEBUG("Migration protocol interrupted in post-copy mode"); postcopy = true; } else { @@ -2683,7 +2683,7 @@ qemuMigrationAnyCanResume(virDomainObj *vm, return false; } - if (!virDomainObjIsPostcopy(vm, vm->job->current->operation)) { + if (!virDomainObjIsPostcopy(vm)) { virReportError(VIR_ERR_OPERATION_INVALID, _("migration of domain %s is not in post-copy phase"), vm->def->name); @@ -3911,7 +3911,7 @@ qemuMigrationSrcConfirmPhase(virQEMUDriver *driver, virCheckFlags(QEMU_MIGRATION_FLAGS, -1); if (retcode != 0 && - virDomainObjIsPostcopy(vm, VIR_DOMAIN_JOB_OPERATION_MIGRATION_OUT) && + virDomainObjIsPostcopy(vm) && currentData->stats.mig.status == QEMU_MONITOR_MIGRATION_STATUS_COMPLETED) { VIR_DEBUG("Finish phase failed, but QEMU reports post-copy migration is completed; forcing success"); retcode = 0; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index b6adcf2f2a..2e989256ff 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -1503,7 +1503,7 @@ qemuProcessHandleMigrationStatus(qemuMonitor *mon G_GNUC_UNUSED, * watching it in any thread. Let's make sure the migration is properly * finished in case we get a "completed" event. */ - if (virDomainObjIsPostcopy(vm, vm->job->current->operation) && + if (virDomainObjIsPostcopy(vm) && vm->job->phase == QEMU_MIGRATION_PHASE_POSTCOPY_FAILED && vm->job->asyncOwner == 0) { qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_UNATTENDED_MIGRATION, @@ -3476,7 +3476,7 @@ qemuProcessRecoverMigrationIn(virQEMUDriver *driver, /* migration finished, we started resuming the domain but didn't * confirm success or failure yet; killing it seems safest unless * we already started guest CPUs or we were in post-copy mode */ - if (virDomainObjIsPostcopy(vm, VIR_DOMAIN_JOB_OPERATION_MIGRATION_IN)) + if (virDomainObjIsPostcopy(vm)) return 1; if (state != VIR_DOMAIN_RUNNING) { @@ -3511,7 +3511,7 @@ qemuProcessRecoverMigrationOut(virQEMUDriver *driver, int reason, unsigned int *stopFlags) { - bool postcopy = virDomainObjIsPostcopy(vm, VIR_DOMAIN_JOB_OPERATION_MIGRATION_OUT); + bool postcopy = virDomainObjIsPostcopy(vm); bool resume = false; VIR_DEBUG("Active outgoing migration in phase %s", -- 2.39.0

Unused for now, but this will change soon. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/conf/domain_conf.c | 8 +++++--- src/conf/domain_conf.h | 6 ++++-- src/qemu/qemu_driver.c | 4 ++-- src/qemu/qemu_migration.c | 22 +++++++++++----------- src/qemu/qemu_process.c | 8 ++++---- 5 files changed, 26 insertions(+), 22 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index d1def730a4..9e2eea79e7 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -27873,7 +27873,8 @@ virDomainObjGetState(virDomainObj *dom, int *reason) bool -virDomainObjIsFailedPostcopy(virDomainObj *dom) +virDomainObjIsFailedPostcopy(virDomainObj *dom, + virDomainJobObj *job G_GNUC_UNUSED) { return ((dom->state.state == VIR_DOMAIN_PAUSED && dom->state.reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) || @@ -27883,9 +27884,10 @@ virDomainObjIsFailedPostcopy(virDomainObj *dom) bool -virDomainObjIsPostcopy(virDomainObj *dom) +virDomainObjIsPostcopy(virDomainObj *dom, + virDomainJobObj *job) { - if (virDomainObjIsFailedPostcopy(dom)) + if (virDomainObjIsFailedPostcopy(dom, job)) return true; return (dom->state.state == VIR_DOMAIN_PAUSED && diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 3abf018574..becc147dab 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -3930,10 +3930,12 @@ virDomainObjGetState(virDomainObj *obj, int *reason) ATTRIBUTE_NONNULL(1); bool -virDomainObjIsFailedPostcopy(virDomainObj *obj) +virDomainObjIsFailedPostcopy(virDomainObj *obj, + virDomainJobObj *job) ATTRIBUTE_NONNULL(1); bool -virDomainObjIsPostcopy(virDomainObj *dom) +virDomainObjIsPostcopy(virDomainObj *dom, + virDomainJobObj *job) ATTRIBUTE_NONNULL(1); virSecurityLabelDef * diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index dea3d65b57..2257a17cde 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -12614,7 +12614,7 @@ qemuDomainAbortJobFlags(virDomainPtr dom, if (flags & VIR_DOMAIN_ABORT_JOB_POSTCOPY && (vm->job->asyncJob != VIR_ASYNC_JOB_MIGRATION_OUT || - !virDomainObjIsPostcopy(vm))) { + !virDomainObjIsPostcopy(vm, vm->job))) { virReportError(VIR_ERR_OPERATION_INVALID, "%s", _("current job is not outgoing migration in post-copy mode")); goto endjob; @@ -12639,7 +12639,7 @@ qemuDomainAbortJobFlags(virDomainPtr dom, break; case VIR_ASYNC_JOB_MIGRATION_OUT: - if (virDomainObjIsPostcopy(vm)) + if (virDomainObjIsPostcopy(vm, vm->job)) ret = qemuDomainAbortJobPostcopy(vm, flags); else ret = qemuDomainAbortJobMigration(vm); diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index fd8583ca34..27a74795d6 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -2294,7 +2294,7 @@ qemuMigrationAnyConnectionClosed(virDomainObj *vm, break; case QEMU_MIGRATION_PHASE_PERFORM3_DONE: - if (virDomainObjIsPostcopy(vm)) { + if (virDomainObjIsPostcopy(vm, vm->job)) { VIR_DEBUG("Migration protocol interrupted in post-copy mode"); postcopy = true; } else { @@ -2683,7 +2683,7 @@ qemuMigrationAnyCanResume(virDomainObj *vm, return false; } - if (!virDomainObjIsPostcopy(vm)) { + if (!virDomainObjIsPostcopy(vm, vm->job)) { virReportError(VIR_ERR_OPERATION_INVALID, _("migration of domain %s is not in post-copy phase"), vm->def->name); @@ -2691,7 +2691,7 @@ qemuMigrationAnyCanResume(virDomainObj *vm, } if (vm->job->phase < QEMU_MIGRATION_PHASE_POSTCOPY_FAILED && - !virDomainObjIsFailedPostcopy(vm)) { + !virDomainObjIsFailedPostcopy(vm, vm->job)) { virReportError(VIR_ERR_OPERATION_INVALID, _("post-copy migration of domain %s has not failed"), vm->def->name); @@ -3911,7 +3911,7 @@ qemuMigrationSrcConfirmPhase(virQEMUDriver *driver, virCheckFlags(QEMU_MIGRATION_FLAGS, -1); if (retcode != 0 && - virDomainObjIsPostcopy(vm) && + virDomainObjIsPostcopy(vm, vm->job) && currentData->stats.mig.status == QEMU_MONITOR_MIGRATION_STATUS_COMPLETED) { VIR_DEBUG("Finish phase failed, but QEMU reports post-copy migration is completed; forcing success"); retcode = 0; @@ -3919,7 +3919,7 @@ qemuMigrationSrcConfirmPhase(virQEMUDriver *driver, if (flags & VIR_MIGRATE_POSTCOPY_RESUME) { phase = QEMU_MIGRATION_PHASE_CONFIRM_RESUME; - } else if (virDomainObjIsFailedPostcopy(vm)) { + } else if (virDomainObjIsFailedPostcopy(vm, vm->job)) { /* Keep the original migration phase in case post-copy failed as the * job will stay active even though migration API finishes with an * error. @@ -3979,7 +3979,7 @@ qemuMigrationSrcConfirmPhase(virQEMUDriver *driver, if (virDomainObjGetState(vm, &reason) == VIR_DOMAIN_PAUSED && reason == VIR_DOMAIN_PAUSED_POSTCOPY) { qemuMigrationSrcPostcopyFailed(vm); - } else if (!virDomainObjIsFailedPostcopy(vm)) { + } else if (!virDomainObjIsFailedPostcopy(vm, vm->job)) { qemuMigrationSrcRestoreDomainState(driver, vm); qemuMigrationParamsReset(vm, VIR_ASYNC_JOB_MIGRATION_OUT, @@ -4020,7 +4020,7 @@ qemuMigrationSrcConfirm(virQEMUDriver *driver, * job will stay active even though migration API finishes with an * error. */ - if (virDomainObjIsFailedPostcopy(vm)) + if (virDomainObjIsFailedPostcopy(vm, vm->job)) phase = vm->job->phase; else if (cancelled) phase = QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED; @@ -4039,7 +4039,7 @@ qemuMigrationSrcConfirm(virQEMUDriver *driver, cookiein, cookieinlen, flags, cancelled); - if (virDomainObjIsFailedPostcopy(vm)) { + if (virDomainObjIsFailedPostcopy(vm, vm->job)) { ignore_value(qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_POSTCOPY_FAILED)); qemuMigrationJobContinue(vm, qemuProcessCleanupMigrationJob); } else { @@ -6093,7 +6093,7 @@ qemuMigrationSrcPerformJob(virQEMUDriver *driver, if (ret < 0) virErrorPreserveLast(&orig_err); - if (virDomainObjIsFailedPostcopy(vm)) { + if (virDomainObjIsFailedPostcopy(vm, vm->job)) { ignore_value(qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_POSTCOPY_FAILED)); qemuMigrationJobContinue(vm, qemuProcessCleanupMigrationJob); } else { @@ -6184,7 +6184,7 @@ qemuMigrationSrcPerformPhase(virQEMUDriver *driver, ret = 0; cleanup: - if (ret < 0 && !virDomainObjIsFailedPostcopy(vm)) { + if (ret < 0 && !virDomainObjIsFailedPostcopy(vm, vm->job)) { qemuMigrationSrcRestoreDomainState(driver, vm); qemuMigrationParamsReset(vm, VIR_ASYNC_JOB_MIGRATION_OUT, jobPriv->migParams, vm->job->apiFlags); @@ -6725,7 +6725,7 @@ qemuMigrationDstFinishActive(virQEMUDriver *driver, } } - if (virDomainObjIsFailedPostcopy(vm)) { + if (virDomainObjIsFailedPostcopy(vm, vm->job)) { ignore_value(qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_POSTCOPY_FAILED)); qemuProcessAutoDestroyRemove(driver, vm); *finishJob = false; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 2e989256ff..6091c9f1a9 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -1473,7 +1473,7 @@ qemuProcessHandleMigrationStatus(qemuMonitor *mon G_GNUC_UNUSED, break; case QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY_RECOVER: - if (virDomainObjIsFailedPostcopy(vm)) { + if (virDomainObjIsFailedPostcopy(vm, vm->job)) { int eventType = -1; int eventDetail = -1; @@ -1503,7 +1503,7 @@ qemuProcessHandleMigrationStatus(qemuMonitor *mon G_GNUC_UNUSED, * watching it in any thread. Let's make sure the migration is properly * finished in case we get a "completed" event. */ - if (virDomainObjIsPostcopy(vm) && + if (virDomainObjIsPostcopy(vm, vm->job) && vm->job->phase == QEMU_MIGRATION_PHASE_POSTCOPY_FAILED && vm->job->asyncOwner == 0) { qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_UNATTENDED_MIGRATION, @@ -3476,7 +3476,7 @@ qemuProcessRecoverMigrationIn(virQEMUDriver *driver, /* migration finished, we started resuming the domain but didn't * confirm success or failure yet; killing it seems safest unless * we already started guest CPUs or we were in post-copy mode */ - if (virDomainObjIsPostcopy(vm)) + if (virDomainObjIsPostcopy(vm, job)) return 1; if (state != VIR_DOMAIN_RUNNING) { @@ -3511,7 +3511,7 @@ qemuProcessRecoverMigrationOut(virQEMUDriver *driver, int reason, unsigned int *stopFlags) { - bool postcopy = virDomainObjIsPostcopy(vm); + bool postcopy = virDomainObjIsPostcopy(vm, job); bool resume = false; VIR_DEBUG("Active outgoing migration in phase %s", -- 2.39.0

When post-copy migration fails, the domain stays running on the destination with a VIR_DOMAIN_RUNNING_POSTCOPY_FAILED reason. Both the state and the reason can later be rewritten in case the domain gets paused for other reasons (such as an I/O error). Thus we need a separate place to remember the post-copy migration failed to be able to resume the migration. https://bugzilla.redhat.com/show_bug.cgi?id=2111948 Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/conf/domain_conf.c | 7 ++++++- src/conf/virdomainjob.c | 1 + src/conf/virdomainjob.h | 1 + src/qemu/qemu_domainjob.c | 9 +++++++++ src/qemu/qemu_migration.c | 34 +++++++++++++++++++++++----------- src/qemu/qemu_process.c | 15 +++++++++++++++ 6 files changed, 55 insertions(+), 12 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 9e2eea79e7..f83586c549 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -27874,8 +27874,13 @@ virDomainObjGetState(virDomainObj *dom, int *reason) bool virDomainObjIsFailedPostcopy(virDomainObj *dom, - virDomainJobObj *job G_GNUC_UNUSED) + virDomainJobObj *job) { + if (job && job->asyncPaused && + (job->asyncJob == VIR_ASYNC_JOB_MIGRATION_IN || + job->asyncJob == VIR_ASYNC_JOB_MIGRATION_OUT)) + return true; + return ((dom->state.state == VIR_DOMAIN_PAUSED && dom->state.reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) || (dom->state.state == VIR_DOMAIN_RUNNING && diff --git a/src/conf/virdomainjob.c b/src/conf/virdomainjob.c index 256b665a42..c4cbbe8f6d 100644 --- a/src/conf/virdomainjob.c +++ b/src/conf/virdomainjob.c @@ -174,6 +174,7 @@ virDomainObjResetAsyncJob(virDomainJobObj *job) job->asyncOwner = 0; g_clear_pointer(&job->asyncOwnerAPI, g_free); job->asyncStarted = 0; + job->asyncPaused = false; job->phase = 0; job->mask = VIR_JOB_DEFAULT_MASK; job->abortJob = false; diff --git a/src/conf/virdomainjob.h b/src/conf/virdomainjob.h index b1ac36a2fa..0d62bab287 100644 --- a/src/conf/virdomainjob.h +++ b/src/conf/virdomainjob.h @@ -176,6 +176,7 @@ struct _virDomainJobObj { unsigned long long asyncOwner; /* Thread which set current async job */ char *asyncOwnerAPI; /* The API which owns the async job */ unsigned long long asyncStarted; /* When the current async job started */ + bool asyncPaused; /* The async job is paused */ int phase; /* Job phase (mainly for migrations) */ unsigned long long mask; /* Jobs allowed during async job */ virDomainJobData *current; /* async job progress data */ diff --git a/src/qemu/qemu_domainjob.c b/src/qemu/qemu_domainjob.c index 8d958b9d21..27beb5229f 100644 --- a/src/qemu/qemu_domainjob.c +++ b/src/qemu/qemu_domainjob.c @@ -695,6 +695,8 @@ qemuDomainObjPrivateXMLFormatJob(virBuffer *buf, if (vm->job->asyncJob != VIR_ASYNC_JOB_NONE) { virBufferAsprintf(&attrBuf, " flags='0x%x'", vm->job->apiFlags); virBufferAsprintf(&attrBuf, " asyncStarted='%llu'", vm->job->asyncStarted); + if (vm->job->asyncPaused) + virBufferAddLit(&attrBuf, " asyncPaused='yes'"); } if (vm->job->cb && @@ -732,6 +734,7 @@ qemuDomainObjPrivateXMLParseJob(virDomainObj *vm, if ((tmp = virXPathString("string(@async)", ctxt))) { int async; + virTristateBool paused; if ((async = virDomainAsyncJobTypeFromString(tmp)) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, @@ -757,6 +760,12 @@ qemuDomainObjPrivateXMLParseJob(virDomainObj *vm, _("Invalid async job start")); return -1; } + + if (virXMLPropTristateBool(ctxt->node, "asyncPaused", VIR_XML_PROP_NONE, + &paused) < 0) + return -1; + + vm->job->asyncPaused = paused == VIR_TRISTATE_BOOL_YES; } if (virXMLPropUInt(ctxt->node, "flags", 16, VIR_XML_PROP_NONE, diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 27a74795d6..f258e7d700 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -1666,17 +1666,19 @@ qemuMigrationSrcPostcopyFailed(virDomainObj *vm) state = virDomainObjGetState(vm, &reason); - VIR_DEBUG("%s/%s", + VIR_DEBUG("%s/%s, asyncPaused=%u", virDomainStateTypeToString(state), - virDomainStateReasonToString(state, reason)); + virDomainStateReasonToString(state, reason), + vm->job->asyncPaused); if (state != VIR_DOMAIN_PAUSED || - reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) + virDomainObjIsFailedPostcopy(vm, vm->job)) return; VIR_WARN("Migration of domain %s failed during post-copy; " "leaving the domain paused", vm->def->name); + vm->job->asyncPaused = true; virDomainObjSetState(vm, VIR_DOMAIN_PAUSED, VIR_DOMAIN_PAUSED_POSTCOPY_FAILED); event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_SUSPENDED, @@ -1696,21 +1698,31 @@ qemuMigrationDstPostcopyFailed(virDomainObj *vm) state = virDomainObjGetState(vm, &reason); - VIR_DEBUG("%s/%s", + VIR_DEBUG("%s/%s, asyncPaused=%u", virDomainStateTypeToString(state), - virDomainStateReasonToString(state, reason)); + virDomainStateReasonToString(state, reason), + vm->job->asyncPaused); - if (state != VIR_DOMAIN_RUNNING || - reason == VIR_DOMAIN_RUNNING_POSTCOPY_FAILED) + if ((state != VIR_DOMAIN_RUNNING && state != VIR_DOMAIN_PAUSED) || + virDomainObjIsFailedPostcopy(vm, vm->job)) return; VIR_WARN("Incoming migration of domain '%s' failed during post-copy; " "leaving the domain running", vm->def->name); - virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, - VIR_DOMAIN_RUNNING_POSTCOPY_FAILED); - event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED, - VIR_DOMAIN_EVENT_RESUMED_POSTCOPY_FAILED); + vm->job->asyncPaused = true; + if (state == VIR_DOMAIN_RUNNING) { + virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, + VIR_DOMAIN_RUNNING_POSTCOPY_FAILED); + event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED, + VIR_DOMAIN_EVENT_RESUMED_POSTCOPY_FAILED); + } else { + /* The domain was paused for other reasons (I/O error, ...) so we don't + * want to rewrite the original reason and just emit a postcopy-failed + * event. */ + event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_SUSPENDED, + VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY_FAILED); + } virObjectEventStateQueue(driver->domainEventState, event); } diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 6091c9f1a9..017a05d57e 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -712,6 +712,15 @@ qemuProcessHandleResume(qemuMonitor *mon G_GNUC_UNUSED, vm->def->name, virDomainRunningReasonTypeToString(reason), eventDetail); + /* When a domain is running in (failed) post-copy migration on the + * destination host, we need to make sure to set the appropriate reason + * here. */ + if (virDomainObjIsPostcopy(vm, vm->job)) { + if (virDomainObjIsFailedPostcopy(vm, vm->job)) + reason = VIR_DOMAIN_RUNNING_POSTCOPY_FAILED; + else + reason = VIR_DOMAIN_RUNNING_POSTCOPY; + } virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason); event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED, @@ -1491,6 +1500,7 @@ qemuProcessHandleMigrationStatus(qemuMonitor *mon G_GNUC_UNUSED, vm->def->name, virDomainStateTypeToString(state), NULLSTR(virDomainStateReasonToString(state, reason))); + vm->job->asyncPaused = false; virDomainObjSetState(vm, state, reason); event = virDomainEventLifecycleNewFromObj(vm, eventType, eventDetail); qemuDomainSaveStatus(vm); @@ -3420,6 +3430,7 @@ qemuProcessRestoreMigrationJob(virDomainObj *vm, job->privateData = g_steal_pointer(&vm->job->privateData); vm->job->privateData = jobPriv; vm->job->apiFlags = job->apiFlags; + vm->job->asyncPaused = job->asyncPaused; qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob); } @@ -3645,6 +3656,7 @@ qemuProcessRecoverMigration(virQEMUDriver *driver, if (migStatus == VIR_DOMAIN_JOB_STATUS_POSTCOPY) { VIR_DEBUG("Post-copy migration of domain %s still running, it will be handled as unattended", vm->def->name); + vm->job->asyncPaused = false; return 0; } @@ -3653,6 +3665,9 @@ qemuProcessRecoverMigration(virQEMUDriver *driver, qemuMigrationSrcPostcopyFailed(vm); else qemuMigrationDstPostcopyFailed(vm); + /* Set the asyncPaused flag in case we're reconnecting to a domain + * started by an older libvirt. */ + vm->job->asyncPaused = true; return 0; } -- 2.39.0

In case a domain is in failed post-copy migration but paused for a different reason, report VIR_DOMAIN_PAUSED_POSTCOPY_FAILED instead to make it more visible. --- Notes: This commit is intentionally missing Signed-off-by: Jiri Denemark <jdenemar@redhat.com> because I can find reasons for both pushing and not pushing this change. It is pretty ugly and masks the real reason behind the paused state. On the other hand, reporting the real reason means users would not really know post-copy migration failed unless they saw the corresponding event. src/conf/domain_conf.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index f83586c549..6097ea3ffc 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -27865,8 +27865,15 @@ virDomainObjCopyPersistentDef(virDomainObj *dom, virDomainState virDomainObjGetState(virDomainObj *dom, int *reason) { - if (reason) - *reason = dom->state.reason; + if (reason) { + if (dom->state.state == VIR_DOMAIN_PAUSED && + dom->job && dom->job->asyncPaused && + (dom->job->asyncJob == VIR_ASYNC_JOB_MIGRATION_IN || + dom->job->asyncJob == VIR_ASYNC_JOB_MIGRATION_OUT)) + *reason = VIR_DOMAIN_PAUSED_POSTCOPY_FAILED; + else + *reason = dom->state.reason; + } return dom->state.state; } -- 2.39.0

On 12/15/22 15:37, Jiri Denemark wrote:
See 3/4 for details.
Jiri Denemark (4): conf: Drop virDomainJobOperation parameter from virDomainObjIsPostcopy conf: Add job parameter to virDomainObjIsFailedPostcopy qemu: Remember failed post-copy migration in job virDomainObjGetState: Promote VIR_DOMAIN_PAUSED_POSTCOPY_FAILED
src/conf/domain_conf.c | 41 +++++++++++++++------------- src/conf/domain_conf.h | 5 ++-- src/conf/virdomainjob.c | 1 + src/conf/virdomainjob.h | 1 + src/qemu/qemu_domainjob.c | 9 +++++++ src/qemu/qemu_driver.c | 4 +-- src/qemu/qemu_migration.c | 56 ++++++++++++++++++++++++--------------- src/qemu/qemu_process.c | 23 +++++++++++++--- 8 files changed, 92 insertions(+), 48 deletions(-)
Reviewed-by: Michal Privoznik <mprivozn@redhat.com> Michal

On Tue, Dec 20, 2022 at 14:26:29 +0100, Michal Prívozník wrote:
On 12/15/22 15:37, Jiri Denemark wrote:
See 3/4 for details.
Jiri Denemark (4): conf: Drop virDomainJobOperation parameter from virDomainObjIsPostcopy conf: Add job parameter to virDomainObjIsFailedPostcopy qemu: Remember failed post-copy migration in job virDomainObjGetState: Promote VIR_DOMAIN_PAUSED_POSTCOPY_FAILED
src/conf/domain_conf.c | 41 +++++++++++++++------------- src/conf/domain_conf.h | 5 ++-- src/conf/virdomainjob.c | 1 + src/conf/virdomainjob.h | 1 + src/qemu/qemu_domainjob.c | 9 +++++++ src/qemu/qemu_driver.c | 4 +-- src/qemu/qemu_migration.c | 56 ++++++++++++++++++++++++--------------- src/qemu/qemu_process.c | 23 +++++++++++++--- 8 files changed, 92 insertions(+), 48 deletions(-)
Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
Thanks. I decided 4/4 is so ugly we will be happier without it and pushed only the first three patches. Jirka
participants (2)
-
Jiri Denemark
-
Michal Prívozník