When recovering from a failed post-copy migration, we need to go through
all migration phases again, but don't need to repeat all the steps in
each phase. Let's create a new set of migration phases dedicated to
post-copy recovery so that we can easily distinguish between normal and
recovery code.
Signed-off-by: Jiri Denemark <jdenemar(a)redhat.com>
Reviewed-by: Peter Krempa <pkrempa(a)redhat.com>
Reviewed-by: Pavel Hrdina <phrdina(a)redhat.com>
---
Notes:
Version 2:
- additional comments
src/qemu/qemu_migration.c | 20 +++++++++++++++++++-
src/qemu/qemu_migration.h | 6 ++++++
src/qemu/qemu_process.c | 29 +++++++++++++++++++++++++++--
3 files changed, 52 insertions(+), 3 deletions(-)
diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c
index 02827bd975..710aae3eb7 100644
--- a/src/qemu/qemu_migration.c
+++ b/src/qemu/qemu_migration.c
@@ -79,6 +79,12 @@ VIR_ENUM_IMPL(qemuMigrationJobPhase,
"prepare",
"finish2",
"finish3",
+ "postcopy_failed",
+ "begin_resume",
+ "perform_resume",
+ "confirm_resume",
+ "prepare_resume",
+ "finish_resume",
);
@@ -139,7 +145,8 @@ qemuMigrationJobSetPhase(virDomainObj *vm,
{
qemuDomainObjPrivate *priv = vm->privateData;
- if (phase < priv->job.phase) {
+ if (phase < QEMU_MIGRATION_PHASE_POSTCOPY_FAILED &&
+ phase < priv->job.phase) {
VIR_ERROR(_("migration protocol going backwards %s => %s"),
qemuMigrationJobPhaseTypeToString(priv->job.phase),
qemuMigrationJobPhaseTypeToString(phase));
@@ -2328,18 +2335,29 @@ qemuMigrationSrcCleanup(virDomainObj *vm,
}
break;
+ case QEMU_MIGRATION_PHASE_BEGIN_RESUME:
+ case QEMU_MIGRATION_PHASE_PERFORM_RESUME:
+ qemuMigrationSrcPostcopyFailed(vm);
+ qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob);
+ qemuMigrationJobContinue(vm);
+ break;
+
case QEMU_MIGRATION_PHASE_PERFORM3:
/* cannot be seen without an active migration API; unreachable */
case QEMU_MIGRATION_PHASE_CONFIRM3:
case QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED:
+ case QEMU_MIGRATION_PHASE_CONFIRM_RESUME:
/* all done; unreachable */
case QEMU_MIGRATION_PHASE_PREPARE:
case QEMU_MIGRATION_PHASE_FINISH2:
case QEMU_MIGRATION_PHASE_FINISH3:
+ case QEMU_MIGRATION_PHASE_PREPARE_RESUME:
+ case QEMU_MIGRATION_PHASE_FINISH_RESUME:
/* incoming migration; unreachable */
case QEMU_MIGRATION_PHASE_PERFORM2:
/* single phase outgoing migration; unreachable */
case QEMU_MIGRATION_PHASE_NONE:
+ case QEMU_MIGRATION_PHASE_POSTCOPY_FAILED:
case QEMU_MIGRATION_PHASE_LAST:
/* unreachable */
;
diff --git a/src/qemu/qemu_migration.h b/src/qemu/qemu_migration.h
index 9351d6ac51..7eb0d4fe02 100644
--- a/src/qemu/qemu_migration.h
+++ b/src/qemu/qemu_migration.h
@@ -100,6 +100,12 @@ typedef enum {
QEMU_MIGRATION_PHASE_PREPARE,
QEMU_MIGRATION_PHASE_FINISH2,
QEMU_MIGRATION_PHASE_FINISH3,
+ QEMU_MIGRATION_PHASE_POSTCOPY_FAILED, /* marker for resume phases */
+ QEMU_MIGRATION_PHASE_BEGIN_RESUME,
+ QEMU_MIGRATION_PHASE_PERFORM_RESUME,
+ QEMU_MIGRATION_PHASE_CONFIRM_RESUME,
+ QEMU_MIGRATION_PHASE_PREPARE_RESUME,
+ QEMU_MIGRATION_PHASE_FINISH_RESUME,
QEMU_MIGRATION_PHASE_LAST
} qemuMigrationJobPhase;
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 6dd643a38b..f752668b2f 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -3507,6 +3507,10 @@ qemuProcessRecoverMigrationIn(virQEMUDriver *driver,
case QEMU_MIGRATION_PHASE_PERFORM3_DONE:
case QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED:
case QEMU_MIGRATION_PHASE_CONFIRM3:
+ case QEMU_MIGRATION_PHASE_POSTCOPY_FAILED:
+ case QEMU_MIGRATION_PHASE_BEGIN_RESUME:
+ case QEMU_MIGRATION_PHASE_PERFORM_RESUME:
+ case QEMU_MIGRATION_PHASE_CONFIRM_RESUME:
case QEMU_MIGRATION_PHASE_LAST:
/* N/A for incoming migration */
break;
@@ -3540,6 +3544,10 @@ qemuProcessRecoverMigrationIn(virQEMUDriver *driver,
return -1;
}
break;
+
+ case QEMU_MIGRATION_PHASE_PREPARE_RESUME:
+ case QEMU_MIGRATION_PHASE_FINISH_RESUME:
+ return 1;
}
return 0;
@@ -3548,7 +3556,8 @@ qemuProcessRecoverMigrationIn(virQEMUDriver *driver,
/*
* Returns
- * -1 on error, the domain will be killed,
+ * -1 the domain should be killed (either after a successful migration or
+ * on error),
* 0 the domain should remain running with the migration job discarded,
* 1 the daemon was restarted during post-copy phase
*/
@@ -3556,6 +3565,7 @@ static int
qemuProcessRecoverMigrationOut(virQEMUDriver *driver,
virDomainObj *vm,
qemuDomainJobObj *job,
+ virDomainJobStatus migStatus,
virDomainState state,
int reason,
unsigned int *stopFlags)
@@ -3571,6 +3581,9 @@ qemuProcessRecoverMigrationOut(virQEMUDriver *driver,
case QEMU_MIGRATION_PHASE_PREPARE:
case QEMU_MIGRATION_PHASE_FINISH2:
case QEMU_MIGRATION_PHASE_FINISH3:
+ case QEMU_MIGRATION_PHASE_POSTCOPY_FAILED:
+ case QEMU_MIGRATION_PHASE_PREPARE_RESUME:
+ case QEMU_MIGRATION_PHASE_FINISH_RESUME:
case QEMU_MIGRATION_PHASE_LAST:
/* N/A for outgoing migration */
break;
@@ -3621,6 +3634,18 @@ qemuProcessRecoverMigrationOut(virQEMUDriver *driver,
/* migration completed, we need to kill the domain here */
*stopFlags |= VIR_QEMU_PROCESS_STOP_MIGRATED;
return -1;
+
+ case QEMU_MIGRATION_PHASE_CONFIRM_RESUME:
+ if (migStatus == VIR_DOMAIN_JOB_STATUS_HYPERVISOR_COMPLETED) {
+ /* migration completed, we need to kill the domain here */
+ *stopFlags |= VIR_QEMU_PROCESS_STOP_MIGRATED;
+ return -1;
+ }
+ return 1;
+
+ case QEMU_MIGRATION_PHASE_BEGIN_RESUME:
+ case QEMU_MIGRATION_PHASE_PERFORM_RESUME:
+ return 1;
}
if (resume) {
@@ -3659,7 +3684,7 @@ qemuProcessRecoverMigration(virQEMUDriver *driver,
qemuMigrationAnyRefreshStatus(driver, vm, VIR_ASYNC_JOB_NONE, &migStatus);
if (job->asyncJob == VIR_ASYNC_JOB_MIGRATION_OUT) {
- rc = qemuProcessRecoverMigrationOut(driver, vm, job,
+ rc = qemuProcessRecoverMigrationOut(driver, vm, job, migStatus,
state, reason, stopFlags);
} else {
rc = qemuProcessRecoverMigrationIn(driver, vm, job, state);
--
2.35.1