[PATCH 0/2] qemu: Avoid false failure when resuming post-copy migration

See 2/2 for details. Jiri Denemark (2): qemu: Add support for postcopy-recover-setup migration state qemu: Avoid false failure when resuming post-copy migration src/conf/virdomainjob.c | 1 + src/conf/virdomainjob.h | 1 + src/qemu/qemu_domain.h | 4 ++++ src/qemu/qemu_driver.c | 1 + src/qemu/qemu_migration.c | 32 +++++++++++++++++++++++++++++++- src/qemu/qemu_monitor.c | 1 + src/qemu/qemu_monitor.h | 1 + src/qemu/qemu_monitor_json.c | 1 + src/qemu/qemu_process.c | 4 ++++ 9 files changed, 45 insertions(+), 1 deletion(-) -- 2.45.2

This patch adds support for recognizing the new migration state reported by QEMU when post-copy recovery is requested. It is not actually used for anything yet. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/conf/virdomainjob.c | 1 + src/conf/virdomainjob.h | 1 + src/qemu/qemu_domain.h | 4 ++++ src/qemu/qemu_driver.c | 1 + src/qemu/qemu_migration.c | 7 +++++++ src/qemu/qemu_monitor.c | 1 + src/qemu/qemu_monitor.h | 1 + src/qemu/qemu_monitor_json.c | 1 + src/qemu/qemu_process.c | 4 ++++ 9 files changed, 21 insertions(+) diff --git a/src/conf/virdomainjob.c b/src/conf/virdomainjob.c index 38f08f1d18..2d5a857a8c 100644 --- a/src/conf/virdomainjob.c +++ b/src/conf/virdomainjob.c @@ -106,6 +106,7 @@ virDomainJobStatusToType(virDomainJobStatus status) case VIR_DOMAIN_JOB_STATUS_HYPERVISOR_COMPLETED: case VIR_DOMAIN_JOB_STATUS_POSTCOPY: case VIR_DOMAIN_JOB_STATUS_POSTCOPY_PAUSED: + case VIR_DOMAIN_JOB_STATUS_POSTCOPY_RECOVER: case VIR_DOMAIN_JOB_STATUS_PAUSED: return VIR_DOMAIN_JOB_UNBOUNDED; diff --git a/src/conf/virdomainjob.h b/src/conf/virdomainjob.h index 0d62bab287..8b2dccd298 100644 --- a/src/conf/virdomainjob.h +++ b/src/conf/virdomainjob.h @@ -89,6 +89,7 @@ typedef enum { VIR_DOMAIN_JOB_STATUS_PAUSED, VIR_DOMAIN_JOB_STATUS_POSTCOPY, VIR_DOMAIN_JOB_STATUS_POSTCOPY_PAUSED, + VIR_DOMAIN_JOB_STATUS_POSTCOPY_RECOVER, VIR_DOMAIN_JOB_STATUS_COMPLETED, VIR_DOMAIN_JOB_STATUS_FAILED, VIR_DOMAIN_JOB_STATUS_CANCELED, diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index a5092dd7f0..af0bb04c45 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -199,6 +199,10 @@ struct _qemuDomainObjPrivate { * private XML. */ virBitmap *migrationCaps; + /* True if QEMU supports "postcopy-recover-setup" migration state. Checked + * QEMU enters the state, not to be stored in private XML. */ + bool migrationRecoverSetup; + /* true if qemu-pr-helper process is running for the domain */ bool prDaemonRunning; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 736602333e..3801ad623a 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -11963,6 +11963,7 @@ qemuDomainGetJobInfoMigrationStats(virDomainObj *vm, case VIR_DOMAIN_JOB_STATUS_POSTCOPY: case VIR_DOMAIN_JOB_STATUS_PAUSED: case VIR_DOMAIN_JOB_STATUS_POSTCOPY_PAUSED: + case VIR_DOMAIN_JOB_STATUS_POSTCOPY_RECOVER: if (qemuMigrationAnyFetchStats(vm, VIR_ASYNC_JOB_NONE, jobData, NULL) < 0) return -1; diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 4fd7a0aafb..4f02a9a053 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -1802,6 +1802,10 @@ qemuMigrationUpdateJobType(virDomainJobData *jobData) jobData->status = VIR_DOMAIN_JOB_STATUS_POSTCOPY; break; + case QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: + jobData->status = VIR_DOMAIN_JOB_STATUS_POSTCOPY_RECOVER; + break; + case QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY_PAUSED: jobData->status = VIR_DOMAIN_JOB_STATUS_POSTCOPY_PAUSED; break; @@ -1943,6 +1947,7 @@ qemuMigrationJobCheckStatus(virDomainObj *vm, case VIR_DOMAIN_JOB_STATUS_MIGRATING: case VIR_DOMAIN_JOB_STATUS_HYPERVISOR_COMPLETED: case VIR_DOMAIN_JOB_STATUS_POSTCOPY: + case VIR_DOMAIN_JOB_STATUS_POSTCOPY_RECOVER: case VIR_DOMAIN_JOB_STATUS_PAUSED: break; } @@ -2028,6 +2033,7 @@ qemuMigrationAnyCompleted(virDomainObj *vm, case VIR_DOMAIN_JOB_STATUS_MIGRATING: case VIR_DOMAIN_JOB_STATUS_POSTCOPY: case VIR_DOMAIN_JOB_STATUS_PAUSED: + case VIR_DOMAIN_JOB_STATUS_POSTCOPY_RECOVER: /* The migration was aborted by us rather than QEMU itself. */ jobData->status = VIR_DOMAIN_JOB_STATUS_FAILED; return -2; @@ -4669,6 +4675,7 @@ qemuMigrationSrcIsCanceled(virDomainObj *vm) case QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY: case QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY_RECOVER: + case QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: case QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY_PAUSED: case QEMU_MONITOR_MIGRATION_STATUS_PRE_SWITCHOVER: case QEMU_MONITOR_MIGRATION_STATUS_DEVICE: diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c index b1c0c6a064..7f65c23748 100644 --- a/src/qemu/qemu_monitor.c +++ b/src/qemu/qemu_monitor.c @@ -152,6 +152,7 @@ VIR_ENUM_IMPL(qemuMonitorMigrationStatus, "postcopy-active", "postcopy-paused", "postcopy-recover", + "postcopy-recover-setup", "completed", "failed", "cancelling", diff --git a/src/qemu/qemu_monitor.h b/src/qemu/qemu_monitor.h index 76c859a888..57d1b45bf5 100644 --- a/src/qemu/qemu_monitor.h +++ b/src/qemu/qemu_monitor.h @@ -762,6 +762,7 @@ typedef enum { QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY, QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY_PAUSED, QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY_RECOVER, + QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP, QEMU_MONITOR_MIGRATION_STATUS_COMPLETED, QEMU_MONITOR_MIGRATION_STATUS_ERROR, QEMU_MONITOR_MIGRATION_STATUS_CANCELLING, diff --git a/src/qemu/qemu_monitor_json.c b/src/qemu/qemu_monitor_json.c index 8a20ce57e6..2db38c1007 100644 --- a/src/qemu/qemu_monitor_json.c +++ b/src/qemu/qemu_monitor_json.c @@ -2969,6 +2969,7 @@ qemuMonitorJSONGetMigrationStatsReply(virJSONValue *reply, case QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY: case QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY_PAUSED: case QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY_RECOVER: + case QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: case QEMU_MONITOR_MIGRATION_STATUS_COMPLETED: case QEMU_MONITOR_MIGRATION_STATUS_CANCELLING: case QEMU_MONITOR_MIGRATION_STATUS_PRE_SWITCHOVER: diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index cec739c984..a69878e8bb 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -1503,6 +1503,10 @@ qemuProcessHandleMigrationStatus(qemuMonitor *mon G_GNUC_UNUSED, } break; + case QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: + priv->migrationRecoverSetup = true; + break; + case QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY_RECOVER: if (virDomainObjIsFailedPostcopy(vm, vm->job)) { int eventType = -1; -- 2.45.2

On Thu, Aug 08, 2024 at 13:41:03 +0200, Jiri Denemark wrote:
This patch adds support for recognizing the new migration state reported by QEMU when post-copy recovery is requested. It is not actually used for anything yet.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/conf/virdomainjob.c | 1 + src/conf/virdomainjob.h | 1 + src/qemu/qemu_domain.h | 4 ++++ src/qemu/qemu_driver.c | 1 + src/qemu/qemu_migration.c | 7 +++++++ src/qemu/qemu_monitor.c | 1 + src/qemu/qemu_monitor.h | 1 + src/qemu/qemu_monitor_json.c | 1 + src/qemu/qemu_process.c | 4 ++++ 9 files changed, 21 insertions(+) ... diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index a5092dd7f0..af0bb04c45 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -199,6 +199,10 @@ struct _qemuDomainObjPrivate { * private XML. */ virBitmap *migrationCaps;
+ /* True if QEMU supports "postcopy-recover-setup" migration state. Checked + * QEMU enters the state, not to be stored in private XML. */ + bool migrationRecoverSetup; + /* true if qemu-pr-helper process is running for the domain */ bool prDaemonRunning;
Oops, this new field also needs to be reset, consider the following hunk squashed in: diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 1e355f0f41..f87ba6ba51 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -1873,6 +1873,8 @@ qemuDomainObjPrivateDataClear(qemuDomainObjPrivate *priv) virHashRemoveAll(priv->statsSchema); g_slist_free_full(g_steal_pointer(&priv->threadContextAliases), g_free); + + priv->migrationRecoverSetup = false; } Jirka

Depending on timing between QEMU and libvirt an attempt to resume failed post-copy migration could immediately report a failure in post-copy phase again even though the migration actually resumed and is progressing just fine. This is caused by QEMU reporting the original migration state (i.e., postcopy-paused) until migration is successfully resumed and QEMU switches to postcopy-active. QEMU 9.1 introduced a new postcopy-recover-setup migration state which is entered immediately after requesting migration to be resumed and we can reliably wait for the migration to either continue or fail without being confused by the old state. https://issues.redhat.com/browse/RHEL-22166 Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_migration.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 4f02a9a053..7f905f8584 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -1962,6 +1962,7 @@ enum qemuMigrationCompletedFlags { QEMU_MIGRATION_COMPLETED_CHECK_STORAGE = (1 << 1), QEMU_MIGRATION_COMPLETED_POSTCOPY = (1 << 2), QEMU_MIGRATION_COMPLETED_PRE_SWITCHOVER = (1 << 3), + QEMU_MIRGATION_COMPLETED_RECOVERY = (1 << 4), }; @@ -2023,6 +2024,16 @@ qemuMigrationAnyCompleted(virDomainObj *vm, return 1; } + /* When QEMU is new enough to enter postcopy-recover-setup state during + * post-copy recovery, the source waits for the recovery to start + * before letting the destination wait for migration to complete. + */ + if (flags & QEMU_MIRGATION_COMPLETED_RECOVERY && + jobData->status == VIR_DOMAIN_JOB_STATUS_POSTCOPY) { + VIR_DEBUG("Post-copy recovery active"); + return 1; + } + if (jobData->status == VIR_DOMAIN_JOB_STATUS_HYPERVISOR_COMPLETED) return 1; else @@ -5115,6 +5126,7 @@ qemuMigrationSrcResume(virDomainObj *vm, char **cookieout, int *cookieoutlen, qemuMigrationSpec *spec, + virConnectPtr dconn, unsigned int flags) { qemuDomainObjPrivate *priv = vm->privateData; @@ -5145,6 +5157,17 @@ qemuMigrationSrcResume(virDomainObj *vm, if (rc < 0) return -1; + /* Wait for postcopy recovery to start (or fail) if QEMU is new enough to + * support postcopy-recover-setup migration state. */ + if (priv->migrationRecoverSetup) { + VIR_DEBUG("Waiting for post-copy recovery to start"); + if (qemuMigrationSrcWaitForCompletion(vm, VIR_ASYNC_JOB_MIGRATION_OUT, dconn, + QEMU_MIRGATION_COMPLETED_RECOVERY) < 0) + return -1; + } else { + VIR_WARN("QEMU is too old, we may report a failure in post-copy phase even though the migration may be running just fine"); + } + if (qemuMigrationCookieFormat(mig, driver, vm, QEMU_MIGRATION_SOURCE, cookieout, cookieoutlen, @@ -5249,7 +5272,7 @@ qemuMigrationSrcPerformNative(virQEMUDriver *driver, if (flags & VIR_MIGRATE_POSTCOPY_RESUME) { ret = qemuMigrationSrcResume(vm, migParams, cookiein, cookieinlen, - cookieout, cookieoutlen, &spec, flags); + cookieout, cookieoutlen, &spec, dconn, flags); } else { ret = qemuMigrationSrcRun(driver, vm, xmlin, persist_xml, cookiein, cookieinlen, cookieout, cookieoutlen, flags, resource, -- 2.45.2

On 8/8/24 13:41, Jiri Denemark wrote:
See 2/2 for details.
Jiri Denemark (2): qemu: Add support for postcopy-recover-setup migration state qemu: Avoid false failure when resuming post-copy migration
src/conf/virdomainjob.c | 1 + src/conf/virdomainjob.h | 1 + src/qemu/qemu_domain.h | 4 ++++ src/qemu/qemu_driver.c | 1 + src/qemu/qemu_migration.c | 32 +++++++++++++++++++++++++++++++- src/qemu/qemu_monitor.c | 1 + src/qemu/qemu_monitor.h | 1 + src/qemu/qemu_monitor_json.c | 1 + src/qemu/qemu_process.c | 4 ++++ 9 files changed, 45 insertions(+), 1 deletion(-)
Reviewed-by: Michal Privoznik <mprivozn@redhat.com> Michal
participants (2)
-
Jiri Denemark
-
Michal Prívozník