[libvirt PATCH 0/2] qemu_migration: Fix p2p post-copy recovery

Jiri Denemark (2): qemu_migration: Move qemuMigrationSrcPerformResume up qemu_migration: Fix p2p post-copy recovery src/qemu/qemu_migration.c | 129 ++++++++++++++++++++------------------ 1 file changed, 68 insertions(+), 61 deletions(-) -- 2.39.0

It will need to be called from a place above its current definition. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_migration.c | 88 +++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 2e87db883e..e8e0474558 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -5262,6 +5262,50 @@ qemuMigrationSrcPerformTunnel(virQEMUDriver *driver, } +static int +qemuMigrationSrcPerformResume(virQEMUDriver *driver, + virConnectPtr conn, + virDomainObj *vm, + const char *uri, + qemuMigrationParams *migParams, + const char *cookiein, + int cookieinlen, + char **cookieout, + int *cookieoutlen, + unsigned int flags) +{ + int ret; + + VIR_DEBUG("vm=%p, uri=%s", vm, uri); + + if (!qemuMigrationAnyCanResume(vm, VIR_ASYNC_JOB_MIGRATION_OUT, flags, + QEMU_MIGRATION_PHASE_BEGIN_RESUME)) + return -1; + + if (qemuMigrationJobStartPhase(vm, QEMU_MIGRATION_PHASE_PERFORM_RESUME) < 0) + return -1; + + virCloseCallbacksUnset(driver->closeCallbacks, vm, + qemuMigrationAnyConnectionClosed); + qemuDomainCleanupRemove(vm, qemuProcessCleanupMigrationJob); + + ret = qemuMigrationSrcPerformNative(driver, vm, NULL, uri, + cookiein, cookieinlen, + cookieout, cookieoutlen, flags, + 0, NULL, NULL, 0, NULL, migParams, NULL); + + if (virCloseCallbacksSet(driver->closeCallbacks, vm, conn, + qemuMigrationAnyConnectionClosed) < 0) + ret = -1; + + if (ret < 0) + ignore_value(qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_POSTCOPY_FAILED)); + + qemuMigrationJobContinue(vm, qemuProcessCleanupMigrationJob); + return ret; +} + + /* This is essentially a re-impl of virDomainMigrateVersion2 * from libvirt.c, but running in source libvirtd context, * instead of client app context & also adding in tunnel @@ -6074,50 +6118,6 @@ qemuMigrationSrcPerformJob(virQEMUDriver *driver, } -static int -qemuMigrationSrcPerformResume(virQEMUDriver *driver, - virConnectPtr conn, - virDomainObj *vm, - const char *uri, - qemuMigrationParams *migParams, - const char *cookiein, - int cookieinlen, - char **cookieout, - int *cookieoutlen, - unsigned int flags) -{ - int ret; - - VIR_DEBUG("vm=%p, uri=%s", vm, uri); - - if (!qemuMigrationAnyCanResume(vm, VIR_ASYNC_JOB_MIGRATION_OUT, flags, - QEMU_MIGRATION_PHASE_BEGIN_RESUME)) - return -1; - - if (qemuMigrationJobStartPhase(vm, QEMU_MIGRATION_PHASE_PERFORM_RESUME) < 0) - return -1; - - virCloseCallbacksUnset(driver->closeCallbacks, vm, - qemuMigrationAnyConnectionClosed); - qemuDomainCleanupRemove(vm, qemuProcessCleanupMigrationJob); - - ret = qemuMigrationSrcPerformNative(driver, vm, NULL, uri, - cookiein, cookieinlen, - cookieout, cookieoutlen, flags, - 0, NULL, NULL, 0, NULL, migParams, NULL); - - if (virCloseCallbacksSet(driver->closeCallbacks, vm, conn, - qemuMigrationAnyConnectionClosed) < 0) - ret = -1; - - if (ret < 0) - ignore_value(qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_POSTCOPY_FAILED)); - - qemuMigrationJobContinue(vm, qemuProcessCleanupMigrationJob); - return ret; -} - - /* * This implements perform phase of v3 migration protocol. */ -- 2.39.0

On Tue, Dec 13, 2022 at 15:15:28 +0100, Jiri Denemark wrote:
It will need to be called from a place above its current definition.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_migration.c | 88 +++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 44 deletions(-)
Reviewed-by: Peter Krempa <pkrempa@redhat.com>

Although the qemuMigrationSrcPerformResume actually got called indirectly via qemuMigrationSrcPerformNative and the recovery process worked, wrong job phases were used for the "perform" phase, which could cause issues when libvirt daemon crashed (or was otherwise restarted) during post-copy recovery. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_migration.c | 41 +++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index e8e0474558..9b98ffb527 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -5642,33 +5642,40 @@ qemuMigrationSrcPerformPeer2Peer3(virQEMUDriver *driver, * confirm migration completion. */ VIR_DEBUG("Perform3 %p uri=%s", sconn, NULLSTR(uri)); - ignore_value(qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_PERFORM3)); VIR_FREE(cookiein); cookiein = g_steal_pointer(&cookieout); cookieinlen = cookieoutlen; cookieoutlen = 0; - if (flags & VIR_MIGRATE_TUNNELLED) { - ret = qemuMigrationSrcPerformTunnel(driver, vm, st, persist_xml, + + if (flags & VIR_MIGRATE_POSTCOPY_RESUME) { + ret = qemuMigrationSrcPerformResume(driver, dconn, vm, uri, migParams, cookiein, cookieinlen, - &cookieout, &cookieoutlen, - flags, bandwidth, dconn, graphicsuri, - nmigrate_disks, migrate_disks, - migParams); + &cookieout, &cookieoutlen, flags); } else { - ret = qemuMigrationSrcPerformNative(driver, vm, persist_xml, uri, - cookiein, cookieinlen, - &cookieout, &cookieoutlen, - flags, bandwidth, dconn, graphicsuri, - nmigrate_disks, migrate_disks, - migParams, nbdURI); + ignore_value(qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_PERFORM3)); + if (flags & VIR_MIGRATE_TUNNELLED) { + ret = qemuMigrationSrcPerformTunnel(driver, vm, st, persist_xml, + cookiein, cookieinlen, + &cookieout, &cookieoutlen, + flags, bandwidth, dconn, graphicsuri, + nmigrate_disks, migrate_disks, + migParams); + } else { + ret = qemuMigrationSrcPerformNative(driver, vm, persist_xml, uri, + cookiein, cookieinlen, + &cookieout, &cookieoutlen, + flags, bandwidth, dconn, graphicsuri, + nmigrate_disks, migrate_disks, + migParams, nbdURI); + } + + if (ret == 0) + ignore_value(qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_PERFORM3_DONE)); } /* Perform failed. Make sure Finish doesn't overwrite the error */ - if (ret < 0) { + if (ret < 0) virErrorPreserveLast(&orig_err); - } else { - ignore_value(qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_PERFORM3_DONE)); - } /* If Perform returns < 0, then we need to cancel the VM * startup on the destination -- 2.39.0

On Tue, Dec 13, 2022 at 15:15:29 +0100, Jiri Denemark wrote:
Although the qemuMigrationSrcPerformResume actually got called indirectly via qemuMigrationSrcPerformNative and the recovery process worked, wrong job phases were used for the "perform" phase, which could cause issues when libvirt daemon crashed (or was otherwise restarted) during post-copy recovery.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_migration.c | 41 +++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 17 deletions(-)
Reviewed-by: Peter Krempa <pkrempa@redhat.com>
participants (2)
-
Jiri Denemark
-
Peter Krempa