[libvirt] [PATCH 0/3] qemu: Fix race condition between block jobs and the end of migration

When migrating both memory and storage, we need to make sure block jobs are finished after the virtual CPUs get paused at the end of migration but before QEMU completes the migration otherwise QEMU may die with _co_do_pwritev: Assertion `!(bs->open_flags & 0x0800)' failed This is a libvirt side of the QEMU fix from Dr. David Alan Gilbert: http://lists.nongnu.org/archive/html/qemu-devel/2017-10/msg04895.html Applicable on top of my "Preparation for new QEMU migration states" series I sent yesterday. Jiri Denemark (3): qemu: Add support for migrate-continue QMP command qemu: Add pause-before-switchover migration capability qemu: Enabled pause-before-switchover migration capability src/qemu/qemu_domain.c | 1 + src/qemu/qemu_domain.h | 1 + src/qemu/qemu_migration.c | 75 +++++++++++++++++++++++++++++++++++++++++++- src/qemu/qemu_monitor.c | 18 +++++++++-- src/qemu/qemu_monitor.h | 6 ++++ src/qemu/qemu_monitor_json.c | 29 +++++++++++++++++ src/qemu/qemu_monitor_json.h | 4 +++ 7 files changed, 131 insertions(+), 3 deletions(-) -- 2.14.2

Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_monitor.c | 13 +++++++++++++ src/qemu/qemu_monitor.h | 3 +++ src/qemu/qemu_monitor_json.c | 27 +++++++++++++++++++++++++++ src/qemu/qemu_monitor_json.h | 4 ++++ 4 files changed, 47 insertions(+) diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c index 64efb89e8..5ca3cdce2 100644 --- a/src/qemu/qemu_monitor.c +++ b/src/qemu/qemu_monitor.c @@ -4254,6 +4254,19 @@ qemuMonitorMigrateStartPostCopy(qemuMonitorPtr mon) return qemuMonitorJSONMigrateStartPostCopy(mon); } + +int +qemuMonitorMigrateContinue(qemuMonitorPtr mon, + qemuMonitorMigrationStatus status) +{ + VIR_DEBUG("status=%s", qemuMonitorMigrationStatusTypeToString(status)); + + QEMU_CHECK_MONITOR_JSON(mon); + + return qemuMonitorJSONMigrateContinue(mon, status); +} + + int qemuMonitorGetRTCTime(qemuMonitorPtr mon, struct tm *tm) diff --git a/src/qemu/qemu_monitor.h b/src/qemu/qemu_monitor.h index 1e6b97714..fe29f484e 100644 --- a/src/qemu/qemu_monitor.h +++ b/src/qemu/qemu_monitor.h @@ -1119,6 +1119,9 @@ int qemuMonitorMigrateIncoming(qemuMonitorPtr mon, int qemuMonitorMigrateStartPostCopy(qemuMonitorPtr mon); +int qemuMonitorMigrateContinue(qemuMonitorPtr mon, + qemuMonitorMigrationStatus status); + int qemuMonitorGetRTCTime(qemuMonitorPtr mon, struct tm *tm); diff --git a/src/qemu/qemu_monitor_json.c b/src/qemu/qemu_monitor_json.c index f7567eb77..def80882c 100644 --- a/src/qemu/qemu_monitor_json.c +++ b/src/qemu/qemu_monitor_json.c @@ -7374,6 +7374,33 @@ qemuMonitorJSONMigrateStartPostCopy(qemuMonitorPtr mon) return ret; } + +int +qemuMonitorJSONMigrateContinue(qemuMonitorPtr mon, + qemuMonitorMigrationStatus status) +{ + const char *statusStr = qemuMonitorMigrationStatusTypeToString(status); + int ret = -1; + virJSONValuePtr cmd; + virJSONValuePtr reply = NULL; + + if (!(cmd = qemuMonitorJSONMakeCommand("migrate-continue", + "s:state", statusStr, + NULL))) + return -1; + + if (qemuMonitorJSONCommand(mon, cmd, &reply) < 0) + goto cleanup; + + ret = qemuMonitorJSONCheckError(cmd, reply); + + cleanup: + virJSONValueFree(cmd); + virJSONValueFree(reply); + return ret; +} + + int qemuMonitorJSONGetRTCTime(qemuMonitorPtr mon, struct tm *tm) diff --git a/src/qemu/qemu_monitor_json.h b/src/qemu/qemu_monitor_json.h index b17348a11..739a99293 100644 --- a/src/qemu/qemu_monitor_json.h +++ b/src/qemu/qemu_monitor_json.h @@ -500,6 +500,10 @@ int qemuMonitorJSONMigrateIncoming(qemuMonitorPtr mon, int qemuMonitorJSONMigrateStartPostCopy(qemuMonitorPtr mon) ATTRIBUTE_NONNULL(1); +int qemuMonitorJSONMigrateContinue(qemuMonitorPtr mon, + qemuMonitorMigrationStatus status) + ATTRIBUTE_NONNULL(1); + int qemuMonitorJSONGetRTCTime(qemuMonitorPtr mon, struct tm *tm) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); -- 2.14.2

This new capability enables a pause before device state serialization so that we can finish all block jobs without racing with the end of the migration. The pause is indicated by "pre-switchover" state. Once we're done QEMU enters "device" migration state. This patch just defines the new capability and QEMU migration states and their mapping to our job states. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_domain.c | 1 + src/qemu/qemu_domain.h | 1 + src/qemu/qemu_migration.c | 11 +++++++++++ src/qemu/qemu_monitor.c | 5 +++-- src/qemu/qemu_monitor.h | 3 +++ src/qemu/qemu_monitor_json.c | 2 ++ 6 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index ece8ee7dd..abf65094a 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -423,6 +423,7 @@ qemuDomainJobStatusToType(qemuDomainJobStatus status) case QEMU_DOMAIN_JOB_STATUS_MIGRATING: case QEMU_DOMAIN_JOB_STATUS_QEMU_COMPLETED: case QEMU_DOMAIN_JOB_STATUS_POSTCOPY: + case QEMU_DOMAIN_JOB_STATUS_PAUSED: return VIR_DOMAIN_JOB_UNBOUNDED; case QEMU_DOMAIN_JOB_STATUS_COMPLETED: diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 3b4272047..ff5328277 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -103,6 +103,7 @@ typedef enum { QEMU_DOMAIN_JOB_STATUS_ACTIVE, QEMU_DOMAIN_JOB_STATUS_MIGRATING, QEMU_DOMAIN_JOB_STATUS_QEMU_COMPLETED, + QEMU_DOMAIN_JOB_STATUS_PAUSED, QEMU_DOMAIN_JOB_STATUS_POSTCOPY, QEMU_DOMAIN_JOB_STATUS_COMPLETED, QEMU_DOMAIN_JOB_STATUS_FAILED, diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 626b4e3ee..4b356002f 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -1366,6 +1366,14 @@ qemuMigrationUpdateJobType(qemuDomainJobInfoPtr jobInfo) jobInfo->status = QEMU_DOMAIN_JOB_STATUS_CANCELED; break; + case QEMU_MONITOR_MIGRATION_STATUS_PRE_SWITCHOVER: + jobInfo->status = QEMU_DOMAIN_JOB_STATUS_PAUSED; + break; + + case QEMU_MONITOR_MIGRATION_STATUS_DEVICE: + jobInfo->status = QEMU_DOMAIN_JOB_STATUS_MIGRATING; + break; + case QEMU_MONITOR_MIGRATION_STATUS_SETUP: case QEMU_MONITOR_MIGRATION_STATUS_ACTIVE: case QEMU_MONITOR_MIGRATION_STATUS_CANCELLING: @@ -1459,6 +1467,7 @@ qemuMigrationCheckJobStatus(virQEMUDriverPtr driver, case QEMU_DOMAIN_JOB_STATUS_MIGRATING: case QEMU_DOMAIN_JOB_STATUS_QEMU_COMPLETED: case QEMU_DOMAIN_JOB_STATUS_POSTCOPY: + case QEMU_DOMAIN_JOB_STATUS_PAUSED: break; } @@ -1474,6 +1483,7 @@ enum qemuMigrationCompletedFlags { QEMU_MIGRATION_COMPLETED_ABORT_ON_ERROR = (1 << 0), QEMU_MIGRATION_COMPLETED_CHECK_STORAGE = (1 << 1), QEMU_MIGRATION_COMPLETED_POSTCOPY = (1 << 2), + QEMU_MIGRATION_COMPLETED_PRE_SWITCHOVER = (1 << 3), }; @@ -1534,6 +1544,7 @@ qemuMigrationCompleted(virQEMUDriverPtr driver, switch (jobInfo->status) { case QEMU_DOMAIN_JOB_STATUS_MIGRATING: case QEMU_DOMAIN_JOB_STATUS_POSTCOPY: + case QEMU_DOMAIN_JOB_STATUS_PAUSED: /* The migration was aborted by us rather than QEMU itself. */ jobInfo->status = QEMU_DOMAIN_JOB_STATUS_FAILED; return -2; diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c index 5ca3cdce2..dd9d64a20 100644 --- a/src/qemu/qemu_monitor.c +++ b/src/qemu/qemu_monitor.c @@ -172,14 +172,15 @@ VIR_ONCE_GLOBAL_INIT(qemuMonitor) VIR_ENUM_IMPL(qemuMonitorMigrationStatus, QEMU_MONITOR_MIGRATION_STATUS_LAST, "inactive", "setup", - "active", "postcopy-active", + "active", "pre-switchover", + "device", "postcopy-active", "completed", "failed", "cancelling", "cancelled") VIR_ENUM_IMPL(qemuMonitorMigrationCaps, QEMU_MONITOR_MIGRATION_CAPS_LAST, "xbzrle", "auto-converge", "rdma-pin-all", "events", - "postcopy-ram", "compress") + "postcopy-ram", "compress", "pause-before-switchover") VIR_ENUM_IMPL(qemuMonitorVMStatus, QEMU_MONITOR_VM_STATUS_LAST, diff --git a/src/qemu/qemu_monitor.h b/src/qemu/qemu_monitor.h index fe29f484e..bc8494fae 100644 --- a/src/qemu/qemu_monitor.h +++ b/src/qemu/qemu_monitor.h @@ -641,6 +641,8 @@ typedef enum { QEMU_MONITOR_MIGRATION_STATUS_INACTIVE, QEMU_MONITOR_MIGRATION_STATUS_SETUP, QEMU_MONITOR_MIGRATION_STATUS_ACTIVE, + QEMU_MONITOR_MIGRATION_STATUS_PRE_SWITCHOVER, + QEMU_MONITOR_MIGRATION_STATUS_DEVICE, QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY, QEMU_MONITOR_MIGRATION_STATUS_COMPLETED, QEMU_MONITOR_MIGRATION_STATUS_ERROR, @@ -706,6 +708,7 @@ typedef enum { QEMU_MONITOR_MIGRATION_CAPS_EVENTS, QEMU_MONITOR_MIGRATION_CAPS_POSTCOPY, QEMU_MONITOR_MIGRATION_CAPS_COMPRESS, + QEMU_MONITOR_MIGRATION_CAPS_PAUSE_BEFORE_SWITCHOVER, QEMU_MONITOR_MIGRATION_CAPS_LAST } qemuMonitorMigrationCaps; diff --git a/src/qemu/qemu_monitor_json.c b/src/qemu/qemu_monitor_json.c index def80882c..05cc634d2 100644 --- a/src/qemu/qemu_monitor_json.c +++ b/src/qemu/qemu_monitor_json.c @@ -2846,6 +2846,8 @@ qemuMonitorJSONGetMigrationStatsReply(virJSONValuePtr reply, case QEMU_MONITOR_MIGRATION_STATUS_POSTCOPY: case QEMU_MONITOR_MIGRATION_STATUS_COMPLETED: case QEMU_MONITOR_MIGRATION_STATUS_CANCELLING: + case QEMU_MONITOR_MIGRATION_STATUS_PRE_SWITCHOVER: + case QEMU_MONITOR_MIGRATION_STATUS_DEVICE: ram = virJSONValueObjectGetObject(ret, "ram"); if (!ram) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", -- 2.14.2

QEMU identified a race condition between the device state serialization and the end of storage migration. Both QEMU and libvirt needs to be updated to fix this. Our migration work flow is modified so that after starting the migration we to wait for QEMU to enter "pre-switchover", "postcopy-active", or "completed" state. Once there, we cancel all block jobs as usual. But if QEMU is in "pre-switchover", we need to resume the migration afterwards and wait again for the real end (either "postcopy-active" or "completed" state). Old QEMU will just enter either "postcopy-active" or "completed" directly, which is still correctly handled even by new libvirt. The "pre-switchover" state will only be entered if QEMU supports it and the pause-before-switchover capability was enabled. Thus all combinations of libvirt and QEMU will work, but only new QEMU with new libvirt will avoid the race condition. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_migration.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 4b356002f..af744661f 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -1525,6 +1525,16 @@ qemuMigrationCompleted(virQEMUDriverPtr driver, goto error; } + /* Migration was paused before serializing device state, let's return to + * the caller so that it can finish all block jobs, resume migration, and + * wait again for the real end of the migration. + */ + if (flags & QEMU_MIGRATION_COMPLETED_PRE_SWITCHOVER && + jobInfo->status == QEMU_DOMAIN_JOB_STATUS_PAUSED) { + VIR_DEBUG("Migration paused before switchover"); + return 1; + } + /* In case of postcopy the source considers migration completed at the * moment it switched from active to postcopy-active state. The destination * will continue waiting until the migrate state changes to completed. @@ -3600,6 +3610,28 @@ qemuMigrationConnect(virQEMUDriverPtr driver, return ret; } + +static int +qemuMigrationContinue(virQEMUDriverPtr driver, + virDomainObjPtr vm, + qemuMonitorMigrationStatus status, + qemuDomainAsyncJob asyncJob) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + int ret; + + if (qemuDomainObjEnterMonitorAsync(driver, vm, asyncJob) < 0) + return -1; + + ret = qemuMonitorMigrateContinue(priv->mon, status); + + if (qemuDomainObjExitMonitor(driver, vm) < 0) + ret = -1; + + return ret; +} + + static int qemuMigrationRun(virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -3769,6 +3801,12 @@ qemuMigrationRun(virQEMUDriverPtr driver, QEMU_ASYNC_JOB_MIGRATION_OUT) < 0) goto error; + if (qemuMigrationCapsGet(vm, QEMU_MONITOR_MIGRATION_CAPS_PAUSE_BEFORE_SWITCHOVER) && + qemuMigrationSetOption(driver, vm, + QEMU_MONITOR_MIGRATION_CAPS_PAUSE_BEFORE_SWITCHOVER, + true, QEMU_ASYNC_JOB_MIGRATION_OUT) < 0) + goto error; + if (qemuMigrationSetParams(driver, vm, QEMU_ASYNC_JOB_MIGRATION_OUT, migParams) < 0) goto error; @@ -3847,7 +3885,7 @@ qemuMigrationRun(virQEMUDriverPtr driver, fd = -1; } - waitFlags = 0; + waitFlags = QEMU_MIGRATION_COMPLETED_PRE_SWITCHOVER; if (abort_on_error) waitFlags |= QEMU_MIGRATION_COMPLETED_ABORT_ON_ERROR; if (mig->nbd) @@ -3889,6 +3927,30 @@ qemuMigrationRun(virQEMUDriverPtr driver, dconn) < 0) goto error; + /* When migration was paused before serializing device state we need to + * resume it now once we finished all block jobs and wait for the real + * end of the migration. + */ + if (priv->job.current->status == QEMU_DOMAIN_JOB_STATUS_PAUSED) { + if (qemuMigrationContinue(driver, vm, + QEMU_MONITOR_MIGRATION_STATUS_PRE_SWITCHOVER, + QEMU_ASYNC_JOB_MIGRATION_OUT) < 0) + goto error; + + waitFlags ^= QEMU_MIGRATION_COMPLETED_PRE_SWITCHOVER; + + rc = qemuMigrationWaitForCompletion(driver, vm, + QEMU_ASYNC_JOB_MIGRATION_OUT, + dconn, waitFlags); + if (rc == -2) { + goto error; + } else if (rc == -1) { + /* QEMU reported failed migration, nothing to cancel anymore */ + cancel = false; + goto error; + } + } + if (iothread) { qemuMigrationIOThreadPtr io; -- 2.14.2

On Fri, Oct 20, 2017 at 02:04:26PM +0200, Jiri Denemark wrote:
When migrating both memory and storage, we need to make sure block jobs are finished after the virtual CPUs get paused at the end of migration but before QEMU completes the migration otherwise QEMU may die with
_co_do_pwritev: Assertion `!(bs->open_flags & 0x0800)' failed
This is a libvirt side of the QEMU fix from Dr. David Alan Gilbert:
http://lists.nongnu.org/archive/html/qemu-devel/2017-10/msg04895.html
Applicable on top of my "Preparation for new QEMU migration states" series I sent yesterday.
Jiri Denemark (3): qemu: Add support for migrate-continue QMP command qemu: Add pause-before-switchover migration capability qemu: Enabled pause-before-switchover migration capability
src/qemu/qemu_domain.c | 1 + src/qemu/qemu_domain.h | 1 + src/qemu/qemu_migration.c | 75 +++++++++++++++++++++++++++++++++++++++++++- src/qemu/qemu_monitor.c | 18 +++++++++-- src/qemu/qemu_monitor.h | 6 ++++ src/qemu/qemu_monitor_json.c | 29 +++++++++++++++++ src/qemu/qemu_monitor_json.h | 4 +++ 7 files changed, 131 insertions(+), 3 deletions(-)
ACK series Jan
participants (2)
-
Jiri Denemark
-
Ján Tomko