When post-copy migration fails, the domain stays running on the
destination with a VIR_DOMAIN_RUNNING_POSTCOPY_FAILED reason. Both the
state and the reason can later be rewritten in case the domain gets
paused for other reasons (such as an I/O error). Thus we need a separate
place to remember the post-copy migration failed to be able to resume
the migration.
https://bugzilla.redhat.com/show_bug.cgi?id=2111948
Signed-off-by: Jiri Denemark <jdenemar(a)redhat.com>
---
src/conf/domain_conf.c | 7 ++++++-
src/conf/virdomainjob.c | 1 +
src/conf/virdomainjob.h | 1 +
src/qemu/qemu_domainjob.c | 9 +++++++++
src/qemu/qemu_migration.c | 34 +++++++++++++++++++++++-----------
src/qemu/qemu_process.c | 15 +++++++++++++++
6 files changed, 55 insertions(+), 12 deletions(-)
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 9e2eea79e7..f83586c549 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -27874,8 +27874,13 @@ virDomainObjGetState(virDomainObj *dom, int *reason)
bool
virDomainObjIsFailedPostcopy(virDomainObj *dom,
- virDomainJobObj *job G_GNUC_UNUSED)
+ virDomainJobObj *job)
{
+ if (job && job->asyncPaused &&
+ (job->asyncJob == VIR_ASYNC_JOB_MIGRATION_IN ||
+ job->asyncJob == VIR_ASYNC_JOB_MIGRATION_OUT))
+ return true;
+
return ((dom->state.state == VIR_DOMAIN_PAUSED &&
dom->state.reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) ||
(dom->state.state == VIR_DOMAIN_RUNNING &&
diff --git a/src/conf/virdomainjob.c b/src/conf/virdomainjob.c
index 256b665a42..c4cbbe8f6d 100644
--- a/src/conf/virdomainjob.c
+++ b/src/conf/virdomainjob.c
@@ -174,6 +174,7 @@ virDomainObjResetAsyncJob(virDomainJobObj *job)
job->asyncOwner = 0;
g_clear_pointer(&job->asyncOwnerAPI, g_free);
job->asyncStarted = 0;
+ job->asyncPaused = false;
job->phase = 0;
job->mask = VIR_JOB_DEFAULT_MASK;
job->abortJob = false;
diff --git a/src/conf/virdomainjob.h b/src/conf/virdomainjob.h
index b1ac36a2fa..0d62bab287 100644
--- a/src/conf/virdomainjob.h
+++ b/src/conf/virdomainjob.h
@@ -176,6 +176,7 @@ struct _virDomainJobObj {
unsigned long long asyncOwner; /* Thread which set current async job */
char *asyncOwnerAPI; /* The API which owns the async job */
unsigned long long asyncStarted; /* When the current async job started */
+ bool asyncPaused; /* The async job is paused */
int phase; /* Job phase (mainly for migrations) */
unsigned long long mask; /* Jobs allowed during async job */
virDomainJobData *current; /* async job progress data */
diff --git a/src/qemu/qemu_domainjob.c b/src/qemu/qemu_domainjob.c
index 8d958b9d21..27beb5229f 100644
--- a/src/qemu/qemu_domainjob.c
+++ b/src/qemu/qemu_domainjob.c
@@ -695,6 +695,8 @@ qemuDomainObjPrivateXMLFormatJob(virBuffer *buf,
if (vm->job->asyncJob != VIR_ASYNC_JOB_NONE) {
virBufferAsprintf(&attrBuf, " flags='0x%x'",
vm->job->apiFlags);
virBufferAsprintf(&attrBuf, " asyncStarted='%llu'",
vm->job->asyncStarted);
+ if (vm->job->asyncPaused)
+ virBufferAddLit(&attrBuf, " asyncPaused='yes'");
}
if (vm->job->cb &&
@@ -732,6 +734,7 @@ qemuDomainObjPrivateXMLParseJob(virDomainObj *vm,
if ((tmp = virXPathString("string(@async)", ctxt))) {
int async;
+ virTristateBool paused;
if ((async = virDomainAsyncJobTypeFromString(tmp)) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
@@ -757,6 +760,12 @@ qemuDomainObjPrivateXMLParseJob(virDomainObj *vm,
_("Invalid async job start"));
return -1;
}
+
+ if (virXMLPropTristateBool(ctxt->node, "asyncPaused",
VIR_XML_PROP_NONE,
+ &paused) < 0)
+ return -1;
+
+ vm->job->asyncPaused = paused == VIR_TRISTATE_BOOL_YES;
}
if (virXMLPropUInt(ctxt->node, "flags", 16, VIR_XML_PROP_NONE,
diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c
index 27a74795d6..f258e7d700 100644
--- a/src/qemu/qemu_migration.c
+++ b/src/qemu/qemu_migration.c
@@ -1666,17 +1666,19 @@ qemuMigrationSrcPostcopyFailed(virDomainObj *vm)
state = virDomainObjGetState(vm, &reason);
- VIR_DEBUG("%s/%s",
+ VIR_DEBUG("%s/%s, asyncPaused=%u",
virDomainStateTypeToString(state),
- virDomainStateReasonToString(state, reason));
+ virDomainStateReasonToString(state, reason),
+ vm->job->asyncPaused);
if (state != VIR_DOMAIN_PAUSED ||
- reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED)
+ virDomainObjIsFailedPostcopy(vm, vm->job))
return;
VIR_WARN("Migration of domain %s failed during post-copy; "
"leaving the domain paused", vm->def->name);
+ vm->job->asyncPaused = true;
virDomainObjSetState(vm, VIR_DOMAIN_PAUSED,
VIR_DOMAIN_PAUSED_POSTCOPY_FAILED);
event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_SUSPENDED,
@@ -1696,21 +1698,31 @@ qemuMigrationDstPostcopyFailed(virDomainObj *vm)
state = virDomainObjGetState(vm, &reason);
- VIR_DEBUG("%s/%s",
+ VIR_DEBUG("%s/%s, asyncPaused=%u",
virDomainStateTypeToString(state),
- virDomainStateReasonToString(state, reason));
+ virDomainStateReasonToString(state, reason),
+ vm->job->asyncPaused);
- if (state != VIR_DOMAIN_RUNNING ||
- reason == VIR_DOMAIN_RUNNING_POSTCOPY_FAILED)
+ if ((state != VIR_DOMAIN_RUNNING && state != VIR_DOMAIN_PAUSED) ||
+ virDomainObjIsFailedPostcopy(vm, vm->job))
return;
VIR_WARN("Incoming migration of domain '%s' failed during post-copy;
"
"leaving the domain running", vm->def->name);
- virDomainObjSetState(vm, VIR_DOMAIN_RUNNING,
- VIR_DOMAIN_RUNNING_POSTCOPY_FAILED);
- event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED,
- VIR_DOMAIN_EVENT_RESUMED_POSTCOPY_FAILED);
+ vm->job->asyncPaused = true;
+ if (state == VIR_DOMAIN_RUNNING) {
+ virDomainObjSetState(vm, VIR_DOMAIN_RUNNING,
+ VIR_DOMAIN_RUNNING_POSTCOPY_FAILED);
+ event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED,
+
VIR_DOMAIN_EVENT_RESUMED_POSTCOPY_FAILED);
+ } else {
+ /* The domain was paused for other reasons (I/O error, ...) so we don't
+ * want to rewrite the original reason and just emit a postcopy-failed
+ * event. */
+ event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_SUSPENDED,
+
VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY_FAILED);
+ }
virObjectEventStateQueue(driver->domainEventState, event);
}
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 6091c9f1a9..017a05d57e 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -712,6 +712,15 @@ qemuProcessHandleResume(qemuMonitor *mon G_GNUC_UNUSED,
vm->def->name, virDomainRunningReasonTypeToString(reason),
eventDetail);
+ /* When a domain is running in (failed) post-copy migration on the
+ * destination host, we need to make sure to set the appropriate reason
+ * here. */
+ if (virDomainObjIsPostcopy(vm, vm->job)) {
+ if (virDomainObjIsFailedPostcopy(vm, vm->job))
+ reason = VIR_DOMAIN_RUNNING_POSTCOPY_FAILED;
+ else
+ reason = VIR_DOMAIN_RUNNING_POSTCOPY;
+ }
virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason);
event = virDomainEventLifecycleNewFromObj(vm,
VIR_DOMAIN_EVENT_RESUMED,
@@ -1491,6 +1500,7 @@ qemuProcessHandleMigrationStatus(qemuMonitor *mon G_GNUC_UNUSED,
vm->def->name,
virDomainStateTypeToString(state),
NULLSTR(virDomainStateReasonToString(state, reason)));
+ vm->job->asyncPaused = false;
virDomainObjSetState(vm, state, reason);
event = virDomainEventLifecycleNewFromObj(vm, eventType, eventDetail);
qemuDomainSaveStatus(vm);
@@ -3420,6 +3430,7 @@ qemuProcessRestoreMigrationJob(virDomainObj *vm,
job->privateData = g_steal_pointer(&vm->job->privateData);
vm->job->privateData = jobPriv;
vm->job->apiFlags = job->apiFlags;
+ vm->job->asyncPaused = job->asyncPaused;
qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob);
}
@@ -3645,6 +3656,7 @@ qemuProcessRecoverMigration(virQEMUDriver *driver,
if (migStatus == VIR_DOMAIN_JOB_STATUS_POSTCOPY) {
VIR_DEBUG("Post-copy migration of domain %s still running, it will be
handled as unattended",
vm->def->name);
+ vm->job->asyncPaused = false;
return 0;
}
@@ -3653,6 +3665,9 @@ qemuProcessRecoverMigration(virQEMUDriver *driver,
qemuMigrationSrcPostcopyFailed(vm);
else
qemuMigrationDstPostcopyFailed(vm);
+ /* Set the asyncPaused flag in case we're reconnecting to a domain
+ * started by an older libvirt. */
+ vm->job->asyncPaused = true;
return 0;
}
--
2.39.0