[libvirt] [PATCH v2 0/5] Improve migration errors

When QEMU exits on destination during migration, the source reports either success (if the failure happened at the very end) or unhelpful "unexpectedly failed" error message, only the destination host knows more about the error, but usually there's no way to get the error back to the source because the domain is already gone at the time virDomainMigrateFinish* is called. This series changes libvirt to remember incoming migration errors so that they can be sent back to a source host. https://bugzilla.redhat.com/show_bug.cgi?id=1090093 Jiri Denemark (5): Introduce virHashAtomic Introduce virErrorCopyNew qemu: Remember incoming migration errors qemu: Don't report false error from MigrateFinish qemu: Use error from Finish instead of "unexpectedly failed" docs/apibuild.py | 1 + include/libvirt/virterror.h | 1 + src/libvirt-domain.c | 30 ++++++++++++- src/libvirt_private.syms | 4 ++ src/qemu/qemu_conf.h | 3 ++ src/qemu/qemu_driver.c | 31 +++++++++---- src/qemu/qemu_migration.c | 104 ++++++++++++++++++++++++++++++++++++++++++-- src/qemu/qemu_migration.h | 7 +++ src/qemu/qemu_monitor.c | 14 ++++++ src/qemu/qemu_monitor.h | 2 + src/qemu/qemu_process.c | 4 ++ src/util/virerror.c | 22 ++++++++++ src/util/virerror.h | 1 + src/util/virhash.c | 81 ++++++++++++++++++++++++++++++++++ src/util/virhash.h | 10 +++++ 15 files changed, 302 insertions(+), 13 deletions(-) -- 2.4.5

This is a self-locking wrapper around virHashTable. Only a limited set of APIs are implemented now (the ones which are used in the following patch) as more can be added on demand. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- Notes: Version 2: - s/virHashLockable/virHashAtomic/ src/libvirt_private.syms | 3 ++ src/util/virhash.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++ src/util/virhash.h | 10 ++++++ 3 files changed, 94 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 720afdf..65168b1 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1476,6 +1476,9 @@ virFirewallStartTransaction; # util/virhash.h virHashAddEntry; +virHashAtomicNew; +virHashAtomicSteal; +virHashAtomicUpdate; virHashCreate; virHashEqual; virHashForEach; diff --git a/src/util/virhash.c b/src/util/virhash.c index e3c1880..3cfcc69 100644 --- a/src/util/virhash.c +++ b/src/util/virhash.c @@ -31,6 +31,7 @@ #include "virhashcode.h" #include "virrandom.h" #include "virstring.h" +#include "virobject.h" #define VIR_FROM_THIS VIR_FROM_NONE @@ -76,6 +77,28 @@ struct _virHashTable { virHashKeyFree keyFree; }; +struct _virHashAtomic { + virObjectLockable parent; + virHashTablePtr hash; +}; + +static virClassPtr virHashAtomicClass; +static void virHashAtomicDispose(void *obj); + +static int virHashAtomicOnceInit(void) +{ + virHashAtomicClass = virClassNew(virClassForObjectLockable(), + "virHashAtomic", + sizeof(virHashAtomic), + virHashAtomicDispose); + if (!virHashAtomicClass) + return -1; + else + return 0; +} +VIR_ONCE_GLOBAL_INIT(virHashAtomic) + + static uint32_t virHashStrCode(const void *name, uint32_t seed) { return virHashCodeGen(name, strlen(name), seed); @@ -178,6 +201,36 @@ virHashTablePtr virHashCreate(ssize_t size, virHashDataFree dataFree) virHashStrFree); } + +virHashAtomicPtr +virHashAtomicNew(ssize_t size, + virHashDataFree dataFree) +{ + virHashAtomicPtr hash; + + if (virHashAtomicInitialize() < 0) + return NULL; + + if (!(hash = virObjectLockableNew(virHashAtomicClass))) + return NULL; + + if (!(hash->hash = virHashCreate(size, dataFree))) { + virObjectUnref(hash); + return NULL; + } + return hash; +} + + +static void +virHashAtomicDispose(void *obj) +{ + virHashAtomicPtr hash = obj; + + virHashFree(hash->hash); +} + + /** * virHashGrow: * @table: the hash table @@ -360,6 +413,21 @@ virHashUpdateEntry(virHashTablePtr table, const void *name, return virHashAddOrUpdateEntry(table, name, userdata, true); } +int +virHashAtomicUpdate(virHashAtomicPtr table, + const void *name, + void *userdata) +{ + int ret; + + virObjectLock(table); + ret = virHashAddOrUpdateEntry(table->hash, name, userdata, true); + virObjectUnlock(table); + + return ret; +} + + /** * virHashLookup: * @table: the hash table @@ -409,6 +477,19 @@ void *virHashSteal(virHashTablePtr table, const void *name) return data; } +void * +virHashAtomicSteal(virHashAtomicPtr table, + const void *name) +{ + void *data; + + virObjectLock(table); + data = virHashSteal(table->hash, name); + virObjectUnlock(table); + + return data; +} + /** * virHashSize: diff --git a/src/util/virhash.h b/src/util/virhash.h index a137137..50b9a04 100644 --- a/src/util/virhash.h +++ b/src/util/virhash.h @@ -21,6 +21,9 @@ typedef struct _virHashTable virHashTable; typedef virHashTable *virHashTablePtr; +typedef struct _virHashAtomic virHashAtomic; +typedef virHashAtomic *virHashAtomicPtr; + /* * function types: */ @@ -101,6 +104,8 @@ typedef void (*virHashKeyFree)(void *name); */ virHashTablePtr virHashCreate(ssize_t size, virHashDataFree dataFree); +virHashAtomicPtr virHashAtomicNew(ssize_t size, + virHashDataFree dataFree); virHashTablePtr virHashCreateFull(ssize_t size, virHashDataFree dataFree, virHashKeyCode keyCode, @@ -119,6 +124,9 @@ int virHashAddEntry(virHashTablePtr table, int virHashUpdateEntry(virHashTablePtr table, const void *name, void *userdata); +int virHashAtomicUpdate(virHashAtomicPtr table, + const void *name, + void *userdata); /* * Remove an entry from the hash table. @@ -140,6 +148,8 @@ void *virHashLookup(const virHashTable *table, const void *name); * Retrieve & remove the userdata. */ void *virHashSteal(virHashTablePtr table, const void *name); +void *virHashAtomicSteal(virHashAtomicPtr table, + const void *name); /* * Get the hash table's key/value pairs and have them optionally sorted. -- 2.4.5

On Fri, Jul 10, 2015 at 11:26:14 +0200, Jiri Denemark wrote:
This is a self-locking wrapper around virHashTable. Only a limited set of APIs are implemented now (the ones which are used in the following patch) as more can be added on demand.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> ---
Notes: Version 2: - s/virHashLockable/virHashAtomic/
src/libvirt_private.syms | 3 ++ src/util/virhash.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++ src/util/virhash.h | 10 ++++++ 3 files changed, 94 insertions(+)
ACK

A helper function for copying error objects. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- Notes: Version 2: - new patch docs/apibuild.py | 1 + src/libvirt_private.syms | 1 + src/util/virerror.c | 19 +++++++++++++++++++ src/util/virerror.h | 1 + 4 files changed, 22 insertions(+) diff --git a/docs/apibuild.py b/docs/apibuild.py index 69f991d..f934fb2 100755 --- a/docs/apibuild.py +++ b/docs/apibuild.py @@ -102,6 +102,7 @@ ignored_functions = { "virDomainMigratePrepare3Params": "private function for migration", "virDomainMigrateConfirm3Params": "private function for migration", "virDomainMigratePrepareTunnel3Params": "private function for tunnelled migration", + "virErrorCopyNew": "private", } ignored_macros = { diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 65168b1..1a03268 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1365,6 +1365,7 @@ ebtablesRemoveForwardAllowIn; # util/virerror.h virDispatchError; +virErrorCopyNew; virErrorInitialize; virErrorSetErrnoFromLastError; virLastErrorIsSystemErrno; diff --git a/src/util/virerror.c b/src/util/virerror.c index fae627b..3ec12a3 100644 --- a/src/util/virerror.c +++ b/src/util/virerror.c @@ -214,6 +214,25 @@ virCopyError(virErrorPtr from, return ret; } + +virErrorPtr +virErrorCopyNew(virErrorPtr err) +{ + virErrorPtr ret; + + if (!err) + return NULL; + + if (VIR_ALLOC_QUIET(ret) < 0) + return NULL; + + if (virCopyError(err, ret) < 0) + VIR_FREE(ret); + + return ret; +} + + static virErrorPtr virLastErrorObject(void) { diff --git a/src/util/virerror.h b/src/util/virerror.h index baa2d08..2348648 100644 --- a/src/util/virerror.h +++ b/src/util/virerror.h @@ -185,6 +185,7 @@ void virReportOOMErrorFull(int domcode, virRaiseErrorObject(__FILE__, __FUNCTION__, __LINE__, obj) int virSetError(virErrorPtr newerr); +virErrorPtr virErrorCopyNew(virErrorPtr err); void virDispatchError(virConnectPtr conn); const char *virStrerror(int theerrno, char *errBuf, size_t errBufLen); -- 2.4.5

On Fri, Jul 10, 2015 at 11:26:15 +0200, Jiri Denemark wrote:
A helper function for copying error objects.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> ---
Notes: Version 2: - new patch
docs/apibuild.py | 1 + src/libvirt_private.syms | 1 + src/util/virerror.c | 19 +++++++++++++++++++ src/util/virerror.h | 1 + 4 files changed, 22 insertions(+)
ACK

On Fri, Jul 10, 2015 at 11:26:15 +0200, Jiri Denemark wrote:
A helper function for copying error objects.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> ---
Notes: Version 2: - new patch
docs/apibuild.py | 1 + src/libvirt_private.syms | 1 + src/util/virerror.c | 19 +++++++++++++++++++ src/util/virerror.h | 1 + 4 files changed, 22 insertions(+)
...
diff --git a/src/util/virerror.c b/src/util/virerror.c index fae627b..3ec12a3 100644 --- a/src/util/virerror.c +++ b/src/util/virerror.c @@ -214,6 +214,25 @@ virCopyError(virErrorPtr from, return ret; }
+ +virErrorPtr +virErrorCopyNew(virErrorPtr err) +{ + virErrorPtr ret; + + if (!err) + return NULL;
Perhaps drop the above hunk so that it's enforced that this func returns NULL only on failure. ACK stads if you choose to do so.
+ + if (VIR_ALLOC_QUIET(ret) < 0) + return NULL; + + if (virCopyError(err, ret) < 0) + VIR_FREE(ret); + + return ret; +}

If QEMU fails during incoming migration, the domain disappears including a possibly useful error message read from QEMU log file. Let's remember the error in virQEMUDriver so that Finish can report more than just "no such domain". Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- Notes: Version 2: - s/virHashLockable/virHashAtomic/ - use virErrorCopyNew - more comments src/qemu/qemu_conf.h | 3 +++ src/qemu/qemu_driver.c | 31 ++++++++++++++++++------- src/qemu/qemu_migration.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++- src/qemu/qemu_migration.h | 7 ++++++ src/qemu/qemu_monitor.c | 14 +++++++++++ src/qemu/qemu_monitor.h | 2 ++ src/qemu/qemu_process.c | 4 ++++ 7 files changed, 111 insertions(+), 9 deletions(-) diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h index b74c283..3f73929 100644 --- a/src/qemu/qemu_conf.h +++ b/src/qemu/qemu_conf.h @@ -252,6 +252,9 @@ struct _virQEMUDriver { /* Immutable pointer, self-clocking APIs */ virCloseCallbacksPtr closeCallbacks; + + /* Immutable pointer, self-locking APIs */ + virHashAtomicPtr migrationErrors; }; typedef struct _qemuDomainCmdlineDef qemuDomainCmdlineDef; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index c4b3979..c8cbd57 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -775,6 +775,9 @@ qemuStateInitialize(bool privileged, if (!(qemu_driver->sharedDevices = virHashCreate(30, qemuSharedDeviceEntryFree))) goto error; + if (qemuMigrationErrorInit(qemu_driver) < 0) + goto error; + if (privileged) { char *channeldir; @@ -1091,6 +1094,7 @@ qemuStateCleanup(void) virObjectUnref(qemu_driver->remotePorts); virObjectUnref(qemu_driver->webSocketPorts); virObjectUnref(qemu_driver->migrationPorts); + virObjectUnref(qemu_driver->migrationErrors); virObjectUnref(qemu_driver->xmlopt); @@ -12117,6 +12121,7 @@ qemuDomainMigrateFinish2(virConnectPtr dconn, if (!vm) { virReportError(VIR_ERR_NO_DOMAIN, _("no domain with matching name '%s'"), dname); + qemuMigrationErrorReport(driver, dname); goto cleanup; } @@ -12566,11 +12571,16 @@ qemuDomainMigrateFinish3(virConnectPtr dconn, virCheckFlags(QEMU_MIGRATION_FLAGS, NULL); - if (!dname || - !(vm = virDomainObjListFindByName(driver->domains, dname))) { + if (!dname) { + virReportError(VIR_ERR_NO_DOMAIN, "%s", _("missing domain name")); + return NULL; + } + + vm = virDomainObjListFindByName(driver->domains, dname); + if (!vm) { virReportError(VIR_ERR_NO_DOMAIN, - _("no domain with matching name '%s'"), - NULLSTR(dname)); + _("no domain with matching name '%s'"), dname); + qemuMigrationErrorReport(driver, dname); return NULL; } @@ -12609,11 +12619,16 @@ qemuDomainMigrateFinish3Params(virConnectPtr dconn, &dname) < 0) return NULL; - if (!dname || - !(vm = virDomainObjListFindByName(driver->domains, dname))) { + if (!dname) { + virReportError(VIR_ERR_NO_DOMAIN, "%s", _("missing domain name")); + return NULL; + } + + vm = virDomainObjListFindByName(driver->domains, dname); + if (!vm) { virReportError(VIR_ERR_NO_DOMAIN, - _("no domain with matching name '%s'"), - NULLSTR(dname)); + _("no domain with matching name '%s'"), dname); + qemuMigrationErrorReport(driver, dname); return NULL; } diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 7257182..58874ee 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -5543,8 +5543,10 @@ qemuMigrationFinish(virQEMUDriverPtr driver, if (!(caps = virQEMUDriverGetCapabilities(driver, false))) goto cleanup; - if (!qemuMigrationJobIsActive(vm, QEMU_ASYNC_JOB_MIGRATION_IN)) + if (!qemuMigrationJobIsActive(vm, QEMU_ASYNC_JOB_MIGRATION_IN)) { + qemuMigrationErrorReport(driver, vm->def->name); goto cleanup; + } qemuMigrationJobStartPhase(driver, vm, v3proto ? QEMU_MIGRATION_PHASE_FINISH3 @@ -5570,6 +5572,7 @@ qemuMigrationFinish(virQEMUDriverPtr driver, if (!virDomainObjIsActive(vm) && !(flags & VIR_MIGRATE_OFFLINE)) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("guest unexpectedly quit")); + qemuMigrationErrorReport(driver, vm->def->name); goto endjob; } @@ -6094,3 +6097,57 @@ qemuMigrationJobFinish(virQEMUDriverPtr driver, virDomainObjPtr vm) { qemuDomainObjEndAsyncJob(driver, vm); } + + +static void +qemuMigrationErrorFree(void *data, + const void *name ATTRIBUTE_UNUSED) +{ + virErrorPtr err = data; + virFreeError(err); +} + +int +qemuMigrationErrorInit(virQEMUDriverPtr driver) +{ + driver->migrationErrors = virHashAtomicNew(64, qemuMigrationErrorFree); + if (driver->migrationErrors) + return 0; + else + return -1; +} + +/** + * This function consumes @err; the caller should consider the @err pointer + * invalid after calling this function. + */ +void +qemuMigrationErrorSave(virQEMUDriverPtr driver, + const char *name, + virErrorPtr err) +{ + if (!err) + return; + + VIR_DEBUG("Saving incoming migration error for domain %s: %s", + name, err->message); + if (virHashAtomicUpdate(driver->migrationErrors, name, err) < 0) { + VIR_WARN("Failed to save migration error for domain '%s'", name); + virFreeError(err); + } +} + +void +qemuMigrationErrorReport(virQEMUDriverPtr driver, + const char *name) +{ + virErrorPtr err; + + if (!(err = virHashAtomicSteal(driver->migrationErrors, name))) + return; + + VIR_DEBUG("Restoring saved incoming migration error for domain %s: %s", + name, err->message); + virSetError(err); + virFreeError(err); +} diff --git a/src/qemu/qemu_migration.h b/src/qemu/qemu_migration.h index 48c2e8c..fa14274 100644 --- a/src/qemu/qemu_migration.h +++ b/src/qemu/qemu_migration.h @@ -193,4 +193,11 @@ int qemuMigrationFetchJobStatus(virQEMUDriverPtr driver, qemuDomainAsyncJob asyncJob, qemuDomainJobInfoPtr jobInfo); +int qemuMigrationErrorInit(virQEMUDriverPtr driver); +void qemuMigrationErrorSave(virQEMUDriverPtr driver, + const char *name, + virErrorPtr err); +void qemuMigrationErrorReport(virQEMUDriverPtr driver, + const char *name); + #endif /* __QEMU_MIGRATION_H__ */ diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c index 6b54f71..4f30b15 100644 --- a/src/qemu/qemu_monitor.c +++ b/src/qemu/qemu_monitor.c @@ -1057,6 +1057,20 @@ qemuMonitorSend(qemuMonitorPtr mon, } +/** + * This function returns a new virError object; the caller is responsible + * for freeing it. + */ +virErrorPtr +qemuMonitorLastError(qemuMonitorPtr mon) +{ + if (mon->lastError.code == VIR_ERR_OK) + return NULL; + + return virErrorCopyNew(&mon->lastError); +} + + virJSONValuePtr qemuMonitorGetOptions(qemuMonitorPtr mon) { diff --git a/src/qemu/qemu_monitor.h b/src/qemu/qemu_monitor.h index ab7d5a7..1e7b6bb 100644 --- a/src/qemu/qemu_monitor.h +++ b/src/qemu/qemu_monitor.h @@ -240,6 +240,8 @@ qemuMonitorPtr qemuMonitorOpenFD(virDomainObjPtr vm, void qemuMonitorClose(qemuMonitorPtr mon); +virErrorPtr qemuMonitorLastError(qemuMonitorPtr mon); + int qemuMonitorSetCapabilities(qemuMonitorPtr mon); int qemuMonitorSetLink(qemuMonitorPtr mon, diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 129cd69..2a529f7 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -310,6 +310,10 @@ qemuProcessHandleMonitorEOF(qemuMonitorPtr mon ATTRIBUTE_UNUSED, auditReason = "failed"; } + if (qemuMigrationJobIsActive(vm, QEMU_ASYNC_JOB_MIGRATION_IN)) + qemuMigrationErrorSave(driver, vm->def->name, + qemuMonitorLastError(priv->mon)); + event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, eventReason); -- 2.4.5

On Fri, Jul 10, 2015 at 11:26:16 +0200, Jiri Denemark wrote:
If QEMU fails during incoming migration, the domain disappears including a possibly useful error message read from QEMU log file. Let's remember the error in virQEMUDriver so that Finish can report more than just "no such domain".
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> ---
Notes: Version 2: - s/virHashLockable/virHashAtomic/ - use virErrorCopyNew - more comments
src/qemu/qemu_conf.h | 3 +++ src/qemu/qemu_driver.c | 31 ++++++++++++++++++------- src/qemu/qemu_migration.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++- src/qemu/qemu_migration.h | 7 ++++++ src/qemu/qemu_monitor.c | 14 +++++++++++ src/qemu/qemu_monitor.h | 2 ++ src/qemu/qemu_process.c | 4 ++++ 7 files changed, 111 insertions(+), 9 deletions(-)
ACK

virDomainMigrateFinish* APIs were unfortunately designed to return the pointer to the domain on destination and NULL on error. This looks OK in normal cases but the same API is also called when we know migration failed and thus we expect Finish to return NULL even if it actually did all it was supposed to do without any error. The call is defined to return nonnull domain pointer over RPC, which means returning NULL will always result in an error being send. If this was not in fact an error, the API itself wouldn't set anything to the thread local virError, which makes the RPC layer come up with it's own "Library function returned error but did not set virError" error. This is quite confusing and also hard to detect by the caller. This patch adds a special error code which can be used to check that Finish successfully aborted migration. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- Notes: Version 2: - no change include/libvirt/virterror.h | 1 + src/qemu/qemu_migration.c | 6 ++++++ src/util/virerror.c | 3 +++ 3 files changed, 10 insertions(+) diff --git a/include/libvirt/virterror.h b/include/libvirt/virterror.h index 6325030..f716cb9 100644 --- a/include/libvirt/virterror.h +++ b/include/libvirt/virterror.h @@ -307,6 +307,7 @@ typedef enum { VIR_ERR_CPU_INCOMPATIBLE = 91, /* given CPU is incompatible with host CPU*/ VIR_ERR_XML_INVALID_SCHEMA = 92, /* XML document doesn't validate against schema */ + VIR_ERR_MIGRATE_FINISH_OK = 93, /* Finish API succeeded but it is expected to return NULL */ } virErrorNumber; /** diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 58874ee..a9cbada 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -5751,6 +5751,12 @@ qemuMigrationFinish(virQEMUDriverPtr driver, } virObjectUnref(caps); virObjectUnref(cfg); + + /* Set a special error if Finish is expected to return NULL as a result of + * successful call with retcode != 0 + */ + if (retcode != 0 && !dom && !virGetLastError()) + virReportError(VIR_ERR_MIGRATE_FINISH_OK, NULL); return dom; } diff --git a/src/util/virerror.c b/src/util/virerror.c index 3ec12a3..26ac691 100644 --- a/src/util/virerror.c +++ b/src/util/virerror.c @@ -1372,6 +1372,9 @@ virErrorMsg(virErrorNumber error, const char *info) else errmsg = _("XML document failed to validate against schema: %s"); break; + case VIR_ERR_MIGRATE_FINISH_OK: + errmsg = _("migration successfully aborted"); + break; } return errmsg; } -- 2.4.5

On Fri, Jul 10, 2015 at 11:26:17AM +0200, Jiri Denemark wrote:
virDomainMigrateFinish* APIs were unfortunately designed to return the pointer to the domain on destination and NULL on error. This looks OK in normal cases but the same API is also called when we know migration failed and thus we expect Finish to return NULL even if it actually did all it was supposed to do without any error. The call is defined to return nonnull domain pointer over RPC, which means returning NULL will always result in an error being send. If this was not in fact an error, the API itself wouldn't set anything to the thread local virError, which makes the RPC layer come up with it's own "Library function returned error but did not set virError" error.
This is quite confusing and also hard to detect by the caller. This patch adds a special error code which can be used to check that Finish successfully aborted migration.
FWIW, I have come to the conclusion that it is impossible to 100% reliably detect successful from the virDomainMigrate* APIs, because it is possible that the libvirt connection to the dest host may fail, while the domain is in fact successfully running on the target. So there is always the possbility to get a error from the RPC layer despite everything at QEMU level being 100% successful. So in openstack we now ignore the return code for virDomainMigrate and instead try to detect succcess from the domain job info which will show that the domain on the source has completed, even if we loose the dest host RPC connection. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

When QEMU exits on destination during migration, the source reports either success (if the failure happened at the very end) or unhelpful "unexpectedly failed" error message. However, the Finish API called on the destination may report a real error so let's use it instead of the generic one. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- Notes: Version 2: - cleaner control flow - more comments src/libvirt-domain.c | 30 ++++++++++++++++++++++++++++-- src/qemu/qemu_migration.c | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/src/libvirt-domain.c b/src/libvirt-domain.c index 909c264..837933f 100644 --- a/src/libvirt-domain.c +++ b/src/libvirt-domain.c @@ -3175,8 +3175,34 @@ virDomainMigrateVersion3Full(virDomainPtr domain, (dconn, dname, cookiein, cookieinlen, &cookieout, &cookieoutlen, NULL, uri, destflags, cancelled); } - if (cancelled && ddomain) - VIR_ERROR(_("finish step ignored that migration was cancelled")); + + if (cancelled) { + if (ddomain) { + VIR_ERROR(_("finish step ignored that migration was cancelled")); + } else { + /* If Finish reported a useful error, use it instead of the + * original "migration unexpectedly failed" error. + * + * This is ugly but we can't do better with the APIs we have. We + * only replace the error if Finish was called with cancelled == 1 + * and reported a real error (old libvirt would report an error + * from RPC instead of MIGRATE_FINISH_OK), which only happens when + * the domain died on destination. To further reduce a possibility + * of false positives we also check that Perform returned + * VIR_ERR_OPERATION_FAILED. + */ + if (orig_err && + orig_err->domain == VIR_FROM_QEMU && + orig_err->code == VIR_ERR_OPERATION_FAILED) { + virErrorPtr err = virGetLastError(); + if (err->domain == VIR_FROM_QEMU && + err->code != VIR_ERR_MIGRATE_FINISH_OK) { + virFreeError(orig_err); + orig_err = NULL; + } + } + } + } /* If ddomain is NULL, then we were unable to start * the guest on the target, and must restart on the diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index a9cbada..d789110 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -4982,8 +4982,34 @@ doPeer2PeerMigrate3(virQEMUDriverPtr driver, dconnuri, uri, destflags, cancelled); qemuDomainObjExitRemote(vm); } - if (cancelled && ddomain) - VIR_ERROR(_("finish step ignored that migration was cancelled")); + + if (cancelled) { + if (ddomain) { + VIR_ERROR(_("finish step ignored that migration was cancelled")); + } else { + /* If Finish reported a useful error, use it instead of the + * original "migration unexpectedly failed" error. + * + * This is ugly but we can't do better with the APIs we have. We + * only replace the error if Finish was called with cancelled == 1 + * and reported a real error (old libvirt would report an error + * from RPC instead of MIGRATE_FINISH_OK), which only happens when + * the domain died on destination. To further reduce a possibility + * of false positives we also check that Perform returned + * VIR_ERR_OPERATION_FAILED. + */ + if (orig_err && + orig_err->domain == VIR_FROM_QEMU && + orig_err->code == VIR_ERR_OPERATION_FAILED) { + virErrorPtr err = virGetLastError(); + if (err->domain == VIR_FROM_QEMU && + err->code != VIR_ERR_MIGRATE_FINISH_OK) { + virFreeError(orig_err); + orig_err = NULL; + } + } + } + } /* If ddomain is NULL, then we were unable to start * the guest on the target, and must restart on the @@ -5719,6 +5745,15 @@ qemuMigrationFinish(virQEMUDriverPtr driver, /* Guest is successfully running, so cancel previous auto destroy */ qemuProcessAutoDestroyRemove(driver, vm); } else if (!(flags & VIR_MIGRATE_OFFLINE)) { + qemuDomainJobInfo info; + + /* Check for a possible error on the monitor in case Finish was called + * earlier than monitor EOF handler got a chance to process the error + */ + qemuMigrationFetchJobStatus(driver, vm, + QEMU_ASYNC_JOB_MIGRATION_IN, + &info); + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, VIR_QEMU_PROCESS_STOP_MIGRATED); virDomainAuditStop(vm, "failed"); -- 2.4.5

On Fri, Jul 10, 2015 at 11:26:18 +0200, Jiri Denemark wrote:
When QEMU exits on destination during migration, the source reports either success (if the failure happened at the very end) or unhelpful "unexpectedly failed" error message. However, the Finish API called on the destination may report a real error so let's use it instead of the generic one.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> ---
Notes: Version 2: - cleaner control flow - more comments
src/libvirt-domain.c | 30 ++++++++++++++++++++++++++++-- src/qemu/qemu_migration.c | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 65 insertions(+), 4 deletions(-)
ACK
participants (3)
-
Daniel P. Berrange
-
Jiri Denemark
-
Peter Krempa