[libvirt] [PATCH 0/4] Improve migration errors

When QEMU exits on destination during migration, the source reports either success (if the failure happened at the very end) or unhelpful "unexpectedly failed" error message, only the destination host knows more about the error, but usually there's no way to get the error back to the source because the domain is already gone at the time virDomainMigrateFinish* is called. This series changes libvirt to remember incoming migration errors so that they can be sent back to a source host. https://bugzilla.redhat.com/show_bug.cgi?id=1090093 Jiri Denemark (4): Introduce virHashLockable qemu: Remember incoming migration errors qemu: Don't report false error from MigrateFinish qemu: Use error from Finish instead of "unexpectedly failed" include/libvirt/virterror.h | 1 + src/libvirt-domain.c | 21 ++++++++++- src/libvirt_private.syms | 3 ++ src/qemu/qemu_conf.h | 3 ++ src/qemu/qemu_driver.c | 31 +++++++++++---- src/qemu/qemu_migration.c | 91 +++++++++++++++++++++++++++++++++++++++++++-- src/qemu/qemu_migration.h | 7 ++++ src/qemu/qemu_monitor.c | 19 ++++++++++ src/qemu/qemu_monitor.h | 2 + src/qemu/qemu_process.c | 4 ++ src/util/virerror.c | 3 ++ src/util/virhash.c | 81 ++++++++++++++++++++++++++++++++++++++++ src/util/virhash.h | 10 +++++ 13 files changed, 263 insertions(+), 13 deletions(-) -- 2.4.5

This is a self-locking wrapper around virHashTable. Only a limited set of APIs are implemented now (the ones which are used in the following patch) as more can be added on demand. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/libvirt_private.syms | 3 ++ src/util/virhash.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++ src/util/virhash.h | 10 ++++++ 3 files changed, 94 insertions(+) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 1566d11..377b7d8 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1482,6 +1482,9 @@ virHashEqual; virHashForEach; virHashFree; virHashGetItems; +virHashLockableNew; +virHashLockableSteal; +virHashLockableUpdate; virHashLookup; virHashRemoveAll; virHashRemoveEntry; diff --git a/src/util/virhash.c b/src/util/virhash.c index e3c1880..0f3934f 100644 --- a/src/util/virhash.c +++ b/src/util/virhash.c @@ -31,6 +31,7 @@ #include "virhashcode.h" #include "virrandom.h" #include "virstring.h" +#include "virobject.h" #define VIR_FROM_THIS VIR_FROM_NONE @@ -76,6 +77,28 @@ struct _virHashTable { virHashKeyFree keyFree; }; +struct _virHashLockable { + virObjectLockable parent; + virHashTablePtr hash; +}; + +static virClassPtr virHashLockableClass; +static void virHashLockableDispose(void *obj); + +static int virHashLockableOnceInit(void) +{ + virHashLockableClass = virClassNew(virClassForObjectLockable(), + "virHashLockable", + sizeof(virHashLockable), + virHashLockableDispose); + if (!virHashLockableClass) + return -1; + else + return 0; +} +VIR_ONCE_GLOBAL_INIT(virHashLockable) + + static uint32_t virHashStrCode(const void *name, uint32_t seed) { return virHashCodeGen(name, strlen(name), seed); @@ -178,6 +201,36 @@ virHashTablePtr virHashCreate(ssize_t size, virHashDataFree dataFree) virHashStrFree); } + +virHashLockablePtr +virHashLockableNew(ssize_t size, + virHashDataFree dataFree) +{ + virHashLockablePtr hash; + + if (virHashLockableInitialize() < 0) + return NULL; + + if (!(hash = virObjectLockableNew(virHashLockableClass))) + return NULL; + + if (!(hash->hash = virHashCreate(size, dataFree))) { + virObjectUnref(hash); + return NULL; + } + return hash; +} + + +static void +virHashLockableDispose(void *obj) +{ + virHashLockablePtr hash = obj; + + virHashFree(hash->hash); +} + + /** * virHashGrow: * @table: the hash table @@ -360,6 +413,21 @@ virHashUpdateEntry(virHashTablePtr table, const void *name, return virHashAddOrUpdateEntry(table, name, userdata, true); } +int +virHashLockableUpdate(virHashLockablePtr table, + const void *name, + void *userdata) +{ + int ret; + + virObjectLock(table); + ret = virHashAddOrUpdateEntry(table->hash, name, userdata, true); + virObjectUnlock(table); + + return ret; +} + + /** * virHashLookup: * @table: the hash table @@ -409,6 +477,19 @@ void *virHashSteal(virHashTablePtr table, const void *name) return data; } +void * +virHashLockableSteal(virHashLockablePtr table, + const void *name) +{ + void *data; + + virObjectLock(table); + data = virHashSteal(table->hash, name); + virObjectUnlock(table); + + return data; +} + /** * virHashSize: diff --git a/src/util/virhash.h b/src/util/virhash.h index a137137..bef20fd 100644 --- a/src/util/virhash.h +++ b/src/util/virhash.h @@ -21,6 +21,9 @@ typedef struct _virHashTable virHashTable; typedef virHashTable *virHashTablePtr; +typedef struct _virHashLockable virHashLockable; +typedef virHashLockable *virHashLockablePtr; + /* * function types: */ @@ -101,6 +104,8 @@ typedef void (*virHashKeyFree)(void *name); */ virHashTablePtr virHashCreate(ssize_t size, virHashDataFree dataFree); +virHashLockablePtr virHashLockableNew(ssize_t size, + virHashDataFree dataFree); virHashTablePtr virHashCreateFull(ssize_t size, virHashDataFree dataFree, virHashKeyCode keyCode, @@ -119,6 +124,9 @@ int virHashAddEntry(virHashTablePtr table, int virHashUpdateEntry(virHashTablePtr table, const void *name, void *userdata); +int virHashLockableUpdate(virHashLockablePtr table, + const void *name, + void *userdata); /* * Remove an entry from the hash table. @@ -140,6 +148,8 @@ void *virHashLookup(const virHashTable *table, const void *name); * Retrieve & remove the userdata. */ void *virHashSteal(virHashTablePtr table, const void *name); +void *virHashLockableSteal(virHashLockablePtr table, + const void *name); /* * Get the hash table's key/value pairs and have them optionally sorted. -- 2.4.5

On Wed, Jul 08, 2015 at 15:22:49 +0200, Jiri Denemark wrote:
This is a self-locking wrapper around virHashTable. Only a limited set of APIs are implemented now (the ones which are used in the following patch) as more can be added on demand.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/libvirt_private.syms | 3 ++ src/util/virhash.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++ src/util/virhash.h | 10 ++++++ 3 files changed, 94 insertions(+)
ACK although I dislike the "Lockable" part since you are not "able" to lock that hash, but that hash is self-locking. But I don't have a better name suggestion. Peter

On Thu, Jul 09, 2015 at 15:43:32 +0200, Peter Krempa wrote:
On Wed, Jul 08, 2015 at 15:22:49 +0200, Jiri Denemark wrote:
This is a self-locking wrapper around virHashTable. Only a limited set of APIs are implemented now (the ones which are used in the following patch) as more can be added on demand.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/libvirt_private.syms | 3 ++ src/util/virhash.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++ src/util/virhash.h | 10 ++++++ 3 files changed, 94 insertions(+)
ACK although I dislike the "Lockable" part since you are not "able" to lock that hash, but that hash is self-locking. But I don't have a better name suggestion.
Yeah, I think virHashAtomic sounds better. Jirka

If QEMU fails during incoming migration, the domain disappears including a possibly useful error message read from QEMU log file. Let's remember the error in virQEMUDriver so that Finish can report more than just "no such domain". Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_conf.h | 3 +++ src/qemu/qemu_driver.c | 31 +++++++++++++++++++------- src/qemu/qemu_migration.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++- src/qemu/qemu_migration.h | 7 ++++++ src/qemu/qemu_monitor.c | 19 ++++++++++++++++ src/qemu/qemu_monitor.h | 2 ++ src/qemu/qemu_process.c | 4 ++++ 7 files changed, 112 insertions(+), 9 deletions(-) diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h index b74c283..b4881c3 100644 --- a/src/qemu/qemu_conf.h +++ b/src/qemu/qemu_conf.h @@ -252,6 +252,9 @@ struct _virQEMUDriver { /* Immutable pointer, self-clocking APIs */ virCloseCallbacksPtr closeCallbacks; + + /* Immutable pointer, self-locking APIs */ + virHashLockablePtr migrationErrors; }; typedef struct _qemuDomainCmdlineDef qemuDomainCmdlineDef; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 4cfae03..cb5c2e8 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -775,6 +775,9 @@ qemuStateInitialize(bool privileged, if (!(qemu_driver->sharedDevices = virHashCreate(30, qemuSharedDeviceEntryFree))) goto error; + if (qemuMigrationErrorInit(qemu_driver) < 0) + goto error; + if (privileged) { char *channeldir; @@ -1091,6 +1094,7 @@ qemuStateCleanup(void) virObjectUnref(qemu_driver->remotePorts); virObjectUnref(qemu_driver->webSocketPorts); virObjectUnref(qemu_driver->migrationPorts); + virObjectUnref(qemu_driver->migrationErrors); virObjectUnref(qemu_driver->xmlopt); @@ -12125,6 +12129,7 @@ qemuDomainMigrateFinish2(virConnectPtr dconn, if (!vm) { virReportError(VIR_ERR_NO_DOMAIN, _("no domain with matching name '%s'"), dname); + qemuMigrationErrorReport(driver, dname); goto cleanup; } @@ -12574,11 +12579,16 @@ qemuDomainMigrateFinish3(virConnectPtr dconn, virCheckFlags(QEMU_MIGRATION_FLAGS, NULL); - if (!dname || - !(vm = virDomainObjListFindByName(driver->domains, dname))) { + if (!dname) { + virReportError(VIR_ERR_NO_DOMAIN, "%s", _("missing domain name")); + return NULL; + } + + vm = virDomainObjListFindByName(driver->domains, dname); + if (!vm) { virReportError(VIR_ERR_NO_DOMAIN, - _("no domain with matching name '%s'"), - NULLSTR(dname)); + _("no domain with matching name '%s'"), dname); + qemuMigrationErrorReport(driver, dname); return NULL; } @@ -12617,11 +12627,16 @@ qemuDomainMigrateFinish3Params(virConnectPtr dconn, &dname) < 0) return NULL; - if (!dname || - !(vm = virDomainObjListFindByName(driver->domains, dname))) { + if (!dname) { + virReportError(VIR_ERR_NO_DOMAIN, "%s", _("missing domain name")); + return NULL; + } + + vm = virDomainObjListFindByName(driver->domains, dname); + if (!vm) { virReportError(VIR_ERR_NO_DOMAIN, - _("no domain with matching name '%s'"), - NULLSTR(dname)); + _("no domain with matching name '%s'"), dname); + qemuMigrationErrorReport(driver, dname); return NULL; } diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index a57a177..82069a1 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -5500,8 +5500,10 @@ qemuMigrationFinish(virQEMUDriverPtr driver, if (!(caps = virQEMUDriverGetCapabilities(driver, false))) goto cleanup; - if (!qemuMigrationJobIsActive(vm, QEMU_ASYNC_JOB_MIGRATION_IN)) + if (!qemuMigrationJobIsActive(vm, QEMU_ASYNC_JOB_MIGRATION_IN)) { + qemuMigrationErrorReport(driver, vm->def->name); goto cleanup; + } qemuMigrationJobStartPhase(driver, vm, v3proto ? QEMU_MIGRATION_PHASE_FINISH3 @@ -5527,6 +5529,7 @@ qemuMigrationFinish(virQEMUDriverPtr driver, if (!virDomainObjIsActive(vm) && !(flags & VIR_MIGRATE_OFFLINE)) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("guest unexpectedly quit")); + qemuMigrationErrorReport(driver, vm->def->name); goto endjob; } @@ -6051,3 +6054,53 @@ qemuMigrationJobFinish(virQEMUDriverPtr driver, virDomainObjPtr vm) { qemuDomainObjEndAsyncJob(driver, vm); } + + +static void +qemuMigrationErrorFree(void *data, + const void *name ATTRIBUTE_UNUSED) +{ + virErrorPtr err = data; + virFreeError(err); +} + +int +qemuMigrationErrorInit(virQEMUDriverPtr driver) +{ + driver->migrationErrors = virHashLockableNew(64, qemuMigrationErrorFree); + if (driver->migrationErrors) + return 0; + else + return -1; +} + +void +qemuMigrationErrorSave(virQEMUDriverPtr driver, + const char *name, + virErrorPtr err) +{ + if (!err) + return; + + VIR_DEBUG("Saving incoming migration error for domain %s: %s", + name, err->message); + if (virHashLockableUpdate(driver->migrationErrors, name, err) < 0) { + VIR_WARN("Failed to save migration error for domain '%s'", name); + virFreeError(err); + } +} + +void +qemuMigrationErrorReport(virQEMUDriverPtr driver, + const char *name) +{ + virErrorPtr err; + + if (!(err = virHashLockableSteal(driver->migrationErrors, name))) + return; + + VIR_DEBUG("Restoring saved incoming migration error for domain %s: %s", + name, err->message); + virSetError(err); + virFreeError(err); +} diff --git a/src/qemu/qemu_migration.h b/src/qemu/qemu_migration.h index 48c2e8c..fa14274 100644 --- a/src/qemu/qemu_migration.h +++ b/src/qemu/qemu_migration.h @@ -193,4 +193,11 @@ int qemuMigrationFetchJobStatus(virQEMUDriverPtr driver, qemuDomainAsyncJob asyncJob, qemuDomainJobInfoPtr jobInfo); +int qemuMigrationErrorInit(virQEMUDriverPtr driver); +void qemuMigrationErrorSave(virQEMUDriverPtr driver, + const char *name, + virErrorPtr err); +void qemuMigrationErrorReport(virQEMUDriverPtr driver, + const char *name); + #endif /* __QEMU_MIGRATION_H__ */ diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c index 896d9fd..9db05c5 100644 --- a/src/qemu/qemu_monitor.c +++ b/src/qemu/qemu_monitor.c @@ -1057,6 +1057,25 @@ qemuMonitorSend(qemuMonitorPtr mon, } +virErrorPtr +qemuMonitorLastError(qemuMonitorPtr mon) +{ + virErrorPtr old; + virErrorPtr err; + + if (mon->lastError.code == VIR_ERR_OK) + return NULL; + + old = virSaveLastError(); + virSetError(&mon->lastError); + err = virSaveLastError(); + virSetError(old); + virFreeError(old); + + return err; +} + + virJSONValuePtr qemuMonitorGetOptions(qemuMonitorPtr mon) { diff --git a/src/qemu/qemu_monitor.h b/src/qemu/qemu_monitor.h index ec1724f..13791be 100644 --- a/src/qemu/qemu_monitor.h +++ b/src/qemu/qemu_monitor.h @@ -234,6 +234,8 @@ qemuMonitorPtr qemuMonitorOpenFD(virDomainObjPtr vm, void qemuMonitorClose(qemuMonitorPtr mon); +virErrorPtr qemuMonitorLastError(qemuMonitorPtr mon); + int qemuMonitorSetCapabilities(qemuMonitorPtr mon); int qemuMonitorSetLink(qemuMonitorPtr mon, diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index ee522b9..89a7c48 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -310,6 +310,10 @@ qemuProcessHandleMonitorEOF(qemuMonitorPtr mon ATTRIBUTE_UNUSED, auditReason = "failed"; } + if (qemuMigrationJobIsActive(vm, QEMU_ASYNC_JOB_MIGRATION_IN)) + qemuMigrationErrorSave(driver, vm->def->name, + qemuMonitorLastError(priv->mon)); + event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, eventReason); -- 2.4.5

On Wed, Jul 08, 2015 at 15:22:50 +0200, Jiri Denemark wrote:
If QEMU fails during incoming migration, the domain disappears including a possibly useful error message read from QEMU log file. Let's remember the error in virQEMUDriver so that Finish can report more than just "no such domain".
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_conf.h | 3 +++ src/qemu/qemu_driver.c | 31 +++++++++++++++++++------- src/qemu/qemu_migration.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++- src/qemu/qemu_migration.h | 7 ++++++ src/qemu/qemu_monitor.c | 19 ++++++++++++++++ src/qemu/qemu_monitor.h | 2 ++ src/qemu/qemu_process.c | 4 ++++ 7 files changed, 112 insertions(+), 9 deletions(-)
...
diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index a57a177..82069a1 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c
...
@@ -6051,3 +6054,53 @@ qemuMigrationJobFinish(virQEMUDriverPtr driver, virDomainObjPtr vm) { qemuDomainObjEndAsyncJob(driver, vm); } + + +static void +qemuMigrationErrorFree(void *data, + const void *name ATTRIBUTE_UNUSED) +{ + virErrorPtr err = data; + virFreeError(err); +} + +int +qemuMigrationErrorInit(virQEMUDriverPtr driver) +{ + driver->migrationErrors = virHashLockableNew(64, qemuMigrationErrorFree); + if (driver->migrationErrors) + return 0; + else + return -1; +} +
This function consumes @err. A comment noting that would be helpful.
+void +qemuMigrationErrorSave(virQEMUDriverPtr driver, + const char *name, + virErrorPtr err) +{ + if (!err) + return; + + VIR_DEBUG("Saving incoming migration error for domain %s: %s", + name, err->message); + if (virHashLockableUpdate(driver->migrationErrors, name, err) < 0) { + VIR_WARN("Failed to save migration error for domain '%s'", name); + virFreeError(err); + } +} + +void +qemuMigrationErrorReport(virQEMUDriverPtr driver, + const char *name) +{ + virErrorPtr err; + + if (!(err = virHashLockableSteal(driver->migrationErrors, name))) + return; + + VIR_DEBUG("Restoring saved incoming migration error for domain %s: %s", + name, err->message); + virSetError(err); + virFreeError(err); +}
...
diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c index 896d9fd..9db05c5 100644 --- a/src/qemu/qemu_monitor.c +++ b/src/qemu/qemu_monitor.c @@ -1057,6 +1057,25 @@ qemuMonitorSend(qemuMonitorPtr mon, }
Telling that the user is responsible for freeing the value would be helpful here.
+virErrorPtr +qemuMonitorLastError(qemuMonitorPtr mon) +{ + virErrorPtr old; + virErrorPtr err; + + if (mon->lastError.code == VIR_ERR_OK) + return NULL; + + old = virSaveLastError(); + virSetError(&mon->lastError); + err = virSaveLastError(); + virSetError(old); + virFreeError(old);
Ummm, how about exporting virCopyError rather than using this rather opaque and ugly way to copy the error?
+ + return err; +} + + virJSONValuePtr qemuMonitorGetOptions(qemuMonitorPtr mon) {
Peter

virDomainMigrateFinish* APIs were unfortunately designed to return the pointer to the domain on destination and NULL on error. This looks OK in normal cases but the same API is also called when we know migration failed and thus we expect Finish to return NULL even if it actually did all it was supposed to do without any error. The call is defined to return nonnull domain pointer over RPC, which means returning NULL will always result in an error being send. If this was not in fact an error, the API itself wouldn't set anything to the thread local virError, which makes the RPC layer come up with it's own "Library function returned error but did not set virError" error. This is quite confusing and also hard to detect by the caller. This patch adds a special error code which can be used to check that Finish successfully aborted migration. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- include/libvirt/virterror.h | 1 + src/qemu/qemu_migration.c | 6 ++++++ src/util/virerror.c | 3 +++ 3 files changed, 10 insertions(+) diff --git a/include/libvirt/virterror.h b/include/libvirt/virterror.h index 6325030..f716cb9 100644 --- a/include/libvirt/virterror.h +++ b/include/libvirt/virterror.h @@ -307,6 +307,7 @@ typedef enum { VIR_ERR_CPU_INCOMPATIBLE = 91, /* given CPU is incompatible with host CPU*/ VIR_ERR_XML_INVALID_SCHEMA = 92, /* XML document doesn't validate against schema */ + VIR_ERR_MIGRATE_FINISH_OK = 93, /* Finish API succeeded but it is expected to return NULL */ } virErrorNumber; /** diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 82069a1..3548d73 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -5708,6 +5708,12 @@ qemuMigrationFinish(virQEMUDriverPtr driver, } virObjectUnref(caps); virObjectUnref(cfg); + + /* Set a special error if Finish is expected to return NULL as a result of + * successful call with retcode != 0 + */ + if (retcode != 0 && !dom && !virGetLastError()) + virReportError(VIR_ERR_MIGRATE_FINISH_OK, NULL); return dom; } diff --git a/src/util/virerror.c b/src/util/virerror.c index fae627b..4c16d97 100644 --- a/src/util/virerror.c +++ b/src/util/virerror.c @@ -1353,6 +1353,9 @@ virErrorMsg(virErrorNumber error, const char *info) else errmsg = _("XML document failed to validate against schema: %s"); break; + case VIR_ERR_MIGRATE_FINISH_OK: + errmsg = _("migration successfully aborted"); + break; } return errmsg; } -- 2.4.5

On Wed, Jul 08, 2015 at 15:22:51 +0200, Jiri Denemark wrote:
virDomainMigrateFinish* APIs were unfortunately designed to return the pointer to the domain on destination and NULL on error. This looks OK in normal cases but the same API is also called when we know migration failed and thus we expect Finish to return NULL even if it actually did all it was supposed to do without any error. The call is defined to return nonnull domain pointer over RPC, which means returning NULL will always result in an error being send. If this was not in fact an error, the API itself wouldn't set anything to the thread local virError, which makes the RPC layer come up with it's own "Library function returned error but did not set virError" error.
This is quite confusing and also hard to detect by the caller. This patch adds a special error code which can be used to check that Finish successfully aborted migration.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- include/libvirt/virterror.h | 1 + src/qemu/qemu_migration.c | 6 ++++++ src/util/virerror.c | 3 +++ 3 files changed, 10 insertions(+)
I'm kind of not sure whether I like this solution or not. I understand the reasons for having it but I'm not liking it. ACK but I'd prefer another opinion on this if possible. Peter

On Thu, Jul 09, 2015 at 18:10:51 +0200, Peter Krempa wrote:
On Wed, Jul 08, 2015 at 15:22:51 +0200, Jiri Denemark wrote:
virDomainMigrateFinish* APIs were unfortunately designed to return the pointer to the domain on destination and NULL on error. This looks OK in normal cases but the same API is also called when we know migration failed and thus we expect Finish to return NULL even if it actually did all it was supposed to do without any error. The call is defined to return nonnull domain pointer over RPC, which means returning NULL will always result in an error being send. If this was not in fact an error, the API itself wouldn't set anything to the thread local virError, which makes the RPC layer come up with it's own "Library function returned error but did not set virError" error.
This is quite confusing and also hard to detect by the caller. This patch adds a special error code which can be used to check that Finish successfully aborted migration.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- include/libvirt/virterror.h | 1 + src/qemu/qemu_migration.c | 6 ++++++ src/util/virerror.c | 3 +++ 3 files changed, 10 insertions(+)
I'm kind of not sure whether I like this solution or not. I understand the reasons for having it but I'm not liking it. ACK but I'd prefer another opinion on this if possible.
I don't like it either but fixing it in RPC is not an option since returning NULL domain would be an ABI incompatible change. And I know for sure I don't want to introduce a completely new Finish API just for this because it is an enormous pain :-) Jirka

When QEMU exits on destination during migration, the source reports either success (if the failure happened at the very end) or unhelpful "unexpectedly failed" error message. However, the Finish API called on the destination may report a real error so let's use it instead of the generic one. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/libvirt-domain.c | 21 +++++++++++++++++++-- src/qemu/qemu_migration.c | 30 ++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/src/libvirt-domain.c b/src/libvirt-domain.c index 909c264..f18fee2 100644 --- a/src/libvirt-domain.c +++ b/src/libvirt-domain.c @@ -3175,8 +3175,25 @@ virDomainMigrateVersion3Full(virDomainPtr domain, (dconn, dname, cookiein, cookieinlen, &cookieout, &cookieoutlen, NULL, uri, destflags, cancelled); } - if (cancelled && ddomain) - VIR_ERROR(_("finish step ignored that migration was cancelled")); + + if (cancelled) { + if (ddomain) + VIR_ERROR(_("finish step ignored that migration was cancelled")); + + /* If Finish reported a useful error, use it instead of the original + * "migration unexpectedly failed" error. + */ + if (orig_err && + orig_err->domain == VIR_FROM_QEMU && + orig_err->code == VIR_ERR_OPERATION_FAILED) { + virErrorPtr err = virGetLastError(); + if (err->domain == VIR_FROM_QEMU && + err->code != VIR_ERR_MIGRATE_FINISH_OK) { + virFreeError(orig_err); + orig_err = NULL; + } + } + } /* If ddomain is NULL, then we were unable to start * the guest on the target, and must restart on the diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 3548d73..d02a0c6 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -4957,8 +4957,25 @@ doPeer2PeerMigrate3(virQEMUDriverPtr driver, dconnuri, uri, destflags, cancelled); qemuDomainObjExitRemote(vm); } - if (cancelled && ddomain) - VIR_ERROR(_("finish step ignored that migration was cancelled")); + + if (cancelled) { + if (ddomain) + VIR_ERROR(_("finish step ignored that migration was cancelled")); + + /* If Finish reported a useful error, use it instead of the original + * "migration unexpectedly failed" error. + */ + if (orig_err && + orig_err->domain == VIR_FROM_QEMU && + orig_err->code == VIR_ERR_OPERATION_FAILED) { + virErrorPtr err = virGetLastError(); + if (err->domain == VIR_FROM_QEMU && + err->code != VIR_ERR_MIGRATE_FINISH_OK) { + virFreeError(orig_err); + orig_err = NULL; + } + } + } /* If ddomain is NULL, then we were unable to start * the guest on the target, and must restart on the @@ -5676,6 +5693,15 @@ qemuMigrationFinish(virQEMUDriverPtr driver, /* Guest is successfully running, so cancel previous auto destroy */ qemuProcessAutoDestroyRemove(driver, vm); } else if (!(flags & VIR_MIGRATE_OFFLINE)) { + qemuDomainJobInfo info; + + /* Check for a possible error on the monitor in case Finish was called + * earlier than monitor EOF handler got a chance to process the error + */ + qemuMigrationFetchJobStatus(driver, vm, + QEMU_ASYNC_JOB_MIGRATION_IN, + &info); + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, VIR_QEMU_PROCESS_STOP_MIGRATED); virDomainAuditStop(vm, "failed"); -- 2.4.5

On Wed, Jul 08, 2015 at 15:22:52 +0200, Jiri Denemark wrote:
When QEMU exits on destination during migration, the source reports either success (if the failure happened at the very end) or unhelpful "unexpectedly failed" error message. However, the Finish API called on the destination may report a real error so let's use it instead of the generic one.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/libvirt-domain.c | 21 +++++++++++++++++++-- src/qemu/qemu_migration.c | 30 ++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 4 deletions(-)
diff --git a/src/libvirt-domain.c b/src/libvirt-domain.c index 909c264..f18fee2 100644 --- a/src/libvirt-domain.c +++ b/src/libvirt-domain.c @@ -3175,8 +3175,25 @@ virDomainMigrateVersion3Full(virDomainPtr domain, (dconn, dname, cookiein, cookieinlen, &cookieout, &cookieoutlen, NULL, uri, destflags, cancelled); } - if (cancelled && ddomain) - VIR_ERROR(_("finish step ignored that migration was cancelled")); +
See below for comments:
+ if (cancelled) { + if (ddomain) + VIR_ERROR(_("finish step ignored that migration was cancelled")); + + /* If Finish reported a useful error, use it instead of the original + * "migration unexpectedly failed" error. + */ + if (orig_err && + orig_err->domain == VIR_FROM_QEMU && + orig_err->code == VIR_ERR_OPERATION_FAILED) { + virErrorPtr err = virGetLastError(); + if (err->domain == VIR_FROM_QEMU && + err->code != VIR_ERR_MIGRATE_FINISH_OK) { + virFreeError(orig_err); + orig_err = NULL; + } + } + }
/* If ddomain is NULL, then we were unable to start * the guest on the target, and must restart on the diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 3548d73..d02a0c6 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -4957,8 +4957,25 @@ doPeer2PeerMigrate3(virQEMUDriverPtr driver, dconnuri, uri, destflags, cancelled); qemuDomainObjExitRemote(vm); } - if (cancelled && ddomain) - VIR_ERROR(_("finish step ignored that migration was cancelled")); +
The control flow below is weird. Here are a few facts from the code above: - ddomain is set via the domainMigrateFinish3(Params) and not anywhere else - domainMigrateFinish3 either reports an error or returns ddomain thus:
+ if (cancelled) { + if (ddomain) + VIR_ERROR(_("finish step ignored that migration was cancelled")); +
Basically all the code below is an else section to the above if statement due to the previous fact, so ... the below code makes it kind of opaque.
+ /* If Finish reported a useful error, use it instead of the original + * "migration unexpectedly failed" error. + */ + if (orig_err && + orig_err->domain == VIR_FROM_QEMU && + orig_err->code == VIR_ERR_OPERATION_FAILED) {
The code check isn't robust enough IMO. If you want to keep it, the fact that this happens in a way like this should be noted in a comment for doNativeMigrate/doTunnelMigrate that set the errors.
+ virErrorPtr err = virGetLastError();
And additionally there is no error reported that this could possibly overwrite in case where ddomain is not NULL except for the one reported in virTypedParamsReplaceString but I doubt that will be a better one. (If that is the intention a comment would really be helpful.
+ if (err->domain == VIR_FROM_QEMU && + err->code != VIR_ERR_MIGRATE_FINISH_OK) { + virFreeError(orig_err); + orig_err = NULL; + } + } + }
/* If ddomain is NULL, then we were unable to start * the guest on the target, and must restart on the
Otherwise makes sense. I'd like to either have an explanation of the above control flow or a fixed version. Peter
participants (2)
-
Jiri Denemark
-
Peter Krempa