[libvirt] [PATCH 0/3] libxl migration improvements

This series of patches fixes problems discovered in libxl migration. The first patch fixes an issue that went undetected while testing the initial implementation of migration. Receiving migration data occurs in the context of an event loop callback, effectively blocking the event loop during the entire migration process. The patch moves the work of receiving migration data to a thread. Interestingly, this issue manifested in a failed migration due to failed keepalives, which would kill virsh's connection to dst host. The dst host failed to respond to keepalives since its event loop was blocked on receiving migration data. Ultimately the migration perform phase would succeed leaving a running domain on dst. However, the subsequent finish phase would fail since virsh's connection to dst had been killed by the keepalive failure. Since finish failed, the confirm phase would resume the domain on src. Yikes! Same domain running on two different hosts :(. Patches 2 and 3 improve handling of errors in the event the perform or finish phases of migration fail. See the individual patches for details. Jim Fehlig (3): libxl: Receive migration data in a thread libxl: start domain paused on migration dst libxl: destroy domain in migration finish phase on failure src/libxl/libxl_migration.c | 75 ++++++++++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 24 deletions(-) -- 1.8.4.5

The libxl driver receives migration data within an IO callback invoked by the event loop, effectively disabling the event loop while migration occurs. This patch moves receving of the migration data to a thread. The incoming connection is still accepted in the IO callback, but control is immediately returned to the event loop after spawning the thread. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_migration.c | 60 +++++++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/src/libxl/libxl_migration.c b/src/libxl/libxl_migration.c index 0b562f7..1940209 100644 --- a/src/libxl/libxl_migration.c +++ b/src/libxl/libxl_migration.c @@ -35,6 +35,7 @@ #include "vircommand.h" #include "virstring.h" #include "virobject.h" +#include "virthread.h" #include "rpc/virnetsocket.h" #include "libxl_domain.h" #include "libxl_driver.h" @@ -48,6 +49,7 @@ VIR_LOG_INIT("libxl.libxl_migration"); typedef struct _libxlMigrationDstArgs { virObject parent; + int recvfd; virConnectPtr conn; virDomainObjPtr vm; unsigned int flags; @@ -82,31 +84,18 @@ libxlMigrationDstArgsOnceInit(void) VIR_ONCE_GLOBAL_INIT(libxlMigrationDstArgs) static void -libxlDoMigrateReceive(virNetSocketPtr sock, - int events ATTRIBUTE_UNUSED, - void *opaque) +libxlDoMigrateReceive(void *opaque) { libxlMigrationDstArgs *args = opaque; - virConnectPtr conn = args->conn; virDomainObjPtr vm = args->vm; virNetSocketPtr *socks = args->socks; size_t nsocks = args->nsocks; bool paused = args->flags & VIR_MIGRATE_PAUSED; - libxlDriverPrivatePtr driver = conn->privateData; - virNetSocketPtr client_sock; - int recvfd = -1; + libxlDriverPrivatePtr driver = args->conn->privateData; + int recvfd = args->recvfd; size_t i; int ret; - if (virNetSocketAccept(sock, &client_sock) < 0) { - virReportError(VIR_ERR_OPERATION_INVALID, "%s", - _("Fail to accept migration connection")); - goto cleanup; - } - VIR_DEBUG("Accepted migration connection\n"); - recvfd = virNetSocketDupFD(client_sock, true); - virObjectUnref(client_sock); - virObjectLock(vm); ret = libxlDomainStart(driver, vm, paused, recvfd); virObjectUnlock(vm); @@ -114,7 +103,6 @@ libxlDoMigrateReceive(virNetSocketPtr sock, if (ret < 0 && !vm->persistent) virDomainObjListRemove(driver->domains, vm); - cleanup: /* Remove all listen socks from event handler, and close them. */ for (i = 0; i < nsocks; i++) { virNetSocketUpdateIOCallback(socks[i], 0); @@ -127,6 +115,42 @@ libxlDoMigrateReceive(virNetSocketPtr sock, VIR_FORCE_CLOSE(recvfd); } + +static void +libxlMigrateReceive(virNetSocketPtr sock, + int events ATTRIBUTE_UNUSED, + void *opaque) +{ + libxlMigrationDstArgs *args = opaque; + virNetSocketPtr client_sock; + int recvfd; + virThread thread; + + /* Accept migration connection */ + virNetSocketAccept(sock, &client_sock); + if (client_sock == NULL) { + virReportError(VIR_ERR_OPERATION_FAILED, "%s", + _("Failed to accept migration connection")); + return; + } + VIR_DEBUG("Accepted migration connection." + " Spawing thread to process migration data"); + recvfd = virNetSocketDupFD(client_sock, true); + virObjectUnref(client_sock); + + /* + * Avoid blocking the event loop. Start a thread to receive + * the migration data + */ + args->recvfd = recvfd; + if (virThreadCreate(&thread, false, + libxlDoMigrateReceive, args) < 0) { + virReportError(VIR_ERR_OPERATION_FAILED, "%s", + _("Failed to create thread for receiving migration data")); + } +} + + static int libxlDoMigrateSend(libxlDriverPrivatePtr driver, virDomainObjPtr vm, @@ -376,7 +400,7 @@ libxlDomainMigrationPrepare(virConnectPtr dconn, if (virNetSocketAddIOCallback(socks[i], VIR_EVENT_HANDLE_READABLE, - libxlDoMigrateReceive, + libxlMigrateReceive, args, virObjectFreeCallback) < 0) continue; -- 1.8.4.5

On 13.11.2014 03:36, Jim Fehlig wrote:
The libxl driver receives migration data within an IO callback invoked by the event loop, effectively disabling the event loop while migration occurs.
This patch moves receving of the migration data to a thread. The incoming connection is still accepted in the IO callback, but control is immediately returned to the event loop after spawning the thread.
Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_migration.c | 60 +++++++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 18 deletions(-)
diff --git a/src/libxl/libxl_migration.c b/src/libxl/libxl_migration.c index 0b562f7..1940209 100644 --- a/src/libxl/libxl_migration.c +++ b/src/libxl/libxl_migration.c @@ -35,6 +35,7 @@ #include "vircommand.h" #include "virstring.h" #include "virobject.h" +#include "virthread.h" #include "rpc/virnetsocket.h" #include "libxl_domain.h" #include "libxl_driver.h" @@ -48,6 +49,7 @@ VIR_LOG_INIT("libxl.libxl_migration"); typedef struct _libxlMigrationDstArgs { virObject parent;
+ int recvfd; virConnectPtr conn; virDomainObjPtr vm; unsigned int flags; @@ -82,31 +84,18 @@ libxlMigrationDstArgsOnceInit(void) VIR_ONCE_GLOBAL_INIT(libxlMigrationDstArgs)
static void -libxlDoMigrateReceive(virNetSocketPtr sock, - int events ATTRIBUTE_UNUSED, - void *opaque) +libxlDoMigrateReceive(void *opaque) { libxlMigrationDstArgs *args = opaque; - virConnectPtr conn = args->conn; virDomainObjPtr vm = args->vm; virNetSocketPtr *socks = args->socks; size_t nsocks = args->nsocks; bool paused = args->flags & VIR_MIGRATE_PAUSED; - libxlDriverPrivatePtr driver = conn->privateData; - virNetSocketPtr client_sock; - int recvfd = -1; + libxlDriverPrivatePtr driver = args->conn->privateData; + int recvfd = args->recvfd; size_t i; int ret;
- if (virNetSocketAccept(sock, &client_sock) < 0) { - virReportError(VIR_ERR_OPERATION_INVALID, "%s", - _("Fail to accept migration connection")); - goto cleanup; - } - VIR_DEBUG("Accepted migration connection\n"); - recvfd = virNetSocketDupFD(client_sock, true); - virObjectUnref(client_sock); - virObjectLock(vm); ret = libxlDomainStart(driver, vm, paused, recvfd); virObjectUnlock(vm); @@ -114,7 +103,6 @@ libxlDoMigrateReceive(virNetSocketPtr sock, if (ret < 0 && !vm->persistent) virDomainObjListRemove(driver->domains, vm);
- cleanup: /* Remove all listen socks from event handler, and close them. */ for (i = 0; i < nsocks; i++) { virNetSocketUpdateIOCallback(socks[i], 0); @@ -127,6 +115,42 @@ libxlDoMigrateReceive(virNetSocketPtr sock, VIR_FORCE_CLOSE(recvfd); }
+ +static void +libxlMigrateReceive(virNetSocketPtr sock, + int events ATTRIBUTE_UNUSED, + void *opaque) +{ + libxlMigrationDstArgs *args = opaque; + virNetSocketPtr client_sock; + int recvfd; + virThread thread; + + /* Accept migration connection */ + virNetSocketAccept(sock, &client_sock); + if (client_sock == NULL) { + virReportError(VIR_ERR_OPERATION_FAILED, "%s", + _("Failed to accept migration connection")); + return; + } + VIR_DEBUG("Accepted migration connection." + " Spawing thread to process migration data"); + recvfd = virNetSocketDupFD(client_sock, true); + virObjectUnref(client_sock); + + /* + * Avoid blocking the event loop. Start a thread to receive + * the migration data + */ + args->recvfd = recvfd; + if (virThreadCreate(&thread, false, + libxlDoMigrateReceive, args) < 0) { + virReportError(VIR_ERR_OPERATION_FAILED, "%s", + _("Failed to create thread for receiving migration data"));
I believe you want VIR_FORCE_CLOSE(args->recvfd) in case thread creation failed.
+ } +} + + static int libxlDoMigrateSend(libxlDriverPrivatePtr driver, virDomainObjPtr vm, @@ -376,7 +400,7 @@ libxlDomainMigrationPrepare(virConnectPtr dconn,
if (virNetSocketAddIOCallback(socks[i], VIR_EVENT_HANDLE_READABLE, - libxlDoMigrateReceive, + libxlMigrateReceive, args, virObjectFreeCallback) < 0) continue;
ACK with that fixed. Michal

Michal Privoznik wrote:
On 13.11.2014 03:36, Jim Fehlig wrote:
The libxl driver receives migration data within an IO callback invoked by the event loop, effectively disabling the event loop while migration occurs.
This patch moves receving of the migration data to a thread. The incoming connection is still accepted in the IO callback, but control is immediately returned to the event loop after spawning the thread.
Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_migration.c | 60 +++++++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 18 deletions(-)
diff --git a/src/libxl/libxl_migration.c b/src/libxl/libxl_migration.c index 0b562f7..1940209 100644 --- a/src/libxl/libxl_migration.c +++ b/src/libxl/libxl_migration.c @@ -35,6 +35,7 @@ #include "vircommand.h" #include "virstring.h" #include "virobject.h" +#include "virthread.h" #include "rpc/virnetsocket.h" #include "libxl_domain.h" #include "libxl_driver.h" @@ -48,6 +49,7 @@ VIR_LOG_INIT("libxl.libxl_migration"); typedef struct _libxlMigrationDstArgs { virObject parent;
+ int recvfd; virConnectPtr conn; virDomainObjPtr vm; unsigned int flags; @@ -82,31 +84,18 @@ libxlMigrationDstArgsOnceInit(void) VIR_ONCE_GLOBAL_INIT(libxlMigrationDstArgs)
static void -libxlDoMigrateReceive(virNetSocketPtr sock, - int events ATTRIBUTE_UNUSED, - void *opaque) +libxlDoMigrateReceive(void *opaque) { libxlMigrationDstArgs *args = opaque; - virConnectPtr conn = args->conn; virDomainObjPtr vm = args->vm; virNetSocketPtr *socks = args->socks; size_t nsocks = args->nsocks; bool paused = args->flags & VIR_MIGRATE_PAUSED; - libxlDriverPrivatePtr driver = conn->privateData; - virNetSocketPtr client_sock; - int recvfd = -1; + libxlDriverPrivatePtr driver = args->conn->privateData; + int recvfd = args->recvfd; size_t i; int ret;
- if (virNetSocketAccept(sock, &client_sock) < 0) { - virReportError(VIR_ERR_OPERATION_INVALID, "%s", - _("Fail to accept migration connection")); - goto cleanup; - } - VIR_DEBUG("Accepted migration connection\n"); - recvfd = virNetSocketDupFD(client_sock, true); - virObjectUnref(client_sock); - virObjectLock(vm); ret = libxlDomainStart(driver, vm, paused, recvfd); virObjectUnlock(vm); @@ -114,7 +103,6 @@ libxlDoMigrateReceive(virNetSocketPtr sock, if (ret < 0 && !vm->persistent) virDomainObjListRemove(driver->domains, vm);
- cleanup: /* Remove all listen socks from event handler, and close them. */ for (i = 0; i < nsocks; i++) { virNetSocketUpdateIOCallback(socks[i], 0); @@ -127,6 +115,42 @@ libxlDoMigrateReceive(virNetSocketPtr sock, VIR_FORCE_CLOSE(recvfd); }
+ +static void +libxlMigrateReceive(virNetSocketPtr sock, + int events ATTRIBUTE_UNUSED, + void *opaque) +{ + libxlMigrationDstArgs *args = opaque; + virNetSocketPtr client_sock; + int recvfd; + virThread thread; + + /* Accept migration connection */ + virNetSocketAccept(sock, &client_sock); + if (client_sock == NULL) { + virReportError(VIR_ERR_OPERATION_FAILED, "%s", + _("Failed to accept migration connection")); + return; + } + VIR_DEBUG("Accepted migration connection." + " Spawing thread to process migration data"); + recvfd = virNetSocketDupFD(client_sock, true); + virObjectUnref(client_sock); + + /* + * Avoid blocking the event loop. Start a thread to receive + * the migration data + */ + args->recvfd = recvfd; + if (virThreadCreate(&thread, false, + libxlDoMigrateReceive, args) < 0) { + virReportError(VIR_ERR_OPERATION_FAILED, "%s", + _("Failed to create thread for receiving migration data"));
I believe you want VIR_FORCE_CLOSE(args->recvfd) in case thread creation failed.
Ah, yes, thanks for catching that. In fact, I need to cleanup all the listening sockets on failure too. Also, while reordering this series I mistakenly dropped a patch that fixes a race condition introduced by moving the receive processing to a thread. V2 on the way. Sorry for wasting your time on an incomplete V1. Regards, Jim

During the perform phase of migration, the domain is started on the dst host in a running state if VIR_MIGRATE_PAUSED flag is not specified. In the finish phase, the domain is also unpaused if VIR_MIGRATE_PAUSED flag is unset. I've noticed this second unpause fails if the domain was already unpaused following the perform phase. This patch changes the perform phase to always start the domain paused, and defers unpausing, if requested, to the finish phase. Unpausing should occur in the finish phase anyhow, where the domain can be properly destroyed if the perform phase fails and migration is cancelled. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_migration.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/libxl/libxl_migration.c b/src/libxl/libxl_migration.c index 1940209..260b247 100644 --- a/src/libxl/libxl_migration.c +++ b/src/libxl/libxl_migration.c @@ -90,14 +90,17 @@ libxlDoMigrateReceive(void *opaque) virDomainObjPtr vm = args->vm; virNetSocketPtr *socks = args->socks; size_t nsocks = args->nsocks; - bool paused = args->flags & VIR_MIGRATE_PAUSED; libxlDriverPrivatePtr driver = args->conn->privateData; int recvfd = args->recvfd; size_t i; int ret; + /* + * Always start the domain paused. If needed, unpause in the + * finish phase, after transfer of the domain is complete. + */ virObjectLock(vm); - ret = libxlDomainStart(driver, vm, paused, recvfd); + ret = libxlDomainStart(driver, vm, true, recvfd); virObjectUnlock(vm); if (ret < 0 && !vm->persistent) -- 1.8.4.5

The 'cleanup' label was missplaced in libxlDomainMigrationFinish, causing a migrated domain to remain if the migration was cancelled or encountered an error during perform phase. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_migration.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/libxl/libxl_migration.c b/src/libxl/libxl_migration.c index 260b247..0a41568 100644 --- a/src/libxl/libxl_migration.c +++ b/src/libxl/libxl_migration.c @@ -539,16 +539,17 @@ libxlDomainMigrationFinish(virConnectPtr dconn, VIR_DOMAIN_EVENT_SUSPENDED_PAUSED); } - if (virDomainSaveStatus(driver->xmlopt, cfg->stateDir, vm) < 0) - goto cleanup; - if (event) { libxlDomainEventQueue(driver, event); event = NULL; } + if (virDomainSaveStatus(driver->xmlopt, cfg->stateDir, vm) < 0) + goto cleanup; + dom = virGetDomain(dconn, vm->def->name, vm->def->uuid); + cleanup: if (dom == NULL) { libxl_domain_destroy(priv->ctx, vm->def->id, NULL); libxlDomainCleanup(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED); @@ -556,7 +557,6 @@ libxlDomainMigrationFinish(virConnectPtr dconn, VIR_DOMAIN_EVENT_STOPPED_FAILED); } - cleanup: if (event) libxlDomainEventQueue(driver, event); virObjectUnlock(vm); -- 1.8.4.5

On 13.11.2014 03:36, Jim Fehlig wrote:
This series of patches fixes problems discovered in libxl migration.
The first patch fixes an issue that went undetected while testing the initial implementation of migration. Receiving migration data occurs in the context of an event loop callback, effectively blocking the event loop during the entire migration process. The patch moves the work of receiving migration data to a thread.
Interestingly, this issue manifested in a failed migration due to failed keepalives, which would kill virsh's connection to dst host. The dst host failed to respond to keepalives since its event loop was blocked on receiving migration data. Ultimately the migration perform phase would succeed leaving a running domain on dst. However, the subsequent finish phase would fail since virsh's connection to dst had been killed by the keepalive failure. Since finish failed, the confirm phase would resume the domain on src. Yikes! Same domain running on two different hosts :(.
Patches 2 and 3 improve handling of errors in the event the perform or finish phases of migration fail. See the individual patches for details.
Jim Fehlig (3): libxl: Receive migration data in a thread libxl: start domain paused on migration dst libxl: destroy domain in migration finish phase on failure
src/libxl/libxl_migration.c | 75 ++++++++++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 24 deletions(-)
ACK series, but see my comment to 1/3. Michal
participants (2)
-
Jim Fehlig
-
Michal Privoznik