[libvirt] [PATCH V2 0/4] libxl migration improvements

V2 of https://www.redhat.com/archives/libvir-list/2014-November/msg00423.html Patch 1 has been fixed based on Michal's feedback. Patch 2 is new in V2. I mistakenly dropped it when reordering the patches for the V1 submission. Patches 3 and 4 also have some minor adjustments from V1. See individual patches for details. Jim Fehlig (4): libxl: Receive migration data in a thread libxl: acquire job in migration finish phase libxl: start domain paused on migration dst libxl: destroy domain in migration finish phase on failure src/libxl/libxl_driver.c | 17 +++++--- src/libxl/libxl_migration.c | 103 +++++++++++++++++++++++++++++++++----------- 2 files changed, 90 insertions(+), 30 deletions(-) -- 1.8.4.5

The libxl driver receives migration data within an IO callback invoked by the event loop, effectively disabling the event loop while migration occurs. This patch moves receving of the migration data to a thread. The incoming connection is still accepted in the IO callback, but control is immediately returned to the event loop after spawning the thread. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- V2: - Address issues noted by mprivozn in V1 review src/libxl/libxl_migration.c | 77 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 18 deletions(-) diff --git a/src/libxl/libxl_migration.c b/src/libxl/libxl_migration.c index 0b562f7..c728fa2 100644 --- a/src/libxl/libxl_migration.c +++ b/src/libxl/libxl_migration.c @@ -35,6 +35,7 @@ #include "vircommand.h" #include "virstring.h" #include "virobject.h" +#include "virthread.h" #include "rpc/virnetsocket.h" #include "libxl_domain.h" #include "libxl_driver.h" @@ -48,6 +49,7 @@ VIR_LOG_INIT("libxl.libxl_migration"); typedef struct _libxlMigrationDstArgs { virObject parent; + int recvfd; virConnectPtr conn; virDomainObjPtr vm; unsigned int flags; @@ -82,39 +84,78 @@ libxlMigrationDstArgsOnceInit(void) VIR_ONCE_GLOBAL_INIT(libxlMigrationDstArgs) static void -libxlDoMigrateReceive(virNetSocketPtr sock, - int events ATTRIBUTE_UNUSED, - void *opaque) +libxlDoMigrateReceive(void *opaque) { libxlMigrationDstArgs *args = opaque; - virConnectPtr conn = args->conn; virDomainObjPtr vm = args->vm; virNetSocketPtr *socks = args->socks; size_t nsocks = args->nsocks; bool paused = args->flags & VIR_MIGRATE_PAUSED; - libxlDriverPrivatePtr driver = conn->privateData; + libxlDriverPrivatePtr driver = args->conn->privateData; + int recvfd = args->recvfd; + size_t i; + int ret; + + virObjectLock(vm); + ret = libxlDomainStart(driver, vm, paused, recvfd); + virObjectUnlock(vm); + + if (ret < 0 && !vm->persistent) + virDomainObjListRemove(driver->domains, vm); + + /* Remove all listen socks from event handler, and close them. */ + for (i = 0; i < nsocks; i++) { + virNetSocketUpdateIOCallback(socks[i], 0); + virNetSocketRemoveIOCallback(socks[i]); + virNetSocketClose(socks[i]); + virObjectUnref(socks[i]); + socks[i] = NULL; + } + args->nsocks = 0; + VIR_FORCE_CLOSE(recvfd); +} + + +static void +libxlMigrateReceive(virNetSocketPtr sock, + int events ATTRIBUTE_UNUSED, + void *opaque) +{ + libxlMigrationDstArgs *args = opaque; + virNetSocketPtr *socks = args->socks; + size_t nsocks = args->nsocks; virNetSocketPtr client_sock; int recvfd = -1; + virThread thread; size_t i; - int ret; - if (virNetSocketAccept(sock, &client_sock) < 0) { - virReportError(VIR_ERR_OPERATION_INVALID, "%s", - _("Fail to accept migration connection")); - goto cleanup; + /* Accept migration connection */ + virNetSocketAccept(sock, &client_sock); + if (client_sock == NULL) { + virReportError(VIR_ERR_OPERATION_FAILED, "%s", + _("Failed to accept migration connection")); + goto fail; } - VIR_DEBUG("Accepted migration connection\n"); + VIR_DEBUG("Accepted migration connection." + " Spawing thread to process migration data"); recvfd = virNetSocketDupFD(client_sock, true); virObjectUnref(client_sock); - virObjectLock(vm); - ret = libxlDomainStart(driver, vm, paused, recvfd); - virObjectUnlock(vm); + /* + * Avoid blocking the event loop. Start a thread to receive + * the migration data + */ + args->recvfd = recvfd; + if (virThreadCreate(&thread, false, + libxlDoMigrateReceive, args) < 0) { + virReportError(VIR_ERR_OPERATION_FAILED, "%s", + _("Failed to create thread for receiving migration data")); + goto fail; + } - if (ret < 0 && !vm->persistent) - virDomainObjListRemove(driver->domains, vm); + return; - cleanup: + fail: /* Remove all listen socks from event handler, and close them. */ for (i = 0; i < nsocks; i++) { virNetSocketUpdateIOCallback(socks[i], 0); @@ -376,7 +417,7 @@ libxlDomainMigrationPrepare(virConnectPtr dconn, if (virNetSocketAddIOCallback(socks[i], VIR_EVENT_HANDLE_READABLE, - libxlDoMigrateReceive, + libxlMigrateReceive, args, virObjectFreeCallback) < 0) continue; -- 1.8.4.5

Moving data reception of the perform phase of migration to a thread introduces a race with the finish phase, where checking if the domain is active races with the thread finishing the perform phase. The race is easily solved by acquiring a job in the finish phase, which must wait for the perform phase job to complete. While wrapping the finish phase in a job, noticed the virDomainObj was being unlocked in a callee - libxlDomainMigrationFinish. Move the unlocking to libxlDomainMigrateFinish3Params, where the lock is acquired. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- V2: - Unlock virDomainObj in libxlDomainMigrateFinish3Params instead of callee src/libxl/libxl_driver.c | 20 +++++++++++++++++--- src/libxl/libxl_migration.c | 1 - 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/libxl/libxl_driver.c b/src/libxl/libxl_driver.c index d2c077c..651d4eb 100644 --- a/src/libxl/libxl_driver.c +++ b/src/libxl/libxl_driver.c @@ -4654,6 +4654,7 @@ libxlDomainMigrateFinish3Params(virConnectPtr dconn, libxlDriverPrivatePtr driver = dconn->privateData; virDomainObjPtr vm = NULL; const char *dname = NULL; + virDomainPtr ret = NULL; #ifdef LIBXL_HAVE_NO_SUSPEND_RESUME virReportUnsupportedError(); @@ -4684,16 +4685,29 @@ libxlDomainMigrateFinish3Params(virConnectPtr dconn, return NULL; } + if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) { + virObjectUnlock(vm); + return NULL; + } + if (!virDomainObjIsActive(vm)) { /* Migration failed if domain is inactive */ virReportError(VIR_ERR_OPERATION_FAILED, "%s", _("Migration failed. Domain is not running " "on destination host")); - virObjectUnlock(vm); - return NULL; + goto endjob; } - return libxlDomainMigrationFinish(dconn, vm, flags, cancelled); + ret = libxlDomainMigrationFinish(dconn, vm, flags, cancelled); + + endjob: + if (!libxlDomainObjEndJob(driver, vm)) + vm = NULL; + + if (vm) + virObjectUnlock(vm); + + return ret; } static int diff --git a/src/libxl/libxl_migration.c b/src/libxl/libxl_migration.c index c728fa2..fa80a0c 100644 --- a/src/libxl/libxl_migration.c +++ b/src/libxl/libxl_migration.c @@ -573,7 +573,6 @@ libxlDomainMigrationFinish(virConnectPtr dconn, cleanup: if (event) libxlDomainEventQueue(driver, event); - virObjectUnlock(vm); virObjectUnref(cfg); return dom; } -- 1.8.4.5

During the perform phase of migration, the domain is started on the dst host in a running state if VIR_MIGRATE_PAUSED flag is not specified. In the finish phase, the domain is also unpaused if VIR_MIGRATE_PAUSED flag is unset. I've noticed this second unpause fails if the domain was already unpaused following the perform phase. This patch changes the perform phase to always start the domain paused, and defers unpausing, if requested, to the finish phase. Unpausing should occur in the finish phase anyhow, where the domain can be properly destroyed if the perform phase fails and migration is cancelled. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_migration.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/libxl/libxl_migration.c b/src/libxl/libxl_migration.c index fa80a0c..6f83b08 100644 --- a/src/libxl/libxl_migration.c +++ b/src/libxl/libxl_migration.c @@ -90,14 +90,17 @@ libxlDoMigrateReceive(void *opaque) virDomainObjPtr vm = args->vm; virNetSocketPtr *socks = args->socks; size_t nsocks = args->nsocks; - bool paused = args->flags & VIR_MIGRATE_PAUSED; libxlDriverPrivatePtr driver = args->conn->privateData; int recvfd = args->recvfd; size_t i; int ret; + /* + * Always start the domain paused. If needed, unpause in the + * finish phase, after transfer of the domain is complete. + */ virObjectLock(vm); - ret = libxlDomainStart(driver, vm, paused, recvfd); + ret = libxlDomainStart(driver, vm, true, recvfd); virObjectUnlock(vm); if (ret < 0 && !vm->persistent) -- 1.8.4.5

This patch contains three domain cleanup improvements in the migration finish phase, ensuring a domain is properly disposed when a failure is detected or the migration is cancelled. The check for virDomainObjIsActive is moved to libxlDomainMigrationFinish, where cleanup can occur if migration failed and the domain is inactive. The 'cleanup' label was missplaced in libxlDomainMigrationFinish, causing a migrated domain to remain in the event of an error or cancelled migration. In cleanup, the domain was not removed from the driver's list of domains. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- Changes in V2: - Move virDomainObjIsActive to libxlDomainMigrationFinish - On error or migration cancel, remove domain from driver's list of domains src/libxl/libxl_driver.c | 9 --------- src/libxl/libxl_migration.c | 20 ++++++++++++++++---- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/libxl/libxl_driver.c b/src/libxl/libxl_driver.c index 651d4eb..d01ff6e 100644 --- a/src/libxl/libxl_driver.c +++ b/src/libxl/libxl_driver.c @@ -4690,17 +4690,8 @@ libxlDomainMigrateFinish3Params(virConnectPtr dconn, return NULL; } - if (!virDomainObjIsActive(vm)) { - /* Migration failed if domain is inactive */ - virReportError(VIR_ERR_OPERATION_FAILED, - "%s", _("Migration failed. Domain is not running " - "on destination host")); - goto endjob; - } - ret = libxlDomainMigrationFinish(dconn, vm, flags, cancelled); - endjob: if (!libxlDomainObjEndJob(driver, vm)) vm = NULL; diff --git a/src/libxl/libxl_migration.c b/src/libxl/libxl_migration.c index 6f83b08..0caa3d0 100644 --- a/src/libxl/libxl_migration.c +++ b/src/libxl/libxl_migration.c @@ -537,6 +537,16 @@ libxlDomainMigrationFinish(virConnectPtr dconn, if (cancelled) goto cleanup; + /* Check if domain is alive */ + if (!virDomainObjIsActive(vm)) { + /* Migration failed if domain is inactive */ + virReportError(VIR_ERR_OPERATION_FAILED, + "%s", _("Migration failed. Domain is not running " + "on destination host")); + goto cleanup; + } + + /* Unpause if requested */ if (!(flags & VIR_MIGRATE_PAUSED)) { if (libxl_domain_unpause(priv->ctx, vm->def->id) != 0) { virReportError(VIR_ERR_OPERATION_FAILED, "%s", @@ -556,24 +566,26 @@ libxlDomainMigrationFinish(virConnectPtr dconn, VIR_DOMAIN_EVENT_SUSPENDED_PAUSED); } - if (virDomainSaveStatus(driver->xmlopt, cfg->stateDir, vm) < 0) - goto cleanup; - if (event) { libxlDomainEventQueue(driver, event); event = NULL; } + if (virDomainSaveStatus(driver->xmlopt, cfg->stateDir, vm) < 0) + goto cleanup; + dom = virGetDomain(dconn, vm->def->name, vm->def->uuid); + cleanup: if (dom == NULL) { libxl_domain_destroy(priv->ctx, vm->def->id, NULL); libxlDomainCleanup(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED); event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_FAILED); + if (!vm->persistent) + virDomainObjListRemove(driver->domains, vm); } - cleanup: if (event) libxlDomainEventQueue(driver, event); virObjectUnref(cfg); -- 1.8.4.5

On 14.11.2014 05:33, Jim Fehlig wrote:
V2 of
https://www.redhat.com/archives/libvir-list/2014-November/msg00423.html
Patch 1 has been fixed based on Michal's feedback. Patch 2 is new in V2. I mistakenly dropped it when reordering the patches for the V1 submission. Patches 3 and 4 also have some minor adjustments from V1. See individual patches for details.
Jim Fehlig (4): libxl: Receive migration data in a thread libxl: acquire job in migration finish phase libxl: start domain paused on migration dst libxl: destroy domain in migration finish phase on failure
src/libxl/libxl_driver.c | 17 +++++--- src/libxl/libxl_migration.c | 103 +++++++++++++++++++++++++++++++++----------- 2 files changed, 90 insertions(+), 30 deletions(-)
ACK series. Michal

On 11/19/2014 06:56 AM, Michal Privoznik wrote:
On 14.11.2014 05:33, Jim Fehlig wrote:
V2 of
https://www.redhat.com/archives/libvir-list/2014-November/msg00423.html
Patch 1 has been fixed based on Michal's feedback. Patch 2 is new in V2. I mistakenly dropped it when reordering the patches for the V1 submission. Patches 3 and 4 also have some minor adjustments from V1. See individual patches for details.
Jim Fehlig (4): libxl: Receive migration data in a thread libxl: acquire job in migration finish phase libxl: start domain paused on migration dst libxl: destroy domain in migration finish phase on failure
src/libxl/libxl_driver.c | 17 +++++--- src/libxl/libxl_migration.c | 103 +++++++++++++++++++++++++++++++++----------- 2 files changed, 90 insertions(+), 30 deletions(-)
ACK series.
Thanks, Pushed now. Regards, Jim
participants (2)
-
Jim Fehlig
-
Michal Privoznik