[libvirt] [PATCH 0/5] libxl: various migration V3 improvements

Patch 5 fixes a long standing problem found by some very slow hosts in xen's osstest https://lists.xenproject.org/archives/html/xen-devel/2018-08/msg01945.html While working on the fix, I discovered other problems in libxl's V3 migration protocol. E.g. a modify job on the migrating VM was not handled properly across the phases on either src or dst host. Patches 1-4 fix this and other problems found along the way. Jim Fehlig (5): libxl: migration: defer removing VM until finish phase libxl: fix logic in P2P migration libxl: fix job handling across migration phases on src libxl: fix job handling across migration phases on dst libxl: join with thread receiving migration data src/libxl/libxl_domain.h | 1 + src/libxl/libxl_driver.c | 7 -- src/libxl/libxl_migration.c | 168 ++++++++++++++++++++++++------------ 3 files changed, 114 insertions(+), 62 deletions(-) -- 2.18.0

If for any reason the restore of a VM fails on the destination host in a migration operation, the VM is removed (if not persistent) from the virDomainObjList, meaning it is no longer available for additional cleanup or processing in the finish phase. Defer removing the VM from the virDomainObjList until the finish phase, which already contains logic to remove the VM. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_migration.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/libxl/libxl_migration.c b/src/libxl/libxl_migration.c index b2e5847c58..97f72d0390 100644 --- a/src/libxl/libxl_migration.c +++ b/src/libxl/libxl_migration.c @@ -264,7 +264,6 @@ libxlDoMigrateDstReceive(void *opaque) libxlDriverPrivatePtr driver = args->conn->privateData; int recvfd = args->recvfd; size_t i; - int ret; virObjectRef(vm); virObjectLock(vm); @@ -274,12 +273,10 @@ libxlDoMigrateDstReceive(void *opaque) /* * Always start the domain paused. If needed, unpause in the * finish phase, after transfer of the domain is complete. + * Errors and cleanup are also handled in the finish phase. */ - ret = libxlDomainStartRestore(driver, vm, true, recvfd, - args->migcookie->xenMigStreamVer); - - if (ret < 0 && !vm->persistent) - virDomainObjListRemove(driver->domains, vm); + libxlDomainStartRestore(driver, vm, true, recvfd, + args->migcookie->xenMigStreamVer); /* Remove all listen socks from event handler, and close them. */ for (i = 0; i < nsocks; i++) { -- 2.18.0

libxlDoMigrateSrcP2P() performs all phases of the migration protocol for peer-to-peer migration. Unfortunately the logic was a bit flawed since it is possible to skip the confirm phase after a successfull begin and prepare phase. Fix the logic to always call the confirm phase after a successful begin and perform. Skip the confirm phase if begin or perform fail. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_migration.c | 48 ++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/src/libxl/libxl_migration.c b/src/libxl/libxl_migration.c index 97f72d0390..e4f2895690 100644 --- a/src/libxl/libxl_migration.c +++ b/src/libxl/libxl_migration.c @@ -972,30 +972,35 @@ libxlDoMigrateSrcP2P(libxlDriverPrivatePtr driver, char *cookieout = NULL; int cookieoutlen; bool cancelled = true; + bool notify_source = true; virErrorPtr orig_err = NULL; int ret = -1; /* For tunnel migration */ virStreamPtr st = NULL; struct libxlTunnelControl *tc = NULL; + if (dname && + virTypedParamsAddString(¶ms, &nparams, &maxparams, + VIR_MIGRATE_PARAM_DEST_NAME, dname) < 0) + goto cleanup; + + if (uri && + virTypedParamsAddString(¶ms, &nparams, &maxparams, + VIR_MIGRATE_PARAM_URI, uri) < 0) + goto cleanup; + dom_xml = libxlDomainMigrationSrcBegin(sconn, vm, xmlin, &cookieout, &cookieoutlen); + /* + * If dom_xml is non-NULL the begin phase has succeeded, and the + * confirm phase must be called to cleanup the migration operation. + */ if (!dom_xml) goto cleanup; if (virTypedParamsAddString(¶ms, &nparams, &maxparams, VIR_MIGRATE_PARAM_DEST_XML, dom_xml) < 0) - goto cleanup; - - if (dname && - virTypedParamsAddString(¶ms, &nparams, &maxparams, - VIR_MIGRATE_PARAM_DEST_NAME, dname) < 0) - goto cleanup; - - if (uri && - virTypedParamsAddString(¶ms, &nparams, &maxparams, - VIR_MIGRATE_PARAM_URI, uri) < 0) - goto cleanup; + goto confirm; /* We don't require the destination to have P2P support * as it looks to be normal migration from the receiver perpective. @@ -1006,7 +1011,7 @@ libxlDoMigrateSrcP2P(libxlDriverPrivatePtr driver, virObjectUnlock(vm); if (flags & VIR_MIGRATE_TUNNELLED) { if (!(st = virStreamNew(dconn, 0))) - goto cleanup; + goto confirm; ret = dconn->driver->domainMigratePrepareTunnel3Params (dconn, st, params, nparams, cookieout, cookieoutlen, NULL, NULL, destflags); } else { @@ -1016,7 +1021,7 @@ libxlDoMigrateSrcP2P(libxlDriverPrivatePtr driver, virObjectLock(vm); if (ret == -1) - goto cleanup; + goto confirm; if (!(flags & VIR_MIGRATE_TUNNELLED)) { if (uri_out) { @@ -1038,8 +1043,10 @@ libxlDoMigrateSrcP2P(libxlDriverPrivatePtr driver, else ret = libxlDomainMigrationSrcPerform(driver, vm, NULL, NULL, uri_out, NULL, flags); - if (ret < 0) + if (ret < 0) { + notify_source = false; orig_err = virSaveLastError(); + } cancelled = (ret < 0); @@ -1067,12 +1074,15 @@ libxlDoMigrateSrcP2P(libxlDriverPrivatePtr driver, if (!orig_err) orig_err = virSaveLastError(); - VIR_DEBUG("Confirm3 cancelled=%d vm=%p", cancelled, vm); - ret = libxlDomainMigrationSrcConfirm(driver, vm, flags, cancelled); + confirm: + if (notify_source) { + VIR_DEBUG("Confirm3 cancelled=%d vm=%p", cancelled, vm); + ret = libxlDomainMigrationSrcConfirm(driver, vm, flags, cancelled); - if (ret < 0) - VIR_WARN("Guest %s probably left in 'paused' state on source", - vm->def->name); + if (ret < 0) + VIR_WARN("Guest %s probably left in 'paused' state on source", + vm->def->name); + } cleanup: if (flags & VIR_MIGRATE_TUNNELLED) { -- 2.18.0

The libxlDomainMigrationSrc* functions are a bit flawed in their handling of modify jobs. A job begins at the start of the begin phase but ends before the phase completes. No job is running for the remaining phases of migration on the source host. Change the logic to keep the job running after a successful begin phase, and end the job in the confirm phase. The job must also end in the perform phase in the case of error since confirm phase would not be executed. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_migration.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/libxl/libxl_migration.c b/src/libxl/libxl_migration.c index e4f2895690..191973edeb 100644 --- a/src/libxl/libxl_migration.c +++ b/src/libxl/libxl_migration.c @@ -399,6 +399,11 @@ libxlDomainMigrationSrcBegin(virConnectPtr conn, virDomainDefPtr def; char *xml = NULL; + /* + * In the case of successful migration, a job is started here and + * terminated in the confirm phase. Errors in the begin or perform + * phase will also terminate the job. + */ if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) goto cleanup; @@ -428,6 +433,9 @@ libxlDomainMigrationSrcBegin(virConnectPtr conn, goto endjob; xml = virDomainDefFormat(def, cfg->caps, VIR_DOMAIN_DEF_FORMAT_SECURE); + /* Valid xml means success! EndJob in the confirm phase */ + if (xml) + goto cleanup; endjob: libxlDomainObjEndJob(driver, vm); @@ -1169,6 +1177,14 @@ libxlDomainMigrationSrcPerformP2P(libxlDriverPrivatePtr driver, ret = libxlDoMigrateSrcP2P(driver, vm, sconn, xmlin, dconn, dconnuri, dname, uri_str, flags); + if (ret < 0) { + /* + * Confirm phase will not be executed if perform fails. End the + * job started in begin phase. + */ + libxlDomainObjEndJob(driver, vm); + } + cleanup: orig_err = virSaveLastError(); virObjectUnlock(vm); @@ -1232,11 +1248,17 @@ libxlDomainMigrationSrcPerform(libxlDriverPrivatePtr driver, ret = libxlDoMigrateSrcSend(driver, vm, flags, sockfd); virObjectLock(vm); - if (ret < 0) + if (ret < 0) { virDomainLockProcessResume(driver->lockManager, "xen:///system", vm, priv->lockState); + /* + * Confirm phase will not be executed if perform fails. End the + * job started in begin phase. + */ + libxlDomainObjEndJob(driver, vm); + } cleanup: VIR_FORCE_CLOSE(sockfd); @@ -1386,6 +1408,8 @@ libxlDomainMigrationSrcConfirm(libxlDriverPrivatePtr driver, ret = 0; cleanup: + /* EndJob for corresponding BeginJob in begin phase */ + libxlDomainObjEndJob(driver, vm); virObjectEventStateQueue(driver->domainEventState, event); virObjectUnref(cfg); return ret; -- 2.18.0

The libxlDomainMigrationDst* functions are a bit flawed in their handling of modify jobs. A job begins when the destination host begins receiving the incoming VM and ends after the VM is started. The finish phase contains another BeginJob/EndJob sequence. This patch changes the logic to begin a job for the incoming VM in the prepare phase and end the job in the finish phase. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_driver.c | 7 ---- src/libxl/libxl_migration.c | 65 +++++++++++++++++++++++-------------- 2 files changed, 40 insertions(+), 32 deletions(-) diff --git a/src/libxl/libxl_driver.c b/src/libxl/libxl_driver.c index 5a5e792957..73c2ff3546 100644 --- a/src/libxl/libxl_driver.c +++ b/src/libxl/libxl_driver.c @@ -6020,15 +6020,8 @@ libxlDomainMigrateFinish3Params(virConnectPtr dconn, return NULL; } - if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) { - virDomainObjEndAPI(&vm); - return NULL; - } - ret = libxlDomainMigrationDstFinish(dconn, vm, flags, cancelled); - libxlDomainObjEndJob(driver, vm); - virDomainObjEndAPI(&vm); return ret; diff --git a/src/libxl/libxl_migration.c b/src/libxl/libxl_migration.c index 191973edeb..54b01a3169 100644 --- a/src/libxl/libxl_migration.c +++ b/src/libxl/libxl_migration.c @@ -266,9 +266,6 @@ libxlDoMigrateDstReceive(void *opaque) size_t i; virObjectRef(vm); - virObjectLock(vm); - if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) - goto cleanup; /* * Always start the domain paused. If needed, unpause in the @@ -288,10 +285,6 @@ libxlDoMigrateDstReceive(void *opaque) args->nsocks = 0; VIR_FORCE_CLOSE(recvfd); virObjectUnref(args); - - libxlDomainObjEndJob(driver, vm); - - cleanup: virDomainObjEndAPI(&vm); } @@ -583,6 +576,13 @@ libxlDomainMigrationDstPrepareTunnel3(virConnectPtr dconn, goto error; *def = NULL; + /* + * Unless an error is encountered in this function, the job will + * be terminated in the finish phase. + */ + if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) + goto error; + priv = vm->privateData; if (taint_hook) { @@ -595,18 +595,18 @@ libxlDomainMigrationDstPrepareTunnel3(virConnectPtr dconn, * stream -> pipe -> recvfd of libxlDomainStartRestore */ if (pipe(dataFD) < 0) - goto error; + goto endjob; /* Stream data will be written to pipeIn */ if (virFDStreamOpen(st, dataFD[1]) < 0) - goto error; + goto endjob; dataFD[1] = -1; /* 'st' owns the FD now & will close it */ if (libxlMigrationDstArgsInitialize() < 0) - goto error; + goto endjob; if (!(args = virObjectNew(libxlMigrationDstArgsClass))) - goto error; + goto endjob; args->conn = virObjectRef(dconn); args->vm = virObjectRef(vm); @@ -620,12 +620,15 @@ libxlDomainMigrationDstPrepareTunnel3(virConnectPtr dconn, if (virThreadCreate(&thread, false, libxlDoMigrateDstReceive, args) < 0) { virReportError(VIR_ERR_OPERATION_FAILED, "%s", _("Failed to create thread for receiving migration data")); - goto error; + goto endjob; } ret = 0; goto done; + endjob: + libxlDomainObjEndJob(driver, vm); + error: libxlMigrationCookieFree(mig); VIR_FORCE_CLOSE(dataFD[1]); @@ -679,6 +682,13 @@ libxlDomainMigrationDstPrepare(virConnectPtr dconn, goto error; *def = NULL; + /* + * Unless an error is encountered in this function, the job will + * be terminated in the finish phase. + */ + if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) + goto error; + priv = vm->privateData; if (taint_hook) { @@ -689,27 +699,27 @@ libxlDomainMigrationDstPrepare(virConnectPtr dconn, /* Create socket connection to receive migration data */ if (!uri_in) { if ((hostname = virGetHostname()) == NULL) - goto error; + goto endjob; if (STRPREFIX(hostname, "localhost")) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("hostname on destination resolved to localhost," " but migration requires an FQDN")); - goto error; + goto endjob; } if (virPortAllocatorAcquire(driver->migrationPorts, &port) < 0) - goto error; + goto endjob; priv->migrationPort = port; if (virAsprintf(uri_out, "tcp://%s:%d", hostname, port) < 0) - goto error; + goto endjob; } else { if (!(STRPREFIX(uri_in, "tcp://"))) { /* not full URI, add prefix tcp:// */ char *tmp; if (virAsprintf(&tmp, "tcp://%s", uri_in) < 0) - goto error; + goto endjob; uri = virURIParse(tmp); VIR_FREE(tmp); } else { @@ -720,20 +730,20 @@ libxlDomainMigrationDstPrepare(virConnectPtr dconn, virReportError(VIR_ERR_INVALID_ARG, _("unable to parse URI: %s"), uri_in); - goto error; + goto endjob; } if (uri->server == NULL) { virReportError(VIR_ERR_INVALID_ARG, _("missing host in migration URI: %s"), uri_in); - goto error; + goto endjob; } hostname = uri->server; if (uri->port == 0) { if (virPortAllocatorAcquire(driver->migrationPorts, &port) < 0) - goto error; + goto endjob; priv->migrationPort = port; } else { @@ -741,7 +751,7 @@ libxlDomainMigrationDstPrepare(virConnectPtr dconn, } if (virAsprintf(uri_out, "tcp://%s:%d", hostname, port) < 0) - goto error; + goto endjob; } snprintf(portstr, sizeof(portstr), "%d", port); @@ -751,14 +761,14 @@ libxlDomainMigrationDstPrepare(virConnectPtr dconn, &socks, &nsocks) < 0) { virReportError(VIR_ERR_OPERATION_FAILED, "%s", _("Fail to create socket for incoming migration")); - goto error; + goto endjob; } if (libxlMigrationDstArgsInitialize() < 0) - goto error; + goto endjob; if (!(args = virObjectNew(libxlMigrationDstArgsClass))) - goto error; + goto endjob; args->conn = virObjectRef(dconn); args->vm = virObjectRef(vm); @@ -786,11 +796,14 @@ libxlDomainMigrationDstPrepare(virConnectPtr dconn, } if (!nsocks_listen) - goto error; + goto endjob; ret = 0; goto done; + endjob: + libxlDomainObjEndJob(driver, vm); + error: for (i = 0; i < nsocks; i++) { virNetSocketClose(socks[i]); @@ -1354,6 +1367,8 @@ libxlDomainMigrationDstFinish(virConnectPtr dconn, virDomainObjListRemove(driver->domains, vm); } + /* EndJob for corresponding BeginJob in prepare phase */ + libxlDomainObjEndJob(driver, vm); virObjectEventStateQueue(driver->domainEventState, event); virObjectUnref(cfg); return dom; -- 2.18.0

It is possible the incoming VM is not fully started when the finish phase of migration is executed. In libxlDomainMigrationDstFinish, wait for the thread receiving the VM to complete before executing finish phase tasks. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_domain.h | 1 + src/libxl/libxl_migration.c | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/libxl/libxl_domain.h b/src/libxl/libxl_domain.h index 5d83230cd6..e193881450 100644 --- a/src/libxl/libxl_domain.h +++ b/src/libxl/libxl_domain.h @@ -65,6 +65,7 @@ struct _libxlDomainObjPrivate { /* console */ virChrdevsPtr devs; libxl_evgen_domain_death *deathW; + virThreadPtr migrationDstReceiveThr; unsigned short migrationPort; char *lockState; diff --git a/src/libxl/libxl_migration.c b/src/libxl/libxl_migration.c index 54b01a3169..fc7ccb53d0 100644 --- a/src/libxl/libxl_migration.c +++ b/src/libxl/libxl_migration.c @@ -297,9 +297,9 @@ libxlMigrateDstReceive(virNetSocketPtr sock, libxlMigrationDstArgs *args = opaque; virNetSocketPtr *socks = args->socks; size_t nsocks = args->nsocks; + libxlDomainObjPrivatePtr priv = args->vm->privateData; virNetSocketPtr client_sock; int recvfd = -1; - virThread thread; size_t i; /* Accept migration connection */ @@ -318,7 +318,10 @@ libxlMigrateDstReceive(virNetSocketPtr sock, * the migration data */ args->recvfd = recvfd; - if (virThreadCreate(&thread, false, + VIR_FREE(priv->migrationDstReceiveThr); + if (VIR_ALLOC(priv->migrationDstReceiveThr) < 0) + goto fail; + if (virThreadCreate(priv->migrationDstReceiveThr, true, libxlDoMigrateDstReceive, args) < 0) { virReportError(VIR_ERR_OPERATION_FAILED, "%s", _("Failed to create thread for receiving migration data")); @@ -557,7 +560,6 @@ libxlDomainMigrationDstPrepareTunnel3(virConnectPtr dconn, libxlDriverPrivatePtr driver = dconn->privateData; virDomainObjPtr vm = NULL; libxlMigrationDstArgs *args = NULL; - virThread thread; bool taint_hook = false; libxlDomainObjPrivatePtr priv = NULL; char *xmlout = NULL; @@ -617,7 +619,10 @@ libxlDomainMigrationDstPrepareTunnel3(virConnectPtr dconn, args->nsocks = 0; mig = NULL; - if (virThreadCreate(&thread, false, libxlDoMigrateDstReceive, args) < 0) { + VIR_FREE(priv->migrationDstReceiveThr); + if (VIR_ALLOC(priv->migrationDstReceiveThr) < 0) + goto error; + if (virThreadCreate(priv->migrationDstReceiveThr, true, libxlDoMigrateDstReceive, args) < 0) { virReportError(VIR_ERR_OPERATION_FAILED, "%s", _("Failed to create thread for receiving migration data")); goto endjob; @@ -1291,6 +1296,13 @@ libxlDomainMigrationDstFinish(virConnectPtr dconn, virObjectEventPtr event = NULL; virDomainPtr dom = NULL; + if (priv->migrationDstReceiveThr) { + virObjectUnlock(vm); + virThreadJoin(priv->migrationDstReceiveThr); + virObjectLock(vm); + VIR_FREE(priv->migrationDstReceiveThr); + } + virPortAllocatorRelease(priv->migrationPort); priv->migrationPort = 0; -- 2.18.0

On 09/05/2018 11:27 PM, Jim Fehlig wrote:
Patch 5 fixes a long standing problem found by some very slow hosts in xen's osstest
https://lists.xenproject.org/archives/html/xen-devel/2018-08/msg01945.html
While working on the fix, I discovered other problems in libxl's V3 migration protocol. E.g. a modify job on the migrating VM was not handled properly across the phases on either src or dst host. Patches 1-4 fix this and other problems found along the way.
Jim Fehlig (5): libxl: migration: defer removing VM until finish phase libxl: fix logic in P2P migration libxl: fix job handling across migration phases on src libxl: fix job handling across migration phases on dst libxl: join with thread receiving migration data
src/libxl/libxl_domain.h | 1 + src/libxl/libxl_driver.c | 7 -- src/libxl/libxl_migration.c | 168 ++++++++++++++++++++++++------------ 3 files changed, 114 insertions(+), 62 deletions(-)
ACK series. Michal
participants (2)
-
Jim Fehlig
-
Michal Privoznik