[libvirt] [PATCH 0/3] libxl: domain destroy fixes

This small series of patches fixes some issues wrt domain destroy in the libxl driver. The primary motivation for this work is to prevent locking the virDomainObj during long running destroy operations on large memory domains. Patch 1 moves job acquisition from libxlDomainStart to it's callers so they have more control over when the job is acquired. Patch 2 fixes a few spots where we never acquired a job during domain destroy. Patch 3 contains the interesting change, where the virDomainObj is unlocked during the long-running destroy operation. This series wraps up my work to improve parallel OpenStack Tempest runs against the libxl driver. With libvirt.git master + this series + a patched libxl [1], I've successfully run a reproducer that was hitting the same issues encountered by Tempest. [1] libxl commits from xen.git: 93699882d, f1335f0d, 4783c99a, 1c91d6fba, and 188e9c54. I'll contact the stable branch maintainers and ask them to include these commits in the next Xen 4.4.x and 4.5.x releases. Jim Fehlig (3): libxl: Move job acquisition in libxlDomainStart to callers libxl: acquire a job when destroying a domain libxl: drop virDomainObj lock when destroying a domain src/libxl/libxl_domain.c | 77 +++++++++++++++++++---------------------------- src/libxl/libxl_domain.h | 4 --- src/libxl/libxl_driver.c | 78 ++++++++++++++++++++++++++++++++++++------------ 3 files changed, 89 insertions(+), 70 deletions(-) -- 1.8.4.5

Let callers of libxlDomainStart decide when it is appropriate to acquire a job on the associated virDomainObj. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_domain.c | 24 ++++++++-------------- src/libxl/libxl_driver.c | 53 +++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 52 insertions(+), 25 deletions(-) diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c index 8feb4dc..eca1ff0 100644 --- a/src/libxl/libxl_domain.c +++ b/src/libxl/libxl_domain.c @@ -907,16 +907,13 @@ libxlDomainStart(libxlDriverPrivatePtr driver, virDomainObjPtr vm, libxl_domain_config_init(&d_config); - if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) - return ret; - cfg = libxlDriverConfigGet(driver); /* If there is a managed saved state restore it instead of starting * from scratch. The old state is removed once the restoring succeeded. */ if (restore_fd < 0) { managed_save_path = libxlDomainManagedSavePath(driver, vm); if (managed_save_path == NULL) - goto endjob; + goto cleanup; if (virFileExists(managed_save_path)) { @@ -924,7 +921,7 @@ libxlDomainStart(libxlDriverPrivatePtr driver, virDomainObjPtr vm, managed_save_path, &def, &hdr); if (managed_save_fd < 0) - goto endjob; + goto cleanup; restore_fd = managed_save_fd; @@ -938,7 +935,7 @@ libxlDomainStart(libxlDriverPrivatePtr driver, virDomainObjPtr vm, _("cannot restore domain '%s' uuid %s from a file" " which belongs to domain '%s' uuid %s"), vm->def->name, vm_uuidstr, def->name, def_uuidstr); - goto endjob; + goto cleanup; } virDomainObjAssignDef(vm, def, true, NULL); @@ -955,18 +952,18 @@ libxlDomainStart(libxlDriverPrivatePtr driver, virDomainObjPtr vm, if (libxlBuildDomainConfig(driver->reservedVNCPorts, vm->def, cfg->ctx, &d_config) < 0) - goto endjob; + goto cleanup; if (cfg->autoballoon && libxlDomainFreeMem(cfg->ctx, &d_config) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, _("libxenlight failed to get free memory for domain '%s'"), d_config.c_info.name); - goto endjob; + goto cleanup; } if (virHostdevPrepareDomainDevices(hostdev_mgr, LIBXL_DRIVER_NAME, vm->def, VIR_HOSTDEV_SP_PCI) < 0) - goto endjob; + goto cleanup; /* Unlock virDomainObj while creating the domain */ virObjectUnlock(vm); @@ -998,7 +995,7 @@ libxlDomainStart(libxlDriverPrivatePtr driver, virDomainObjPtr vm, virReportError(VIR_ERR_INTERNAL_ERROR, _("libxenlight failed to restore domain '%s'"), d_config.c_info.name); - goto endjob; + goto cleanup; } /* @@ -1045,7 +1042,7 @@ libxlDomainStart(libxlDriverPrivatePtr driver, virDomainObjPtr vm, libxlDomainEventQueue(driver, event); ret = 0; - goto endjob; + goto cleanup; cleanup_dom: if (priv->deathW) { @@ -1056,10 +1053,7 @@ libxlDomainStart(libxlDriverPrivatePtr driver, virDomainObjPtr vm, vm->def->id = -1; virDomainObjSetState(vm, VIR_DOMAIN_SHUTOFF, VIR_DOMAIN_SHUTOFF_FAILED); - endjob: - if (!libxlDomainObjEndJob(driver, vm)) - vm = NULL; - + cleanup: libxl_domain_config_dispose(&d_config); VIR_FREE(dom_xml); VIR_FREE(managed_save_path); diff --git a/src/libxl/libxl_driver.c b/src/libxl/libxl_driver.c index 05f6eb1..da4c1c7 100644 --- a/src/libxl/libxl_driver.c +++ b/src/libxl/libxl_driver.c @@ -303,18 +303,26 @@ libxlAutostartDomain(virDomainObjPtr vm, virObjectLock(vm); virResetLastError(); + if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) { + virObjectUnlock(vm); + return ret; + } + if (vm->autostart && !virDomainObjIsActive(vm) && libxlDomainStart(driver, vm, false, -1) < 0) { err = virGetLastError(); VIR_ERROR(_("Failed to autostart VM '%s': %s"), vm->def->name, err ? err->message : _("unknown error")); - goto cleanup; + goto endjob; } ret = 0; - cleanup: - virObjectUnlock(vm); + + endjob: + if (libxlDomainObjEndJob(driver, vm)) + virObjectUnlock(vm); + return ret; } @@ -885,17 +893,26 @@ libxlDomainCreateXML(virConnectPtr conn, const char *xml, goto cleanup; def = NULL; + if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) { + virDomainObjListRemove(driver->domains, vm); + vm = NULL; + goto cleanup; + } + if (libxlDomainStart(driver, vm, (flags & VIR_DOMAIN_START_PAUSED) != 0, -1) < 0) { virDomainObjListRemove(driver->domains, vm); - vm = NULL; - goto cleanup; + goto endjob; } dom = virGetDomain(conn, vm->def->name, vm->def->uuid); if (dom) dom->id = vm->def->id; + endjob: + if (!libxlDomainObjEndJob(driver, vm)) + vm = NULL; + cleanup: virDomainDefFree(def); if (vm) @@ -1681,7 +1698,7 @@ libxlDomainRestoreFlags(virConnectPtr conn, const char *from, fd = libxlDomainSaveImageOpen(driver, cfg, from, &def, &hdr); if (fd < 0) - return -1; + goto cleanup; if (virDomainRestoreFlagsEnsureACL(conn, def) < 0) goto cleanup; @@ -1695,11 +1712,20 @@ libxlDomainRestoreFlags(virConnectPtr conn, const char *from, def = NULL; + if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) { + if (!vm->persistent) { + virDomainObjListRemove(driver->domains, vm); + vm = NULL; + } + goto cleanup; + } + ret = libxlDomainStart(driver, vm, (flags & VIR_DOMAIN_SAVE_PAUSED) != 0, fd); - if (ret < 0 && !vm->persistent) { + if (ret < 0 && !vm->persistent) virDomainObjListRemove(driver->domains, vm); + + if (!libxlDomainObjEndJob(driver, vm)) vm = NULL; - } cleanup: if (VIR_CLOSE(fd) < 0) @@ -2567,17 +2593,24 @@ libxlDomainCreateWithFlags(virDomainPtr dom, if (virDomainCreateWithFlagsEnsureACL(dom->conn, vm->def) < 0) goto cleanup; + if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) + goto cleanup; + if (virDomainObjIsActive(vm)) { virReportError(VIR_ERR_OPERATION_INVALID, "%s", _("Domain is already running")); - goto cleanup; + goto endjob; } ret = libxlDomainStart(driver, vm, (flags & VIR_DOMAIN_START_PAUSED) != 0, -1); if (ret < 0) - goto cleanup; + goto endjob; dom->id = vm->def->id; + endjob: + if (!libxlDomainObjEndJob(driver, vm)) + vm = NULL; + cleanup: if (vm) virObjectUnlock(vm); -- 1.8.4.5

On Wed, Mar 25, 2015 at 02:08:34PM -0600, Jim Fehlig wrote:
Let callers of libxlDomainStart decide when it is appropriate to acquire a job on the associated virDomainObj.
This makes sense, I see many bugs this fixes, but how come libxlDomainShutdownThread(), libxlDomainRestoreFlags() and libxlDoMigrateReceive() don't need to start the job when they all call libxlDomainStart()?
Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_domain.c | 24 ++++++++-------------- src/libxl/libxl_driver.c | 53 +++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 52 insertions(+), 25 deletions(-)
diff --git a/src/libxl/libxl_driver.c b/src/libxl/libxl_driver.c index 05f6eb1..da4c1c7 100644 --- a/src/libxl/libxl_driver.c +++ b/src/libxl/libxl_driver.c @@ -885,17 +893,26 @@ libxlDomainCreateXML(virConnectPtr conn, const char *xml, goto cleanup; def = NULL;
+ if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) { + virDomainObjListRemove(driver->domains, vm); + vm = NULL; + goto cleanup; + } +
This should be acquired before virDomainObjListAdd() since you need to check whether it's active after creating the job. If I'm wrong, then virDomainObjListRemove() should be only called if the vm is not persistent (as CreateXML can be called on persistent ones as well), shouldn't it? [...]
@@ -1695,11 +1712,20 @@ libxlDomainRestoreFlags(virConnectPtr conn, const char *from,
def = NULL;
+ if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) { + if (!vm->persistent) { + virDomainObjListRemove(driver->domains, vm); + vm = NULL; + } + goto cleanup; + } +
Same here, I guess.
ret = libxlDomainStart(driver, vm, (flags & VIR_DOMAIN_SAVE_PAUSED) != 0, fd); - if (ret < 0 && !vm->persistent) { + if (ret < 0 && !vm->persistent) virDomainObjListRemove(driver->domains, vm); + + if (!libxlDomainObjEndJob(driver, vm)) vm = NULL; - }
cleanup: if (VIR_CLOSE(fd) < 0) @@ -2567,17 +2593,24 @@ libxlDomainCreateWithFlags(virDomainPtr dom, if (virDomainCreateWithFlagsEnsureACL(dom->conn, vm->def) < 0) goto cleanup;
+ if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) + goto cleanup; + if (virDomainObjIsActive(vm)) { virReportError(VIR_ERR_OPERATION_INVALID, "%s", _("Domain is already running")); - goto cleanup; + goto endjob; }
ret = libxlDomainStart(driver, vm, (flags & VIR_DOMAIN_START_PAUSED) != 0, -1); if (ret < 0) - goto cleanup; + goto endjob; dom->id = vm->def->id;
+ endjob: + if (!libxlDomainObjEndJob(driver, vm)) + vm = NULL; + cleanup: if (vm) virObjectUnlock(vm); -- 1.8.4.5
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list

Martin Kletzander wrote:
On Wed, Mar 25, 2015 at 02:08:34PM -0600, Jim Fehlig wrote:
Let callers of libxlDomainStart decide when it is appropriate to acquire a job on the associated virDomainObj.
This makes sense, I see many bugs this fixes, but how come libxlDomainShutdownThread(), libxlDomainRestoreFlags() and libxlDoMigrateReceive() don't need to start the job when they all call libxlDomainStart()?
Commit 0521d658 starts a job for libxlDomainShutdownThread. This patch adds starting a job in libxlDomainRestoreFlags. For migration, I'll need to work on a follow-up series that fixes job handling in general.
Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_domain.c | 24 ++++++++-------------- src/libxl/libxl_driver.c | 53 +++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 52 insertions(+), 25 deletions(-)
diff --git a/src/libxl/libxl_driver.c b/src/libxl/libxl_driver.c index 05f6eb1..da4c1c7 100644 --- a/src/libxl/libxl_driver.c +++ b/src/libxl/libxl_driver.c @@ -885,17 +893,26 @@ libxlDomainCreateXML(virConnectPtr conn, const char *xml, goto cleanup; def = NULL;
+ if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) { + virDomainObjListRemove(driver->domains, vm); + vm = NULL; + goto cleanup; + } +
This should be acquired before virDomainObjListAdd() since you need to check whether it's active after creating the job.
Acquiring the job requires a virDomainObj, which we get from the call to virDomainObjListAdd().
If I'm wrong, then virDomainObjListRemove() should be only called if the vm is not persistent (as CreateXML can be called on persistent ones as well), shouldn't it?
Yes, I think you are right. This was coded up assuming CreateXML only handled transient domains.
[...]
@@ -1695,11 +1712,20 @@ libxlDomainRestoreFlags(virConnectPtr conn, const char *from,
def = NULL;
+ if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) { + if (!vm->persistent) { + virDomainObjListRemove(driver->domains, vm); + vm = NULL; + } + goto cleanup; + } +
Same here, I guess.
Yes :-). A virDomainObj is needed to acquire a job. Thanks for the review. I'll fix calling virDomainObjListRemove() on persistent domains in V2. Regards, Jim

A job should be acquired at the beginning of a domain destroy operation, not at the end when cleaning up the domain. Fix two occurances of this late job acquisition in the libxl driver. Doing so renders libxlDomainCleanup unused, so it is removed. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_domain.c | 49 +++++++++++++++++------------------------------- src/libxl/libxl_domain.h | 4 ---- src/libxl/libxl_driver.c | 20 ++++++++++++-------- 3 files changed, 29 insertions(+), 44 deletions(-) diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c index eca1ff0..e240eae 100644 --- a/src/libxl/libxl_domain.c +++ b/src/libxl/libxl_domain.c @@ -371,6 +371,9 @@ libxlDomainShutdownThread(void *opaque) cfg = libxlDriverConfigGet(driver); + if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) + goto cleanup; + if (xl_reason == LIBXL_SHUTDOWN_REASON_POWEROFF) { dom_event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, @@ -384,7 +387,7 @@ libxlDomainShutdownThread(void *opaque) goto restart; case VIR_DOMAIN_LIFECYCLE_PRESERVE: case VIR_DOMAIN_LIFECYCLE_LAST: - goto cleanup; + goto endjob; } } else if (xl_reason == LIBXL_SHUTDOWN_REASON_CRASH) { dom_event = virDomainEventLifecycleNewFromObj(vm, @@ -399,7 +402,7 @@ libxlDomainShutdownThread(void *opaque) goto restart; case VIR_DOMAIN_LIFECYCLE_CRASH_PRESERVE: case VIR_DOMAIN_LIFECYCLE_CRASH_LAST: - goto cleanup; + goto endjob; case VIR_DOMAIN_LIFECYCLE_CRASH_COREDUMP_DESTROY: libxlDomainAutoCoreDump(driver, vm); goto destroy; @@ -420,11 +423,11 @@ libxlDomainShutdownThread(void *opaque) goto restart; case VIR_DOMAIN_LIFECYCLE_PRESERVE: case VIR_DOMAIN_LIFECYCLE_LAST: - goto cleanup; + goto endjob; } } else { VIR_INFO("Unhandled shutdown_reason %d", xl_reason); - goto cleanup; + goto endjob; } destroy: @@ -433,13 +436,11 @@ libxlDomainShutdownThread(void *opaque) dom_event = NULL; } libxl_domain_destroy(cfg->ctx, vm->def->id, NULL); - if (libxlDomainCleanupJob(driver, vm, reason)) { - if (!vm->persistent) { - virDomainObjListRemove(driver->domains, vm); - vm = NULL; - } - } - goto cleanup; + libxlDomainCleanup(driver, vm, reason); + if (!vm->persistent) + virDomainObjListRemove(driver->domains, vm); + + goto endjob; restart: if (dom_event) { @@ -447,13 +448,17 @@ libxlDomainShutdownThread(void *opaque) dom_event = NULL; } libxl_domain_destroy(cfg->ctx, vm->def->id, NULL); - libxlDomainCleanupJob(driver, vm, VIR_DOMAIN_SHUTOFF_SHUTDOWN); + libxlDomainCleanup(driver, vm, VIR_DOMAIN_SHUTOFF_SHUTDOWN); if (libxlDomainStart(driver, vm, false, -1) < 0) { virErrorPtr err = virGetLastError(); VIR_ERROR(_("Failed to restart VM '%s': %s"), vm->def->name, err ? err->message : _("unknown error")); } + endjob: + if (!libxlDomainObjEndJob(driver, vm)) + vm = NULL; + cleanup: if (vm) virObjectUnlock(vm); @@ -690,26 +695,6 @@ libxlDomainCleanup(libxlDriverPrivatePtr driver, } /* - * Cleanup function for domain that has reached shutoff state. - * Executed in the context of a job. - * - * virDomainObjPtr should be locked on invocation - * Returns true if references remain on virDomainObjPtr, false otherwise. - */ -bool -libxlDomainCleanupJob(libxlDriverPrivatePtr driver, - virDomainObjPtr vm, - virDomainShutoffReason reason) -{ - if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_DESTROY) < 0) - return true; - - libxlDomainCleanup(driver, vm, reason); - - return libxlDomainObjEndJob(driver, vm); -} - -/* * Core dump domain to default dump path. * * virDomainObjPtr must be locked on invocation diff --git a/src/libxl/libxl_domain.h b/src/libxl/libxl_domain.h index a032e46..30855a2 100644 --- a/src/libxl/libxl_domain.h +++ b/src/libxl/libxl_domain.h @@ -108,10 +108,6 @@ libxlDomainCleanup(libxlDriverPrivatePtr driver, virDomainObjPtr vm, virDomainShutoffReason reason); -bool -libxlDomainCleanupJob(libxlDriverPrivatePtr driver, - virDomainObjPtr vm, - virDomainShutoffReason reason); /* * Note: Xen 4.3 removed the const from the event handler signature. * Detect which signature to use based on diff --git a/src/libxl/libxl_driver.c b/src/libxl/libxl_driver.c index da4c1c7..a1977c2 100644 --- a/src/libxl/libxl_driver.c +++ b/src/libxl/libxl_driver.c @@ -1238,10 +1238,13 @@ libxlDomainDestroyFlags(virDomainPtr dom, if (virDomainDestroyFlagsEnsureACL(dom->conn, vm->def) < 0) goto cleanup; + if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) + goto cleanup; + if (!virDomainObjIsActive(vm)) { virReportError(VIR_ERR_OPERATION_INVALID, "%s", _("Domain is not running")); - goto cleanup; + goto endjob; } event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, @@ -1250,18 +1253,19 @@ libxlDomainDestroyFlags(virDomainPtr dom, if (libxl_domain_destroy(cfg->ctx, vm->def->id, NULL) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, _("Failed to destroy domain '%d'"), vm->def->id); - goto cleanup; + goto endjob; } - if (libxlDomainCleanupJob(driver, vm, VIR_DOMAIN_SHUTOFF_DESTROYED)) { - if (!vm->persistent) { - virDomainObjListRemove(driver->domains, vm); - vm = NULL; - } - } + libxlDomainCleanup(driver, vm, VIR_DOMAIN_SHUTOFF_DESTROYED); + if (!vm->persistent) + virDomainObjListRemove(driver->domains, vm); ret = 0; + endjob: + if (!libxlDomainObjEndJob(driver, vm)) + vm = NULL; + cleanup: if (vm) virObjectUnlock(vm); -- 1.8.4.5

On Wed, Mar 25, 2015 at 02:08:35PM -0600, Jim Fehlig wrote:
A job should be acquired at the beginning of a domain destroy operation, not at the end when cleaning up the domain. Fix two occurances of this late job acquisition in the libxl driver. Doing so renders libxlDomainCleanup unused, so it is removed.
Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_domain.c | 49 +++++++++++++++++------------------------------- src/libxl/libxl_domain.h | 4 ---- src/libxl/libxl_driver.c | 20 ++++++++++++-------- 3 files changed, 29 insertions(+), 44 deletions(-)
diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c index eca1ff0..e240eae 100644 --- a/src/libxl/libxl_domain.c +++ b/src/libxl/libxl_domain.c @@ -371,6 +371,9 @@ libxlDomainShutdownThread(void *opaque)
cfg = libxlDriverConfigGet(driver);
+ if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) + goto cleanup; +
Won't this a deadlock with 'libxlDomainAutoCoreDump' (line 410) (which also calls : 727 if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) 728 goto cleanup; well, not deadlock - but spin up to 30 seconds? and then give up? Perhaps this patch should be folded in?
From 9f2bac0c28815fc51850290c4b1962881691bfca Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Date: Wed, 25 Mar 2015 22:34:51 -0400 Subject: [PATCH] squash
--- src/libxl/libxl_domain.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c index aef0157..774b070 100644 --- a/src/libxl/libxl_domain.c +++ b/src/libxl/libxl_domain.c @@ -723,15 +723,11 @@ libxlDomainAutoCoreDump(libxlDriverPrivatePtr driver, timestr) < 0) goto cleanup; - if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) - goto cleanup; - /* Unlock virDomainObj while dumping core */ virObjectUnlock(vm); libxl_domain_core_dump(cfg->ctx, vm->def->id, dumpfile, NULL); virObjectLock(vm); - ignore_value(libxlDomainObjEndJob(driver, vm)); ret = 0; cleanup: -- 1.8.4.2
if (xl_reason == LIBXL_SHUTDOWN_REASON_POWEROFF) { dom_event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, @@ -384,7 +387,7 @@ libxlDomainShutdownThread(void *opaque) goto restart; case VIR_DOMAIN_LIFECYCLE_PRESERVE: case VIR_DOMAIN_LIFECYCLE_LAST: - goto cleanup; + goto endjob; } } else if (xl_reason == LIBXL_SHUTDOWN_REASON_CRASH) { dom_event = virDomainEventLifecycleNewFromObj(vm, @@ -399,7 +402,7 @@ libxlDomainShutdownThread(void *opaque) goto restart; case VIR_DOMAIN_LIFECYCLE_CRASH_PRESERVE: case VIR_DOMAIN_LIFECYCLE_CRASH_LAST: - goto cleanup; + goto endjob; case VIR_DOMAIN_LIFECYCLE_CRASH_COREDUMP_DESTROY: libxlDomainAutoCoreDump(driver, vm); goto destroy; @@ -420,11 +423,11 @@ libxlDomainShutdownThread(void *opaque) goto restart; case VIR_DOMAIN_LIFECYCLE_PRESERVE: case VIR_DOMAIN_LIFECYCLE_LAST: - goto cleanup; + goto endjob; } } else { VIR_INFO("Unhandled shutdown_reason %d", xl_reason); - goto cleanup; + goto endjob; }
destroy: @@ -433,13 +436,11 @@ libxlDomainShutdownThread(void *opaque) dom_event = NULL; } libxl_domain_destroy(cfg->ctx, vm->def->id, NULL); - if (libxlDomainCleanupJob(driver, vm, reason)) { - if (!vm->persistent) { - virDomainObjListRemove(driver->domains, vm); - vm = NULL; - } - } - goto cleanup; + libxlDomainCleanup(driver, vm, reason); + if (!vm->persistent) + virDomainObjListRemove(driver->domains, vm); + + goto endjob;
restart: if (dom_event) { @@ -447,13 +448,17 @@ libxlDomainShutdownThread(void *opaque) dom_event = NULL; } libxl_domain_destroy(cfg->ctx, vm->def->id, NULL); - libxlDomainCleanupJob(driver, vm, VIR_DOMAIN_SHUTOFF_SHUTDOWN); + libxlDomainCleanup(driver, vm, VIR_DOMAIN_SHUTOFF_SHUTDOWN); if (libxlDomainStart(driver, vm, false, -1) < 0) { virErrorPtr err = virGetLastError(); VIR_ERROR(_("Failed to restart VM '%s': %s"), vm->def->name, err ? err->message : _("unknown error")); }
+ endjob: + if (!libxlDomainObjEndJob(driver, vm)) + vm = NULL; + cleanup: if (vm) virObjectUnlock(vm); @@ -690,26 +695,6 @@ libxlDomainCleanup(libxlDriverPrivatePtr driver, }
/* - * Cleanup function for domain that has reached shutoff state. - * Executed in the context of a job. - * - * virDomainObjPtr should be locked on invocation - * Returns true if references remain on virDomainObjPtr, false otherwise. - */ -bool -libxlDomainCleanupJob(libxlDriverPrivatePtr driver, - virDomainObjPtr vm, - virDomainShutoffReason reason) -{ - if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_DESTROY) < 0) - return true; - - libxlDomainCleanup(driver, vm, reason); - - return libxlDomainObjEndJob(driver, vm); -} - -/* * Core dump domain to default dump path. * * virDomainObjPtr must be locked on invocation diff --git a/src/libxl/libxl_domain.h b/src/libxl/libxl_domain.h index a032e46..30855a2 100644 --- a/src/libxl/libxl_domain.h +++ b/src/libxl/libxl_domain.h @@ -108,10 +108,6 @@ libxlDomainCleanup(libxlDriverPrivatePtr driver, virDomainObjPtr vm, virDomainShutoffReason reason);
-bool -libxlDomainCleanupJob(libxlDriverPrivatePtr driver, - virDomainObjPtr vm, - virDomainShutoffReason reason); /* * Note: Xen 4.3 removed the const from the event handler signature. * Detect which signature to use based on diff --git a/src/libxl/libxl_driver.c b/src/libxl/libxl_driver.c index da4c1c7..a1977c2 100644 --- a/src/libxl/libxl_driver.c +++ b/src/libxl/libxl_driver.c @@ -1238,10 +1238,13 @@ libxlDomainDestroyFlags(virDomainPtr dom, if (virDomainDestroyFlagsEnsureACL(dom->conn, vm->def) < 0) goto cleanup;
+ if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) + goto cleanup; + if (!virDomainObjIsActive(vm)) { virReportError(VIR_ERR_OPERATION_INVALID, "%s", _("Domain is not running")); - goto cleanup; + goto endjob; }
event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, @@ -1250,18 +1253,19 @@ libxlDomainDestroyFlags(virDomainPtr dom, if (libxl_domain_destroy(cfg->ctx, vm->def->id, NULL) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, _("Failed to destroy domain '%d'"), vm->def->id); - goto cleanup; + goto endjob; }
- if (libxlDomainCleanupJob(driver, vm, VIR_DOMAIN_SHUTOFF_DESTROYED)) { - if (!vm->persistent) { - virDomainObjListRemove(driver->domains, vm); - vm = NULL; - } - } + libxlDomainCleanup(driver, vm, VIR_DOMAIN_SHUTOFF_DESTROYED); + if (!vm->persistent) + virDomainObjListRemove(driver->domains, vm);
ret = 0;
+ endjob: + if (!libxlDomainObjEndJob(driver, vm)) + vm = NULL; + cleanup: if (vm) virObjectUnlock(vm); -- 1.8.4.5
_______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel

Konrad Rzeszutek Wilk wrote:
On Wed, Mar 25, 2015 at 02:08:35PM -0600, Jim Fehlig wrote:
A job should be acquired at the beginning of a domain destroy operation, not at the end when cleaning up the domain. Fix two occurances of this late job acquisition in the libxl driver. Doing so renders libxlDomainCleanup unused, so it is removed.
Just noticed this should be libxlDomainCleanupJob.
Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_domain.c | 49 +++++++++++++++++------------------------------- src/libxl/libxl_domain.h | 4 ---- src/libxl/libxl_driver.c | 20 ++++++++++++-------- 3 files changed, 29 insertions(+), 44 deletions(-)
diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c index eca1ff0..e240eae 100644 --- a/src/libxl/libxl_domain.c +++ b/src/libxl/libxl_domain.c @@ -371,6 +371,9 @@ libxlDomainShutdownThread(void *opaque)
cfg = libxlDriverConfigGet(driver);
+ if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) + goto cleanup; +
Won't this a deadlock with 'libxlDomainAutoCoreDump' (line 410) (which also calls :
727 if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) 728 goto cleanup;
well, not deadlock - but spin up to 30 seconds? and then give up?
Yes, another good catch!
Perhaps this patch should be folded in?
From 9f2bac0c28815fc51850290c4b1962881691bfca Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Date: Wed, 25 Mar 2015 22:34:51 -0400 Subject: [PATCH] squash
--- src/libxl/libxl_domain.c | 4 ---- 1 file changed, 4 deletions(-)
diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c index aef0157..774b070 100644 --- a/src/libxl/libxl_domain.c +++ b/src/libxl/libxl_domain.c @@ -723,15 +723,11 @@ libxlDomainAutoCoreDump(libxlDriverPrivatePtr driver, timestr) < 0) goto cleanup;
- if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) - goto cleanup; - /* Unlock virDomainObj while dumping core */ virObjectUnlock(vm); libxl_domain_core_dump(cfg->ctx, vm->def->id, dumpfile, NULL); virObjectLock(vm);
- ignore_value(libxlDomainObjEndJob(driver, vm));
I've squashed this in my branch and fixed the commit message typo. Regards, Jim

On Thu, Mar 26, 2015 at 03:29:51PM -0600, Jim Fehlig wrote:
Konrad Rzeszutek Wilk wrote:
On Wed, Mar 25, 2015 at 02:08:35PM -0600, Jim Fehlig wrote:
A job should be acquired at the beginning of a domain destroy operation, not at the end when cleaning up the domain. Fix two occurances of this
And s/occurances/occurrences/ here. It looks fine, though, with the squash-in. Also, if you want to have a look at some other things that might be fixed here, plus some speed-up gained, have a look at my commit 540c339a, that does some similar things in the QEMU driver.
late job acquisition in the libxl driver. Doing so renders libxlDomainCleanup unused, so it is removed.
Just noticed this should be libxlDomainCleanupJob.
Yes :)

Martin Kletzander wrote:
On Thu, Mar 26, 2015 at 03:29:51PM -0600, Jim Fehlig wrote:
Konrad Rzeszutek Wilk wrote:
On Wed, Mar 25, 2015 at 02:08:35PM -0600, Jim Fehlig wrote:
A job should be acquired at the beginning of a domain destroy operation, not at the end when cleaning up the domain. Fix two occurances of this
And s/occurances/occurrences/ here.
It looks fine, though, with the squash-in.
Thanks. I've sent a V2 addressing comments from you and Konrad https://www.redhat.com/archives/libvir-list/2015-April/msg00072.html
Also, if you want to have a look at some other things that might be fixed here, plus some speed-up gained, have a look at my commit 540c339a, that does some similar things in the QEMU driver.
Ah, that is nice work! I recall seeing the series and thinking it was pertinent to the libxl driver, but then forgot to take a closer look. Thanks for the reminder! I'll work on a similar improvement in the libxl driver. But IMO that should not hold up this series, which is a big improvement in itself. Regards, Jim

A destroy operation can take considerable time on large memory domains due to scrubbing the domain' memory. The operation is running in the context of a job, so unlocking the domain and allowing query operations is safe. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_domain.c | 4 ++++ src/libxl/libxl_driver.c | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c index e240eae..aef0157 100644 --- a/src/libxl/libxl_domain.c +++ b/src/libxl/libxl_domain.c @@ -435,7 +435,9 @@ libxlDomainShutdownThread(void *opaque) libxlDomainEventQueue(driver, dom_event); dom_event = NULL; } + virObjectUnlock(vm); libxl_domain_destroy(cfg->ctx, vm->def->id, NULL); + virObjectLock(vm); libxlDomainCleanup(driver, vm, reason); if (!vm->persistent) virDomainObjListRemove(driver->domains, vm); @@ -447,7 +449,9 @@ libxlDomainShutdownThread(void *opaque) libxlDomainEventQueue(driver, dom_event); dom_event = NULL; } + virObjectUnlock(vm); libxl_domain_destroy(cfg->ctx, vm->def->id, NULL); + virObjectLock(vm); libxlDomainCleanup(driver, vm, VIR_DOMAIN_SHUTOFF_SHUTDOWN); if (libxlDomainStart(driver, vm, false, -1) < 0) { virErrorPtr err = virGetLastError(); diff --git a/src/libxl/libxl_driver.c b/src/libxl/libxl_driver.c index a1977c2..21e20bb 100644 --- a/src/libxl/libxl_driver.c +++ b/src/libxl/libxl_driver.c @@ -1250,7 +1250,10 @@ libxlDomainDestroyFlags(virDomainPtr dom, event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_DESTROYED); - if (libxl_domain_destroy(cfg->ctx, vm->def->id, NULL) < 0) { + virObjectUnlock(vm); + ret = libxl_domain_destroy(cfg->ctx, vm->def->id, NULL); + virObjectLock(vm); + if (ret < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, _("Failed to destroy domain '%d'"), vm->def->id); goto endjob; -- 1.8.4.5

On Wed, 2015-03-25 at 14:08 -0600, Jim Fehlig wrote:
A destroy operation can take considerable time on large memory domains due to scrubbing the domain' memory. The operation is running in the context of a job, so unlocking the domain and allowing query operations is safe.
Signed-off-by: Jim Fehlig <jfehlig@suse.com>
I don't really know enough about the libvirt job/obj locking to comment on the previous patches or that aspect of this one, but in general the idea of dropping the lock before calling libxl_destroy seems reasonable to me. In principal you could use the async stuff (the final NULL to the call), but you'd still be wanting to drop the lock for that period, so it's not clear it's any better from your PoV. Ian.
--- src/libxl/libxl_domain.c | 4 ++++ src/libxl/libxl_driver.c | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c index e240eae..aef0157 100644 --- a/src/libxl/libxl_domain.c +++ b/src/libxl/libxl_domain.c @@ -435,7 +435,9 @@ libxlDomainShutdownThread(void *opaque) libxlDomainEventQueue(driver, dom_event); dom_event = NULL; } + virObjectUnlock(vm); libxl_domain_destroy(cfg->ctx, vm->def->id, NULL); + virObjectLock(vm); libxlDomainCleanup(driver, vm, reason); if (!vm->persistent) virDomainObjListRemove(driver->domains, vm); @@ -447,7 +449,9 @@ libxlDomainShutdownThread(void *opaque) libxlDomainEventQueue(driver, dom_event); dom_event = NULL; } + virObjectUnlock(vm); libxl_domain_destroy(cfg->ctx, vm->def->id, NULL); + virObjectLock(vm); libxlDomainCleanup(driver, vm, VIR_DOMAIN_SHUTOFF_SHUTDOWN); if (libxlDomainStart(driver, vm, false, -1) < 0) { virErrorPtr err = virGetLastError(); diff --git a/src/libxl/libxl_driver.c b/src/libxl/libxl_driver.c index a1977c2..21e20bb 100644 --- a/src/libxl/libxl_driver.c +++ b/src/libxl/libxl_driver.c @@ -1250,7 +1250,10 @@ libxlDomainDestroyFlags(virDomainPtr dom, event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_DESTROYED);
- if (libxl_domain_destroy(cfg->ctx, vm->def->id, NULL) < 0) { + virObjectUnlock(vm); + ret = libxl_domain_destroy(cfg->ctx, vm->def->id, NULL); + virObjectLock(vm); + if (ret < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, _("Failed to destroy domain '%d'"), vm->def->id); goto endjob;

Ian Campbell wrote:
On Wed, 2015-03-25 at 14:08 -0600, Jim Fehlig wrote:
A destroy operation can take considerable time on large memory domains due to scrubbing the domain' memory. The operation is running in the context of a job, so unlocking the domain and allowing query operations is safe.
Signed-off-by: Jim Fehlig <jfehlig@suse.com>
I don't really know enough about the libvirt job/obj locking to comment on the previous patches or that aspect of this one, but in general the idea of dropping the lock before calling libxl_destroy seems reasonable to me.
In principal you could use the async stuff (the final NULL to the call), but you'd still be wanting to drop the lock for that period, so it's not clear it's any better from your PoV.
I tried the async approach as well, but in the end decided it was no better. In case you're interested, I pushed the async variant to libxl-dom-destroy-async branch in my github fork https://github.com/jfehlig/libvirt Regards, Jim

On Wed, Mar 25, 2015 at 02:08:33PM -0600, Jim Fehlig wrote:
This small series of patches fixes some issues wrt domain destroy in the libxl driver. The primary motivation for this work is to prevent locking the virDomainObj during long running destroy operations on large memory domains.
Patch 1 moves job acquisition from libxlDomainStart to it's callers so they have more control over when the job is acquired. Patch 2 fixes a few spots where we never acquired a job during domain destroy. Patch 3 contains the interesting change, where the virDomainObj is unlocked during the long-running destroy operation.
This series wraps up my work to improve parallel OpenStack Tempest runs against the libxl driver. With libvirt.git master + this series + a patched libxl [1], I've successfully run a reproducer that was hitting the same issues encountered by Tempest.
[1] libxl commits from xen.git: 93699882d, f1335f0d, 4783c99a, 1c91d6fba, and 188e9c54. I'll contact the stable branch maintainers and ask them to include these commits in the next Xen 4.4.x and 4.5.x releases.
Jim Fehlig (3): libxl: Move job acquisition in libxlDomainStart to callers libxl: acquire a job when destroying a domain libxl: drop virDomainObj lock when destroying a domain
I am no expert at this- but I dug through the code to understand how the job and locking is done and now I am more comfortable with it. Since I am new to this I went through all of the the callsites (which used the job now) from the driver to make sure that there are no chained calls (one function calling another which also uses a mutex or job locking). I only found one culprit (libxlDomainAutoCoreDump being called from libxlDomainShutdownThread). Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
src/libxl/libxl_domain.c | 77 +++++++++++++++++++---------------------------- src/libxl/libxl_domain.h | 4 --- src/libxl/libxl_driver.c | 78 ++++++++++++++++++++++++++++++++++++------------ 3 files changed, 89 insertions(+), 70 deletions(-)
-- 1.8.4.5
_______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel

Konrad Rzeszutek Wilk wrote:
On Wed, Mar 25, 2015 at 02:08:33PM -0600, Jim Fehlig wrote:
This small series of patches fixes some issues wrt domain destroy in the libxl driver. The primary motivation for this work is to prevent locking the virDomainObj during long running destroy operations on large memory domains.
Patch 1 moves job acquisition from libxlDomainStart to it's callers so they have more control over when the job is acquired. Patch 2 fixes a few spots where we never acquired a job during domain destroy. Patch 3 contains the interesting change, where the virDomainObj is unlocked during the long-running destroy operation.
This series wraps up my work to improve parallel OpenStack Tempest runs against the libxl driver. With libvirt.git master + this series + a patched libxl [1], I've successfully run a reproducer that was hitting the same issues encountered by Tempest.
[1] libxl commits from xen.git: 93699882d, f1335f0d, 4783c99a, 1c91d6fba, and 188e9c54. I'll contact the stable branch maintainers and ask them to include these commits in the next Xen 4.4.x and 4.5.x releases.
Jim Fehlig (3): libxl: Move job acquisition in libxlDomainStart to callers libxl: acquire a job when destroying a domain libxl: drop virDomainObj lock when destroying a domain
I am no expert at this- but I dug through the code to understand how the job and locking is done and now I am more comfortable with it.
Since I am new to this I went through all of the the callsites (which used the job now) from the driver to make sure that there are no chained calls (one function calling another which also uses a mutex or job locking).
I only found one culprit (libxlDomainAutoCoreDump being called from libxlDomainShutdownThread).
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Thanks for taking time to review the series and familiarize yourself with the libvirt libxl code! As mentioned, I squashed in your libxlDomainAutoCoreDump fix in 2/3. Do any other libvirt devs have time for a quick review? I'd like to push this series, and the dom0 ballooning fix, for 1.2.14 if there are no objections. The latter was reviewed by Stefano Stabellini before the freeze https://www.redhat.com/archives/libvir-list/2015-March/msg01248.html Regards, Jim

Jim Fehlig wrote:
Konrad Rzeszutek Wilk wrote:
On Wed, Mar 25, 2015 at 02:08:33PM -0600, Jim Fehlig wrote:
This small series of patches fixes some issues wrt domain destroy in the libxl driver. The primary motivation for this work is to prevent locking the virDomainObj during long running destroy operations on large memory domains.
Patch 1 moves job acquisition from libxlDomainStart to it's callers so they have more control over when the job is acquired. Patch 2 fixes a few spots where we never acquired a job during domain destroy. Patch 3 contains the interesting change, where the virDomainObj is unlocked during the long-running destroy operation.
This series wraps up my work to improve parallel OpenStack Tempest runs against the libxl driver. With libvirt.git master + this series + a patched libxl [1], I've successfully run a reproducer that was hitting the same issues encountered by Tempest.
[1] libxl commits from xen.git: 93699882d, f1335f0d, 4783c99a, 1c91d6fba, and 188e9c54. I'll contact the stable branch maintainers and ask them to include these commits in the next Xen 4.4.x and 4.5.x releases.
Jim Fehlig (3): libxl: Move job acquisition in libxlDomainStart to callers libxl: acquire a job when destroying a domain libxl: drop virDomainObj lock when destroying a domain
I am no expert at this- but I dug through the code to understand how the job and locking is done and now I am more comfortable with it.
Since I am new to this I went through all of the the callsites (which used the job now) from the driver to make sure that there are no chained calls (one function calling another which also uses a mutex or job locking).
I only found one culprit (libxlDomainAutoCoreDump being called from libxlDomainShutdownThread).
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Thanks for taking time to review the series and familiarize yourself with the libvirt libxl code! As mentioned, I squashed in your libxlDomainAutoCoreDump fix in 2/3.
Do any other libvirt devs have time for a quick review? I'd like to push this series, and the dom0 ballooning fix, for 1.2.14 if there are no objections. The latter was reviewed by Stefano Stabellini before the freeze
https://www.redhat.com/archives/libvir-list/2015-March/msg01248.html
Ping! I'd like to include these fixes for 1.2.14. The patches have been "Reviewed-by" Konrad and Stefano. Anthony also responded today that his OpenStack Tempest runs are much happier https://www.redhat.com/archives/libvir-list/2015-March/msg01540.html Regards, Jim

On Wed, Mar 25, 2015 at 02:08:33PM -0600, Jim Fehlig wrote:
This small series of patches fixes some issues wrt domain destroy in the libxl driver. The primary motivation for this work is to prevent locking the virDomainObj during long running destroy operations on large memory domains.
Patch 1 moves job acquisition from libxlDomainStart to it's callers so they have more control over when the job is acquired. Patch 2 fixes a few spots where we never acquired a job during domain destroy. Patch 3 contains the interesting change, where the virDomainObj is unlocked during the long-running destroy operation.
This series wraps up my work to improve parallel OpenStack Tempest runs against the libxl driver. With libvirt.git master + this series + a patched libxl [1], I've successfully run a reproducer that was hitting the same issues encountered by Tempest.
[1] libxl commits from xen.git: 93699882d, f1335f0d, 4783c99a, 1c91d6fba, and 188e9c54. I'll contact the stable branch maintainers and ask them to include these commits in the next Xen 4.4.x and 4.5.x releases.
Hi, I gave a try to this series with OpenStack Tempest. And it is much better. Thanks! -- Anthony PERARD
participants (5)
-
Anthony PERARD
-
Ian Campbell
-
Jim Fehlig
-
Konrad Rzeszutek Wilk
-
Martin Kletzander