[PATCH 0/6] Fix potential deadlock and crash in libxl driver

I've been investigating what turns out to be some long-standing issues in the libxl driver. One of them causes libvirtd to deadlock, the other can lead to a segmentation fault. Both can be triggered by repeatedly rebooting a collection of VMs. My reproducer continually reboots 8 VMs on a host where libvirtd runs in a VM (dom0) confined to 4 vcpus. Patches 1-4 contain improvements and preparation for the fixes in patches 5 and 6. Patch 5 fixes the potential deadlock, and patch 6 fixes the potential crash. Both contain more detail on the respective issues. My reprocuder has run for 5 days without issue. Before the patches, it would trigger within 2 days. Jim Fehlig (6): libxl: Disable death events after receiving a shutdown event libxl: Rename libxlShutdownThreadInfo struct libxl: Modify name of shutdown thread libxl: Handle domain death events in a thread libxl: Search for virDomainObj in event handler threads libxl: Protect access to libxlLogger files hash table src/libxl/libxl_domain.c | 115 ++++++++++++++++++++++----------------- src/libxl/libxl_domain.h | 3 - src/libxl/libxl_logger.c | 12 ++++ 3 files changed, 77 insertions(+), 53 deletions(-) -- 2.33.0

The libxl driver will handle all domain destruction and cleanup when receiving a domain shutdown event from libxl. Commit fa30ee04a2a introduced the ignoreDeathEvent boolean in the DomainObjPrivate struct to ignore subsequent death events from libxl. But libxl already provides a mechanism to disable death events via libxl_evdisable_domain_death. This patch partially reverts commit fa30ee04a2a and instead uses libxl_evdisable_domain_death to disable subsequent death events when processing a shutdown event. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_domain.c | 23 +++++------------------ src/libxl/libxl_domain.h | 3 --- 2 files changed, 5 insertions(+), 21 deletions(-) diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c index db2966a599..bbcbd4c74f 100644 --- a/src/libxl/libxl_domain.c +++ b/src/libxl/libxl_domain.c @@ -616,12 +616,6 @@ static void libxlDomainHandleDeath(libxlDriverPrivate *driver, virDomainObj *vm) { virObjectEvent *dom_event = NULL; - libxlDomainObjPrivate *priv = vm->privateData; - - if (priv->ignoreDeathEvent) { - priv->ignoreDeathEvent = false; - return; - } if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) return; @@ -671,7 +665,6 @@ libxlDomainEventHandler(void *data, libxl_event *event) } if (event->type == LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN) { - libxlDomainObjPrivate *priv = vm->privateData; struct libxlShutdownThreadInfo *shutdown_info = NULL; virThread thread; g_autofree char *name = NULL; @@ -688,12 +681,9 @@ libxlDomainEventHandler(void *data, libxl_event *event) name = g_strdup_printf("ev-%d", event->domid); /* * Cleanup will be handled by the shutdown thread. - * Ignore the forthcoming death event from libxl */ - priv->ignoreDeathEvent = true; if (virThreadCreateFull(&thread, false, libxlDomainShutdownThread, name, false, shutdown_info) < 0) { - priv->ignoreDeathEvent = false; /* * Not much we can do on error here except log it. */ @@ -859,18 +849,17 @@ libxlDomainDestroyInternal(libxlDriverPrivate *driver, libxlDomainObjPrivate *priv = vm->privateData; int ret = -1; - /* Ignore next LIBXL_EVENT_TYPE_DOMAIN_DEATH as the caller will handle - * domain death appropriately already (having more info, like the reason). - */ - priv->ignoreDeathEvent = true; + if (priv->deathW) { + libxl_evdisable_domain_death(cfg->ctx, priv->deathW); + priv->deathW = NULL; + } + /* Unlock virDomainObj during destroy, which can take considerable * time on large memory domains. */ virObjectUnlock(vm); ret = libxl_domain_destroy(cfg->ctx, vm->def->id, NULL); virObjectLock(vm); - if (ret) - priv->ignoreDeathEvent = false; return ret; } @@ -921,8 +910,6 @@ libxlDomainCleanup(libxlDriverPrivate *driver, priv->deathW = NULL; } - priv->ignoreDeathEvent = false; - if (!!g_atomic_int_dec_and_test(&driver->nactive) && driver->inhibitCallback) driver->inhibitCallback(false, driver->inhibitOpaque); diff --git a/src/libxl/libxl_domain.h b/src/libxl/libxl_domain.h index 661610bd3f..981bfc2bca 100644 --- a/src/libxl/libxl_domain.h +++ b/src/libxl/libxl_domain.h @@ -54,9 +54,6 @@ struct _libxlDomainObjPrivate { /* console */ virChrdevs *devs; libxl_evgen_domain_death *deathW; - /* Flag to indicate the upcoming LIBXL_EVENT_TYPE_DOMAIN_DEATH is caused - * by libvirt and should not be handled separately */ - bool ignoreDeathEvent; virThread *migrationDstReceiveThr; unsigned short migrationPort; char *lockState; -- 2.33.0

An upcoming change will use the struct in a thread created to process death events. Rename libxlShutdownThreadInfo to libxlEventHandlerThreadInfo to reflect the more generic usage. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_domain.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c index bbcbd4c74f..2c0cbab269 100644 --- a/src/libxl/libxl_domain.c +++ b/src/libxl/libxl_domain.c @@ -477,7 +477,7 @@ libxlDomainShutdownHandleRestart(libxlDriverPrivate *driver, } -struct libxlShutdownThreadInfo +struct libxlEventHandlerThreadInfo { libxlDriverPrivate *driver; virDomainObj *vm; @@ -488,7 +488,7 @@ struct libxlShutdownThreadInfo static void libxlDomainShutdownThread(void *opaque) { - struct libxlShutdownThreadInfo *shutdown_info = opaque; + struct libxlEventHandlerThreadInfo *shutdown_info = opaque; virDomainObj *vm = shutdown_info->vm; libxl_event *ev = shutdown_info->event; libxlDriverPrivate *driver = shutdown_info->driver; @@ -665,7 +665,7 @@ libxlDomainEventHandler(void *data, libxl_event *event) } if (event->type == LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN) { - struct libxlShutdownThreadInfo *shutdown_info = NULL; + struct libxlEventHandlerThreadInfo *shutdown_info = NULL; virThread thread; g_autofree char *name = NULL; @@ -673,7 +673,7 @@ libxlDomainEventHandler(void *data, libxl_event *event) * Start a thread to handle shutdown. We don't want to be tying up * libxl's event machinery by doing a potentially lengthy shutdown. */ - shutdown_info = g_new0(struct libxlShutdownThreadInfo, 1); + shutdown_info = g_new0(struct libxlEventHandlerThreadInfo, 1); shutdown_info->driver = driver; shutdown_info->vm = vm; @@ -693,7 +693,7 @@ libxlDomainEventHandler(void *data, libxl_event *event) } /* * virDomainObjEndAPI is called in the shutdown thread, where - * libxlShutdownThreadInfo and libxl_event are also freed. + * libxlEventHandlerThreadInfo and libxl_event are also freed. */ return; } else if (event->type == LIBXL_EVENT_TYPE_DOMAIN_DEATH) { -- 2.33.0

The current thread name 'ev-<domid>' is a bit terse. Change the name to 'shutdown-event-<domid>', allowing it to be distinguished between thread handling other event types. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_domain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c index 2c0cbab269..5d0034102e 100644 --- a/src/libxl/libxl_domain.c +++ b/src/libxl/libxl_domain.c @@ -678,7 +678,7 @@ libxlDomainEventHandler(void *data, libxl_event *event) shutdown_info->driver = driver; shutdown_info->vm = vm; shutdown_info->event = (libxl_event *)event; - name = g_strdup_printf("ev-%d", event->domid); + name = g_strdup_printf("shutdown-event-%d", event->domid); /* * Cleanup will be handled by the shutdown thread. */ -- 2.33.0

Similar to domain shutdown events, processing domain death events can be a lengthy process and we don't want to block the event handler while the operation completes. Move the death handling function to a thread. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_domain.c | 67 ++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 20 deletions(-) diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c index 5d0034102e..d6c5f7e5b7 100644 --- a/src/libxl/libxl_domain.c +++ b/src/libxl/libxl_domain.c @@ -613,12 +613,17 @@ libxlDomainShutdownThread(void *opaque) } static void -libxlDomainHandleDeath(libxlDriverPrivate *driver, virDomainObj *vm) +libxlDomainDeathThread(void *opaque) { + struct libxlEventHandlerThreadInfo *death_info = opaque; + virDomainObj *vm = death_info->vm; + libxl_event *ev = death_info->event; + libxlDriverPrivate *driver = death_info->driver; virObjectEvent *dom_event = NULL; + g_autoptr(libxlDriverConfig) cfg = libxlDriverConfigGet(driver); if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) - return; + goto cleanup; virDomainObjSetState(vm, VIR_DOMAIN_SHUTOFF, VIR_DOMAIN_SHUTOFF_DESTROYED); dom_event = virDomainEventLifecycleNewFromObj(vm, @@ -629,6 +634,11 @@ libxlDomainHandleDeath(libxlDriverPrivate *driver, virDomainObj *vm) virDomainObjListRemove(driver->domains, vm); libxlDomainObjEndJob(driver, vm); virObjectEventStateQueue(driver->domainEventState, dom_event); + + cleanup: + virDomainObjEndAPI(&vm); + libxl_event_free(cfg->ctx, ev); + VIR_FREE(death_info); } @@ -642,6 +652,9 @@ libxlDomainEventHandler(void *data, libxl_event *event) libxl_shutdown_reason xl_reason = event->u.domain_shutdown.shutdown_reason; virDomainObj *vm = NULL; g_autoptr(libxlDriverConfig) cfg = NULL; + struct libxlEventHandlerThreadInfo *thread_info = NULL; + virThread thread; + g_autofree char *thread_name = NULL; VIR_DEBUG("Received libxl event '%d' for domid '%d'", event->type, event->domid); @@ -664,31 +677,27 @@ libxlDomainEventHandler(void *data, libxl_event *event) goto cleanup; } + /* + * Start event-specific threads to handle shutdown and death. + * They are potentially lengthy operations and we don't want to be + * blocking this event handler while they are in progress. + */ if (event->type == LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN) { - struct libxlEventHandlerThreadInfo *shutdown_info = NULL; - virThread thread; - g_autofree char *name = NULL; + thread_info = g_new0(struct libxlEventHandlerThreadInfo, 1); - /* - * Start a thread to handle shutdown. We don't want to be tying up - * libxl's event machinery by doing a potentially lengthy shutdown. - */ - shutdown_info = g_new0(struct libxlEventHandlerThreadInfo, 1); - - shutdown_info->driver = driver; - shutdown_info->vm = vm; - shutdown_info->event = (libxl_event *)event; - name = g_strdup_printf("shutdown-event-%d", event->domid); + thread_info->driver = driver; + thread_info->vm = vm; + thread_info->event = (libxl_event *)event; + thread_name = g_strdup_printf("shutdown-event-%d", event->domid); /* * Cleanup will be handled by the shutdown thread. */ if (virThreadCreateFull(&thread, false, libxlDomainShutdownThread, - name, false, shutdown_info) < 0) { + thread_name, false, thread_info) < 0) { /* * Not much we can do on error here except log it. */ VIR_ERROR(_("Failed to create thread to handle domain shutdown")); - VIR_FREE(shutdown_info); goto cleanup; } /* @@ -697,15 +706,33 @@ libxlDomainEventHandler(void *data, libxl_event *event) */ return; } else if (event->type == LIBXL_EVENT_TYPE_DOMAIN_DEATH) { + thread_info = g_new0(struct libxlEventHandlerThreadInfo, 1); + + thread_info->driver = driver; + thread_info->vm = vm; + thread_info->event = (libxl_event *)event; + thread_name = g_strdup_printf("death-event-%d", event->domid); /* - * On death the domain is cleaned up from Xen's perspective. - * Cleanup on the libvirt side can be done synchronously. + * Cleanup will be handled by the death thread. */ - libxlDomainHandleDeath(driver, vm); + if (virThreadCreateFull(&thread, false, libxlDomainDeathThread, + thread_name, false, thread_info) < 0) { + /* + * Not much we can do on error here except log it. + */ + VIR_ERROR(_("Failed to create thread to handle domain death")); + goto cleanup; + } + /* + * virDomainObjEndAPI is called in the death thread, where + * libxlEventHandlerThreadInfo and libxl_event are also freed. + */ + return; } cleanup: virDomainObjEndAPI(&vm); + VIR_FREE(thread_info); cfg = libxlDriverConfigGet(driver); /* Cast away any const */ libxl_event_free(cfg->ctx, (libxl_event *)event); -- 2.33.0

libxl can deliver events and invoke callbacks on any application thread calling into libxl. This can cause deadlock in the libvirt libxl driver Thread 19 (Thread 0x7f31411ec700 (LWP 14068) "libvirtd"): #0 0x00007f318520cc7d in __lll_lock_wait () from /lib64/libpthread.so.0 #1 0x00007f3185205ed5 in pthread_mutex_lock () from /lib64/libpthread.so.0 #2 0x00007f3189488015 in virMutexLock (m=<optimized out>) at ../../src/util/virthread.c:79 #3 0x00007f3189463f3b in virObjectLock (anyobj=<optimized out>) at ../../src/util/virobject.c:433 #4 0x00007f31894f2f41 in virDomainObjListSearchID (payload=0x7f317400a6d0, name=<optimized out>, data=0x7f31411eaeac) at ../../src/conf/virdomainobjlist.c:105 JWF: the 'payload' is virDomainObj with domid 28712, 'data' is domid 28710. So looking for 28710 but firsh compare with 28712, which is the virDomainObj we started working with in f15 and is locked. #5 0x00007f3189437ac5 in virHashSearch (ctable=0x7f3124025a30, iter=iter@entry=0x7f31894f2f30 <virDomainObjListSearchID>, data=data@entry=0x7f31411eaeac, name=name@entry=0x0) at ../../src/util/virhash.c:745 #6 0x00007f31894f3919 in virDomainObjListFindByID (doms=0x7f3124025430, id=<optimized out>) at ../../src/conf/virdomainobjlist.c:121 #7 0x00007f3152f292e5 in libxlDomainEventHandler (data=0x7f3124023d80, event=0x7f310c010ae0) at ../../src/libxl/libxl_domain.c:660 #8 0x00007f3152c6ff5d in egc_run_callbacks (egc=egc@entry=0x7f31411eaf50) at libxl_event.c:1427 #9 0x00007f3152c718bd in libxl__egc_cleanup (egc=0x7f31411eaf50) at libxl_event.c:1458 #10 libxl__ao_inprogress (ao=ao@entry=0x7f310c00b8a0, file=file@entry=0x7f3152cce987 "libxl_domain.c", line=line@entry=730, func=func@entry=0x7f3152ccf750 <__func__.22238> "libxl_domain_unpause") at libxl_event.c:2047 #11 0x00007f3152c8c5b8 in libxl_domain_unpause (ctx=0x7f3124015a40, domid=<optimized out>, ao_how=ao_how@entry=0x0) at libxl_domain.c:730 #12 0x00007f3152f2a584 in libxl_domain_unpause_0x041200 (domid=<optimized out>, ctx=<optimized out>) at /usr/include/libxl.h:1756 #13 libxlDomainStart (driver=driver@entry=0x7f3124023d80, vm=vm@entry=0x7f317400a6d0, start_paused=start_paused@entry=false, restore_fd=restore_fd@entry=-1, restore_ver=<optimized out>, restore_ver@entry=2) at ../../src/libxl/libxl_domain.c:1482 #14 0x00007f3152f2a6e3 in libxlDomainStartNew (driver=driver@entry=0x7f3124023d80, vm=vm@entry=0x7f317400a6d0, start_paused=start_paused@entry=false) at ../../src/libxl/libxl_domain.c:1545 #15 0x00007f3152f2a789 in libxlDomainShutdownHandleRestart (driver=0x7f3124023d80, vm=0x7f317400a6d0) at ../../src/libxl/libxl_domain.c:464 JWF: domid here is 28712 #16 0x00007f3152f2a9e4 in libxlDomainShutdownThread (opaque=<optimized out>) at ../../src/libxl/libxl_domain.c:559 #17 0x00007f3189487ee2 in virThreadHelper (data=<optimized out>) at ../../src/util/virthread.c:196 #18 0x00007f3185203539 in start_thread () from /lib64/libpthread.so.0 #19 0x00007f3184f3becf in clone () from /lib64/libc.so.6 Frame 16 runs a thread created to handle domain shutdown processing for domid 28712. In this case the event contained the reboot reason, so the old domain is destroyed and a new one is created by libxlDomainStart new. After starting the domain, it is unpaused by calling libxl_domain_unpause in frame 12. While the thread is running within libxl, libxl takes the opportunity to deliver a pending domain shutdown event for unrelated domid 28710. While searching for the associated virDomainObj by ID, a deadlock is encountered when attempting to lock the virDomainObj for domid 28712, which is already locked since this thread is processing its shutdown event. The deadlock can be avoided by moving the search for a virDomainObj associated with the event domid to the shutdown thread. The same is done for the death thread. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_domain.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c index d6c5f7e5b7..366e3b9263 100644 --- a/src/libxl/libxl_domain.c +++ b/src/libxl/libxl_domain.c @@ -480,7 +480,6 @@ libxlDomainShutdownHandleRestart(libxlDriverPrivate *driver, struct libxlEventHandlerThreadInfo { libxlDriverPrivate *driver; - virDomainObj *vm; libxl_event *event; }; @@ -489,7 +488,7 @@ static void libxlDomainShutdownThread(void *opaque) { struct libxlEventHandlerThreadInfo *shutdown_info = opaque; - virDomainObj *vm = shutdown_info->vm; + virDomainObj *vm = NULL; libxl_event *ev = shutdown_info->event; libxlDriverPrivate *driver = shutdown_info->driver; virObjectEvent *dom_event = NULL; @@ -499,6 +498,12 @@ libxlDomainShutdownThread(void *opaque) libxl_domain_config_init(&d_config); + vm = virDomainObjListFindByID(driver->domains, ev->domid); + if (!vm) { + /* Nothing to do if we can't find the virDomainObj */ + goto cleanup; + } + if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) goto cleanup; @@ -616,12 +621,18 @@ static void libxlDomainDeathThread(void *opaque) { struct libxlEventHandlerThreadInfo *death_info = opaque; - virDomainObj *vm = death_info->vm; + virDomainObj *vm = NULL; libxl_event *ev = death_info->event; libxlDriverPrivate *driver = death_info->driver; virObjectEvent *dom_event = NULL; g_autoptr(libxlDriverConfig) cfg = libxlDriverConfigGet(driver); + vm = virDomainObjListFindByID(driver->domains, ev->domid); + if (!vm) { + /* Nothing to do if we can't find the virDomainObj */ + goto cleanup; + } + if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0) goto cleanup; @@ -650,7 +661,6 @@ libxlDomainEventHandler(void *data, libxl_event *event) { libxlDriverPrivate *driver = data; libxl_shutdown_reason xl_reason = event->u.domain_shutdown.shutdown_reason; - virDomainObj *vm = NULL; g_autoptr(libxlDriverConfig) cfg = NULL; struct libxlEventHandlerThreadInfo *thread_info = NULL; virThread thread; @@ -671,12 +681,6 @@ libxlDomainEventHandler(void *data, libxl_event *event) if (xl_reason == LIBXL_SHUTDOWN_REASON_SUSPEND) goto cleanup; - vm = virDomainObjListFindByID(driver->domains, event->domid); - if (!vm) { - /* Nothing to do if we can't find the virDomainObj */ - goto cleanup; - } - /* * Start event-specific threads to handle shutdown and death. * They are potentially lengthy operations and we don't want to be @@ -686,7 +690,6 @@ libxlDomainEventHandler(void *data, libxl_event *event) thread_info = g_new0(struct libxlEventHandlerThreadInfo, 1); thread_info->driver = driver; - thread_info->vm = vm; thread_info->event = (libxl_event *)event; thread_name = g_strdup_printf("shutdown-event-%d", event->domid); /* @@ -701,15 +704,14 @@ libxlDomainEventHandler(void *data, libxl_event *event) goto cleanup; } /* - * virDomainObjEndAPI is called in the shutdown thread, where - * libxlEventHandlerThreadInfo and libxl_event are also freed. + * libxlEventHandlerThreadInfo and libxl_event are freed in the + * shutdown thread */ return; } else if (event->type == LIBXL_EVENT_TYPE_DOMAIN_DEATH) { thread_info = g_new0(struct libxlEventHandlerThreadInfo, 1); thread_info->driver = driver; - thread_info->vm = vm; thread_info->event = (libxl_event *)event; thread_name = g_strdup_printf("death-event-%d", event->domid); /* @@ -724,14 +726,13 @@ libxlDomainEventHandler(void *data, libxl_event *event) goto cleanup; } /* - * virDomainObjEndAPI is called in the death thread, where - * libxlEventHandlerThreadInfo and libxl_event are also freed. + * libxlEventHandlerThreadInfo and libxl_event are freed in the + * death thread */ return; } cleanup: - virDomainObjEndAPI(&vm); VIR_FREE(thread_info); cfg = libxlDriverConfigGet(driver); /* Cast away any const */ -- 2.33.0

The hash table of log file objects in libxlLogger is not protected against concurrent access. It is possible for one thread to remove an entry while another is updating it. Add a mutex to the libxlLogger object and lock it when accessing the files hash table. Signed-off-by: Jim Fehlig <jfehlig@suse.com> --- src/libxl/libxl_logger.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/libxl/libxl_logger.c b/src/libxl/libxl_logger.c index f7b5c8ee16..27f8f50b1c 100644 --- a/src/libxl/libxl_logger.c +++ b/src/libxl/libxl_logger.c @@ -28,6 +28,7 @@ #include "util/virfile.h" #include "util/virhash.h" #include "util/virstring.h" +#include "util/virthread.h" #include "util/virtime.h" #define VIR_FROM_THIS VIR_FROM_LIBXL @@ -43,6 +44,7 @@ struct xentoollog_logger_libvirt { /* map storing the opened fds: "domid" -> FILE* */ GHashTable *files; + virMutex tableLock; FILE *defaultLogFile; }; @@ -85,7 +87,9 @@ libvirt_vmessage(xentoollog_logger *logger_in, start = start + 9; *end = '\0'; + virMutexLock(&lg->tableLock); domainLogFile = virHashLookup(lg->files, start); + virMutexUnlock(&lg->tableLock); if (domainLogFile) logFile = domainLogFile; @@ -149,11 +153,14 @@ libxlLoggerNew(const char *logDir, virLogPriority minLevel) break; } logger.logDir = logDir; + if (virMutexInit(&logger.tableLock) < 0) + return NULL; logger.files = virHashNew(libxlLoggerFileFree); path = g_strdup_printf("%s/libxl-driver.log", logDir); if ((logger.defaultLogFile = fopen(path, "a")) == NULL) { + virMutexDestroy(&logger.tableLock); virHashFree(logger.files); return NULL; } @@ -168,6 +175,7 @@ libxlLoggerFree(libxlLogger *logger) if (logger->defaultLogFile) VIR_FORCE_FCLOSE(logger->defaultLogFile); virHashFree(logger->files); + virMutexDestroy(&logger->tableLock); xtl_logger_destroy(xtl_logger); } @@ -189,7 +197,9 @@ libxlLoggerOpenFile(libxlLogger *logger, path, g_strerror(errno)); return; } + virMutexLock(&logger->tableLock); ignore_value(virHashAddEntry(logger->files, domidstr, logFile)); + virMutexUnlock(&logger->tableLock); /* domain_config is non NULL only when starting a new domain */ if (domain_config) { @@ -204,5 +214,7 @@ libxlLoggerCloseFile(libxlLogger *logger, int id) g_autofree char *domidstr = NULL; domidstr = g_strdup_printf("%d", id); + virMutexLock(&logger->tableLock); ignore_value(virHashRemoveEntry(logger->files, domidstr)); + virMutexUnlock(&logger->tableLock); } -- 2.33.0

On Mon, Nov 29, 2021 at 09:38:39AM -0700, Jim Fehlig wrote:
I've been investigating what turns out to be some long-standing issues in the libxl driver. One of them causes libvirtd to deadlock, the other can lead to a segmentation fault. Both can be triggered by repeatedly rebooting a collection of VMs. My reproducer continually reboots 8 VMs on a host where libvirtd runs in a VM (dom0) confined to 4 vcpus.
Patches 1-4 contain improvements and preparation for the fixes in patches 5 and 6. Patch 5 fixes the potential deadlock, and patch 6 fixes the potential crash. Both contain more detail on the respective issues. My reprocuder has run for 5 days without issue. Before the patches, it would trigger within 2 days.
For all patches Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|

On a Monday in 2021, Jim Fehlig wrote:
I've been investigating what turns out to be some long-standing issues in the libxl driver. One of them causes libvirtd to deadlock, the other can lead to a segmentation fault. Both can be triggered by repeatedly rebooting a collection of VMs. My reproducer continually reboots 8 VMs on a host where libvirtd runs in a VM (dom0) confined to 4 vcpus.
Patches 1-4 contain improvements and preparation for the fixes in patches 5 and 6. Patch 5 fixes the potential deadlock, and patch 6 fixes the potential crash. Both contain more detail on the respective issues. My reprocuder has run for 5 days without issue. Before the patches, it would trigger within 2 days.
Jim Fehlig (6): libxl: Disable death events after receiving a shutdown event libxl: Rename libxlShutdownThreadInfo struct libxl: Modify name of shutdown thread libxl: Handle domain death events in a thread libxl: Search for virDomainObj in event handler threads libxl: Protect access to libxlLogger files hash table
src/libxl/libxl_domain.c | 115 ++++++++++++++++++++++----------------- src/libxl/libxl_domain.h | 3 - src/libxl/libxl_logger.c | 12 ++++ 3 files changed, 77 insertions(+), 53 deletions(-)
Reviewed-by: Ján Tomko <jtomko@redhat.com> Jano
participants (3)
-
Daniel P. Berrangé
-
Jim Fehlig
-
Ján Tomko