[libvirt] [PATCH] unlock eventLoop before calling callback function

When I use newest libvirt to save a domain, libvirtd will be deadlock. Here is the output of gdb: (gdb) thread 3 [Switching to thread 3 (Thread 0x7f972a1fc710 (LWP 30265))]#0 0x000000351fe0e034 in __lll_lock_wait () from /lib64/libpthread.so.0 (gdb) bt #0 0x000000351fe0e034 in __lll_lock_wait () from /lib64/libpthread.so.0 #1 0x000000351fe09345 in _L_lock_870 () from /lib64/libpthread.so.0 #2 0x000000351fe09217 in pthread_mutex_lock () from /lib64/libpthread.so.0 #3 0x00007f9737b0e663 in virMutexLock (m=0x6fdd60) at util/threads-pthread.c:80 #4 0x000000000041853d in virEventUpdateTimeoutImpl (timer=12, frequency=0) at event.c:247 #5 0x00007f9737af8ac2 in virEventUpdateTimeout (timer=12, timeout=0) at util/event.c:70 #6 0x000000000046654e in qemuDomainEventQueue (driver=0x19c5440, event=0x7f9724005f80) at qemu/qemu_domain.c:97 #7 0x000000000044184a in qemudDomainSaveFlag (driver=0x19c5440, dom=0x7f9724000970, vm=0x1a63ac0, path=0x7f9724000940 "/var/lib/libvirt/images/memory.save", compressed=0) at qemu/qemu_driver.c:2074 #8 0x0000000000441b30 in qemudDomainSave (dom=0x7f9724000970, path=0x7f9724000940 "/var/lib/libvirt/images/memory.save") at qemu/qemu_driver.c:2137 #9 0x00007f9737b66d7d in virDomainSave (domain=0x7f9724000970, to=0x7f9724000940 "/var/lib/libvirt/images/memory.save") at libvirt.c:2280 #10 0x00000000004273e6 in remoteDispatchDomainSave (server=0x19ac120, client=0x7f972c081740, conn=0x7f9720000b40, hdr=0x7f972c041250, rerr=0x7f972a1fbb40, args=0x7f972a1fbc40, ret=0x7f972a1fbbe0) at remote.c:2273 #11 0x0000000000431b6f in remoteDispatchClientCall (server=0x19ac120, client=0x7f972c081740, msg=0x7f972c001240, qemu_protocol=false) at dispatch.c:529 #12 0x00000000004316fe in remoteDispatchClientRequest (server=0x19ac120, client=0x7f972c081740, msg=0x7f972c001240) at dispatch.c:407 #13 0x000000000041d9a2 in qemudWorker (data=0x7f972c000908) at libvirtd.c:1629 #14 0x000000351fe077e1 in start_thread () from /lib64/libpthread.so.0 #15 0x000000351f2e153d in clone () from /lib64/libc.so.6 (gdb) thread 7 [Switching to thread 7 (Thread 0x7f9730bcd710 (LWP 30261))]#0 0x000000351fe0e034 in __lll_lock_wait () from /lib64/libpthread.so.0 (gdb) bt #0 0x000000351fe0e034 in __lll_lock_wait () from /lib64/libpthread.so.0 #1 0x000000351fe09345 in _L_lock_870 () from /lib64/libpthread.so.0 #2 0x000000351fe09217 in pthread_mutex_lock () from /lib64/libpthread.so.0 #3 0x00007f9737b0e663 in virMutexLock (m=0x1a63ac0) at util/threads-pthread.c:80 #4 0x00007f9737b32cb8 in virDomainObjLock (obj=0x1a63ac0) at conf/domain_conf.c:8447 #5 0x00000000004732d2 in qemuProcessHandleMonitorDestroy (mon=0x7f9718002610, vm=0x1a63ac0) at qemu/qemu_process.c:599 #6 0x000000000047c05b in qemuMonitorFree (mon=0x7f9718002610) at qemu/qemu_monitor.c:209 #7 0x000000000047c0f9 in qemuMonitorUnref (mon=0x7f9718002610) at qemu/qemu_monitor.c:229 #8 0x000000000047c135 in qemuMonitorUnwatch (monitor=0x7f9718002610) at qemu/qemu_monitor.c:242 #9 0x00000000004194b4 in virEventCleanupHandles () at event.c:538 #10 0x00000000004197a4 in virEventRunOnce () at event.c:603 #11 0x000000000041f0bd in qemudOneLoop () at libvirtd.c:2285 #12 0x000000000041f5e9 in qemudRunLoop (opaque=0x19ac120) at libvirtd.c:2395 #13 0x000000351fe077e1 in start_thread () from /lib64/libpthread.so.0 #14 0x000000351f2e153d in clone () from /lib64/libc.so.6 (gdb) p *(virMutexPtr)0x6fdd60 $2 = {lock = {__data = {__lock = 2, __count = 0, __owner = 30261, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = "\002\000\000\000\000\000\000\000\065v\000\000\001", '\000' <repeats 26 times>, __align = 2}} (gdb) p *(virMutexPtr)0x1a63ac0 $3 = {lock = {__data = {__lock = 2, __count = 0, __owner = 30265, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = "\002\000\000\000\000\000\000\000\071v\000\000\001", '\000' <repeats 26 times>, __align = 2}} (gdb) info threads 7 Thread 0x7f9730bcd710 (LWP 30261) 0x000000351fe0e034 in __lll_lock_wait () from /lib64/libpthread.so.0 6 Thread 0x7f972bfff710 (LWP 30262) 0x000000351fe0b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 5 Thread 0x7f972b5fe710 (LWP 30263) 0x000000351fe0b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 4 Thread 0x7f972abfd710 (LWP 30264) 0x000000351fe0b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 * 3 Thread 0x7f972a1fc710 (LWP 30265) 0x000000351fe0e034 in __lll_lock_wait () from /lib64/libpthread.so.0 2 Thread 0x7f97297fb710 (LWP 30266) 0x000000351fe0b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 1 Thread 0x7f9737aac800 (LWP 30260) 0x000000351fe0803d in pthread_join () from /lib64/libpthread.so.0 The reason is that we will try to lock some object in callback function, and we may call event API with locking the same object. In the function virEventDispatchHandles(), we unlock eventLoop before calling callback function. I think we should do the same thing in the function virEventCleanupTimeouts() and virEventCleanupHandles(). Signed-off-by: Wen Congyang <wency@cn.fujitsu.com> --- daemon/event.c | 10 ++++++++-- 1 files changed, 8 insertions(+), 2 deletions(-) diff --git a/daemon/event.c b/daemon/event.c index 1a31717..0d45014 100644 --- a/daemon/event.c +++ b/daemon/event.c @@ -493,8 +493,11 @@ static int virEventCleanupTimeouts(void) { EVENT_DEBUG("Purging timeout %d with id %d", i, eventLoop.timeouts[i].timer); - if (eventLoop.timeouts[i].ff) + if (eventLoop.timeouts[i].ff) { + virMutexUnlock(&eventLoop.lock); (eventLoop.timeouts[i].ff)(eventLoop.timeouts[i].opaque); + virMutexLock(&eventLoop.lock); + } if ((i+1) < eventLoop.timeoutsCount) { memmove(eventLoop.timeouts+i, @@ -534,8 +537,11 @@ static int virEventCleanupHandles(void) { continue; } - if (eventLoop.handles[i].ff) + if (eventLoop.handles[i].ff) { + virMutexUnlock(&eventLoop.lock); (eventLoop.handles[i].ff)(eventLoop.handles[i].opaque); + virMutexLock(&eventLoop.lock); + } if ((i+1) < eventLoop.handlesCount) { memmove(eventLoop.handles+i, -- 1.7.1

On Mon, Mar 07, 2011 at 02:06:49PM +0800, Wen Congyang wrote:
diff --git a/daemon/event.c b/daemon/event.c index 1a31717..0d45014 100644 --- a/daemon/event.c +++ b/daemon/event.c @@ -493,8 +493,11 @@ static int virEventCleanupTimeouts(void) {
EVENT_DEBUG("Purging timeout %d with id %d", i, eventLoop.timeouts[i].timer); - if (eventLoop.timeouts[i].ff) + if (eventLoop.timeouts[i].ff) { + virMutexUnlock(&eventLoop.lock); (eventLoop.timeouts[i].ff)(eventLoop.timeouts[i].opaque); + virMutexLock(&eventLoop.lock); + }
if ((i+1) < eventLoop.timeoutsCount) { memmove(eventLoop.timeouts+i, @@ -534,8 +537,11 @@ static int virEventCleanupHandles(void) { continue; }
- if (eventLoop.handles[i].ff) + if (eventLoop.handles[i].ff) { + virMutexUnlock(&eventLoop.lock); (eventLoop.handles[i].ff)(eventLoop.handles[i].opaque); + virMutexLock(&eventLoop.lock); + }
if ((i+1) < eventLoop.handlesCount) { memmove(eventLoop.handles+i,
I'm a little concerned as to whether the rest of the code in virEventCleanupHandles/CleanupTimeouts is safe, if we release the lock here. eg, if some other thread calls virEventAddTimeout of AddHandle, is there any way this could cause problems for us here. So far I think this is safe because AddTimeout/AddHandle will simply append to the end of the array we're iterating over, but would like a second opinion before ACK'ing Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Mon, Mar 07, 2011 at 11:15:38 +0000, Daniel P. Berrange wrote:
- if (eventLoop.handles[i].ff) + if (eventLoop.handles[i].ff) { + virMutexUnlock(&eventLoop.lock); (eventLoop.handles[i].ff)(eventLoop.handles[i].opaque); + virMutexLock(&eventLoop.lock); + }
I'm a little concerned as to whether the rest of the code in virEventCleanupHandles/CleanupTimeouts is safe, if we release the lock here. eg, if some other thread calls virEventAddTimeout of AddHandle, is there any way this could cause problems for us here. So far I think this is safe because AddTimeout/AddHandle will simply append to the end of the array we're iterating over, but would like a second opinion before ACK'ing
I don't think it's safe to unlock eventloop.lock even in the existing Dispatch{Timeouts,Handles} cases because Add{Timeout,Handle} use realloc which is allowed to allocate a new array, move the contents of the old one into it and free the old array. So the for loop can easily end up accessing memory which has already been freed. Jirka

On Mon, Mar 07, 2011 at 02:13:23PM +0100, Jiri Denemark wrote:
On Mon, Mar 07, 2011 at 11:15:38 +0000, Daniel P. Berrange wrote:
- if (eventLoop.handles[i].ff) + if (eventLoop.handles[i].ff) { + virMutexUnlock(&eventLoop.lock); (eventLoop.handles[i].ff)(eventLoop.handles[i].opaque); + virMutexLock(&eventLoop.lock); + }
I'm a little concerned as to whether the rest of the code in virEventCleanupHandles/CleanupTimeouts is safe, if we release the lock here. eg, if some other thread calls virEventAddTimeout of AddHandle, is there any way this could cause problems for us here. So far I think this is safe because AddTimeout/AddHandle will simply append to the end of the array we're iterating over, but would like a second opinion before ACK'ing
I don't think it's safe to unlock eventloop.lock even in the existing Dispatch{Timeouts,Handles} cases because Add{Timeout,Handle} use realloc which is allowed to allocate a new array, move the contents of the old one into it and free the old array. So the for loop can easily end up accessing memory which has already been freed.
That's a very unlikely scenario, but yes it could happen. We'd need to save a copy of the row we're accessing. Thus instead of if (eventLoop.handles[i].ff) { virMutexUnlock(&eventLoop.lock); (eventLoop.handles[i].ff)(eventLoop.handles[i].opaque); virMutexLock(&eventLoop.lock); } We probably need if (eventLoop.handles[i].ff) { virFreeCallback ff = eventLoop.handles[i].ff; void *opaque = eventLoop.handles[i].opaque; virMutexUnlock(&eventLoop.lock); ff(opaque); virMutexLock(&eventLoop.lock); } Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 03/07/2011 08:17 AM, Daniel P. Berrange wrote:
On Mon, Mar 07, 2011 at 02:13:23PM +0100, Jiri Denemark wrote:
On Mon, Mar 07, 2011 at 11:15:38 +0000, Daniel P. Berrange wrote:
- if (eventLoop.handles[i].ff) + if (eventLoop.handles[i].ff) { + virMutexUnlock(&eventLoop.lock); (eventLoop.handles[i].ff)(eventLoop.handles[i].opaque); + virMutexLock(&eventLoop.lock); + } I'm a little concerned as to whether the rest of the code in virEventCleanupHandles/CleanupTimeouts is safe, if we release the lock here. eg, if some other thread calls virEventAddTimeout of AddHandle, is there any way this could cause problems for us here. So far I think this is safe because AddTimeout/AddHandle will simply append to the end of the array we're iterating over, but would like a second opinion before ACK'ing I don't think it's safe to unlock eventloop.lock even in the existing Dispatch{Timeouts,Handles} cases because Add{Timeout,Handle} use realloc which is allowed to allocate a new array, move the contents of the old one into it and free the old array. So the for loop can easily end up accessing memory which has already been freed. That's a very unlikely scenario, but yes it could happen. We'd need to save a copy of the row we're accessing. Thus instead of
if (eventLoop.handles[i].ff) { virMutexUnlock(&eventLoop.lock); (eventLoop.handles[i].ff)(eventLoop.handles[i].opaque); virMutexLock(&eventLoop.lock); }
We probably need
if (eventLoop.handles[i].ff) { virFreeCallback ff = eventLoop.handles[i].ff; void *opaque = eventLoop.handles[i].opaque; virMutexUnlock(&eventLoop.lock); ff(opaque); virMutexLock(&eventLoop.lock); }
I tested this patch on the two cleanup functions with a save/restore of both a transient and a persistent domain, and both completed without problems. Based on this, and Dan's explanation of the way that the eventLoop lock *should* work, I can ACK a version of Wen Congyang's patch with Dan's suggestion from above.

On Mon, Mar 07, 2011 at 14:13:23 +0100, Jiri Denemark wrote:
On Mon, Mar 07, 2011 at 11:15:38 +0000, Daniel P. Berrange wrote:
- if (eventLoop.handles[i].ff) + if (eventLoop.handles[i].ff) { + virMutexUnlock(&eventLoop.lock); (eventLoop.handles[i].ff)(eventLoop.handles[i].opaque); + virMutexLock(&eventLoop.lock); + }
I'm a little concerned as to whether the rest of the code in virEventCleanupHandles/CleanupTimeouts is safe, if we release the lock here. eg, if some other thread calls virEventAddTimeout of AddHandle, is there any way this could cause problems for us here. So far I think this is safe because AddTimeout/AddHandle will simply append to the end of the array we're iterating over, but would like a second opinion before ACK'ing
I don't think it's safe to unlock eventloop.lock even in the existing Dispatch{Timeouts,Handles} cases because Add{Timeout,Handle} use realloc which is allowed to allocate a new array, move the contents of the old one into it and free the old array. So the for loop can easily end up accessing memory which has already been freed.
Eh, my fingers were faster than my brain :-) Dispatch* are safe since they do anything with the old pointer after getting back from a callback. Jirka

On Mon, Mar 07, 2011 at 14:18:06 +0100, Jiri Denemark wrote:
On Mon, Mar 07, 2011 at 14:13:23 +0100, Jiri Denemark wrote:
On Mon, Mar 07, 2011 at 11:15:38 +0000, Daniel P. Berrange wrote:
- if (eventLoop.handles[i].ff) + if (eventLoop.handles[i].ff) { + virMutexUnlock(&eventLoop.lock); (eventLoop.handles[i].ff)(eventLoop.handles[i].opaque); + virMutexLock(&eventLoop.lock); + }
I'm a little concerned as to whether the rest of the code in virEventCleanupHandles/CleanupTimeouts is safe, if we release the lock here. eg, if some other thread calls virEventAddTimeout of AddHandle, is there any way this could cause problems for us here. So far I think this is safe because AddTimeout/AddHandle will simply append to the end of the array we're iterating over, but would like a second opinion before ACK'ing
I don't think it's safe to unlock eventloop.lock even in the existing Dispatch{Timeouts,Handles} cases because Add{Timeout,Handle} use realloc which is allowed to allocate a new array, move the contents of the old one into it and free the old array. So the for loop can easily end up accessing memory which has already been freed.
Eh, my fingers were faster than my brain :-) Dispatch* are safe since they do anything with the old pointer after getting back from a callback.
And the rest of the for loops in Cleanup* is safe as well for the reason you already mentioned. The code moves stuff from i+1 to the end of the array one position closer the beginning of the array and it will move the possibly added items as well. The only thing which may happen is that if Remove* is called while the mutex is unlocked, an item which we already went through can be marked as deleted so we are no longer guaranteed that when Cleanup* finishes, the array only contains items which were not deleted. But AFAICT no-one counts with that behavior so it's safe to break it. Sorry for the noise. Jirka

On 03/07/2011 06:27 AM, Jiri Denemark wrote:
And the rest of the for loops in Cleanup* is safe as well for the reason you already mentioned. The code moves stuff from i+1 to the end of the array one position closer the beginning of the array and it will move the possibly added items as well. The only thing which may happen is that if Remove* is called while the mutex is unlocked, an item which we already went through can be marked as deleted so we are no longer guaranteed that when Cleanup* finishes, the array only contains items which were not deleted. But AFAICT no-one counts with that behavior so it's safe to break it.
Actually, someone does count on it: virEventRunOnce first runs the cleanup handlers, then calls virEventMakePollFDs, all while holding the lock. virEventMakePollFDs currently adds a slot in the poll fd array for every slot of the array, because it "knows" that there are no deleted entries in the array (since the cleanup completed without dropping lock). Once we make the cleanup functions drop and regain lock, we also need to teach virEventMakePollFDs to ignore deleted entries. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

From: Wen Congyang <wency@cn.fujitsu.com> When I use newest libvirt to save a domain, libvirtd will be deadlock. Here is the output of gdb: (gdb) thread 3 [Switching to thread 3 (Thread 0x7f972a1fc710 (LWP 30265))]#0 0x000000351fe0e034 in __lll_lock_wait () from /lib64/libpthread.so.0 (gdb) bt at qemu/qemu_driver.c:2074 ret=0x7f972a1fbbe0) at remote.c:2273 (gdb) thread 7 [Switching to thread 7 (Thread 0x7f9730bcd710 (LWP 30261))]#0 0x000000351fe0e034 in __lll_lock_wait () from /lib64/libpthread.so.0 (gdb) bt (gdb) p *(virMutexPtr)0x6fdd60 $2 = {lock = {__data = {__lock = 2, __count = 0, __owner = 30261, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = "\002\000\000\000\000\000\000\000\065v\000\000\001", '\000' <repeats 26 times>, __align = 2}} (gdb) p *(virMutexPtr)0x1a63ac0 $3 = {lock = {__data = {__lock = 2, __count = 0, __owner = 30265, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = "\002\000\000\000\000\000\000\000\071v\000\000\001", '\000' <repeats 26 times>, __align = 2}} (gdb) info threads 7 Thread 0x7f9730bcd710 (LWP 30261) 0x000000351fe0e034 in __lll_lock_wait () from /lib64/libpthread.so.0 6 Thread 0x7f972bfff710 (LWP 30262) 0x000000351fe0b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 5 Thread 0x7f972b5fe710 (LWP 30263) 0x000000351fe0b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 4 Thread 0x7f972abfd710 (LWP 30264) 0x000000351fe0b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 * 3 Thread 0x7f972a1fc710 (LWP 30265) 0x000000351fe0e034 in __lll_lock_wait () from /lib64/libpthread.so.0 2 Thread 0x7f97297fb710 (LWP 30266) 0x000000351fe0b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 1 Thread 0x7f9737aac800 (LWP 30260) 0x000000351fe0803d in pthread_join () from /lib64/libpthread.so.0 The reason is that we will try to lock some object in callback function, and we may call event API with locking the same object. In the function virEventDispatchHandles(), we unlock eventLoop before calling callback function. I think we should do the same thing in the function virEventCleanupTimeouts() and virEventCleanupHandles(). Signed-off-by: Wen Congyang <wency@cn.fujitsu.com> Signed-off-by: Eric Blake <eblake@redhat.com> --- v2: incorporate comments from reviewers, and rebase on top of file move I tested that this avoided deadlock for my 'virsh save' case where I was reporting failure last week. src/util/event_poll.c | 27 +++++++++++++++++++-------- 1 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/util/event_poll.c b/src/util/event_poll.c index dd83fc3..91000e2 100644 --- a/src/util/event_poll.c +++ b/src/util/event_poll.c @@ -354,7 +354,7 @@ static struct pollfd *virEventPollMakePollFDs(int *nfds) { *nfds = 0; for (i = 0 ; i < eventLoop.handlesCount ; i++) { - if (eventLoop.handles[i].events) + if (eventLoop.handles[i].events && !eventLoop.handles[i].deleted) (*nfds)++; } @@ -366,11 +366,12 @@ static struct pollfd *virEventPollMakePollFDs(int *nfds) { *nfds = 0; for (i = 0 ; i < eventLoop.handlesCount ; i++) { - EVENT_DEBUG("Prepare n=%d w=%d, f=%d e=%d", i, + EVENT_DEBUG("Prepare n=%d w=%d, f=%d e=%d d=%d", i, eventLoop.handles[i].watch, eventLoop.handles[i].fd, - eventLoop.handles[i].events); - if (!eventLoop.handles[i].events) + eventLoop.handles[i].events, + eventLoop.handles[i].deleted); + if (!eventLoop.handles[i].events || eventLoop.handles[i].deleted) continue; fds[*nfds].fd = eventLoop.handles[i].fd; fds[*nfds].events = eventLoop.handles[i].events; @@ -506,8 +507,13 @@ static void virEventPollCleanupTimeouts(void) { EVENT_DEBUG("Purging timeout %d with id %d", i, eventLoop.timeouts[i].timer); - if (eventLoop.timeouts[i].ff) - (eventLoop.timeouts[i].ff)(eventLoop.timeouts[i].opaque); + if (eventLoop.timeouts[i].ff) { + virFreeCallback ff = eventLoop.timeouts[i].ff; + void *opaque = eventLoop.timeouts[i].opaque; + virMutexUnlock(&eventLoop.lock); + ff(opaque); + virMutexLock(&eventLoop.lock); + } if ((i+1) < eventLoop.timeoutsCount) { memmove(eventLoop.timeouts+i, @@ -546,8 +552,13 @@ static void virEventPollCleanupHandles(void) { continue; } - if (eventLoop.handles[i].ff) - (eventLoop.handles[i].ff)(eventLoop.handles[i].opaque); + if (eventLoop.handles[i].ff) { + virFreeCallback ff = eventLoop.handles[i].ff; + void *opaque = eventLoop.handles[i].opaque; + virMutexUnlock(&eventLoop.lock); + ff(opaque); + virMutexLock(&eventLoop.lock); + } if ((i+1) < eventLoop.handlesCount) { memmove(eventLoop.handles+i, -- 1.7.4

On Mon, Mar 07, 2011 at 10:09:36AM -0700, Eric Blake wrote:
From: Wen Congyang <wency@cn.fujitsu.com>
When I use newest libvirt to save a domain, libvirtd will be deadlock. Here is the output of gdb: (gdb) thread 3 [Switching to thread 3 (Thread 0x7f972a1fc710 (LWP 30265))]#0 0x000000351fe0e034 in __lll_lock_wait () from /lib64/libpthread.so.0 (gdb) bt at qemu/qemu_driver.c:2074 ret=0x7f972a1fbbe0) at remote.c:2273 (gdb) thread 7 [Switching to thread 7 (Thread 0x7f9730bcd710 (LWP 30261))]#0 0x000000351fe0e034 in __lll_lock_wait () from /lib64/libpthread.so.0 (gdb) bt (gdb) p *(virMutexPtr)0x6fdd60 $2 = {lock = {__data = {__lock = 2, __count = 0, __owner = 30261, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = "\002\000\000\000\000\000\000\000\065v\000\000\001", '\000' <repeats 26 times>, __align = 2}} (gdb) p *(virMutexPtr)0x1a63ac0 $3 = {lock = {__data = {__lock = 2, __count = 0, __owner = 30265, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = "\002\000\000\000\000\000\000\000\071v\000\000\001", '\000' <repeats 26 times>, __align = 2}} (gdb) info threads 7 Thread 0x7f9730bcd710 (LWP 30261) 0x000000351fe0e034 in __lll_lock_wait () from /lib64/libpthread.so.0 6 Thread 0x7f972bfff710 (LWP 30262) 0x000000351fe0b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 5 Thread 0x7f972b5fe710 (LWP 30263) 0x000000351fe0b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 4 Thread 0x7f972abfd710 (LWP 30264) 0x000000351fe0b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 * 3 Thread 0x7f972a1fc710 (LWP 30265) 0x000000351fe0e034 in __lll_lock_wait () from /lib64/libpthread.so.0 2 Thread 0x7f97297fb710 (LWP 30266) 0x000000351fe0b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 1 Thread 0x7f9737aac800 (LWP 30260) 0x000000351fe0803d in pthread_join () from /lib64/libpthread.so.0
The reason is that we will try to lock some object in callback function, and we may call event API with locking the same object. In the function virEventDispatchHandles(), we unlock eventLoop before calling callback function. I think we should do the same thing in the function virEventCleanupTimeouts() and virEventCleanupHandles().
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com> Signed-off-by: Eric Blake <eblake@redhat.com> ---
v2: incorporate comments from reviewers, and rebase on top of file move
I tested that this avoided deadlock for my 'virsh save' case where I was reporting failure last week.
src/util/event_poll.c | 27 +++++++++++++++++++-------- 1 files changed, 19 insertions(+), 8 deletions(-)
diff --git a/src/util/event_poll.c b/src/util/event_poll.c index dd83fc3..91000e2 100644 --- a/src/util/event_poll.c +++ b/src/util/event_poll.c @@ -354,7 +354,7 @@ static struct pollfd *virEventPollMakePollFDs(int *nfds) {
*nfds = 0; for (i = 0 ; i < eventLoop.handlesCount ; i++) { - if (eventLoop.handles[i].events) + if (eventLoop.handles[i].events && !eventLoop.handles[i].deleted) (*nfds)++; }
@@ -366,11 +366,12 @@ static struct pollfd *virEventPollMakePollFDs(int *nfds) {
*nfds = 0; for (i = 0 ; i < eventLoop.handlesCount ; i++) { - EVENT_DEBUG("Prepare n=%d w=%d, f=%d e=%d", i, + EVENT_DEBUG("Prepare n=%d w=%d, f=%d e=%d d=%d", i, eventLoop.handles[i].watch, eventLoop.handles[i].fd, - eventLoop.handles[i].events); - if (!eventLoop.handles[i].events) + eventLoop.handles[i].events, + eventLoop.handles[i].deleted); + if (!eventLoop.handles[i].events || eventLoop.handles[i].deleted) continue; fds[*nfds].fd = eventLoop.handles[i].fd; fds[*nfds].events = eventLoop.handles[i].events; @@ -506,8 +507,13 @@ static void virEventPollCleanupTimeouts(void) {
EVENT_DEBUG("Purging timeout %d with id %d", i, eventLoop.timeouts[i].timer); - if (eventLoop.timeouts[i].ff) - (eventLoop.timeouts[i].ff)(eventLoop.timeouts[i].opaque); + if (eventLoop.timeouts[i].ff) { + virFreeCallback ff = eventLoop.timeouts[i].ff; + void *opaque = eventLoop.timeouts[i].opaque; + virMutexUnlock(&eventLoop.lock); + ff(opaque); + virMutexLock(&eventLoop.lock); + }
if ((i+1) < eventLoop.timeoutsCount) { memmove(eventLoop.timeouts+i, @@ -546,8 +552,13 @@ static void virEventPollCleanupHandles(void) { continue; }
- if (eventLoop.handles[i].ff) - (eventLoop.handles[i].ff)(eventLoop.handles[i].opaque); + if (eventLoop.handles[i].ff) { + virFreeCallback ff = eventLoop.handles[i].ff; + void *opaque = eventLoop.handles[i].opaque; + virMutexUnlock(&eventLoop.lock); + ff(opaque); + virMutexLock(&eventLoop.lock); + }
if ((i+1) < eventLoop.handlesCount) { memmove(eventLoop.handles+i,
ACK Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 03/07/2011 10:18 AM, Daniel P. Berrange wrote:
On Mon, Mar 07, 2011 at 10:09:36AM -0700, Eric Blake wrote:
From: Wen Congyang <wency@cn.fujitsu.com>
When I use newest libvirt to save a domain, libvirtd will be deadlock.
The reason is that we will try to lock some object in callback function, and we may call event API with locking the same object. In the function virEventDispatchHandles(), we unlock eventLoop before calling callback function. I think we should do the same thing in the function virEventCleanupTimeouts() and virEventCleanupHandles().
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com> Signed-off-by: Eric Blake <eblake@redhat.com> ---
v2: incorporate comments from reviewers, and rebase on top of file move
ACK
Thanks; pushed. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org
participants (5)
-
Daniel P. Berrange
-
Eric Blake
-
Jiri Denemark
-
Laine Stump
-
Wen Congyang