On Mon, Mar 07, 2011 at 10:09:36AM -0700, Eric Blake wrote:
From: Wen Congyang <wency(a)cn.fujitsu.com>
When I use newest libvirt to save a domain, libvirtd will be deadlock.
Here is the output of gdb:
(gdb) thread 3
[Switching to thread 3 (Thread 0x7f972a1fc710 (LWP 30265))]#0 0x000000351fe0e034 in
__lll_lock_wait () from /lib64/libpthread.so.0
(gdb) bt
at qemu/qemu_driver.c:2074
ret=0x7f972a1fbbe0) at remote.c:2273
(gdb) thread 7
[Switching to thread 7 (Thread 0x7f9730bcd710 (LWP 30261))]#0 0x000000351fe0e034 in
__lll_lock_wait () from /lib64/libpthread.so.0
(gdb) bt
(gdb) p *(virMutexPtr)0x6fdd60
$2 = {lock = {__data = {__lock = 2, __count = 0, __owner = 30261, __nusers = 1, __kind =
0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000\065v\000\000\001",
'\000' <repeats 26 times>, __align = 2}}
(gdb) p *(virMutexPtr)0x1a63ac0
$3 = {lock = {__data = {__lock = 2, __count = 0, __owner = 30265, __nusers = 1, __kind =
0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000\071v\000\000\001",
'\000' <repeats 26 times>, __align = 2}}
(gdb) info threads
7 Thread 0x7f9730bcd710 (LWP 30261) 0x000000351fe0e034 in __lll_lock_wait () from
/lib64/libpthread.so.0
6 Thread 0x7f972bfff710 (LWP 30262) 0x000000351fe0b43c in
pthread_cond_wait@(a)GLIBC_2.3.2 () from /lib64/libpthread.so.0
5 Thread 0x7f972b5fe710 (LWP 30263) 0x000000351fe0b43c in
pthread_cond_wait@(a)GLIBC_2.3.2 () from /lib64/libpthread.so.0
4 Thread 0x7f972abfd710 (LWP 30264) 0x000000351fe0b43c in
pthread_cond_wait@(a)GLIBC_2.3.2 () from /lib64/libpthread.so.0
* 3 Thread 0x7f972a1fc710 (LWP 30265) 0x000000351fe0e034 in __lll_lock_wait () from
/lib64/libpthread.so.0
2 Thread 0x7f97297fb710 (LWP 30266) 0x000000351fe0b43c in
pthread_cond_wait@(a)GLIBC_2.3.2 () from /lib64/libpthread.so.0
1 Thread 0x7f9737aac800 (LWP 30260) 0x000000351fe0803d in pthread_join () from
/lib64/libpthread.so.0
The reason is that we will try to lock some object in callback function, and we may call
event API with locking the same object.
In the function virEventDispatchHandles(), we unlock eventLoop before calling callback
function. I think we should
do the same thing in the function virEventCleanupTimeouts() and
virEventCleanupHandles().
Signed-off-by: Wen Congyang <wency(a)cn.fujitsu.com>
Signed-off-by: Eric Blake <eblake(a)redhat.com>
---
v2: incorporate comments from reviewers, and rebase on top of file move
I tested that this avoided deadlock for my 'virsh save' case
where I was reporting failure last week.
src/util/event_poll.c | 27 +++++++++++++++++++--------
1 files changed, 19 insertions(+), 8 deletions(-)
diff --git a/src/util/event_poll.c b/src/util/event_poll.c
index dd83fc3..91000e2 100644
--- a/src/util/event_poll.c
+++ b/src/util/event_poll.c
@@ -354,7 +354,7 @@ static struct pollfd *virEventPollMakePollFDs(int *nfds) {
*nfds = 0;
for (i = 0 ; i < eventLoop.handlesCount ; i++) {
- if (eventLoop.handles[i].events)
+ if (eventLoop.handles[i].events && !eventLoop.handles[i].deleted)
(*nfds)++;
}
@@ -366,11 +366,12 @@ static struct pollfd *virEventPollMakePollFDs(int *nfds) {
*nfds = 0;
for (i = 0 ; i < eventLoop.handlesCount ; i++) {
- EVENT_DEBUG("Prepare n=%d w=%d, f=%d e=%d", i,
+ EVENT_DEBUG("Prepare n=%d w=%d, f=%d e=%d d=%d", i,
eventLoop.handles[i].watch,
eventLoop.handles[i].fd,
- eventLoop.handles[i].events);
- if (!eventLoop.handles[i].events)
+ eventLoop.handles[i].events,
+ eventLoop.handles[i].deleted);
+ if (!eventLoop.handles[i].events || eventLoop.handles[i].deleted)
continue;
fds[*nfds].fd = eventLoop.handles[i].fd;
fds[*nfds].events = eventLoop.handles[i].events;
@@ -506,8 +507,13 @@ static void virEventPollCleanupTimeouts(void) {
EVENT_DEBUG("Purging timeout %d with id %d", i,
eventLoop.timeouts[i].timer);
- if (eventLoop.timeouts[i].ff)
- (eventLoop.timeouts[i].ff)(eventLoop.timeouts[i].opaque);
+ if (eventLoop.timeouts[i].ff) {
+ virFreeCallback ff = eventLoop.timeouts[i].ff;
+ void *opaque = eventLoop.timeouts[i].opaque;
+ virMutexUnlock(&eventLoop.lock);
+ ff(opaque);
+ virMutexLock(&eventLoop.lock);
+ }
if ((i+1) < eventLoop.timeoutsCount) {
memmove(eventLoop.timeouts+i,
@@ -546,8 +552,13 @@ static void virEventPollCleanupHandles(void) {
continue;
}
- if (eventLoop.handles[i].ff)
- (eventLoop.handles[i].ff)(eventLoop.handles[i].opaque);
+ if (eventLoop.handles[i].ff) {
+ virFreeCallback ff = eventLoop.handles[i].ff;
+ void *opaque = eventLoop.handles[i].opaque;
+ virMutexUnlock(&eventLoop.lock);
+ ff(opaque);
+ virMutexLock(&eventLoop.lock);
+ }
if ((i+1) < eventLoop.handlesCount) {
memmove(eventLoop.handles+i,
ACK
Daniel
--
|:
http://berrange.com -o-
http://www.flickr.com/photos/dberrange/ :|
|:
http://libvirt.org -o-
http://virt-manager.org :|
|:
http://autobuild.org -o-
http://search.cpan.org/~danberr/ :|
|:
http://entangle-photo.org -o-
http://live.gnome.org/gtk-vnc :|