[libvirt] [PATCH 0/3] fix crash on libvirtd termination

Libvirtd termination can crash. One can use [2] patch to trigger it. Call domstats function and send TERM to libvirtd. You'd probably see stacktrace [1]. The problem is that threads with clients requests are joined after drivers cleanup. This patch series address this issue. [1] Crash stacktrace Program received signal SIGSEGV, Segmentation fault. Thread 5 (Thread 0x7fffe6a4d700 (LWP 921916)): #0 0x00007fffd9cb3f14 in qemuDomainObjBeginJobInternal (driver=driver@entry=0x7fffcc103e40, obj=obj@entry=0x7fffcc1a6ca0, job=job@entry=QEMU_JOB_QUERY, asyncJob=asyncJob@entry=QEMU_ASYNC_JOB_NONE) at qemu/qemu_domain.c:4114 #1 0x00007fffd9cb82ab in qemuDomainObjBeginJob (driver=driver@entry=0x7fffcc103e40, obj=obj@entry=0x7fffcc1a6ca0, job=job@entry=QEMU_JOB_QUERY) at qemu/qemu_domain.c:4240 #2 0x00007fffd9d23094 in qemuConnectGetAllDomainStats (conn=0x7fffcc1bc140, doms=<optimized out>, ndoms=<optimized out>, stats=127, retStats=0x7fffe6a4cb10, flags=<optimized out>) at qemu/qemu_driver.c:20116 #3 0x00007ffff744a166 in virDomainListGetStats (doms=0x7fffa8000a10, stats=0, retStats=retStats@entry=0x7fffe6a4cb10, flags=0) at libvirt-domain.c:11592 #4 0x000055555557af15 in remoteDispatchConnectGetAllDomainStats (server=<optimized out>, msg=<optimized out>, ret=0x7fffa80008e0, args=0x7fffa80008c0, rerr=0x7fffe6a4cc50, client=<optimized out>) at remote.c:6532 #5 remoteDispatchConnectGetAllDomainStatsHelper (server=<optimized out>, client=<optimized out>, msg=<optimized out>, rerr=0x7fffe6a4cc50, args=0x7fffa80008c0, ret=0x7fffa80008e0) at remote_dispatch.h:615 #6 0x00007ffff74abba2 in virNetServerProgramDispatchCall (msg=0x55555583bf50, client=0x55555583c580, server=0x555555810f40, prog=0x55555583a140) at rpc/virnetserverprogram.c:437 #7 virNetServerProgramDispatch (prog=0x55555583a140, server=server@entry=0x555555810f40, client=0x55555583c580, msg=0x55555583bf50) at rpc/virnetserverprogram.c:307 #8 0x00005555555ae10d in virNetServerProcessMsg (msg=<optimized out>, prog=<optimized out>, client=<optimized out>, srv=0x555555810f40) at rpc/virnetserver.c:148 #9 virNetServerHandleJob (jobOpaque=<optimized out>, opaque=0x555555810f40) at rpc/virnetserver.c:169 #10 0x00007ffff7390fd1 in virThreadPoolWorker (opaque=opaque@entry=0x5555558057a0) at util/virthreadpool.c:167 #11 0x00007ffff7390358 in virThreadHelper (data=<optimized out>) at util/virthread.c:206 #12 0x00007ffff457be25 in start_thread () from /lib64/libpthread.so.0 #13 0x00007ffff42a934d in clone () from /lib64/libc.so.6 Thread 1 (Thread 0x7ffff7fae880 (LWP 921909)): #0 0x00007ffff457f945 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00007ffff73905c6 in virCondWait (c=<optimized out>, m=<optimized out>) at util/virthread.c:154 #2 0x00007ffff73911e0 in virThreadPoolFree (pool=0x555555811030) at util/virthreadpool.c:290 #3 0x00005555555adb44 in virNetServerDispose (obj=0x555555810f40) at rpc/virnetserver.c:767 #4 0x00007ffff736f62b in virObjectUnref (anyobj=<optimized out>) at util/virobject.c:356 #5 0x00007ffff7343e19 in virHashFree (table=0x55555581ba40) at util/virhash.c:318 #6 0x00007ffff74a46b5 in virNetDaemonDispose (obj=0x555555812c50) at rpc/virnetdaemon.c:105 #7 0x00007ffff736f62b in virObjectUnref (anyobj=anyobj@entry=0x555555812c50) at util/virobject.c:356 #8 0x0000555555570479 in main (argc=<optimized out>, argv=<optimized out>) at libvirtd.c:1539 [2] patch to trigger crash # diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c # index cf5e4ad..39a57aa 100644 # --- a/src/qemu/qemu_driver.c # +++ b/src/qemu/qemu_driver.c # @@ -20144,6 +20144,8 @@ qemuConnectGetAllDomainStats(virConnectPtr conn, # domflags = 0; # vm = vms[i]; # # + sleep(5); # + # virObjectLock(vm); # # if (HAVE_JOB(privflags) && Nikolay Shirokovskiy (3): daemon: finish threads on close qemu: monitor: check monitor not closed on send qemu: implement state driver shutdown function daemon/libvirtd.c | 2 ++ src/driver-state.h | 4 ++++ src/libvirt.c | 18 ++++++++++++++++++ src/libvirt_internal.h | 1 + src/libvirt_private.syms | 1 + src/qemu/qemu_driver.c | 39 +++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_monitor.c | 27 +++++++++++++-------------- src/rpc/virnetserver.c | 5 +++-- 8 files changed, 81 insertions(+), 16 deletions(-) -- 1.8.3.1

Current daemon shutdown can cause crashes. The problem is that threads serving client request are joined on daemon dispose after drivers already cleaned up. But this threads typically uses drivers and thus crashes come. We need to join threads before virStateCleanup. virNetDaemonClose is a good candidate. But it turns out that we can hang on join. The problem is that at this moment event loop is not functional and for example threads waiting for qemu response will never finish. Let's introduce extra shutdown step for drivers so that they can signal any API calls in progress to finish. --- daemon/libvirtd.c | 2 ++ src/driver-state.h | 4 ++++ src/libvirt.c | 18 ++++++++++++++++++ src/libvirt_internal.h | 1 + src/libvirt_private.syms | 1 + src/rpc/virnetserver.c | 5 +++-- 6 files changed, 29 insertions(+), 2 deletions(-) diff --git a/daemon/libvirtd.c b/daemon/libvirtd.c index 589b321..d2bbe1e 100644 --- a/daemon/libvirtd.c +++ b/daemon/libvirtd.c @@ -1504,6 +1504,8 @@ int main(int argc, char **argv) { virObjectUnref(lxcProgram); virObjectUnref(qemuProgram); virObjectUnref(adminProgram); + if (driversInitialized) + virStateShutdown(); virNetDaemonClose(dmn); virObjectUnref(srv); virObjectUnref(srvAdm); diff --git a/src/driver-state.h b/src/driver-state.h index 1cb3e4f..ea549a7 100644 --- a/src/driver-state.h +++ b/src/driver-state.h @@ -42,6 +42,9 @@ typedef int typedef int (*virDrvStateStop)(void); +typedef void +(*virDrvStateShutdown)(void); + typedef struct _virStateDriver virStateDriver; typedef virStateDriver *virStateDriverPtr; @@ -52,6 +55,7 @@ struct _virStateDriver { virDrvStateCleanup stateCleanup; virDrvStateReload stateReload; virDrvStateStop stateStop; + virDrvStateShutdown stateShutdown; }; diff --git a/src/libvirt.c b/src/libvirt.c index 6d66fa4..ff41764 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -812,6 +812,24 @@ virStateCleanup(void) /** + * virStateShutdown: + * + * Run each virtualization driver's shutdown method. + * + */ +void +virStateShutdown(void) +{ + int r; + + for (r = virStateDriverTabCount - 1; r >= 0; r--) { + if (virStateDriverTab[r]->stateShutdown) + virStateDriverTab[r]->stateShutdown(); + } +} + + +/** * virStateReload: * * Run each virtualization driver's reload method. diff --git a/src/libvirt_internal.h b/src/libvirt_internal.h index 62f490a..9863b07 100644 --- a/src/libvirt_internal.h +++ b/src/libvirt_internal.h @@ -36,6 +36,7 @@ int virStateInitialize(bool privileged, int virStateCleanup(void); int virStateReload(void); int virStateStop(void); +void virStateShutdown(void); /* Feature detection. This is a libvirt-private interface for determining * what features are supported by the driver. diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 5b1bc5e..59f8207 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1189,6 +1189,7 @@ virSetSharedStorageDriver; virStateCleanup; virStateInitialize; virStateReload; +virStateShutdown; virStateStop; virStreamInData; diff --git a/src/rpc/virnetserver.c b/src/rpc/virnetserver.c index 2b76daa..7dc8018 100644 --- a/src/rpc/virnetserver.c +++ b/src/rpc/virnetserver.c @@ -764,8 +764,6 @@ void virNetServerDispose(void *obj) for (i = 0; i < srv->nservices; i++) virNetServerServiceToggle(srv->services[i], false); - virThreadPoolFree(srv->workers); - for (i = 0; i < srv->nservices; i++) virObjectUnref(srv->services[i]); VIR_FREE(srv->services); @@ -796,6 +794,9 @@ void virNetServerClose(virNetServerPtr srv) for (i = 0; i < srv->nservices; i++) virNetServerServiceClose(srv->services[i]); + virThreadPoolFree(srv->workers); + srv->workers = NULL; + virObjectUnlock(srv); } -- 1.8.3.1

On 09/27/2017 08:45 AM, Nikolay Shirokovskiy wrote:
Current daemon shutdown can cause crashes. The problem is that threads serving client request are joined on daemon dispose after drivers already cleaned up. But this threads typically uses drivers and thus crashes come. We need to join threads before virStateCleanup. virNetDaemonClose is a good candidate.
But it turns out that we can hang on join. The problem is that at this moment event loop is not functional and for example threads waiting for qemu response will never finish. Let's introduce extra shutdown step for drivers so that they can signal any API calls in progress to finish. --- daemon/libvirtd.c | 2 ++ src/driver-state.h | 4 ++++ src/libvirt.c | 18 ++++++++++++++++++ src/libvirt_internal.h | 1 + src/libvirt_private.syms | 1 + src/rpc/virnetserver.c | 5 +++-- 6 files changed, 29 insertions(+), 2 deletions(-)
So - first off - this patch is doing 2 things: 1. Introduce a new driver State function - "Shutdown" 2. Moving virThreadPoolFree from virNetServerDispose to virNetServerClose The two do not seem to be related so, they'd need to be separated. It appears the motivation behind the StateShutdown is to "force" the close the qemu monitor and agent, but it doesn't seem that's really related to the virNet{Daemon|Server}* timing issue. So let's consider that separately... and I'm not considering it now... Focusing more on #2 - the move of the virThreadPoolFree would seem to hasten the eventual free. Or more to the point ensure it happens. While that does resolve the problem, I don't think it's the best or actual fix to what it appears the real problem is. Taking it from the Start/Alloc/Ref side - @srv gets a Ref at virNetServerNew and then again at virNetDaemonAddServer. So each @srv has two refs, which means in order for virNetServerDispose to be called, there would need to be two Unref's; however, I can only find one. During cleanup: @srv is Unref'd after virNetDaemonClose, but I'm not finding the other one. Do you recall where you may have seen it? I'm assuming the answer is no, there wasn't one and hence why you moved that virThreadPoolFree call. Since at virNetDaemonAddServer we add a Ref to @srv, then during virNetDaemonClose I'd expect that for each server on dmn->servers there'd be the virNetServerClose and a removal from the HashTable and Unref of the @srv object which I'm not seeing. If that was there, then the virNetServerDispose would call virThreadPoolFree right when the Unref was done on @srv. The better fix, I believe is a call to virHashRemoveAll in virNetDaemonClose which does that removal of @srv from the dmn->servers hash table. NB this would fix a memory leak since the eventual virHashFree(dmn->servers) doesn't do free the elements of the hash table when virNetDaemonDispose is called as a result of the virObjectUnref(dmn) at the bottom of main() in daemon/libvirtd.c. As an aside (and I think something else that needs to be fixed), there's virNetDaemonAddServerPostExec which adds a @srv to dmn->services, but never does the virObjectRef after the virHashAddEntry call. That would need to be a patch before the patch that adds the virHashRemoveAll. Make sense? This is a very interesting/good catch to a problem - let's just get the right fix! Tks - John
diff --git a/daemon/libvirtd.c b/daemon/libvirtd.c index 589b321..d2bbe1e 100644 --- a/daemon/libvirtd.c +++ b/daemon/libvirtd.c @@ -1504,6 +1504,8 @@ int main(int argc, char **argv) { virObjectUnref(lxcProgram); virObjectUnref(qemuProgram); virObjectUnref(adminProgram); + if (driversInitialized) + virStateShutdown(); virNetDaemonClose(dmn); virObjectUnref(srv); virObjectUnref(srvAdm); diff --git a/src/driver-state.h b/src/driver-state.h index 1cb3e4f..ea549a7 100644 --- a/src/driver-state.h +++ b/src/driver-state.h @@ -42,6 +42,9 @@ typedef int typedef int (*virDrvStateStop)(void);
+typedef void +(*virDrvStateShutdown)(void); + typedef struct _virStateDriver virStateDriver; typedef virStateDriver *virStateDriverPtr;
@@ -52,6 +55,7 @@ struct _virStateDriver { virDrvStateCleanup stateCleanup; virDrvStateReload stateReload; virDrvStateStop stateStop; + virDrvStateShutdown stateShutdown; };
diff --git a/src/libvirt.c b/src/libvirt.c index 6d66fa4..ff41764 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -812,6 +812,24 @@ virStateCleanup(void)
/** + * virStateShutdown: + * + * Run each virtualization driver's shutdown method. + * + */ +void +virStateShutdown(void) +{ + int r; + + for (r = virStateDriverTabCount - 1; r >= 0; r--) { + if (virStateDriverTab[r]->stateShutdown) + virStateDriverTab[r]->stateShutdown(); + } +} + + +/** * virStateReload: * * Run each virtualization driver's reload method. diff --git a/src/libvirt_internal.h b/src/libvirt_internal.h index 62f490a..9863b07 100644 --- a/src/libvirt_internal.h +++ b/src/libvirt_internal.h @@ -36,6 +36,7 @@ int virStateInitialize(bool privileged, int virStateCleanup(void); int virStateReload(void); int virStateStop(void); +void virStateShutdown(void);
/* Feature detection. This is a libvirt-private interface for determining * what features are supported by the driver. diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 5b1bc5e..59f8207 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1189,6 +1189,7 @@ virSetSharedStorageDriver; virStateCleanup; virStateInitialize; virStateReload; +virStateShutdown; virStateStop; virStreamInData;
diff --git a/src/rpc/virnetserver.c b/src/rpc/virnetserver.c index 2b76daa..7dc8018 100644 --- a/src/rpc/virnetserver.c +++ b/src/rpc/virnetserver.c @@ -764,8 +764,6 @@ void virNetServerDispose(void *obj) for (i = 0; i < srv->nservices; i++) virNetServerServiceToggle(srv->services[i], false);
- virThreadPoolFree(srv->workers); - for (i = 0; i < srv->nservices; i++) virObjectUnref(srv->services[i]); VIR_FREE(srv->services); @@ -796,6 +794,9 @@ void virNetServerClose(virNetServerPtr srv) for (i = 0; i < srv->nservices; i++) virNetServerServiceClose(srv->services[i]);
+ virThreadPoolFree(srv->workers); + srv->workers = NULL; + virObjectUnlock(srv); }

On 16.10.2017 15:47, John Ferlan wrote:
On 09/27/2017 08:45 AM, Nikolay Shirokovskiy wrote:
Current daemon shutdown can cause crashes. The problem is that threads serving client request are joined on daemon dispose after drivers already cleaned up. But this threads typically uses drivers and thus crashes come. We need to join threads before virStateCleanup. virNetDaemonClose is a good candidate.
But it turns out that we can hang on join. The problem is that at this moment event loop is not functional and for example threads waiting for qemu response will never finish. Let's introduce extra shutdown step for drivers so that they can signal any API calls in progress to finish. --- daemon/libvirtd.c | 2 ++ src/driver-state.h | 4 ++++ src/libvirt.c | 18 ++++++++++++++++++ src/libvirt_internal.h | 1 + src/libvirt_private.syms | 1 + src/rpc/virnetserver.c | 5 +++-- 6 files changed, 29 insertions(+), 2 deletions(-)
So - first off - this patch is doing 2 things:
1. Introduce a new driver State function - "Shutdown"
2. Moving virThreadPoolFree from virNetServerDispose to virNetServerClose
The two do not seem to be related so, they'd need to be separated.
It appears the motivation behind the StateShutdown is to "force" the close the qemu monitor and agent, but it doesn't seem that's really related to the virNet{Daemon|Server}* timing issue. So let's consider that separately... and I'm not considering it now...
Focusing more on #2 - the move of the virThreadPoolFree would seem to hasten the eventual free. Or more to the point ensure it happens. While that does resolve the problem, I don't think it's the best or actual fix to what it appears the real problem is.
The problem is related to the fact that virThreadPoolFree does 2 things. 1. finishes all worker threads 2. do actual free What I want is to hasten #1 not actual free.
Taking it from the Start/Alloc/Ref side - @srv gets a Ref at virNetServerNew and then again at virNetDaemonAddServer. So each @srv has two refs, which means in order for virNetServerDispose to be called, there would need to be two Unref's; however, I can only find one.
During cleanup: @srv is Unref'd after virNetDaemonClose, but I'm not finding the other one. Do you recall where you may have seen it? I'm
When the daemon object is unref'd at the end of daemon main function servers are unrefed as part of daemon dispose.
assuming the answer is no, there wasn't one and hence why you moved that virThreadPoolFree call.
Not entirely true. I moved this call so that all client request are finished before we clean up drivers state. Otherwise client requests will see disposed objects in the middele of operation and crashes occur.
Since at virNetDaemonAddServer we add a Ref to @srv, then during virNetDaemonClose I'd expect that for each server on dmn->servers there'd be the virNetServerClose and a removal from the HashTable and Unref of the @srv object which I'm not seeing. If that was there, then the virNetServerDispose would call virThreadPoolFree right when the Unref was done on @srv. The better fix, I believe is a call to virHashRemoveAll in virNetDaemonClose which does that removal of @srv from the dmn->servers hash table. NB this would fix a memory leak since the eventual virHashFree(dmn->servers) doesn't do free the elements of the hash table when virNetDaemonDispose is called as a result of the virObjectUnref(dmn) at the bottom of main() in daemon/libvirtd.c.
Servers hash table created with free function virObjectFreeHashData which will unref servers when hash table is freed. As to clearing servers hash table at virNetDaemonClose. To me daemon has close and dispose function and looks like dispose function is more suitable for freeing servers for the sake of functional groupping. Why we distinct close/dispose functions - I guess one can potentially close/open daemon and reuse daemon object but there is no such usage currently.
As an aside (and I think something else that needs to be fixed), there's virNetDaemonAddServerPostExec which adds a @srv to dmn->services, but never does the virObjectRef after the virHashAddEntry call. That would need to be a patch before the patch that adds the virHashRemoveAll.
Agree. By the way virtlogd and virtlockd treat differently virNetDaemonAddServerPostExec. The first stores reference to server object and unref it on daemon dispose as if virNetDaemonAddServerPostExec return server with extra reference. The second just drops the returned server object as if there is no extra reference. So if we add extra reference we need to patch virtlockd as well. The original server reference owner is servers table I mean.
Make sense? This is a very interesting/good catch to a problem - let's just get the right fix!
Tks -
John
diff --git a/daemon/libvirtd.c b/daemon/libvirtd.c index 589b321..d2bbe1e 100644 --- a/daemon/libvirtd.c +++ b/daemon/libvirtd.c @@ -1504,6 +1504,8 @@ int main(int argc, char **argv) { virObjectUnref(lxcProgram); virObjectUnref(qemuProgram); virObjectUnref(adminProgram); + if (driversInitialized) + virStateShutdown(); virNetDaemonClose(dmn); virObjectUnref(srv); virObjectUnref(srvAdm); diff --git a/src/driver-state.h b/src/driver-state.h index 1cb3e4f..ea549a7 100644 --- a/src/driver-state.h +++ b/src/driver-state.h @@ -42,6 +42,9 @@ typedef int typedef int (*virDrvStateStop)(void);
+typedef void +(*virDrvStateShutdown)(void); + typedef struct _virStateDriver virStateDriver; typedef virStateDriver *virStateDriverPtr;
@@ -52,6 +55,7 @@ struct _virStateDriver { virDrvStateCleanup stateCleanup; virDrvStateReload stateReload; virDrvStateStop stateStop; + virDrvStateShutdown stateShutdown; };
diff --git a/src/libvirt.c b/src/libvirt.c index 6d66fa4..ff41764 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -812,6 +812,24 @@ virStateCleanup(void)
/** + * virStateShutdown: + * + * Run each virtualization driver's shutdown method. + * + */ +void +virStateShutdown(void) +{ + int r; + + for (r = virStateDriverTabCount - 1; r >= 0; r--) { + if (virStateDriverTab[r]->stateShutdown) + virStateDriverTab[r]->stateShutdown(); + } +} + + +/** * virStateReload: * * Run each virtualization driver's reload method. diff --git a/src/libvirt_internal.h b/src/libvirt_internal.h index 62f490a..9863b07 100644 --- a/src/libvirt_internal.h +++ b/src/libvirt_internal.h @@ -36,6 +36,7 @@ int virStateInitialize(bool privileged, int virStateCleanup(void); int virStateReload(void); int virStateStop(void); +void virStateShutdown(void);
/* Feature detection. This is a libvirt-private interface for determining * what features are supported by the driver. diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 5b1bc5e..59f8207 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1189,6 +1189,7 @@ virSetSharedStorageDriver; virStateCleanup; virStateInitialize; virStateReload; +virStateShutdown; virStateStop; virStreamInData;
diff --git a/src/rpc/virnetserver.c b/src/rpc/virnetserver.c index 2b76daa..7dc8018 100644 --- a/src/rpc/virnetserver.c +++ b/src/rpc/virnetserver.c @@ -764,8 +764,6 @@ void virNetServerDispose(void *obj) for (i = 0; i < srv->nservices; i++) virNetServerServiceToggle(srv->services[i], false);
- virThreadPoolFree(srv->workers); - for (i = 0; i < srv->nservices; i++) virObjectUnref(srv->services[i]); VIR_FREE(srv->services); @@ -796,6 +794,9 @@ void virNetServerClose(virNetServerPtr srv) for (i = 0; i < srv->nservices; i++) virNetServerServiceClose(srv->services[i]);
+ virThreadPoolFree(srv->workers); + srv->workers = NULL; + virObjectUnlock(srv); }

On 10/17/2017 03:40 AM, Nikolay Shirokovskiy wrote:
On 16.10.2017 15:47, John Ferlan wrote:
On 09/27/2017 08:45 AM, Nikolay Shirokovskiy wrote:
Current daemon shutdown can cause crashes. The problem is that threads serving client request are joined on daemon dispose after drivers already cleaned up. But this threads typically uses drivers and thus crashes come. We need to join threads before virStateCleanup. virNetDaemonClose is a good candidate.
But it turns out that we can hang on join. The problem is that at this moment event loop is not functional and for example threads waiting for qemu response will never finish. Let's introduce extra shutdown step for drivers so that they can signal any API calls in progress to finish. --- daemon/libvirtd.c | 2 ++ src/driver-state.h | 4 ++++ src/libvirt.c | 18 ++++++++++++++++++ src/libvirt_internal.h | 1 + src/libvirt_private.syms | 1 + src/rpc/virnetserver.c | 5 +++-- 6 files changed, 29 insertions(+), 2 deletions(-)
So - first off - this patch is doing 2 things:
1. Introduce a new driver State function - "Shutdown"
2. Moving virThreadPoolFree from virNetServerDispose to virNetServerClose
The two do not seem to be related so, they'd need to be separated.
It appears the motivation behind the StateShutdown is to "force" the close the qemu monitor and agent, but it doesn't seem that's really related to the virNet{Daemon|Server}* timing issue. So let's consider that separately... and I'm not considering it now...
Focusing more on #2 - the move of the virThreadPoolFree would seem to hasten the eventual free. Or more to the point ensure it happens. While that does resolve the problem, I don't think it's the best or actual fix to what it appears the real problem is.
The problem is related to the fact that virThreadPoolFree does 2 things.
1. finishes all worker threads 2. do actual free
What I want is to hasten #1 not actual free.
Right - but because your fix just calls virThreadPoolFree and sets srv->workers = NULL - that caused me to wonder why we had to do that when it "should" be done once we're done using the servers hash table entry... IOW: Something should Unref sooner. BTW: Your 1 and 2 are all the same to me - the Unref being the more key component because we know virObjectUnref(srv) should be the "last" reference to @srv. It's one of those ordering things. It should be do A, then B to allocate/start and then undo B and undo A on cleanup. In this case we partially undo B, then undo A, and eventually complete the undo B much later on.
Taking it from the Start/Alloc/Ref side - @srv gets a Ref at virNetServerNew and then again at virNetDaemonAddServer. So each @srv has two refs, which means in order for virNetServerDispose to be called, there would need to be two Unref's; however, I can only find one.
During cleanup: @srv is Unref'd after virNetDaemonClose, but I'm not finding the other one. Do you recall where you may have seen it? I'm
When the daemon object is unref'd at the end of daemon main function servers are unrefed as part of daemon dispose.>
OK - right, virHashFree does end up making that free call, but as you note, much too late.
assuming the answer is no, there wasn't one and hence why you moved that virThreadPoolFree call.
Not entirely true. I moved this call so that all client request are finished before we clean up drivers state. Otherwise client requests will see disposed objects in the middele of operation and crashes occur.
So while the move "works" it does so because it's bypassing the 'proper' way to resolve this by Unref() the table elements when the decision is made by the the code that it is done with them...
Since at virNetDaemonAddServer we add a Ref to @srv, then during virNetDaemonClose I'd expect that for each server on dmn->servers there'd be the virNetServerClose and a removal from the HashTable and Unref of the @srv object which I'm not seeing. If that was there, then the virNetServerDispose would call virThreadPoolFree right when the Unref was done on @srv. The better fix, I believe is a call to virHashRemoveAll in virNetDaemonClose which does that removal of @srv from the dmn->servers hash table. NB this would fix a memory leak since the eventual virHashFree(dmn->servers) doesn't do free the elements of the hash table when virNetDaemonDispose is called as a result of the virObjectUnref(dmn) at the bottom of main() in daemon/libvirtd.c.
Servers hash table created with free function virObjectFreeHashData which will unref servers when hash table is freed.
As to clearing servers hash table at virNetDaemonClose. To me daemon has close and dispose function and looks like dispose function is more suitable for freeing servers for the sake of functional groupping. Why we distinct close/dispose functions - I guess one can potentially close/open daemon and reuse daemon object but there is no such usage currently.
If virNetDaemonNew() took @srv as a parameter, then I agree as part of Dispose, the Unref of each @srv would be more appropriate. E.g., the dmn->servers hash table is allocated during New and Free'd during Dispose. Logically, since @srv is added to dmn->servers after @dmn is allocated, then when we Close the @dmn, we should then remove the @srv's that we find, right? The Close function will currently call for each @srv the daemonServerClose function to call the virNetServerClose helper. Since we have the dmn lock and for every dmn->servers hash table entry we've closed, then we should also remove the entries from dmn->servers at that time. That means either doing them one at a time during daemonServerClose (remember that @srv is our entry) or all at once after we've gone through each entry. BTW: I did actually test using the more simplified approach of calling virHashRemoveAll after virHashForEach in virNetDaemonClose. Prior to the adjustment, I saw the stack trace (more or less) as noted in the cover, but with the patch in place, the client would eventually get the keepalive timeout message as opposed to the connection reset message.
As an aside (and I think something else that needs to be fixed), there's virNetDaemonAddServerPostExec which adds a @srv to dmn->services, but never does the virObjectRef after the virHashAddEntry call. That would need to be a patch before the patch that adds the virHashRemoveAll.
Agree. By the way virtlogd and virtlockd treat differently virNetDaemonAddServerPostExec. The first stores reference to server object and unref it on daemon dispose as if virNetDaemonAddServerPostExec return server with extra reference. The second just drops the returned server object as if there is no extra reference. So if we add extra reference we need to patch virtlockd as well.
I didn't dig through all that code... My viewpoint was more that we allocate @srv, then we Add it to the dmn->servers, but don't Ref it as we did during virNetDaemonAddServer when @srv was added to dmn->servers. But yes, I see virtlockd would need some adjustment too since that would seem to be "more correct". To some degree the fact that Disposal of @srv is done during virHashFree probably helps in this case. Still makes my brain hurt thinking about it. John
The original server reference owner is servers table I mean.
Make sense? This is a very interesting/good catch to a problem - let's just get the right fix!
Tks -
John
diff --git a/daemon/libvirtd.c b/daemon/libvirtd.c index 589b321..d2bbe1e 100644 --- a/daemon/libvirtd.c +++ b/daemon/libvirtd.c @@ -1504,6 +1504,8 @@ int main(int argc, char **argv) { virObjectUnref(lxcProgram); virObjectUnref(qemuProgram); virObjectUnref(adminProgram); + if (driversInitialized) + virStateShutdown(); virNetDaemonClose(dmn); virObjectUnref(srv); virObjectUnref(srvAdm); diff --git a/src/driver-state.h b/src/driver-state.h index 1cb3e4f..ea549a7 100644 --- a/src/driver-state.h +++ b/src/driver-state.h @@ -42,6 +42,9 @@ typedef int typedef int (*virDrvStateStop)(void);
+typedef void +(*virDrvStateShutdown)(void); + typedef struct _virStateDriver virStateDriver; typedef virStateDriver *virStateDriverPtr;
@@ -52,6 +55,7 @@ struct _virStateDriver { virDrvStateCleanup stateCleanup; virDrvStateReload stateReload; virDrvStateStop stateStop; + virDrvStateShutdown stateShutdown; };
diff --git a/src/libvirt.c b/src/libvirt.c index 6d66fa4..ff41764 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -812,6 +812,24 @@ virStateCleanup(void)
/** + * virStateShutdown: + * + * Run each virtualization driver's shutdown method. + * + */ +void +virStateShutdown(void) +{ + int r; + + for (r = virStateDriverTabCount - 1; r >= 0; r--) { + if (virStateDriverTab[r]->stateShutdown) + virStateDriverTab[r]->stateShutdown(); + } +} + + +/** * virStateReload: * * Run each virtualization driver's reload method. diff --git a/src/libvirt_internal.h b/src/libvirt_internal.h index 62f490a..9863b07 100644 --- a/src/libvirt_internal.h +++ b/src/libvirt_internal.h @@ -36,6 +36,7 @@ int virStateInitialize(bool privileged, int virStateCleanup(void); int virStateReload(void); int virStateStop(void); +void virStateShutdown(void);
/* Feature detection. This is a libvirt-private interface for determining * what features are supported by the driver. diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 5b1bc5e..59f8207 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1189,6 +1189,7 @@ virSetSharedStorageDriver; virStateCleanup; virStateInitialize; virStateReload; +virStateShutdown; virStateStop; virStreamInData;
diff --git a/src/rpc/virnetserver.c b/src/rpc/virnetserver.c index 2b76daa..7dc8018 100644 --- a/src/rpc/virnetserver.c +++ b/src/rpc/virnetserver.c @@ -764,8 +764,6 @@ void virNetServerDispose(void *obj) for (i = 0; i < srv->nservices; i++) virNetServerServiceToggle(srv->services[i], false);
- virThreadPoolFree(srv->workers); - for (i = 0; i < srv->nservices; i++) virObjectUnref(srv->services[i]); VIR_FREE(srv->services); @@ -796,6 +794,9 @@ void virNetServerClose(virNetServerPtr srv) for (i = 0; i < srv->nservices; i++) virNetServerServiceClose(srv->services[i]);
+ virThreadPoolFree(srv->workers); + srv->workers = NULL; + virObjectUnlock(srv); }

On 17.10.2017 15:34, John Ferlan wrote:
On 10/17/2017 03:40 AM, Nikolay Shirokovskiy wrote:
On 16.10.2017 15:47, John Ferlan wrote:
On 09/27/2017 08:45 AM, Nikolay Shirokovskiy wrote:
Current daemon shutdown can cause crashes. The problem is that threads serving client request are joined on daemon dispose after drivers already cleaned up. But this threads typically uses drivers and thus crashes come. We need to join threads before virStateCleanup. virNetDaemonClose is a good candidate.
But it turns out that we can hang on join. The problem is that at this moment event loop is not functional and for example threads waiting for qemu response will never finish. Let's introduce extra shutdown step for drivers so that they can signal any API calls in progress to finish. --- daemon/libvirtd.c | 2 ++ src/driver-state.h | 4 ++++ src/libvirt.c | 18 ++++++++++++++++++ src/libvirt_internal.h | 1 + src/libvirt_private.syms | 1 + src/rpc/virnetserver.c | 5 +++-- 6 files changed, 29 insertions(+), 2 deletions(-)
So - first off - this patch is doing 2 things:
1. Introduce a new driver State function - "Shutdown"
2. Moving virThreadPoolFree from virNetServerDispose to virNetServerClose
The two do not seem to be related so, they'd need to be separated.
It appears the motivation behind the StateShutdown is to "force" the close the qemu monitor and agent, but it doesn't seem that's really related to the virNet{Daemon|Server}* timing issue. So let's consider that separately... and I'm not considering it now...
Focusing more on #2 - the move of the virThreadPoolFree would seem to hasten the eventual free. Or more to the point ensure it happens. While that does resolve the problem, I don't think it's the best or actual fix to what it appears the real problem is.
The problem is related to the fact that virThreadPoolFree does 2 things.
1. finishes all worker threads 2. do actual free
What I want is to hasten #1 not actual free.
Right - but because your fix just calls virThreadPoolFree and sets srv->workers = NULL - that caused me to wonder why we had to do that when it "should" be done once we're done using the servers hash table entry... IOW: Something should Unref sooner.
BTW: Your 1 and 2 are all the same to me - the Unref being the more key component because we know virObjectUnref(srv) should be the "last" reference to @srv. It's one of those ordering things. It should be do A, then B to allocate/start and then undo B and undo A on cleanup. In this case we partially undo B, then undo A, and eventually complete the undo B much later on.
Taking it from the Start/Alloc/Ref side - @srv gets a Ref at virNetServerNew and then again at virNetDaemonAddServer. So each @srv has two refs, which means in order for virNetServerDispose to be called, there would need to be two Unref's; however, I can only find one.
During cleanup: @srv is Unref'd after virNetDaemonClose, but I'm not finding the other one. Do you recall where you may have seen it? I'm
When the daemon object is unref'd at the end of daemon main function servers are unrefed as part of daemon dispose.>
OK - right, virHashFree does end up making that free call, but as you note, much too late.
assuming the answer is no, there wasn't one and hence why you moved that virThreadPoolFree call.
Not entirely true. I moved this call so that all client request are finished before we clean up drivers state. Otherwise client requests will see disposed objects in the middele of operation and crashes occur.
So while the move "works" it does so because it's bypassing the 'proper' way to resolve this by Unref() the table elements when the decision is made by the the code that it is done with them...
Since at virNetDaemonAddServer we add a Ref to @srv, then during virNetDaemonClose I'd expect that for each server on dmn->servers there'd be the virNetServerClose and a removal from the HashTable and Unref of the @srv object which I'm not seeing. If that was there, then the virNetServerDispose would call virThreadPoolFree right when the Unref was done on @srv. The better fix, I believe is a call to virHashRemoveAll in virNetDaemonClose which does that removal of @srv from the dmn->servers hash table. NB this would fix a memory leak since the eventual virHashFree(dmn->servers) doesn't do free the elements of the hash table when virNetDaemonDispose is called as a result of the virObjectUnref(dmn) at the bottom of main() in daemon/libvirtd.c.
Servers hash table created with free function virObjectFreeHashData which will unref servers when hash table is freed.
As to clearing servers hash table at virNetDaemonClose. To me daemon has close and dispose function and looks like dispose function is more suitable for freeing servers for the sake of functional groupping. Why we distinct close/dispose functions - I guess one can potentially close/open daemon and reuse daemon object but there is no such usage currently.
If virNetDaemonNew() took @srv as a parameter, then I agree as part of Dispose, the Unref of each @srv would be more appropriate. E.g., the dmn->servers hash table is allocated during New and Free'd during Dispose.
Logically, since @srv is added to dmn->servers after @dmn is allocated, then when we Close the @dmn, we should then remove the @srv's that we find, right? The Close function will currently call for each @srv the daemonServerClose function to call the virNetServerClose helper.
Since we have the dmn lock and for every dmn->servers hash table entry we've closed, then we should also remove the entries from dmn->servers at that time. That means either doing them one at a time during daemonServerClose (remember that @srv is our entry) or all at once after we've gone through each entry.
I think we can go any of the ways until there is no more daemon usecases.
BTW: I did actually test using the more simplified approach of calling virHashRemoveAll after virHashForEach in virNetDaemonClose. Prior to the adjustment, I saw the stack trace (more or less) as noted in the cover, but with the patch in place, the client would eventually get the keepalive timeout message as opposed to the connection reset message.
Yes and the patch series fixes this hung besides crash too )
As an aside (and I think something else that needs to be fixed), there's virNetDaemonAddServerPostExec which adds a @srv to dmn->services, but never does the virObjectRef after the virHashAddEntry call. That would need to be a patch before the patch that adds the virHashRemoveAll.
Agree. By the way virtlogd and virtlockd treat differently virNetDaemonAddServerPostExec. The first stores reference to server object and unref it on daemon dispose as if virNetDaemonAddServerPostExec return server with extra reference. The second just drops the returned server object as if there is no extra reference. So if we add extra reference we need to patch virtlockd as well.
I didn't dig through all that code... My viewpoint was more that we allocate @srv, then we Add it to the dmn->servers, but don't Ref it as we did during virNetDaemonAddServer when @srv was added to dmn->servers.
Yes virNetDaemonAddServerPostExec definetely have to take extra ref. One ref goes to hash table and one ref goes to the caller. We can not put taking ref to the caller because of multi threading.
But yes, I see virtlockd would need some adjustment too since that would seem to be "more correct". To some degree the fact that Disposal of @srv is done during virHashFree probably helps in this case. Still makes my brain hurt thinking about it.
John
So I can split this patch to 2 and clear servers hash table as you suggusted instead of tossing virThreadPoolFree and add 2 extra patches to make referencing at virNetDaemonAddServerPostExec straight. This will take almost no time thus let's move to the other parts of the series. Nikolay
The original server reference owner is servers table I mean.
Make sense? This is a very interesting/good catch to a problem - let's just get the right fix!
Tks -
John
diff --git a/daemon/libvirtd.c b/daemon/libvirtd.c index 589b321..d2bbe1e 100644 --- a/daemon/libvirtd.c +++ b/daemon/libvirtd.c @@ -1504,6 +1504,8 @@ int main(int argc, char **argv) { virObjectUnref(lxcProgram); virObjectUnref(qemuProgram); virObjectUnref(adminProgram); + if (driversInitialized) + virStateShutdown(); virNetDaemonClose(dmn); virObjectUnref(srv); virObjectUnref(srvAdm); diff --git a/src/driver-state.h b/src/driver-state.h index 1cb3e4f..ea549a7 100644 --- a/src/driver-state.h +++ b/src/driver-state.h @@ -42,6 +42,9 @@ typedef int typedef int (*virDrvStateStop)(void);
+typedef void +(*virDrvStateShutdown)(void); + typedef struct _virStateDriver virStateDriver; typedef virStateDriver *virStateDriverPtr;
@@ -52,6 +55,7 @@ struct _virStateDriver { virDrvStateCleanup stateCleanup; virDrvStateReload stateReload; virDrvStateStop stateStop; + virDrvStateShutdown stateShutdown; };
diff --git a/src/libvirt.c b/src/libvirt.c index 6d66fa4..ff41764 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -812,6 +812,24 @@ virStateCleanup(void)
/** + * virStateShutdown: + * + * Run each virtualization driver's shutdown method. + * + */ +void +virStateShutdown(void) +{ + int r; + + for (r = virStateDriverTabCount - 1; r >= 0; r--) { + if (virStateDriverTab[r]->stateShutdown) + virStateDriverTab[r]->stateShutdown(); + } +} + + +/** * virStateReload: * * Run each virtualization driver's reload method. diff --git a/src/libvirt_internal.h b/src/libvirt_internal.h index 62f490a..9863b07 100644 --- a/src/libvirt_internal.h +++ b/src/libvirt_internal.h @@ -36,6 +36,7 @@ int virStateInitialize(bool privileged, int virStateCleanup(void); int virStateReload(void); int virStateStop(void); +void virStateShutdown(void);
/* Feature detection. This is a libvirt-private interface for determining * what features are supported by the driver. diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 5b1bc5e..59f8207 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1189,6 +1189,7 @@ virSetSharedStorageDriver; virStateCleanup; virStateInitialize; virStateReload; +virStateShutdown; virStateStop; virStreamInData;
diff --git a/src/rpc/virnetserver.c b/src/rpc/virnetserver.c index 2b76daa..7dc8018 100644 --- a/src/rpc/virnetserver.c +++ b/src/rpc/virnetserver.c @@ -764,8 +764,6 @@ void virNetServerDispose(void *obj) for (i = 0; i < srv->nservices; i++) virNetServerServiceToggle(srv->services[i], false);
- virThreadPoolFree(srv->workers); - for (i = 0; i < srv->nservices; i++) virObjectUnref(srv->services[i]); VIR_FREE(srv->services); @@ -796,6 +794,9 @@ void virNetServerClose(virNetServerPtr srv) for (i = 0; i < srv->nservices; i++) virNetServerServiceClose(srv->services[i]);
+ virThreadPoolFree(srv->workers); + srv->workers = NULL; + virObjectUnlock(srv); }

[...]
So I can split this patch to 2 and clear servers hash table as you suggusted instead of tossing virThreadPoolFree and add 2 extra patches to make referencing at virNetDaemonAddServerPostExec straight. This will take almost no time thus let's move to the other parts of the series.
Nikolay
As I said originally I wasn't considering the StateShutdown part of this series. Post that separately, but be sure to properly state what the purpose is. I think there is one sentence from the patch1 commit message. It doesn't seem that it would avoid a crash on its own. When I first took a quick look I wondered what the difference with that new state was and the stateStop functionality - then I noted stateStop functionality in daemon/libvirtd.c was gated on WITH_DBUS, but that's about as far as I got to thinking about adding a new state and its usefulness or purpose. John [...]

On 18.10.2017 14:17, John Ferlan wrote:
[...]
So I can split this patch to 2 and clear servers hash table as you suggusted instead of tossing virThreadPoolFree and add 2 extra patches to make referencing at virNetDaemonAddServerPostExec straight. This will take almost no time thus let's move to the other parts of the series.
Nikolay
As I said originally I wasn't considering the StateShutdown part of this series. Post that separately, but be sure to properly state what the purpose is. I think there is one sentence from the patch1 commit
Ok.
message. It doesn't seem that it would avoid a crash on its own. When I first took a quick look I wondered what the difference with that new state was and the stateStop functionality - then I noted stateStop functionality in daemon/libvirtd.c was gated on WITH_DBUS, but that's about as far as I got to thinking about adding a new state and its usefulness or purpose.
stateShutdown does not really introduces a state. It just helps to terminate state driver. Nikolay

It is convinient to have closed state for qemu monitor. I'm going to use it on daemon shutdown. We can use fd to check for this state. Now we don't need to set lastError on qemuMonitorClose as the error will be reported explicitly by qemuMonitorSend. Let's report VIR_ERR_OPERATION_INVALID instead of VIR_ERR_OPERATION_FAILED as closing monitor is a part of stopping domain (and will be a part of shutting driver down). Particularly we won't see error messages in daemon log. --- src/qemu/qemu_monitor.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c index 363ad76..ba2371f 100644 --- a/src/qemu/qemu_monitor.c +++ b/src/qemu/qemu_monitor.c @@ -1000,22 +1000,9 @@ qemuMonitorClose(qemuMonitorPtr mon) } /* In case another thread is waiting for its monitor command to be - * processed, we need to wake it up with appropriate error set. + * processed, we need to wake it up. */ if (mon->msg) { - if (mon->lastError.code == VIR_ERR_OK) { - virErrorPtr err = virSaveLastError(); - - virReportError(VIR_ERR_OPERATION_FAILED, "%s", - _("QEMU monitor was closed")); - virCopyLastError(&mon->lastError); - if (err) { - virSetError(err); - virFreeError(err); - } else { - virResetLastError(); - } - } mon->msg->finished = 1; virCondSignal(&mon->notify); } @@ -1047,6 +1034,12 @@ qemuMonitorSend(qemuMonitorPtr mon, { int ret = -1; + if (mon->fd < 0) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("QEMU monitor was closed")); + return -1; + } + /* Check whether qemu quit unexpectedly */ if (mon->lastError.code != VIR_ERR_OK) { VIR_DEBUG("Attempt to send command while error is set %s", @@ -1070,6 +1063,12 @@ qemuMonitorSend(qemuMonitorPtr mon, } } + if (mon->fd < 0) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("QEMU monitor was closed")); + goto cleanup; + } + if (mon->lastError.code != VIR_ERR_OK) { VIR_DEBUG("Send command resulted in error %s", NULLSTR(mon->lastError.message)); -- 1.8.3.1

Let's close agent and qemu monitors. This should trigger any API calls awaiting response to finish eventually. --- src/qemu/qemu_driver.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 4855c90..31d100c 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -1067,6 +1067,44 @@ qemuStateStop(void) return ret; } + +static int +qemuDomainDisconnect(virDomainObjPtr vm, void *opaque ATTRIBUTE_UNUSED) +{ + + qemuDomainObjPrivatePtr priv; + + virObjectLock(vm); + priv = vm->privateData; + + if (priv->mon) { + /* Take extra reference to monitor so it won't be disposed + * by qemuMonitorClose last unref. */ + virObjectRef(priv->mon); + qemuMonitorClose(priv->mon); + } + + if (priv->agent) { + /* Other threads are ready for priv->agent to became NULL meanwhile */ + qemuAgentClose(priv->agent); + priv->agent = NULL; + } + + virObjectUnlock(vm); + return 0; +} + + +static void +qemuStateShutdown(void) +{ + if (!qemu_driver) + return; + + virDomainObjListForEach(qemu_driver->domains, qemuDomainDisconnect, NULL); +} + + /** * qemuStateCleanup: * @@ -21147,6 +21185,7 @@ static virStateDriver qemuStateDriver = { .stateCleanup = qemuStateCleanup, .stateReload = qemuStateReload, .stateStop = qemuStateStop, + .stateShutdown = qemuStateShutdown, }; int qemuRegister(void) -- 1.8.3.1
participants (2)
-
John Ferlan
-
Nikolay Shirokovskiy