[libvirt] [PATCH] daemon: Dynamically create worker threads when some get stuck

Up to now, we've created new worker threads only during new connection. This patch monitors worker threads for liveness and dynamically create new one if all are stuck, waiting for hypervisor to reply. This situation can happen. All one need to do is send STOP signal to qemu. The amount of time when we evaluate thread as stuck is defined in WORKER_TIMEOUT macro. With this approach we don't need to create new worker thread on incoming connection. However, as number of active worker threads grows, it might happen we need to size up the pool of worker threads and hence exceed the max_worker configuration value. --- daemon/libvirtd.c | 77 +++++++++++++++++++++++++++++++++++++++++++--------- daemon/libvirtd.h | 4 +++ 2 files changed, 67 insertions(+), 14 deletions(-) diff --git a/daemon/libvirtd.c b/daemon/libvirtd.c index bcaa37b..df019b3 100644 --- a/daemon/libvirtd.c +++ b/daemon/libvirtd.c @@ -1491,19 +1491,6 @@ static int qemudDispatchServer(struct qemud_server *server, struct qemud_socket server->clients[server->nclients++] = client; - if (server->nclients > server->nactiveworkers && - server->nactiveworkers < server->nworkers) { - for (i = 0 ; i < server->nworkers ; i++) { - if (!server->workers[i].hasThread) { - if (qemudStartWorker(server, &server->workers[i]) < 0) - return -1; - server->nactiveworkers++; - break; - } - } - } - - return 0; error: @@ -1629,6 +1616,9 @@ static void *qemudWorker(void *data) virMutexLock(&server->lock); worker->processingCall = 0; + /* The fact we're here means, this worker is not stuck + * and can serve clients. Thus suspend timeout */ + virEventUpdateTimeout(server->workerTimeout, -1); virMutexUnlock(&server->lock); } } @@ -1900,8 +1890,12 @@ readmore: } /* Move completed message to the end of the dispatch queue */ - if (msg) + if (msg) { qemudClientMessageQueuePush(&client->dx, msg); + /* For queued message set timeout. If a worker thread + * will serve this, it will also remove this timeout. */ + virEventUpdateTimeout(server->workerTimeout, WORKER_TIMEOUT); + } client->nrequests++; /* Possibly need to create another receive buffer */ @@ -2303,6 +2297,53 @@ static void qemudInactiveTimer(int timerid, void *data) { } } +static void qemudWorkerTimeout(int timerid, void *data) { + struct qemud_server *server = (struct qemud_server *)data; + int i; + + virMutexLock(&server->lock); + VIR_DEBUG("Some threads are not responding"); + for (i = 0; i < server->nclients; i++) { + if (server->clients[i]->dx) + break; + } + + if (i == server->nclients) { + /* All clients have been/are being served, + * there is no call waiting in the queue */ + virEventUpdateTimeout(timerid, -1); + } else { + VIR_DEBUG("Got a queued call. Need to create worker"); + if (server->nactiveworkers == server->nworkers) { + /* We need to size up worker threads pool */ + if (VIR_REALLOC_N(server->workers, server->nworkers + 1) < 0) { + virReportOOMError(); + goto cleanup; + } + /* Init new worker structure */ + memset(&server->workers[server->nworkers], 0, + sizeof(struct qemud_worker)); + server->nworkers++; + } + + /* Find free worker and start it */ + for (i = 0; i < server->nworkers; i++) { + if (!server->workers[i].hasThread) { + if (qemudStartWorker(server, &server->workers[i]) < 0) { + VIR_DEBUG("Could not start new worker"); + goto cleanup; + } + server->nactiveworkers++; + VIR_DEBUG("created worker %d", i); + break; + } + } + } + +cleanup: + virMutexUnlock(&server->lock); +} + static void qemudFreeClient(struct qemud_client *client) { while (client->rx) { struct qemud_client_message *msg @@ -2346,6 +2387,14 @@ static void *qemudRunLoop(void *opaque) { return NULL; } + server->workerTimeout = virEventAddTimeout(-1, + qemudWorkerTimeout, + server, NULL); + if (server->workerTimeout < 0) { + VIR_ERROR(_("Failed to register worker threads timeout")); + return NULL; + } + if (min_workers > max_workers) max_workers = min_workers; diff --git a/daemon/libvirtd.h b/daemon/libvirtd.h index ea00d5c..1d828be 100644 --- a/daemon/libvirtd.h +++ b/daemon/libvirtd.h @@ -271,6 +271,8 @@ struct qemud_worker { struct qemud_server *server; }; +#define WORKER_TIMEOUT 3000 + /* Main server state */ struct qemud_server { virMutex lock; @@ -278,6 +280,8 @@ struct qemud_server { int privileged; + int workerTimeout; + size_t nworkers; size_t nactiveworkers; struct qemud_worker *workers; -- 1.7.5.rc3

On Thu, Jun 16, 2011 at 04:29:55PM +0200, Michal Privoznik wrote:
Up to now, we've created new worker threads only during new connection. This patch monitors worker threads for liveness and dynamically create new one if all are stuck, waiting for hypervisor to reply. This situation can happen. All one need to do is send STOP signal to qemu. The amount of time when we evaluate thread as stuck is defined in WORKER_TIMEOUT macro.
With this approach we don't need to create new worker thread on incoming connection. However, as number of active worker threads grows, it might happen we need to size up the pool of worker threads and hence exceed the max_worker configuration value.
This is really not desirable. The max_workers limit is in the configuration as a static limit, to prevent client applications from making libvirtd spawn an unlimited number of threads. We must *always* respect the max_workers limit. I don't think automatically spawning workers is the right way to deal with the QEMU issue anyway. As mentioned before, we need to improve the QEMU monitor driver so that we can safely allow monitor commands to time out Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Thu, Jun 16, 2011 at 06:29:09PM +0100, Daniel P. Berrange wrote:
On Thu, Jun 16, 2011 at 04:29:55PM +0200, Michal Privoznik wrote:
Up to now, we've created new worker threads only during new connection. This patch monitors worker threads for liveness and dynamically create new one if all are stuck, waiting for hypervisor to reply. This situation can happen. All one need to do is send STOP signal to qemu.
We will also need to consider the case of qemu processes in uninterruptible sleep, e.g., in the case of a failed NFS mount.
The amount of time when we evaluate thread as stuck is defined in WORKER_TIMEOUT macro.
With this approach we don't need to create new worker thread on incoming connection. However, as number of active worker threads grows, it might happen we need to size up the pool of worker threads and hence exceed the max_worker configuration value.
This is really not desirable. The max_workers limit is in the configuration as a static limit, to prevent client applications from making libvirtd spawn an unlimited number of threads. We must *always* respect the max_workers limit.
I don't think automatically spawning workers is the right way to deal with the QEMU issue anyway. As mentioned before, we need to improve the QEMU monitor driver so that we can safely allow monitor commands to time out
Dan, can you suggest some possible strategies here? I don't have a strong opinion on the implementation, although I agree with your concern about spawning unlimited numbers of threads. Dave

On Thu, Jun 16, 2011 at 04:03:36PM -0400, Dave Allan wrote:
On Thu, Jun 16, 2011 at 06:29:09PM +0100, Daniel P. Berrange wrote:
On Thu, Jun 16, 2011 at 04:29:55PM +0200, Michal Privoznik wrote:
Up to now, we've created new worker threads only during new connection. This patch monitors worker threads for liveness and dynamically create new one if all are stuck, waiting for hypervisor to reply. This situation can happen. All one need to do is send STOP signal to qemu.
We will also need to consider the case of qemu processes in uninterruptible sleep, e.g., in the case of a failed NFS mount.
That is no different as far as libvirt is concerned. The end result is always simply that libvirt sends a monitor command & does not get a response in an appropriate timeframe.
The amount of time when we evaluate thread as stuck is defined in WORKER_TIMEOUT macro.
With this approach we don't need to create new worker thread on incoming connection. However, as number of active worker threads grows, it might happen we need to size up the pool of worker threads and hence exceed the max_worker configuration value.
This is really not desirable. The max_workers limit is in the configuration as a static limit, to prevent client applications from making libvirtd spawn an unlimited number of threads. We must *always* respect the max_workers limit.
I don't think automatically spawning workers is the right way to deal with the QEMU issue anyway. As mentioned before, we need to improve the QEMU monitor driver so that we can safely allow monitor commands to time out
Dan, can you suggest some possible strategies here? I don't have a strong opinion on the implementation, although I agree with your concern about spawning unlimited numbers of threads.
As I mentioned, we need to make the QEMU monitor timeout after some period of time waiting, and ensure that the monitor for that VM cannot be used thereafter. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Fri, Jun 17, 2011 at 10:55:43 +0100, Daniel P. Berrange wrote:
On Thu, Jun 16, 2011 at 04:03:36PM -0400, Dave Allan wrote:
Dan, can you suggest some possible strategies here? I don't have a strong opinion on the implementation, although I agree with your concern about spawning unlimited numbers of threads.
As I mentioned, we need to make the QEMU monitor timeout after some period of time waiting, and ensure that the monitor for that VM cannot be used thereafter.
I'm not sure that's the best way to deal with this either. I hate this kind of timeouts since I worked on Xen :-) The problem with this timeout is that no matter how big the timeout is, it is usually pretty easy to get into a situation when the timeout is not big enough. If anything in the system goes crazy (easiest is just causing lots of disk writes) the monitor command times out and you cannot do nothing with the domain except for destroying it (or shutting it down from inside) even though you fixed the issue and the system returns back to normal operation. Another issue is that the threads don't have to be stuck in QEMU monitor after all, they can be doing migration, for example. Let's say you one client connects to libvirtd and starts 5 migrations. Then 15 other clients connect and each issues 1 additional migration. So we have 16 clients connected and all 20 threads consumed. So even though a new client can connect to libvirtd it can't do anything (not even cancel the migrations) since no worker is free. I know this is not a probable scenario but I just wanted to show that we need to think about more possibilities how libvirtd can become unresponsive. I'm afraid we won't find any perfect solution and we'll just need to take the one that we think sucks less. Jirka

On Fri, Jun 17, 2011 at 12:46:33PM +0200, Jiri Denemark wrote:
On Fri, Jun 17, 2011 at 10:55:43 +0100, Daniel P. Berrange wrote:
On Thu, Jun 16, 2011 at 04:03:36PM -0400, Dave Allan wrote:
Dan, can you suggest some possible strategies here? I don't have a strong opinion on the implementation, although I agree with your concern about spawning unlimited numbers of threads.
As I mentioned, we need to make the QEMU monitor timeout after some period of time waiting, and ensure that the monitor for that VM cannot be used thereafter.
I'm not sure that's the best way to deal with this either. I hate this kind of timeouts since I worked on Xen :-) The problem with this timeout is that no matter how big the timeout is, it is usually pretty easy to get into a situation when the timeout is not big enough. If anything in the system goes crazy (easiest is just causing lots of disk writes) the monitor command times out and you cannot do nothing with the domain except for destroying it (or shutting it down from inside) even though you fixed the issue and the system returns back to normal operation.
Depending on the commands being issued it would be possible to recover after a monitor timeout. eg for a 'query-XXXX' command, we could likely just discard any reply that arrives after a timeout. For things like 'cont', 'stop' we can rely on the fact that QEMU notifies us when the VM actually pauses/resumes via async events. Migrationm, hotplug &unplug is the main problem area where you end up in a tricky state to recover from.
Another issue is that the threads don't have to be stuck in QEMU monitor after all, they can be doing migration, for example. Let's say you one client connects to libvirtd and starts 5 migrations. Then 15 other clients connect and each issues 1 additional migration. So we have 16 clients connected and all 20 threads consumed. So even though a new client can connect to libvirtd it can't do anything (not even cancel the migrations) since no worker is free. I know this is not a probable scenario but I just wanted to show that we need to think about more possibilities how libvirtd can become unresponsive.
IMHO, this scenario you describe illustrates the configuration parameters operating as intended, by limiting the number of concurrent API requests that are processed. What I do think we need is an administrative interface to libvirtd. We ought to be able to connect to libvirtd without opening a hypervisor connection, and do various admin operations. In particular I'd like to be able to change logging filters + outputs on the fly & kill off active client connections. We could also make it possible to increase the thread limits on the fly. Adding a administrative interface, separate from the main RPC interface is one of the goals of the RPC rewrite patches I've posted many times. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Fri, Jun 17, 2011 at 01:26:06PM +0100, Daniel P. Berrange wrote:
On Fri, Jun 17, 2011 at 12:46:33PM +0200, Jiri Denemark wrote:
On Fri, Jun 17, 2011 at 10:55:43 +0100, Daniel P. Berrange wrote: [...] What I do think we need is an administrative interface to libvirtd. We ought to be able to connect to libvirtd without opening a hypervisor connection, and do various admin operations. In particular I'd like to be able to change logging filters + outputs on the fly & kill off active client connections. We could also make it possible to increase the thread limits on the fly.
yes that's would be a welcome addition, agreed.
Adding a administrative interface, separate from the main RPC interface is one of the goals of the RPC rewrite patches I've posted many times.
Could you rebase and post, I was thinking they were somehow conflicting with Matthias RPC remote code cleanup (which seems mostly finished anyway), and that's why I though that had to wait, I probably msunderstood ! Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

On Mon, Jun 20, 2011 at 01:41:41PM +0800, Daniel Veillard wrote:
On Fri, Jun 17, 2011 at 01:26:06PM +0100, Daniel P. Berrange wrote:
On Fri, Jun 17, 2011 at 12:46:33PM +0200, Jiri Denemark wrote:
On Fri, Jun 17, 2011 at 10:55:43 +0100, Daniel P. Berrange wrote: [...] What I do think we need is an administrative interface to libvirtd. We ought to be able to connect to libvirtd without opening a hypervisor connection, and do various admin operations. In particular I'd like to be able to change logging filters + outputs on the fly & kill off active client connections. We could also make it possible to increase the thread limits on the fly.
yes that's would be a welcome addition, agreed.
Another thing is that if we had the fine grained access control system I described in another thread last week, then we could create some kind of "super admin" account which could issue API calls, ignoring the request and thread limits, as a means to a recover a system, or abort a troublesome migration job etc, while still controlling normal libvirt users. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Fri, Jun 17, 2011 at 10:55:43AM +0100, Daniel P. Berrange wrote:
On Thu, Jun 16, 2011 at 04:03:36PM -0400, Dave Allan wrote:
On Thu, Jun 16, 2011 at 06:29:09PM +0100, Daniel P. Berrange wrote:
On Thu, Jun 16, 2011 at 04:29:55PM +0200, Michal Privoznik wrote:
Up to now, we've created new worker threads only during new connection. This patch monitors worker threads for liveness and dynamically create new one if all are stuck, waiting for hypervisor to reply. This situation can happen. All one need to do is send STOP signal to qemu.
We will also need to consider the case of qemu processes in uninterruptible sleep, e.g., in the case of a failed NFS mount.
That is no different as far as libvirt is concerned. The end result is always simply that libvirt sends a monitor command & does not get a response in an appropriate timeframe.
Except that locking behavior cannot depend on being able to kill the process.
The amount of time when we evaluate thread as stuck is defined in WORKER_TIMEOUT macro.
With this approach we don't need to create new worker thread on incoming connection. However, as number of active worker threads grows, it might happen we need to size up the pool of worker threads and hence exceed the max_worker configuration value.
This is really not desirable. The max_workers limit is in the configuration as a static limit, to prevent client applications from making libvirtd spawn an unlimited number of threads. We must *always* respect the max_workers limit.
I don't think automatically spawning workers is the right way to deal with the QEMU issue anyway. As mentioned before, we need to improve the QEMU monitor driver so that we can safely allow monitor commands to time out
Dan, can you suggest some possible strategies here? I don't have a strong opinion on the implementation, although I agree with your concern about spawning unlimited numbers of threads.
As I mentioned, we need to make the QEMU monitor timeout after some period of time waiting, and ensure that the monitor for that VM cannot be used thereafter.
Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 16.06.2011 19:29, Daniel P. Berrange wrote:
On Thu, Jun 16, 2011 at 04:29:55PM +0200, Michal Privoznik wrote:
Up to now, we've created new worker threads only during new connection. This patch monitors worker threads for liveness and dynamically create new one if all are stuck, waiting for hypervisor to reply. This situation can happen. All one need to do is send STOP signal to qemu. The amount of time when we evaluate thread as stuck is defined in WORKER_TIMEOUT macro.
With this approach we don't need to create new worker thread on incoming connection. However, as number of active worker threads grows, it might happen we need to size up the pool of worker threads and hence exceed the max_worker configuration value.
This is really not desirable. The max_workers limit is in the configuration as a static limit, to prevent client applications from making libvirtd spawn an unlimited number of threads. We must *always* respect the max_workers limit.
I don't think automatically spawning workers is the right way to deal with the QEMU issue anyway. As mentioned before, we need to improve the QEMU monitor driver so that we can safely allow monitor commands to time out
Regards, Daniel
Well, and what about compromise. Don't exceed the max_workers limit, but possibly create workers on API calls instead only on new connection? Even with timeouts on qemu monitor we can be unresponsive for min(QEMU_JOB_WAIT_TIME, monitor_timeout) even without obvious reason. Michal
participants (5)
-
Daniel P. Berrange
-
Daniel Veillard
-
Dave Allan
-
Jiri Denemark
-
Michal Privoznik