On Tue, Jan 19, 2016 at 04:18:46PM +0000, Richard W.M. Jones wrote:
Libvirt has a fixed 15 second timeout for qemu to exit. If qemu is
writing to a slow USB key, it can hang (in D state) for much longer
than this - many minutes usually.
The solution is to check specifically for the libvirt EBUSY error when
this happens, and retry the virDomainDestroyFlags operation
(indefinitely).
See also the description here:
https://www.redhat.com/archives/libvir-list/2016-January/msg00767.html
Similar to the following OpenStack Nova commit:
http://git.openstack.org/cgit/openstack/nova/commit/?id=3907867
Thanks: Kashyap Chamarthy and Daniel Berrange.
---
src/launch-libvirt.c | 45 +++++++++++++++++++++++++++++++++++----------
1 file changed, 35 insertions(+), 10 deletions(-)
diff --git a/src/launch-libvirt.c b/src/launch-libvirt.c
index 8215e02..90b6c49 100644
--- a/src/launch-libvirt.c
+++ b/src/launch-libvirt.c
@@ -25,6 +25,7 @@
#include <unistd.h>
#include <fcntl.h>
#include <grp.h>
+#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <assert.h>
@@ -2015,6 +2016,8 @@ ignore_errors (void *ignore, virErrorPtr ignore2)
/* empty */
}
+static int destroy_domain (guestfs_h *g, virDomainPtr dom, int check_for_errors);
+
static int
shutdown_libvirt (guestfs_h *g, void *datav, int check_for_errors)
{
@@ -2023,23 +2026,14 @@ shutdown_libvirt (guestfs_h *g, void *datav, int
check_for_errors)
virDomainPtr dom = data->dom;
size_t i;
int ret = 0;
- int flags;
/* Note that we can be called back very early in launch (specifically
* from launch_libvirt itself), when conn and dom might be NULL.
*/
-
if (dom != NULL) {
- flags = check_for_errors ? VIR_DOMAIN_DESTROY_GRACEFUL : 0;
- debug (g, "calling virDomainDestroy \"%s\" flags=%s",
- data->name, check_for_errors ? "VIR_DOMAIN_DESTROY_GRACEFUL" :
"0");
- if (virDomainDestroyFlags (dom, flags) == -1) {
- libvirt_error (g, _("could not destroy libvirt domain"));
- ret = -1;
- }
+ ret = destroy_domain (g, dom, check_for_errors);
virDomainFree (dom);
}
-
if (conn != NULL)
virConnectClose (conn);
@@ -2068,6 +2062,37 @@ shutdown_libvirt (guestfs_h *g, void *datav, int
check_for_errors)
return ret;
}
+/* Wrapper around virDomainDestroy which handles errors and retries.. */
+static int
+destroy_domain (guestfs_h *g, virDomainPtr dom, int check_for_errors)
+{
+ const int flags = check_for_errors ? VIR_DOMAIN_DESTROY_GRACEFUL : 0;
+ virErrorPtr err;
+
+ again:
+ debug (g, "calling virDomainDestroy flags=%s",
+ check_for_errors ? "VIR_DOMAIN_DESTROY_GRACEFUL" : "0");
+ if (virDomainDestroyFlags (dom, flags) == -1) {
+ err = virGetLastError ();
+
+ /* Second chance if we're just waiting for qemu to shut down. See:
+ *
https://www.redhat.com/archives/libvir-list/2016-January/msg00767.html
+ */
+ if ((flags & VIR_DOMAIN_DESTROY_GRACEFUL) &&
+ err && err->code == VIR_ERR_SYSTEM_ERROR && err->int1 ==
EBUSY)
+ goto again;
NB, you could get that error even if you don't specify
VIR_DOMAIN_DESTROY_GRACEFUL, because even SIGKILL can
fail to kill QEMU if it is in an uninterruptable sleep
on a dead NFS server for example and hard,nointr is used
as mount options.
So perhaps you don't want to check flags here ?
+
+ /* "Domain not found" is not treated as an error. */
+ if (err && err->code == VIR_ERR_NO_DOMAIN)
+ return 0;
+
+ libvirt_error (g, _("could not destroy libvirt domain"));
+ return -1;
+ }
+
+ return 0;
+}
I don't know the use context from libguestfs POV, but is there any
scenario in which a user would want a way to time out this, instead
of making libguestfs wait forever ?
Regards,
Daniel
--
|: