[libvirt] [PATCH 2/2] Don't kill QEMU process when a monitor I/O parsing error occurs

11 May 2011

Currently whenever there is any failure with parsing the monitor,
this is treated in the same was as end-of-file (ie QEMU quit).
The domain is terminated, if not already dead.

With this change, failures in parsing the monitor stream do not
result in the death of QEMU. The guest continues running unchanged,
but all further use of the monitor will be disabled.

The VMM_FAILURE event will be emitted, and the mgmt application
can decide when to kill/restart the guest to re-gain control

* src/qemu/qemu_monitor.c, src/qemu/qemu_monitor.h: Run a
  different callback for monitor EOF vs error conditions.
* src/qemu/qemu_process.c: Emit VMM_FAILURE event when monitor
  fails
---
 src/qemu/qemu_monitor.c |   45 +++++++++++++++++++++++++++------------------
 src/qemu/qemu_monitor.h |    7 ++++---
 src/qemu/qemu_process.c |   46 +++++++++++++++++++++++++++++++++++++---------
 3 files changed, 68 insertions(+), 30 deletions(-)

diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c
index 9f0f20d..6e7e3d6 100644
--- a/src/qemu/qemu_monitor.c
+++ b/src/qemu/qemu_monitor.c
@@ -517,7 +517,8 @@ static void qemuMonitorUpdateWatch(qemuMonitorPtr mon)
 static void
 qemuMonitorIO(int watch, int fd, int events, void *opaque) {
     qemuMonitorPtr mon = opaque;
-    int quit = 0, failed = 0;
+    bool error = false;
+    bool eof = false;
 
     /* lock access to the monitor and protect fd */
     qemuMonitorLock(mon);
@@ -528,27 +529,27 @@ qemuMonitorIO(int watch, int fd, int events, void *opaque) {
 
     if (mon->fd != fd || mon->watch != watch) {
         VIR_ERROR(_("event from unexpected fd %d!=%d / watch %d!=%d"), mon->fd, fd, mon->watch, watch);
-        failed = 1;
+        error = true;
     } else {
         if (!mon->lastErrno &&
             events & VIR_EVENT_HANDLE_WRITABLE) {
             int done = qemuMonitorIOWrite(mon);
             if (done < 0)
-                failed = 1;
+                error = 1;
             events &= ~VIR_EVENT_HANDLE_WRITABLE;
         }
         if (!mon->lastErrno &&
             events & VIR_EVENT_HANDLE_READABLE) {
             int got = qemuMonitorIORead(mon);
             if (got < 0)
-                failed = 1;
+                error = true;
             /* Ignore hangup/error events if we read some data, to
              * give time for that data to be consumed */
             if (got > 0) {
                 events = 0;
 
                 if (qemuMonitorIOProcess(mon) < 0)
-                    failed = 1;
+                    error = true;
             } else
                 events &= ~VIR_EVENT_HANDLE_READABLE;
         }
@@ -572,36 +573,44 @@ qemuMonitorIO(int watch, int fd, int events, void *opaque) {
                 mon->msg->lastErrno = EIO;
                 virCondSignal(&mon->notify);
             }
-            quit = 1;
+            eof = 1;
         } else if (events) {
             VIR_ERROR(_("unhandled fd event %d for monitor fd %d"),
                       events, mon->fd);
-            failed = 1;
+            error = 1;
         }
     }
 
+    if (eof || error)
+        mon->lastErrno = EIO;
+
+    qemuMonitorUpdateWatch(mon);
+
     /* We have to unlock to avoid deadlock against command thread,
      * but is this safe ?  I think it is, because the callback
      * will try to acquire the virDomainObjPtr mutex next */
-    if (failed || quit) {
-        void (*eofNotify)(qemuMonitorPtr, virDomainObjPtr, int)
+    if (eof) {
+        void (*eofNotify)(qemuMonitorPtr, virDomainObjPtr)
             = mon->cb->eofNotify;
         virDomainObjPtr vm = mon->vm;
 
-        /* If qemu quited unexpectedly, and we may try to send monitor
-         * command later. But we have no chance to wake up it. So set
-         * mon->lastErrno to EIO, and check it before sending monitor
-         * command.
-         */
-        if (!mon->lastErrno)
-            mon->lastErrno = EIO;
+        /* Make sure anyone waiting wakes up now */
+        virCondSignal(&mon->notify);
+        if (qemuMonitorUnref(mon) > 0)
+            qemuMonitorUnlock(mon);
+        VIR_DEBUG0("Triggering EOF callback");
+        (eofNotify)(mon, vm);
+    } else if (error) {
+        void (*errorNotify)(qemuMonitorPtr, virDomainObjPtr)
+            = mon->cb->errorNotify;
+        virDomainObjPtr vm = mon->vm;
 
         /* Make sure anyone waiting wakes up now */
         virCondSignal(&mon->notify);
         if (qemuMonitorUnref(mon) > 0)
             qemuMonitorUnlock(mon);
-        VIR_DEBUG("Triggering EOF callback error? %d", failed);
-        (eofNotify)(mon, vm, failed);
+        VIR_DEBUG0("Triggering error callback");
+        (errorNotify)(mon, vm);
     } else {
         if (qemuMonitorUnref(mon) > 0)
             qemuMonitorUnlock(mon);
diff --git a/src/qemu/qemu_monitor.h b/src/qemu/qemu_monitor.h
index b84e230..47e0f4f 100644
--- a/src/qemu/qemu_monitor.h
+++ b/src/qemu/qemu_monitor.h
@@ -67,9 +67,10 @@ struct _qemuMonitorCallbacks {
                     virDomainObjPtr vm);
 
     void (*eofNotify)(qemuMonitorPtr mon,
-                      virDomainObjPtr vm,
-                      int withError);
-    /* XXX we'd really like to avoid virCOnnectPtr here
+                      virDomainObjPtr vm);
+    void (*errorNotify)(qemuMonitorPtr mon,
+                        virDomainObjPtr vm);
+    /* XXX we'd really like to avoid virConnectPtr here
      * It is required so the callback can find the active
      * secret driver. Need to change this to work like the
      * security drivers do, to avoid this
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index de728a2..4293f59 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -100,12 +100,13 @@ extern struct qemud_driver *qemu_driver;
  */
 static void
 qemuProcessHandleMonitorEOF(qemuMonitorPtr mon ATTRIBUTE_UNUSED,
-                            virDomainObjPtr vm,
-                            int hasError)
+                            virDomainObjPtr vm)
 {
     struct qemud_driver *driver = qemu_driver;
     virDomainEventPtr event = NULL;
     qemuDomainObjPrivatePtr priv;
+    int eventReason = VIR_DOMAIN_EVENT_STOPPED_SHUTDOWN;
+    const char *auditReason = "shutdown";
 
     VIR_DEBUG("Received EOF on %p '%s'", vm, vm->def->name);
 
@@ -120,20 +121,18 @@ qemuProcessHandleMonitorEOF(qemuMonitorPtr mon ATTRIBUTE_UNUSED,
     }
 
     priv = vm->privateData;
-    if (!hasError && priv->monJSON && !priv->gotShutdown) {
+    if (priv->monJSON && !priv->gotShutdown) {
         VIR_DEBUG("Monitor connection to '%s' closed without SHUTDOWN event; "
                   "assuming the domain crashed", vm->def->name);
-        hasError = 1;
+        eventReason = VIR_DOMAIN_EVENT_STOPPED_FAILED;
+        auditReason = "failed";
     }
 
     event = virDomainEventNewFromObj(vm,
-                                     VIR_DOMAIN_EVENT_STOPPED,
-                                     hasError ?
-                                     VIR_DOMAIN_EVENT_STOPPED_FAILED :
-                                     VIR_DOMAIN_EVENT_STOPPED_SHUTDOWN);
+                                     VIR_DOMAIN_EVENT_STOPPED, eventReason);
 
     qemuProcessStop(driver, vm, 0);
-    qemuAuditDomainStop(vm, hasError ? "failed" : "shutdown");
+    qemuAuditDomainStop(vm, auditReason);
 
     if (!vm->persistent)
         virDomainRemoveInactive(&driver->domains, vm);
@@ -147,6 +146,34 @@ qemuProcessHandleMonitorEOF(qemuMonitorPtr mon ATTRIBUTE_UNUSED,
 }
 
 
+/*
+ * This is invoked when there is some kind of error
+ * parsing data to/from the monitor. The VM can continue
+ * to run, but no further monitor commands will be
+ * allowed
+ */
+static void
+qemuProcessHandleMonitorError(qemuMonitorPtr mon ATTRIBUTE_UNUSED,
+                              virDomainObjPtr vm)
+{
+    struct qemud_driver *driver = qemu_driver;
+    virDomainEventPtr event = NULL;
+
+    VIR_DEBUG("Received error on %p '%s'", vm, vm->def->name);
+
+    qemuDriverLock(driver);
+    virDomainObjLock(vm);
+
+    event = virDomainEventVMMErrorNewFromObj(vm,
+                                             VIR_DOMAIN_EVENT_VMM_ERROR_CONTROL);
+    if (event)
+        qemuDomainEventQueue(driver, event);
+
+    virDomainObjUnlock(vm);
+    qemuDriverUnlock(driver);
+}
+
+
 static virDomainDiskDefPtr
 qemuProcessFindDomainDiskByPath(virDomainObjPtr vm,
                                 const char *path)
@@ -623,6 +650,7 @@ static void qemuProcessHandleMonitorDestroy(qemuMonitorPtr mon,
 static qemuMonitorCallbacks monitorCallbacks = {
     .destroy = qemuProcessHandleMonitorDestroy,
     .eofNotify = qemuProcessHandleMonitorEOF,
+    .errorNotify = qemuProcessHandleMonitorError,
     .diskSecretLookup = qemuProcessFindVolumeQcowPassphrase,
     .domainShutdown = qemuProcessHandleShutdown,
     .domainStop = qemuProcessHandleStop,
-- 
1.7.4.4

    

[libvirt] [PATCH 2/2] Don't kill QEMU process when a monitor I/O parsing error occurs

Daniel P. Berrange