[libvirt] [PATCHv2 0/4] Improve passthrough of early errors from qemu log

Version 2 fixed according to review feedback of Jan. Peter Krempa (4): qemu_process: Make qemuProcessReadLog() more versatile and reusable qemu: monitor: Add infrastructure to access VM logs for better err msgs qemu: monitor: Produce better errors on monitor hangup qemu: Wire up better early error reporting src/qemu/qemu_monitor.c | 93 +++++++++++++++++++++++++++++++++++++++++++++---- src/qemu/qemu_monitor.h | 2 ++ src/qemu/qemu_process.c | 54 +++++++++++++++++++--------- src/qemu/qemu_process.h | 2 ++ 4 files changed, 129 insertions(+), 22 deletions(-) -- 1.8.3.2

Teach the function to skip character device definitions printed by qemu at startup in addition to libvirt log messages and make it usable from outside of qemu_process.c. Also add documentation about the func. --- src/qemu/qemu_process.c | 24 +++++++++++++++++++----- src/qemu/qemu_process.h | 2 ++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index dd16f6c..846cee6 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -1450,8 +1450,20 @@ error: return ret; } -static int -qemuProcessReadLog(int fd, char *buf, int buflen, int off) + +/** + * qemuProcessReadLog: Read log file of a qemu VM + * @fd: File descriptor of the log file + * @buf: buffer to store the read messages + * @buflen: allocated space available in @buf + * @off: Offset to start reading from + * @skipchar: Skip messages about created character devices + * + * Reads log of a qemu VM. Skips messages not produced by qemu or irrelevant + * messages. Returns length of the message stored in @buf, or -1 on error. + */ +int +qemuProcessReadLog(int fd, char *buf, int buflen, int off, bool skipchar) { char *filter_next = buf; ssize_t bytes; @@ -1471,7 +1483,9 @@ qemuProcessReadLog(int fd, char *buf, int buflen, int off) /* Filter out debug messages from intermediate libvirt process */ while ((eol = strchr(filter_next, '\n'))) { *eol = '\0'; - if (virLogProbablyLogMessage(filter_next)) { + if (virLogProbablyLogMessage(filter_next) || + (skipchar && + STRPREFIX(filter_next, "char device redirected to"))) { memmove(filter_next, eol + 1, off - (eol - buf)); off -= eol + 1 - filter_next; } else { @@ -1514,7 +1528,7 @@ qemuProcessReadLogOutput(virDomainObjPtr vm, isdead = kill(vm->pid, 0) == -1 && errno == ESRCH; - got = qemuProcessReadLog(fd, buf, buflen, got); + got = qemuProcessReadLog(fd, buf, buflen, got, false); if (got < 0) { virReportSystemError(errno, _("Failure while reading %s log output"), @@ -1832,7 +1846,7 @@ cleanup: } len = strlen(buf); - qemuProcessReadLog(logfd, buf + len, buf_size - len - 1, 0); + qemuProcessReadLog(logfd, buf + len, buf_size - len - 1, 0, true); virReportError(VIR_ERR_INTERNAL_ERROR, _("process exited while connecting to monitor: %s"), buf); diff --git a/src/qemu/qemu_process.h b/src/qemu/qemu_process.h index 405c73d..4176815 100644 --- a/src/qemu/qemu_process.h +++ b/src/qemu/qemu_process.h @@ -101,4 +101,6 @@ bool qemuProcessAutoDestroyActive(virQEMUDriverPtr driver, virBitmapPtr qemuPrepareCpumap(virQEMUDriverPtr driver, virBitmapPtr nodemask); +int qemuProcessReadLog(int fd, char *buf, int buflen, int off, bool skipchar); + #endif /* __QEMU_PROCESS_H__ */ -- 1.8.3.2

Early VM startup errors usually produce a better error message in the machine log file. Currently we were accessing it only when the process exited during certain phases of startup. This will help adding a more comprehensive error extraction for early qemu startup phases. This patch adds infrastructure to keep a file descriptor for the machine log file that will be used in case an error happens. --- Notes: Version 2: - no changes, just explained motivation in reply to previous version src/qemu/qemu_monitor.c | 27 +++++++++++++++++++++++++++ src/qemu/qemu_monitor.h | 2 ++ 2 files changed, 29 insertions(+) diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c index ecec8e2..c3701fe 100644 --- a/src/qemu/qemu_monitor.c +++ b/src/qemu/qemu_monitor.c @@ -88,6 +88,9 @@ struct _qemuMonitor { /* If found, path to the virtio memballoon driver */ char *balloonpath; bool ballooninit; + + /* Log file fd of the qemu process to dig for usable info */ + int logfd; }; static virClassPtr qemuMonitorClass; @@ -254,6 +257,7 @@ static void qemuMonitorDispose(void *obj) VIR_FREE(mon->buffer); virJSONValueFree(mon->options); VIR_FREE(mon->balloonpath); + VIR_FORCE_CLOSE(mon->logfd); } @@ -715,6 +719,7 @@ qemuMonitorOpenInternal(virDomainObjPtr vm, return NULL; mon->fd = -1; + mon->logfd = -1; if (virCondInit(&mon->notify) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("cannot initialize monitor condition")); @@ -3843,3 +3848,25 @@ qemuMonitorGetDeviceAliases(qemuMonitorPtr mon, return qemuMonitorJSONGetDeviceAliases(mon, aliases); } + + +/** + * qemuMonitorSetDomainLog: + * Set the file descriptor of the open VM log file to report potential + * early startup errors of qemu. + * + * @mon: Monitor object to set the log file reading on + * @logfd: File descriptor of the already open log file + */ +int +qemuMonitorSetDomainLog(qemuMonitorPtr mon, int logfd) +{ + VIR_FORCE_CLOSE(mon->logfd); + if (logfd >= 0 && + (mon->logfd = dup(logfd)) < 0) { + virReportSystemError(errno, "%s", _("failed to duplicate log fd")); + return -1; + } + + return 0; +} diff --git a/src/qemu/qemu_monitor.h b/src/qemu/qemu_monitor.h index 5fe33db..06ba7e8 100644 --- a/src/qemu/qemu_monitor.h +++ b/src/qemu/qemu_monitor.h @@ -761,6 +761,8 @@ int qemuMonitorDetachCharDev(qemuMonitorPtr mon, int qemuMonitorGetDeviceAliases(qemuMonitorPtr mon, char ***aliases); +int qemuMonitorSetDomainLog(qemuMonitorPtr mon, int logfd); + /** * When running two dd process and using <> redirection, we need a * shell that will not truncate files. These two strings serve that -- 1.8.3.2

Change the monitor error code to add the ability to access the qemu log file using a file descriptor so that we can dig in it for a more useful error message. The error is now logged on monitor hangups and overwrites a possible lesser error. A hangup on the monitor usualy means that qemu has crashed and there's a significant chance it produced a useful error message. The functionality will be latent until the next patch. --- Notes: Version 2: - fixed typos - avoided unnecessary reset of error - removed unneeded strlen to check returned log string src/qemu/qemu_monitor.c | 66 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 6 deletions(-) diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c index c3701fe..a601ee0 100644 --- a/src/qemu/qemu_monitor.c +++ b/src/qemu/qemu_monitor.c @@ -32,6 +32,8 @@ #include "qemu_monitor.h" #include "qemu_monitor_text.h" #include "qemu_monitor_json.h" +#include "qemu_domain.h" +#include "qemu_process.h" #include "virerror.h" #include "viralloc.h" #include "virlog.h" @@ -331,6 +333,35 @@ qemuMonitorOpenPty(const char *monitor) } +/* Get a possible error from qemu's log. This function closes the + * corresponding log fd */ +static char * +qemuMonitorGetErrorFromLog(qemuMonitorPtr mon) +{ + int len; + char *logbuf = NULL; + int orig_errno = errno; + + if (mon->logfd < 0) + return NULL; + + if (VIR_ALLOC_N_QUIET(logbuf, 4096) < 0) + goto error; + + if ((len = qemuProcessReadLog(mon->logfd, logbuf, 4096 - 1, 0, true)) <= 0) + goto error; + +cleanup: + errno = orig_errno; + VIR_FORCE_CLOSE(mon->logfd); + return logbuf; + +error: + VIR_FREE(logbuf); + goto cleanup; +} + + /* This method processes data that has been received * from the monitor. Looking for async events and * replies/errors. @@ -564,6 +595,7 @@ qemuMonitorIO(int watch, int fd, int events, void *opaque) { qemuMonitorPtr mon = opaque; bool error = false; bool eof = false; + bool hangup = false; virObjectRef(mon); @@ -614,12 +646,14 @@ qemuMonitorIO(int watch, int fd, int events, void *opaque) { } } - if (!error && - events & VIR_EVENT_HANDLE_HANGUP) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("End of file from monitor")); - eof = true; - events &= ~VIR_EVENT_HANDLE_HANGUP; + if (events & VIR_EVENT_HANDLE_HANGUP) { + hangup = true; + if (!error) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("End of file from monitor")); + eof = true; + events &= ~VIR_EVENT_HANDLE_HANGUP; + } } if (!error && !eof && @@ -638,6 +672,26 @@ qemuMonitorIO(int watch, int fd, int events, void *opaque) { } if (error || eof) { + if (hangup) { + /* Check if an error message from qemu is available and if so, use + * it to overwrite the actual message. It's done only in early + * startup phases where the message from qemu is certainly more + * interesting than a "connection reset by peer" message. + */ + char *qemuMessage; + + if ((qemuMessage = qemuMonitorGetErrorFromLog(mon))) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("early end of file from monitor: " + "possible problem:\n%s"), + qemuMessage); + virCopyLastError(&mon->lastError); + virResetLastError(); + } + + VIR_FREE(qemuMessage); + } + if (mon->lastError.code != VIR_ERR_OK) { /* Already have an error, so clear any new error */ virResetLastError(); -- 1.8.3.2

The previous patches added infrastructure to report better errors from monitor in some cases. This patch finalizes this "feature" by enabling this enhanced error reporting on early phases of VM startup. In these phases the possibility of qemu producing a useful error message is really high compared to running it during the whole life cycle. After the start up is complete, the feature is disabled to provide the usual error messages so that users are not confused by possibly irrelevant messages that may be in the domain log. The original motivation to do this enhancement is to capture errors when using VFIO device passthrough, where qemu reports errors after the monitor is initialized and the existing error catching code couldn't catch this producing a unhelpful message: # virsh start test error: Failed to start domain test error: Unable to read from monitor: Connection reset by peer With this change, the message is changed to: # virsh start test error: Failed to start domain test error: internal error: early end of file from monitor: possible problem: qemu-system-x86_64: -device vfio-pci,host=00:1a.0,id=hostdev0,bus=pci.0,addr=0x5: vfio: error, group 8 is not viable, please ensure all devices within the iommu_group are bound to their vfio bus driver. qemu-system-x86_64: -device vfio-pci,host=00:1a.0,id=hostdev0,bus=pci.0,addr=0x5: vfio: failed to get group 8 qemu-system-x86_64: -device vfio-pci,host=00:1a.0,id=hostdev0,bus=pci.0,addr=0x5: Device 'vfio-pci' could not be initialized --- Notes: Version 2: - don't leak mon in case setting of the logfd fails, -ignore errors if setting of the logfd fails src/qemu/qemu_process.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 846cee6..2b9bfe3 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -1389,7 +1389,7 @@ static qemuMonitorCallbacks monitorCallbacks = { }; static int -qemuConnectMonitor(virQEMUDriverPtr driver, virDomainObjPtr vm) +qemuConnectMonitor(virQEMUDriverPtr driver, virDomainObjPtr vm, int logfd) { qemuDomainObjPrivatePtr priv = vm->privateData; int ret = -1; @@ -1415,6 +1415,9 @@ qemuConnectMonitor(virQEMUDriverPtr driver, virDomainObjPtr vm) &monitorCallbacks, driver); + if (mon) + ignore_value(qemuMonitorSetDomainLog(mon, logfd)); + virObjectLock(vm); priv->monStart = 0; @@ -1794,11 +1797,11 @@ qemuProcessWaitForMonitor(virQEMUDriverPtr driver, virHashTablePtr paths = NULL; qemuDomainObjPrivatePtr priv; - if (!virQEMUCapsUsedQMP(qemuCaps) - && pos != -1) { - if ((logfd = qemuDomainOpenLog(driver, vm, pos)) < 0) - return -1; + if (pos != -1 && + (logfd = qemuDomainOpenLog(driver, vm, pos)) < 0) + return -1; + if (logfd != -1 && !virQEMUCapsUsedQMP(qemuCaps)) { if (VIR_ALLOC_N(buf, buf_size) < 0) goto closelog; @@ -1809,9 +1812,8 @@ qemuProcessWaitForMonitor(virQEMUDriverPtr driver, } VIR_DEBUG("Connect monitor to %p '%s'", vm, vm->def->name); - if (qemuConnectMonitor(driver, vm) < 0) { + if (qemuConnectMonitor(driver, vm, logfd) < 0) goto cleanup; - } /* Try to get the pty path mappings again via the monitor. This is much more * reliable if it's available. @@ -1838,14 +1840,15 @@ cleanup: /* VM is dead, any other error raised in the interim is probably * not as important as the qemu cmdline output */ if (virQEMUCapsUsedQMP(qemuCaps)) { - if ((logfd = qemuDomainOpenLog(driver, vm, pos)) < 0) - return -1; - if (VIR_ALLOC_N(buf, buf_size) < 0) goto closelog; } len = strlen(buf); + /* best effor seek - we need to reset to the original position, so that + * a possible read of the fd in the monitor code doesn't influence this + * error delivery option */ + lseek(logfd, pos, SEEK_SET); qemuProcessReadLog(logfd, buf + len, buf_size - len - 1, 0, true); virReportError(VIR_ERR_INTERNAL_ERROR, _("process exited while connecting to monitor: %s"), @@ -3080,7 +3083,7 @@ qemuProcessReconnect(void *opaque) virObjectRef(obj); /* XXX check PID liveliness & EXE path */ - if (qemuConnectMonitor(driver, obj) < 0) + if (qemuConnectMonitor(driver, obj, -1) < 0) goto error; /* Failure to connect to agent shouldn't be fatal */ @@ -4047,6 +4050,9 @@ int qemuProcessStart(virConnectPtr conn, goto cleanup; } + /* unset reporting errors from qemu log */ + qemuMonitorSetDomainLog(priv->mon, -1); + virCommandFree(cmd); VIR_FORCE_CLOSE(logfile); virObjectUnref(cfg); @@ -4062,6 +4068,8 @@ cleanup: virBitmapFree(nodemask); virCommandFree(cmd); VIR_FORCE_CLOSE(logfile); + if (priv->mon) + qemuMonitorSetDomainLog(priv->mon, -1); qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, stop_flags); virObjectUnref(cfg); virObjectUnref(caps); -- 1.8.3.2

On 09/24/2013 11:14 AM, Peter Krempa wrote: ...
src/qemu/qemu_process.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-)
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 846cee6..2b9bfe3 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -1389,7 +1389,7 @@ static qemuMonitorCallbacks monitorCallbacks = { };
static int -qemuConnectMonitor(virQEMUDriverPtr driver, virDomainObjPtr vm) +qemuConnectMonitor(virQEMUDriverPtr driver, virDomainObjPtr vm, int logfd) { qemuDomainObjPrivatePtr priv = vm->privateData; int ret = -1; @@ -1415,6 +1415,9 @@ qemuConnectMonitor(virQEMUDriverPtr driver, virDomainObjPtr vm) &monitorCallbacks, driver);
+ if (mon) + ignore_value(qemuMonitorSetDomainLog(mon, logfd)); + virObjectLock(vm); priv->monStart = 0;
@@ -1794,11 +1797,11 @@ qemuProcessWaitForMonitor(virQEMUDriverPtr driver, virHashTablePtr paths = NULL; qemuDomainObjPrivatePtr priv;
- if (!virQEMUCapsUsedQMP(qemuCaps) - && pos != -1) { - if ((logfd = qemuDomainOpenLog(driver, vm, pos)) < 0) - return -1; + if (pos != -1 && + (logfd = qemuDomainOpenLog(driver, vm, pos)) < 0) + return -1;
+ if (logfd != -1 && !virQEMUCapsUsedQMP(qemuCaps)) { if (VIR_ALLOC_N(buf, buf_size) < 0) goto closelog;
@@ -1809,9 +1812,8 @@ qemuProcessWaitForMonitor(virQEMUDriverPtr driver, }
VIR_DEBUG("Connect monitor to %p '%s'", vm, vm->def->name); - if (qemuConnectMonitor(driver, vm) < 0) { + if (qemuConnectMonitor(driver, vm, logfd) < 0) goto cleanup; - }
/* Try to get the pty path mappings again via the monitor. This is much more * reliable if it's available. @@ -1838,14 +1840,15 @@ cleanup: /* VM is dead, any other error raised in the interim is probably * not as important as the qemu cmdline output */ if (virQEMUCapsUsedQMP(qemuCaps)) { - if ((logfd = qemuDomainOpenLog(driver, vm, pos)) < 0) - return -1; - if (VIR_ALLOC_N(buf, buf_size) < 0) goto closelog; }
len = strlen(buf); + /* best effor seek - we need to reset to the original position, so that + * a possible read of the fd in the monitor code doesn't influence this + * error delivery option */ + lseek(logfd, pos, SEEK_SET);
Coverity complains: 1850 * error delivery option */ (18) Event check_return: Calling function "lseek(logfd, pos, 0)" without checking return value. This library function may fail and return an error code. (19) Event unchecked_value: No check of the return value of "lseek(logfd, pos, 0)". 1851 lseek(logfd, pos, SEEK_SET); John
qemuProcessReadLog(logfd, buf + len, buf_size - len - 1, 0, true); virReportError(VIR_ERR_INTERNAL_ERROR, _("process exited while connecting to monitor: %s"), @@ -3080,7 +3083,7 @@ qemuProcessReconnect(void *opaque) virObjectRef(obj);
/* XXX check PID liveliness & EXE path */ - if (qemuConnectMonitor(driver, obj) < 0) + if (qemuConnectMonitor(driver, obj, -1) < 0) goto error;
/* Failure to connect to agent shouldn't be fatal */ @@ -4047,6 +4050,9 @@ int qemuProcessStart(virConnectPtr conn, goto cleanup; }
+ /* unset reporting errors from qemu log */ + qemuMonitorSetDomainLog(priv->mon, -1); + virCommandFree(cmd); VIR_FORCE_CLOSE(logfile); virObjectUnref(cfg); @@ -4062,6 +4068,8 @@ cleanup: virBitmapFree(nodemask); virCommandFree(cmd); VIR_FORCE_CLOSE(logfile); + if (priv->mon) + qemuMonitorSetDomainLog(priv->mon, -1); qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, stop_flags); virObjectUnref(cfg); virObjectUnref(caps);

On 09/24/2013 05:14 PM, Peter Krempa wrote:
Version 2 fixed according to review feedback of Jan.
Peter Krempa (4): qemu_process: Make qemuProcessReadLog() more versatile and reusable qemu: monitor: Add infrastructure to access VM logs for better err msgs qemu: monitor: Produce better errors on monitor hangup qemu: Wire up better early error reporting
src/qemu/qemu_monitor.c | 93 +++++++++++++++++++++++++++++++++++++++++++++---- src/qemu/qemu_monitor.h | 2 ++ src/qemu/qemu_process.c | 54 +++++++++++++++++++--------- src/qemu/qemu_process.h | 2 ++ 4 files changed, 129 insertions(+), 22 deletions(-)
ACK

On 09/25/13 12:45, Ján Tomko wrote:
On 09/24/2013 05:14 PM, Peter Krempa wrote:
Version 2 fixed according to review feedback of Jan.
Peter Krempa (4): qemu_process: Make qemuProcessReadLog() more versatile and reusable qemu: monitor: Add infrastructure to access VM logs for better err msgs qemu: monitor: Produce better errors on monitor hangup qemu: Wire up better early error reporting
src/qemu/qemu_monitor.c | 93 +++++++++++++++++++++++++++++++++++++++++++++---- src/qemu/qemu_monitor.h | 2 ++ src/qemu/qemu_process.c | 54 +++++++++++++++++++--------- src/qemu/qemu_process.h | 2 ++ 4 files changed, 129 insertions(+), 22 deletions(-)
ACK
Pushed; Thanks for the review. Peter
participants (3)
-
John Ferlan
-
Ján Tomko
-
Peter Krempa