[libvirt] [PATCH 0/2] qemu: Improve virDomainGetControlInfo

Peter Krempa (2): qemu: Properly report error state in qemuDomainGetControlInfo() qemu: Allow inactive domains in qemuDomainGetControlInfo() include/libvirt/libvirt-domain.h | 27 +++++++++++++++++++++++++-- src/qemu/qemu_driver.c | 21 +++++++++++++-------- tools/virsh-domain-monitor.c | 19 +++++++++++++++++++ 3 files changed, 57 insertions(+), 10 deletions(-) -- 2.2.2

Previously when a domain would get stuck in a domain job due to a programming mistake we'd report the following control state: $ virsh domcontrol domain occupied (1424343406.150s) The timestamp is invalid as the monitor was not entered for that domain. We can use that to detect that the domain has an active job and report a better error instead: $ virsh domcontrol domain error: internal (locking) error --- include/libvirt/libvirt-domain.h | 27 +++++++++++++++++++++++++-- src/qemu/qemu_driver.c | 15 +++++++++++++-- tools/virsh-domain-monitor.c | 19 +++++++++++++++++++ 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/include/libvirt/libvirt-domain.h b/include/libvirt/libvirt-domain.h index 4dbd7f5..a9d3efd 100644 --- a/include/libvirt/libvirt-domain.h +++ b/include/libvirt/libvirt-domain.h @@ -182,7 +182,9 @@ typedef enum { monitored by virDomainGetJobInfo); only limited set of commands may be allowed */ VIR_DOMAIN_CONTROL_OCCUPIED = 2, /* occupied by a running command */ - VIR_DOMAIN_CONTROL_ERROR = 3, /* unusable, domain cannot be fully operated */ + VIR_DOMAIN_CONTROL_ERROR = 3, /* unusable, domain cannot be fully + operated, possible reason is provided + in the details field */ # ifdef VIR_ENUM_SENTINELS VIR_DOMAIN_CONTROL_LAST @@ -190,6 +192,26 @@ typedef enum { } virDomainControlState; /** + * virDomainControlErrorReason: + * + * Reason for the error state. + */ +typedef enum { + VIR_DOMAIN_CONTROL_ERROR_REASON_NONE = 0, /* server didn't provide a + reason */ + VIR_DOMAIN_CONTROL_ERROR_REASON_UNKNOWN = 1, /* unknown reason for the + error */ + VIR_DOMAIN_CONTROL_ERROR_REASON_MONITOR = 2, /* monitor connection is + broken */ + VIR_DOMAIN_CONTROL_ERROR_REASON_INTERNAL = 3, /* error caused due to + internal failure in libvirt + */ +# ifdef VIR_ENUM_SENTINELS + VIR_DOMAIN_CONTROL_ERROR_REASON_LAST +# endif +} virDomainControlErrorReason; + +/** * virDomainControlInfo: * * Structure filled in by virDomainGetControlInfo and providing details about @@ -198,7 +220,8 @@ typedef enum { typedef struct _virDomainControlInfo virDomainControlInfo; struct _virDomainControlInfo { unsigned int state; /* control state, one of virDomainControlState */ - unsigned int details; /* state details, currently 0 */ + unsigned int details; /* state details, currently 0 except for ERROR + state (one of virDomainControlErrorReason) */ unsigned long long stateTime; /* for how long (in msec) control interface has been in current state (except for OK and ERROR states) */ diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 8f0cf2b..dda9ab7 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -2690,6 +2690,7 @@ qemuDomainGetControlInfo(virDomainPtr dom, if (priv->monError) { info->state = VIR_DOMAIN_CONTROL_ERROR; + info->details = VIR_DOMAIN_CONTROL_ERROR_REASON_MONITOR; } else if (priv->job.active) { if (virTimeMillisNow(&info->stateTime) < 0) goto cleanup; @@ -2697,8 +2698,18 @@ qemuDomainGetControlInfo(virDomainPtr dom, info->state = VIR_DOMAIN_CONTROL_JOB; info->stateTime -= priv->job.current->started; } else { - info->state = VIR_DOMAIN_CONTROL_OCCUPIED; - info->stateTime -= priv->monStart; + if (priv->monStart > 0) { + info->state = VIR_DOMAIN_CONTROL_OCCUPIED; + info->stateTime -= priv->monStart; + } else { + /* At this point the domain has an active job, but monitor was + * not entered and the domain object lock is not held thus we + * are stuck in the job forever due to a programming error. + */ + info->state = VIR_DOMAIN_CONTROL_ERROR; + info->details = VIR_DOMAIN_CONTROL_ERROR_REASON_INTERNAL; + info->stateTime = 0; + } } } else { info->state = VIR_DOMAIN_CONTROL_OK; diff --git a/tools/virsh-domain-monitor.c b/tools/virsh-domain-monitor.c index 925eb1b..f774d30 100644 --- a/tools/virsh-domain-monitor.c +++ b/tools/virsh-domain-monitor.c @@ -128,6 +128,21 @@ vshDomainControlStateToString(int state) return str ? _(str) : _("unknown"); } +VIR_ENUM_DECL(vshDomainControlErrorReason) +VIR_ENUM_IMPL(vshDomainControlErrorReason, + VIR_DOMAIN_CONTROL_ERROR_REASON_LAST, + "", + N_("unknown"), + N_("monitor failure"), + N_("internal (locking) error")) + +static const char * +vshDomainControlErrorReasonToString(int reason) +{ + const char *ret = vshDomainControlErrorReasonTypeToString(reason); + return ret ? _(ret) : _("unknown"); +} + VIR_ENUM_DECL(vshDomainState) VIR_ENUM_IMPL(vshDomainState, VIR_DOMAIN_LAST, @@ -814,6 +829,10 @@ cmdDomControl(vshControl *ctl, const vshCmd *cmd) vshPrint(ctl, "%s (%0.3fs)\n", vshDomainControlStateToString(info.state), info.stateTime / 1000.0); + } else if (info.state == VIR_DOMAIN_CONTROL_ERROR && info.details > 0) { + vshPrint(ctl, "%s: %s\n", + vshDomainControlStateToString(info.state), + vshDomainControlErrorReasonToString(info.details)); } else { vshPrint(ctl, "%s\n", vshDomainControlStateToString(info.state)); -- 2.2.2

On Thu, Feb 19, 2015 at 12:02:27PM +0100, Peter Krempa wrote:
Previously when a domain would get stuck in a domain job due to a programming mistake we'd report the following control state:
$ virsh domcontrol domain occupied (1424343406.150s)
The timestamp is invalid as the monitor was not entered for that domain. We can use that to detect that the domain has an active job and report a better error instead:
$ virsh domcontrol domain error: internal (locking) error
I don't really think that is much better as it still doesn't give us any clue as to how/why we got into this broken state. I think it'd be desirable to modify our JobEnter APIs so that they get passed the __FUNC__ __FILE__ and __LINE__ of the code which starts the job. Then include this info when we report a job error. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Thu, Feb 19, 2015 at 11:07:46 +0000, Daniel Berrange wrote:
On Thu, Feb 19, 2015 at 12:02:27PM +0100, Peter Krempa wrote:
Previously when a domain would get stuck in a domain job due to a programming mistake we'd report the following control state:
$ virsh domcontrol domain occupied (1424343406.150s)
The timestamp is invalid as the monitor was not entered for that domain. We can use that to detect that the domain has an active job and report a better error instead:
$ virsh domcontrol domain error: internal (locking) error
I don't really think that is much better as it still doesn't give us any clue as to how/why we got into this broken state.
Well, yes, it doesn't give a clue how we got into a broken state but fixes the output in case we are already in the broken state for the virDomainGetControlInfo API. In the example above you can see that the API returns state "occupied" which usually means that the domain is in a monitor call and the elapsed time is totally incorrect as it's calculated as CURRENT_TIME - monitorStartTime. In case the domain is _not_ in a monitor call, the monitorStartTime variable is set to 0 so we basically state that the domain is in a monitor call since start of the epoch.
I think it'd be desirable to modify our JobEnter APIs so that they get passed the __FUNC__ __FILE__ and __LINE__ of the code which starts the job. Then include this info when we report a job error.
I agree though that this is necessary. Jiri set out to improve the error reporting a while ago in a very similar manner but still didn't get to finish it AFAIK. Peter

On Thu, Feb 19, 2015 at 01:46:26PM +0100, Peter Krempa wrote:
On Thu, Feb 19, 2015 at 11:07:46 +0000, Daniel Berrange wrote:
On Thu, Feb 19, 2015 at 12:02:27PM +0100, Peter Krempa wrote:
Previously when a domain would get stuck in a domain job due to a programming mistake we'd report the following control state:
$ virsh domcontrol domain occupied (1424343406.150s)
The timestamp is invalid as the monitor was not entered for that domain. We can use that to detect that the domain has an active job and report a better error instead:
$ virsh domcontrol domain error: internal (locking) error
I don't really think that is much better as it still doesn't give us any clue as to how/why we got into this broken state.
Well, yes, it doesn't give a clue how we got into a broken state but fixes the output in case we are already in the broken state for the virDomainGetControlInfo API.
In the example above you can see that the API returns state "occupied" which usually means that the domain is in a monitor call and the elapsed time is totally incorrect as it's calculated as CURRENT_TIME - monitorStartTime. In case the domain is _not_ in a monitor call, the monitorStartTime variable is set to 0 so we basically state that the domain is in a monitor call since start of the epoch.
Ok, ack to it then Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

Inactive domains can still be stuck in a job or other problems. Add a way to detect it. --- src/qemu/qemu_driver.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index dda9ab7..484419e 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -2678,12 +2678,6 @@ qemuDomainGetControlInfo(virDomainPtr dom, if (virDomainGetControlInfoEnsureACL(dom->conn, vm->def) < 0) goto cleanup; - if (!virDomainObjIsActive(vm)) { - virReportError(VIR_ERR_OPERATION_INVALID, - "%s", _("domain is not running")); - goto cleanup; - } - priv = vm->privateData; memset(info, 0, sizeof(*info)); -- 2.2.2
participants (2)
-
Daniel P. Berrange
-
Peter Krempa