[libvirt] [PATCH] Improve error reporting in virsh

# HG changeset patch # User john.levon@sun.com # Date 1233791330 28800 # Node ID 6588879d8cc9a3af1147a5edd624aee5155493ae # Parent da162569b5f7e132c4ccbfc56fc3670fb5ecee10 Improve error reporting in virsh Rather than verbosely printing every error, save the last error and report that only if the entire command fails. Additionally, avoid over-writing an existing error in various places in the library. Signed-off-by: John Levon <john.levon@sun.com> diff --git a/include/libvirt/virterror.h b/include/libvirt/virterror.h --- a/include/libvirt/virterror.h +++ b/include/libvirt/virterror.h @@ -187,6 +187,8 @@ void virConnSetErrorFunc (virConnectPt virErrorFunc handler); int virConnCopyLastError (virConnectPtr conn, virErrorPtr to); + +virErrorPtr virCloneError (virErrorPtr from); #ifdef __cplusplus } #endif diff --git a/src/domain_conf.c b/src/domain_conf.c --- a/src/domain_conf.c +++ b/src/domain_conf.c @@ -2432,8 +2432,7 @@ catchXMLError (void *ctx, const char *ms if (ctxt) { virConnectPtr conn = ctxt->_private; - if (conn && - conn->err.code == VIR_ERR_NONE && + if (virGetLastError() == NULL && ctxt->lastError.level == XML_ERR_FATAL && ctxt->lastError.message != NULL) { virDomainReportError (conn, VIR_ERR_XML_DETAIL, @@ -2466,7 +2465,7 @@ virDomainDefPtr virDomainDefParseString( XML_PARSE_NOENT | XML_PARSE_NONET | XML_PARSE_NOWARNING); if (!xml) { - if (conn && conn->err.code == VIR_ERR_NONE) + if (virGetLastError() == NULL) virDomainReportError(conn, VIR_ERR_XML_ERROR, "%s", _("failed to parse xml document")); goto cleanup; @@ -2507,7 +2506,7 @@ virDomainDefPtr virDomainDefParseFile(vi XML_PARSE_NOENT | XML_PARSE_NONET | XML_PARSE_NOWARNING); if (!xml) { - if (conn && conn->err.code == VIR_ERR_NONE) + if (virGetLastError() == NULL) virDomainReportError(conn, VIR_ERR_XML_ERROR, "%s", _("failed to parse xml document")); goto cleanup; diff --git a/src/libvirt_public.syms b/src/libvirt_public.syms --- a/src/libvirt_public.syms +++ b/src/libvirt_public.syms @@ -244,6 +244,7 @@ LIBVIRT_0.6.0 { virStoragePoolRef; virStorageVolRef; virNodeDeviceRef; + virCloneError; } LIBVIRT_0.5.0; diff --git a/src/virsh.c b/src/virsh.c --- a/src/virsh.c +++ b/src/virsh.c @@ -93,22 +93,6 @@ typedef enum { } vshErrorLevel; /* - * The error handler for virsh - */ -static void -virshErrorHandler(void *unused, virErrorPtr error) -{ - if ((unused != NULL) || (error == NULL)) - return; - - /* Suppress the VIR_ERR_NO_XEN error which fails as non-root */ - if ((error->code == VIR_ERR_NO_XEN) || (error->code == VIR_ERR_OK)) - return; - - virDefaultErrorFunc(error); -} - -/* * virsh command line grammar: * * command_line = <command>\n | <command>; <command>; ... @@ -319,6 +303,41 @@ static int namesorter(const void *a, con const char **sb = (const char**)b; return strcasecmp(*sa, *sb); +} + +static virErrorPtr last_error; + +/* + * Quieten libvirt until we're done with the command. + */ +static void +virshErrorHandler(void *unused, virErrorPtr error) +{ + last_error = virCloneError(error); + if (getenv("VIRSH_DEBUG") != NULL) + virDefaultErrorFunc(error); +} + +/* + * Report an error when a command finishes. This is better than before + * (when correct operation would report errors), but it has some + * problems: we lose the smarter formatting of virDefaultErrorFunc(), + * and it can become harder to debug problems, if errors get reported + * twice during one command. This case shouldn't really happen anyway, + * and it's IMHO a bug that libvirt does that sometimes. + */ +static void +virshReportError(vshControl *ctl) +{ + if (last_error == NULL) + return; + + if (last_error->code == VIR_ERR_OK) { + vshError(ctl, TRUE, "%s", _("unknown error")); + return; + } + + vshError(ctl, TRUE, "%s", last_error->message); } @@ -6102,6 +6121,9 @@ vshCommandRun(vshControl *ctl, const vsh if (ctl->timing) GETTIMEOFDAY(&after); + + if (ret == FALSE) + virshReportError(ctl); if (STREQ(cmd->def->name, "quit")) /* hack ... */ return ret; diff --git a/src/virterror.c b/src/virterror.c --- a/src/virterror.c +++ b/src/virterror.c @@ -259,6 +259,32 @@ virGetLastError(void) } /** + * virCloneError: + * @from: source error to copy from + * + * Allocate a new error object and deep-copy it from the given error + * object. The destination error is reset before the copy. If NULL is + * passed, a new error object is allocated. + * + * Returns a pointer to the copied error or NULL if allocation failed. + */ +virErrorPtr +virCloneError(virErrorPtr from) +{ + virErrorPtr to; + + if (VIR_ALLOC(to) < 0) + return NULL; + + if (from) + virCopyError(from, to); + else + virResetError(to); + + return to; +} + +/** * virCopyLastError: * @to: target to receive the copy * diff --git a/src/xend_internal.c b/src/xend_internal.c --- a/src/xend_internal.c +++ b/src/xend_internal.c @@ -105,12 +105,16 @@ virDomainDevID(virDomainPtr domain, int devid_len); #endif -#define virXendError(conn, code, fmt...) \ - virReportErrorHelper(conn, VIR_FROM_XEND, code, __FILE__, \ - __FUNCTION__, __LINE__, fmt) - +#define virXendError(conn, codeval, fmt...) \ + do { \ + if (virGetLastError() == NULL) { \ + virReportErrorHelper(conn, VIR_FROM_XEND, codeval, __FILE__, \ + __FUNCTION__, __LINE__, fmt); \ + } \ + } while (0) + #define virXendErrorInt(conn, code, ival) \ - virXendError(conn, code, "%d", ival) + virXendError(conn, code, "%d", ival) /** * do_connect:

On Wed, Feb 04, 2009 at 03:56:26PM -0800, john.levon@sun.com wrote:
# HG changeset patch # User john.levon@sun.com # Date 1233791330 28800 # Node ID 6588879d8cc9a3af1147a5edd624aee5155493ae # Parent da162569b5f7e132c4ccbfc56fc3670fb5ecee10 Improve error reporting in virsh
Rather than verbosely printing every error, save the last error and report that only if the entire command fails. Additionally, avoid over-writing an existing error in various places in the library.
Signed-off-by: John Levon <john.levon@sun.com>
diff --git a/src/domain_conf.c b/src/domain_conf.c --- a/src/domain_conf.c +++ b/src/domain_conf.c @@ -2432,8 +2432,7 @@ catchXMLError (void *ctx, const char *ms if (ctxt) { virConnectPtr conn = ctxt->_private;
- if (conn && - conn->err.code == VIR_ERR_NONE && + if (virGetLastError() == NULL && ctxt->lastError.level == XML_ERR_FATAL && ctxt->lastError.message != NULL) { virDomainReportError (conn, VIR_ERR_XML_DETAIL, @@ -2466,7 +2465,7 @@ virDomainDefPtr virDomainDefParseString( XML_PARSE_NOENT | XML_PARSE_NONET | XML_PARSE_NOWARNING); if (!xml) { - if (conn && conn->err.code == VIR_ERR_NONE) + if (virGetLastError() == NULL) virDomainReportError(conn, VIR_ERR_XML_ERROR, "%s", _("failed to parse xml document")); goto cleanup; @@ -2507,7 +2506,7 @@ virDomainDefPtr virDomainDefParseFile(vi XML_PARSE_NOENT | XML_PARSE_NONET | XML_PARSE_NOWARNING); if (!xml) { - if (conn && conn->err.code == VIR_ERR_NONE) + if (virGetLastError() == NULL) virDomainReportError(conn, VIR_ERR_XML_ERROR, "%s", _("failed to parse xml document")); goto cleanup;
ACK to those bits.
@@ -319,6 +303,41 @@ static int namesorter(const void *a, con const char **sb = (const char**)b;
return strcasecmp(*sa, *sb); +} + +static virErrorPtr last_error; + +/* + * Quieten libvirt until we're done with the command. + */ +static void +virshErrorHandler(void *unused, virErrorPtr error) +{ + last_error = virCloneError(error); + if (getenv("VIRSH_DEBUG") != NULL) + virDefaultErrorFunc(error); +} + +/* + * Report an error when a command finishes. This is better than before + * (when correct operation would report errors), but it has some + * problems: we lose the smarter formatting of virDefaultErrorFunc(), + * and it can become harder to debug problems, if errors get reported + * twice during one command. This case shouldn't really happen anyway, + * and it's IMHO a bug that libvirt does that sometimes. + */ +static void +virshReportError(vshControl *ctl) +{ + if (last_error == NULL) + return; + + if (last_error->code == VIR_ERR_OK) { + vshError(ctl, TRUE, "%s", _("unknown error")); + return; + } + + vshError(ctl, TRUE, "%s", last_error->message); }
Since you only ever print out the 'last_error->message' field here, I think its better to just do strdup() of that field, instead of adding a new virCloneError API. Also, it is neccessary to free the error message/object after reporting it to avoid a memory leak.
diff --git a/src/xend_internal.c b/src/xend_internal.c --- a/src/xend_internal.c +++ b/src/xend_internal.c @@ -105,12 +105,16 @@ virDomainDevID(virDomainPtr domain, int devid_len); #endif
-#define virXendError(conn, code, fmt...) \ - virReportErrorHelper(conn, VIR_FROM_XEND, code, __FILE__, \ - __FUNCTION__, __LINE__, fmt) - +#define virXendError(conn, codeval, fmt...) \ + do { \ + if (virGetLastError() == NULL) { \ + virReportErrorHelper(conn, VIR_FROM_XEND, codeval, __FILE__, \ + __FUNCTION__, __LINE__, fmt); \ + } \ + } while (0) + #define virXendErrorInt(conn, code, ival) \ - virXendError(conn, code, "%d", ival) + virXendError(conn, code, "%d", ival)
I don't like this change - we are trying to remove all driver specific error reporting macros. If some part of the Xen driver is overwriting an existing error message during the failure path, either that is explicit, or a mistake that needs to be fixed. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

On Thu, Feb 05, 2009 at 12:34:32PM +0000, Daniel P. Berrange wrote:
+static void +virshReportError(vshControl *ctl) +{ + if (last_error == NULL) + return; + + if (last_error->code == VIR_ERR_OK) { + vshError(ctl, TRUE, "%s", _("unknown error")); + return; + } + + vshError(ctl, TRUE, "%s", last_error->message); }
Since you only ever print out the 'last_error->message' field here, I think its better to just do strdup() of that field, instead of adding a new virCloneError API.
Very much disagree. That happens to be true in virsh's case, but it's not in general. Currently there's no way to save off a virErrorPtr, and that's a significant hole in the API
Also, it is neccessary to free the error message/object after reporting it to avoid a memory leak.
We're passing TRUE to vshError() so immediately exiting - there's no place at which we can free it.
+#define virXendError(conn, codeval, fmt...) \ + do { \ + if (virGetLastError() == NULL) { \ + virReportErrorHelper(conn, VIR_FROM_XEND, codeval, __FILE__, \ + __FUNCTION__, __LINE__, fmt); \ + } \ + } while (0) + #define virXendErrorInt(conn, code, ival) \ - virXendError(conn, code, "%d", ival) + virXendError(conn, code, "%d", ival)
I don't like this change - we are trying to remove all driver specific error reporting macros.
Can you explain further? How do you expect to report driver-specific issues without doing it at the point of the error? Why does my change affect this?
If some part of the Xen driver is overwriting an existing error message during the failure path, either that is explicit, or a mistake that needs to be fixed.
I just fixed this to be like the other places where we do this? They're all bugs too? regards john

On Thu, Feb 05, 2009 at 05:05:53PM +0000, John Levon wrote:
On Thu, Feb 05, 2009 at 12:34:32PM +0000, Daniel P. Berrange wrote:
+static void +virshReportError(vshControl *ctl) +{ + if (last_error == NULL) + return; + + if (last_error->code == VIR_ERR_OK) { + vshError(ctl, TRUE, "%s", _("unknown error")); + return; + } + + vshError(ctl, TRUE, "%s", last_error->message); }
Since you only ever print out the 'last_error->message' field here, I think its better to just do strdup() of that field, instead of adding a new virCloneError API.
Very much disagree. That happens to be true in virsh's case, but it's not in general. Currently there's no way to save off a virErrorPtr, and that's a significant hole in the API
Ok, how about a slightly different virSaveLastError() call, avoiding the need to do the pair of call virGetLastEror() + virCloneError()
Also, it is neccessary to free the error message/object after reporting it to avoid a memory leak.
We're passing TRUE to vshError() so immediately exiting - there's no place at which we can free it.
Doesn't this mean that virsh will exit whenever any libvirt command raises an error ? Not any difference it using it in single-command mode, but if using it interactively you don't want it to exit like this, just because you, or example, typoed with a domain name that doesn't exist
+#define virXendError(conn, codeval, fmt...) \ + do { \ + if (virGetLastError() == NULL) { \ + virReportErrorHelper(conn, VIR_FROM_XEND, codeval, __FILE__, \ + __FUNCTION__, __LINE__, fmt); \ + } \ + } while (0) + #define virXendErrorInt(conn, code, ival) \ - virXendError(conn, code, "%d", ival) + virXendError(conn, code, "%d", ival)
I don't like this change - we are trying to remove all driver specific error reporting macros.
Can you explain further? How do you expect to report driver-specific issues without doing it at the point of the error? Why does my change affect this?
Just have the virXendError macro call virReportErrorHelper directly, as it already does - there is no need to wrap it in the conditional check 'if(virGetLastError() == NULL)' Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

On Thu, Feb 05, 2009 at 05:34:20PM +0000, Daniel P. Berrange wrote:
Ok, how about a slightly different virSaveLastError() call, avoiding the need to do the pair of call virGetLastEror() + virCloneError()
Sounds reasonable to me.
Also, it is neccessary to free the error message/object after reporting it to avoid a memory leak.
We're passing TRUE to vshError() so immediately exiting - there's no place at which we can free it.
Doesn't this mean that virsh will exit whenever any libvirt command raises an error ? Not any difference it using it in single-command mode, but if using it interactively you don't want it to exit like this, just because you, or example, typoed with a domain name that doesn't exist
D'oh, quite right.
+#define virXendError(conn, codeval, fmt...) \ + do { \ + if (virGetLastError() == NULL) { \ + virReportErrorHelper(conn, VIR_FROM_XEND, codeval, __FILE__, \ + __FUNCTION__, __LINE__, fmt); \ + } \ + } while (0) + #define virXendErrorInt(conn, code, ival) \ - virXendError(conn, code, "%d", ival) + virXendError(conn, code, "%d", ival)
I don't like this change - we are trying to remove all driver specific error reporting macros.
Can you explain further? How do you expect to report driver-specific issues without doing it at the point of the error? Why does my change affect this?
Just have the virXendError macro call virReportErrorHelper directly, as it already does - there is no need to wrap it in the conditional check 'if(virGetLastError() == NULL)'
I'm trying to connect this comment to your other ones, but can't, so I'll have to attempt to guess what you're saying :) I think you're just NAKing the entire change, with your reasoning being that the code must be fixed to only set an error once. Unfortunately fixing code your way seems close to impossible. To give just one example, xenDaemonFormatSxprNet(): 5335 virXendError(conn, VIR_ERR_INTERNAL_ERROR, 5336 _("unsupported network type %d"), def->type); This part of it is "lowest level", so should always report an error. but it also calls virNetworkGetBridgeName() which *isn't*. This CAN report an error, but we immediately over-write it: 5374 bridge = virNetworkGetBridgeName(network); 5375 virNetworkFree(network); 5376 if (!bridge) { 5377 virXendError(conn, VIR_ERR_NO_SOURCE, "%s", 5378 def->data.network.name); Also, there's at least two call stacks that reach this point: xenDaemonDomainDefineXML() xenDaemonFormatSxpr() xenDaemonFormatSxprNet() xenDaemonAttachDevice() xenDaemonFormatSxprNet() In the second case, we are OK. In the first, we're already over-writing the error from xenDaemonFormatSxprNet(). Or for another example, it took just a couple of seconds looking at the xenDaemonDomainDefineXML() code paths to find one where virDomainDefParseString() would not set an error. I do not think it's really possible to get things right the way you're suggesting. The fact that the code is broken everywhere backs me up pretty strongly here. How do you propose to reasonably test such changes? In contrast, "only set an error if one isn't set already" is simple, immediately understandable, and covers almost all the common cases. Anyway, I'm not having much luck in arguing my case with you, but I thought I'd explain this at least, because the code is broken and I don't believe it's possible to fix it your way. regards john:w

On Thu, Feb 05, 2009 at 06:09:10PM +0000, John Levon wrote:
On Thu, Feb 05, 2009 at 05:34:20PM +0000, Daniel P. Berrange wrote:
Just have the virXendError macro call virReportErrorHelper directly, as it already does - there is no need to wrap it in the conditional check 'if(virGetLastError() == NULL)'
I'm trying to connect this comment to your other ones, but can't, so I'll have to attempt to guess what you're saying :)
I think you're just NAKing the entire change, with your reasoning being that the code must be fixed to only set an error once. Unfortunately fixing code your way seems close to impossible. To give just one example, xenDaemonFormatSxprNet():
5335 virXendError(conn, VIR_ERR_INTERNAL_ERROR, 5336 _("unsupported network type %d"), def->type);
This part of it is "lowest level", so should always report an error. but it also calls virNetworkGetBridgeName() which *isn't*. This CAN report an error, but we immediately over-write it:
5374 bridge = virNetworkGetBridgeName(network); 5375 virNetworkFree(network); 5376 if (!bridge) { 5377 virXendError(conn, VIR_ERR_NO_SOURCE, "%s", 5378 def->data.network.name);
Also, there's at least two call stacks that reach this point:
xenDaemonDomainDefineXML() xenDaemonFormatSxpr() xenDaemonFormatSxprNet()
xenDaemonAttachDevice() xenDaemonFormatSxprNet()
In the second case, we are OK. In the first, we're already over-writing the error from xenDaemonFormatSxprNet().
Or for another example, it took just a couple of seconds looking at the xenDaemonDomainDefineXML() code paths to find one where virDomainDefParseString() would not set an error.
I do not think it's really possible to get things right the way you're suggesting. The fact that the code is broken everywhere backs me up pretty strongly here. How do you propose to reasonably test such changes?
In contrast, "only set an error if one isn't set already" is simple, immediately understandable, and covers almost all the common cases.
I'm not totally against this idea, but with a few caveats. First, we'd need to make some changes to the libvirtd because it calls into a number of APIs, both public entry points & some of the internal helpers directly, but does not reset the error in betweeen these calls. Of course it also doesn't actually do anything with errors that these calls may raise - they just happen to be printed on stderr. If we only set when not already set, then most won't even appear on stderr in the libvirtd context. Not sure of the best / least error prone way to fix this really. I'd still like to try and find a way to make the error raising process more robust. The idea of having calls to virRaiseError, which may or may not raise an error, depending on whether the thing we just called remembered to raise an error is not a great situation to be in. As you point out it is pretty hard to get this right and/or identify places where it is wrong. As a starting point, though adding code to virRaiseError which prints a stack trace to stderr() whenever it is called with an error already set, would help our awareness. Obviously this would only want to be turned on in dev-builds. Given my experiance with the OCAML/CIL stuff on threading, I think it might also be practical to perform static analysis of the callpaths and identify a fairly large proportion of flaws this way. I think it would be possible to just track exit paths, and keep note of whether there are any virRaiseError calls in each method, and match against call locations. Ok, not easy, but I'll have a stab at this next time I have a spare moment. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

On Thu, Feb 05, 2009 at 08:37:29PM +0000, Daniel P. Berrange wrote:
On Thu, Feb 05, 2009 at 06:09:10PM +0000, John Levon wrote:
In contrast, "only set an error if one isn't set already" is simple, immediately understandable, and covers almost all the common cases.
I'm not totally against this idea, but with a few caveats.
First, we'd need to make some changes to the libvirtd because it calls into a number of APIs, both public entry points & some of the internal helpers directly, but does not reset the error in betweeen these calls. Of course it also doesn't actually do anything with errors that these calls may raise - they just happen to be printed on stderr. If we only set when not already set, then most won't even appear on stderr in the libvirtd context. Not sure of the best / least error prone way to fix this really.
I'd still like to try and find a way to make the error raising process more robust. The idea of having calls to virRaiseError, which may or may not raise an error, depending on whether the thing we just called remembered to raise an error is not a great situation to be in.
As you point out it is pretty hard to get this right and/or identify places where it is wrong. As a starting point, though adding code to virRaiseError which prints a stack trace to stderr() whenever it is called with an error already set, would help our awareness. Obviously this would only want to be turned on in dev-builds.
Given my experiance with the OCAML/CIL stuff on threading, I think it might also be practical to perform static analysis of the callpaths and identify a fairly large proportion of flaws this way. I think it would be possible to just track exit paths, and keep note of whether there are any virRaiseError calls in each method, and match against call locations. Ok, not easy, but I'll have a stab at this next time I have a spare moment.
Opps, missed the final one I should have mentioned. The driver open process is actually quite annoying wrt this particular problem, because we explicitly expect a number of calls to fail during probing of drivers. If any of those raise errors, we do explicitly want to override them in many cases. This could possibly be addressed by just adding an explicit call to virResetError at the point we wish to override the early error. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

On Thu, Feb 05, 2009 at 08:39:55PM +0000, Daniel P. Berrange wrote:
First, we'd need to make some changes to the libvirtd because it calls into a number of APIs, both public entry points & some of the internal helpers directly, but does not reset the error in betweeen these calls. Of course it also doesn't actually do anything with errors that these calls may raise - they just happen to be printed on stderr. If we only
Right, I think I mentioned there were some cases where we would need special handling. Hopefully they're easier to find and fix than the reverse case, though I don't have any good ideas how.
As you point out it is pretty hard to get this right and/or identify places where it is wrong. As a starting point, though adding code to virRaiseError which prints a stack trace to stderr() whenever it is called with an error already set, would help our awareness. Obviously this would only want to be turned on in dev-builds.
That would help. The problem is this though: the worst possible errors to lose are those rare one-off events that happen in production, and they're precisely the ones least likely to be caught during testing. I wonder if we need to abandon the notion of a single error altogether, and maintain a stack of them across each top-level call. regards john
participants (3)
-
Daniel P. Berrange
-
John Levon
-
john.levon@sun.com