[libvirt] [PATCH v2 0/3] Add virDomainGetDiskErrors API

We already provide ways to detect when a domain has been paused as a result of I/O error, but there was no way of getting the exact error or even the device that experienced it. This new API may be used for both. Changes in version 2: - the API is now called virDomainGetDiskErrors - it returns a list (allocated by caller) of disks with errors instead so that users don't have to call it multiple times to get all errors - there's no python API yet since it can't be autogenerated; I'll send a patch for it tomorrow Jiri Denemark (3): virDomainGetDiskErrors public API and remote protocol qemu: Implement virDomainGetDiskErrors virsh: Implement domblkerror command daemon/remote.c | 40 +++++++++++++++++++ include/libvirt/libvirt.h.in | 32 +++++++++++++++ python/generator.py | 3 +- src/driver.h | 7 +++ src/libvirt.c | 52 ++++++++++++++++++++++++ src/libvirt_public.syms | 1 + src/qemu/qemu_conf.h | 1 + src/qemu/qemu_driver.c | 83 +++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_monitor.c | 40 +++++++++++++++++++ src/qemu/qemu_monitor.h | 1 + src/qemu/qemu_monitor_json.c | 8 ++++ src/qemu/qemu_monitor_text.c | 15 +++++++ src/remote/remote_driver.c | 34 ++++++++++++++++ src/remote/remote_protocol.x | 22 ++++++++++- src/remote_protocol-structs | 16 +++++++ src/rpc/gendispatch.pl | 47 ++++++++++++++++++++++ tools/virsh.c | 89 ++++++++++++++++++++++++++++++++++++++++++ tools/virsh.pod | 7 +++ 18 files changed, 496 insertions(+), 2 deletions(-) -- 1.7.8.4

We already provide ways to detect when a domain has been paused as a result of I/O error, but there was no way of getting the exact error or even the device that experienced it. This new API may be used for both. --- daemon/remote.c | 40 ++++++++++++++++++++++++++++++++ include/libvirt/libvirt.h.in | 32 +++++++++++++++++++++++++ python/generator.py | 3 +- src/driver.h | 7 +++++ src/libvirt.c | 52 ++++++++++++++++++++++++++++++++++++++++++ src/libvirt_public.syms | 1 + src/remote/remote_driver.c | 34 +++++++++++++++++++++++++++ src/remote/remote_protocol.x | 22 +++++++++++++++++- src/remote_protocol-structs | 16 +++++++++++++ src/rpc/gendispatch.pl | 47 +++++++++++++++++++++++++++++++++++++ 10 files changed, 252 insertions(+), 2 deletions(-) diff --git a/daemon/remote.c b/daemon/remote.c index cb8423a..31bf1de 100644 --- a/daemon/remote.c +++ b/daemon/remote.c @@ -98,6 +98,12 @@ remoteDeserializeTypedParameters(remote_typed_param *args_params_val, int limit, int *nparams); +static int +remoteSerializeDomainDiskErrors(virDomainDiskErrorPtr errors, + int nerrors, + remote_domain_disk_error **ret_errors_val, + u_int *ret_errors_len); + #include "remote_dispatch.h" #include "qemu_dispatch.h" @@ -3725,3 +3731,37 @@ make_nonnull_domain_snapshot(remote_nonnull_domain_snapshot *snapshot_dst, virDo snapshot_dst->name = strdup(snapshot_src->name); make_nonnull_domain(&snapshot_dst->dom, snapshot_src->domain); } + +static int +remoteSerializeDomainDiskErrors(virDomainDiskErrorPtr errors, + int nerrors, + remote_domain_disk_error **ret_errors_val, + u_int *ret_errors_len) +{ + remote_domain_disk_error *val = NULL; + int i = 0; + + if (VIR_ALLOC_N(val, nerrors) < 0) + goto no_memory; + + for (i = 0; i < nerrors; i++) { + if (!(val[i].disk = strdup(errors[i].disk))) + goto no_memory; + val[i].error = errors[i].error; + } + + *ret_errors_len = nerrors; + *ret_errors_val = val; + + return 0; + +no_memory: + if (val) { + int j; + for (j = 0; j < i; j++) + VIR_FREE(val[j].disk); + VIR_FREE(val); + } + virReportOOMError(); + return -1; +} diff --git a/include/libvirt/libvirt.h.in b/include/libvirt/libvirt.h.in index 0a7b324..272d142 100644 --- a/include/libvirt/libvirt.h.in +++ b/include/libvirt/libvirt.h.in @@ -1967,6 +1967,38 @@ virDomainGetBlockIoTune(virDomainPtr dom, int *nparams, unsigned int flags); +/** + * virDomainDiskErrorCode: + * + * Disk I/O error. + */ +typedef enum { + VIR_DOMAIN_DISK_ERROR_NONE = 0, /* no error */ + VIR_DOMAIN_DISK_ERROR_UNSPEC = 1, /* unspecified I/O error */ + VIR_DOMAIN_DISK_ERROR_NO_SPACE = 2, /* no space left on the device */ + +#ifdef VIR_ENUM_SENTINELS + VIR_DOMAIN_DISK_ERROR_LAST +#endif +} virDomainDiskErrorCode; + +/** + * virDomainDiskError: + * + */ +typedef struct _virDomainDiskError virDomainDiskError; +typedef virDomainDiskError *virDomainDiskErrorPtr; + +struct _virDomainDiskError { + char *disk; /* disk target */ + int error; /* virDomainDiskErrorCode */ +}; + +int virDomainGetDiskErrors(virDomainPtr dom, + virDomainDiskErrorPtr errors, + int maxerrors, + unsigned int flags); + /* * NUMA support diff --git a/python/generator.py b/python/generator.py index 6f813ae..b514af5 100755 --- a/python/generator.py +++ b/python/generator.py @@ -423,7 +423,8 @@ skip_impl = ( 'virDomainGetBlockIoTune', 'virDomainSetInterfaceParameters', 'virDomainGetInterfaceParameters', - 'virDomainGetCPUStats' # not implemented now. + 'virDomainGetCPUStats', # not implemented now. + 'virDomainGetDiskErrors', ) qemu_skip_impl = ( diff --git a/src/driver.h b/src/driver.h index ba7dbc4..f3ded19 100644 --- a/src/driver.h +++ b/src/driver.h @@ -810,6 +810,12 @@ typedef int unsigned int ncpus, unsigned int flags); +typedef int + (*virDrvDomainGetDiskErrors)(virDomainPtr dom, + virDomainDiskErrorPtr errors, + int maxerrors, + unsigned int flags); + /** * _virDriver: * @@ -981,6 +987,7 @@ struct _virDriver { virDrvDomainSetBlockIoTune domainSetBlockIoTune; virDrvDomainGetBlockIoTune domainGetBlockIoTune; virDrvDomainGetCPUStats domainGetCPUStats; + virDrvDomainGetDiskErrors domainGetDiskErrors; }; typedef int diff --git a/src/libvirt.c b/src/libvirt.c index e702a34..345c95b 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -18281,3 +18281,55 @@ error: virDispatchError(domain->conn); return -1; } + +/** + * virDomainGetDiskErrors: + * @dom: a domain object + * @errors: array to populate on output + * @maxerrors: size of @errors array + * @flags: extra flags; not used yet, so callers should always pass 0 + * + * The function populates @errors array with all disks that encountered an + * I/O error. Each disk is identified by its target (the dev attribute of + * target subelement in domain XML), such as "vda", and accompanied with + * the error that was seen on it. + * + * Returns number of disks with errors filled in the @errors array or -1 on + * error. + */ +int +virDomainGetDiskErrors(virDomainPtr dom, + virDomainDiskErrorPtr errors, + int maxerrors, + unsigned int flags) +{ + VIR_DOMAIN_DEBUG(dom, "errors=%p, maxerrors=%d, flags=%x", + errors, maxerrors, flags); + + virResetLastError(); + + if (!VIR_IS_DOMAIN(dom)) { + virLibDomainError(VIR_ERR_INVALID_DOMAIN, __FUNCTION__); + virDispatchError(NULL); + return -1; + } + + if (dom->conn->flags & VIR_CONNECT_RO) { + virLibDomainError(VIR_ERR_OPERATION_DENIED, __FUNCTION__); + goto error; + } + + if (dom->conn->driver->domainGetDiskErrors) { + int ret = dom->conn->driver->domainGetDiskErrors(dom, errors, + maxerrors, flags); + if (ret < 0) + goto error; + return ret; + } + + virLibConnError(VIR_ERR_NO_SUPPORT, __FUNCTION__); + +error: + virDispatchError(dom->conn); + return -1; +} diff --git a/src/libvirt_public.syms b/src/libvirt_public.syms index 1c4e0a3..ced9fb3 100644 --- a/src/libvirt_public.syms +++ b/src/libvirt_public.syms @@ -519,6 +519,7 @@ LIBVIRT_0.9.9 { LIBVIRT_0.9.10 { global: virDomainGetCPUStats; + virDomainGetDiskErrors; virDomainPMSuspendForDuration; virDomainShutdownFlags; virStorageVolResize; diff --git a/src/remote/remote_driver.c b/src/remote/remote_driver.c index 61b96e9..33b468a 100644 --- a/src/remote/remote_driver.c +++ b/src/remote/remote_driver.c @@ -1427,6 +1427,39 @@ cleanup: } static int +remoteDeserializeDomainDiskErrors(remote_domain_disk_error *ret_errors_val, + u_int ret_errors_len, + int limit, + virDomainDiskErrorPtr errors, + int maxerrors) +{ + int i = 0; + int j; + + if (ret_errors_len > limit || ret_errors_len > maxerrors) { + remoteError(VIR_ERR_RPC, "%s", + _("returned number of disk errors exceeds limit")); + goto error; + } + + for (i = 0; i < ret_errors_len; i++) { + if (!(errors[i].disk = strdup(ret_errors_val[i].disk))) { + virReportOOMError(); + goto error; + } + errors[i].error = ret_errors_val[i].error; + } + + return 0; + +error: + for (j = 0; j < i; j++) + VIR_FREE(errors[i].disk); + + return -1; +} + +static int remoteDomainBlockStatsFlags(virDomainPtr domain, const char *path, virTypedParameterPtr params, @@ -4849,6 +4882,7 @@ static virDriver remote_driver = { .domainSetNumaParameters = remoteDomainSetNumaParameters, /* 0.9.9 */ .domainGetNumaParameters = remoteDomainGetNumaParameters, /* 0.9.9 */ .domainGetCPUStats = remoteDomainGetCPUStats, /* 0.9.10 */ + .domainGetDiskErrors = remoteDomainGetDiskErrors, /* 0.9.10 */ }; static virNetworkDriver network_driver = { diff --git a/src/remote/remote_protocol.x b/src/remote/remote_protocol.x index b58925a..547c108 100644 --- a/src/remote/remote_protocol.x +++ b/src/remote/remote_protocol.x @@ -219,6 +219,11 @@ const REMOTE_DOMAIN_GET_CPU_STATS_NCPUS_MAX = 128; */ const REMOTE_DOMAIN_GET_CPU_STATS_MAX = 2048; +/* + * Upper limit on number of disks with errors + */ +const REMOTE_DOMAIN_DISK_ERRORS_MAX = 256; + /* UUID. VIR_UUID_BUFLEN definition comes from libvirt.h */ typedef opaque remote_uuid[VIR_UUID_BUFLEN]; @@ -359,6 +364,10 @@ struct remote_node_get_memory_stats { unsigned hyper value; }; +struct remote_domain_disk_error { + remote_nonnull_string disk; + int error; +}; /*----- Calls. -----*/ @@ -2397,6 +2406,16 @@ struct remote_domain_shutdown_flags_args { unsigned int flags; }; +struct remote_domain_get_disk_errors_args { + remote_nonnull_domain dom; + int maxerrors; + unsigned int flags; +}; + +struct remote_domain_get_disk_errors_ret { + remote_domain_disk_error errors<REMOTE_DOMAIN_DISK_ERRORS_MAX>; /* insert@1 */ +}; + /*----- Protocol. -----*/ @@ -2708,7 +2727,8 @@ enum remote_procedure { REMOTE_PROC_STORAGE_VOL_RESIZE = 260, /* autogen autogen */ REMOTE_PROC_DOMAIN_PM_SUSPEND_FOR_DURATION = 261, /* autogen autogen */ - REMOTE_PROC_DOMAIN_GET_CPU_STATS = 262 /* skipgen skipgen */ + REMOTE_PROC_DOMAIN_GET_CPU_STATS = 262, /* skipgen skipgen */ + REMOTE_PROC_DOMAIN_GET_DISK_ERRORS = 263 /* autogen autogen */ /* * Notice how the entries are grouped in sets of 10 ? diff --git a/src/remote_protocol-structs b/src/remote_protocol-structs index 5eac9bf..e031d04 100644 --- a/src/remote_protocol-structs +++ b/src/remote_protocol-structs @@ -94,6 +94,10 @@ struct remote_node_get_memory_stats { remote_nonnull_string field; uint64_t value; }; +struct remote_domain_disk_error { + remote_nonnull_string disk; + int error; +}; struct remote_open_args { remote_string name; u_int flags; @@ -1866,6 +1870,17 @@ struct remote_domain_shutdown_flags_args { remote_nonnull_domain dom; u_int flags; }; +struct remote_domain_get_disk_errors_args { + remote_nonnull_domain dom; + int maxerrors; + u_int flags; +}; +struct remote_domain_get_disk_errors_ret { + struct { + u_int errors_len; + remote_domain_disk_error * errors_val; + } errors; +}; enum remote_procedure { REMOTE_PROC_OPEN = 1, REMOTE_PROC_CLOSE = 2, @@ -2129,4 +2144,5 @@ enum remote_procedure { REMOTE_PROC_STORAGE_VOL_RESIZE = 260, REMOTE_PROC_DOMAIN_PM_SUSPEND_FOR_DURATION = 261, REMOTE_PROC_DOMAIN_GET_CPU_STATS = 262, + REMOTE_PROC_DOMAIN_GET_DISK_ERRORS = 263, }; diff --git a/src/rpc/gendispatch.pl b/src/rpc/gendispatch.pl index 446f229..72c1159 100755 --- a/src/rpc/gendispatch.pl +++ b/src/rpc/gendispatch.pl @@ -539,6 +539,39 @@ elsif ($opt_b) { } elsif ($ret_member =~ m/^remote_nonnull_string (\S+)<\S+>;/) { # error out on unannotated arrays die "remote_nonnull_string array without insert@<offset> annotation: $ret_member"; + } elsif ($ret_member =~ m/^remote_domain_disk_error (\S+)<(\S+)>;\s*\/\*\s*insert@(\d+)\s*\*\//) { + push(@vars_list, + "virDomainDiskErrorPtr $1 = NULL", + "int len"); + splice(@args_list, int($3), 0, ("$1")); + push(@getters_list, + " if (args->max$1 > $2) {", + " virNetError(VIR_ERR_INTERNAL_ERROR, \"%s\",", + " _(\"max$1 too large\"));", + " goto cleanup;", + " }", + "", + " if (VIR_ALLOC_N($1, args->max$1) < 0) {", + " virReportOOMError();", + " goto cleanup;", + " }\n"); + $single_ret_var = "len"; + $single_ret_check = " < 0"; + push(@ret_list, + "if (remoteSerializeDomainDiskErrors($1, len,", + " &ret->$1.$1_val,", + " &ret->$1.$1_len) < 0)", + " goto cleanup;\n"); + push(@free_list, + " if ($1) {", + " int i;", + " for (i = 0; i < len; i++)", + " VIR_FREE(${1}[i].disk);", + " }", + " VIR_FREE($1);"); + } elsif ($ret_member =~ m/^remote_domain_disk_error (\S+)<\S+>;/) { + # error out on unannotated arrays + die "remote_domain_disk_error array without insert@<offset> annotation: $ret_member"; } elsif ($ret_member =~ m/^remote_nonnull_string (\S+);/) { if ($call->{ProcName} eq "GetType") { # SPECIAL: virConnectGetType returns a constant string that must @@ -1297,6 +1330,20 @@ elsif ($opt_k) { } elsif ($ret_member =~ m/^remote_typed_param (\S+)<\S+>;/) { # error out on unannotated arrays die "remote_typed_param array without insert@<offset> annotation: $ret_member"; + } elsif ($ret_member =~ m/^remote_domain_disk_error (\S+)<(\S+)>;\s*\/\*\s*insert@(\d+)\s*\*\//) { + splice(@args_list, int($3), 0, ("virDomainDiskErrorPtr $1")); + push(@ret_list, "rv = ret.$1.$1_len;"); + push(@ret_list2, + "if (remoteDeserializeDomainDiskErrors(ret.$1.$1_val,", + " ret.$1.$1_len,", + " $2,", + " $1,", + " max$1) < 0)", + " goto cleanup;"); + $single_ret_cleanup = 1; + } elsif ($ret_member =~ m/^remote_domain_disk_error (\S+)<\S+>;/) { + # error out on unannotated arrays + die "remote_domain_disk_error array without insert@<offset> annotation: $ret_member"; } elsif ($ret_member =~ m/^int (\S+);/) { my $arg_name = $1; -- 1.7.8.4

On 01/30/2012 09:00 AM, Jiri Denemark wrote:
We already provide ways to detect when a domain has been paused as a result of I/O error, but there was no way of getting the exact error or even the device that experienced it. This new API may be used for both. --- daemon/remote.c | 40 ++++++++++++++++++++++++++++++++ include/libvirt/libvirt.h.in | 32 +++++++++++++++++++++++++ python/generator.py | 3 +- src/driver.h | 7 +++++ src/libvirt.c | 52 ++++++++++++++++++++++++++++++++++++++++++ src/libvirt_public.syms | 1 + src/remote/remote_driver.c | 34 +++++++++++++++++++++++++++ src/remote/remote_protocol.x | 22 +++++++++++++++++- src/remote_protocol-structs | 16 +++++++++++++ src/rpc/gendispatch.pl | 47 +++++++++++++++++++++++++++++++++++++ 10 files changed, 252 insertions(+), 2 deletions(-)
This is big enough that I might have split it into public API (include, python, src/{driver.h,libvirt.c,libvirt_public.syms}) and RPC implementation (daemon, src/remote, src/rpc). But I'll go ahead and review this, whether or not you split it.
+++ b/include/libvirt/libvirt.h.in @@ -1967,6 +1967,38 @@ virDomainGetBlockIoTune(virDomainPtr dom, int *nparams, unsigned int flags);
+/** + * virDomainDiskErrorCode: + * + * Disk I/O error. + */ +typedef enum { + VIR_DOMAIN_DISK_ERROR_NONE = 0, /* no error */ + VIR_DOMAIN_DISK_ERROR_UNSPEC = 1, /* unspecified I/O error */ + VIR_DOMAIN_DISK_ERROR_NO_SPACE = 2, /* no space left on the device */ + +#ifdef VIR_ENUM_SENTINELS + VIR_DOMAIN_DISK_ERROR_LAST +#endif +} virDomainDiskErrorCode;
Looks reasonable and extensible.
+ +/** + * virDomainDiskError: + * + */ +typedef struct _virDomainDiskError virDomainDiskError; +typedef virDomainDiskError *virDomainDiskErrorPtr; + +struct _virDomainDiskError { + char *disk; /* disk target */ + int error; /* virDomainDiskErrorCode */ +}; + +int virDomainGetDiskErrors(virDomainPtr dom, + virDomainDiskErrorPtr errors, + int maxerrors,
I guess inertia is on our side for using int maxerrors, even though the value should never be negative.
+++ b/python/generator.py @@ -423,7 +423,8 @@ skip_impl = ( 'virDomainGetBlockIoTune', 'virDomainSetInterfaceParameters', 'virDomainGetInterfaceParameters', - 'virDomainGetCPUStats' # not implemented now. + 'virDomainGetCPUStats', # not implemented now. + 'virDomainGetDiskErrors',
Oops - blame me for not using a trailing comma, so that your hunk had to touch more lines than one :)
+/** + * virDomainGetDiskErrors: + * @dom: a domain object + * @errors: array to populate on output + * @maxerrors: size of @errors array + * @flags: extra flags; not used yet, so callers should always pass 0 + * + * The function populates @errors array with all disks that encountered an + * I/O error. Each disk is identified by its target (the dev attribute of + * target subelement in domain XML), such as "vda", and accompanied with + * the error that was seen on it.
Suppose I have a domain with 3 disks, and disk A and C have both encountered an error. It is not clear to me whether calling virDomainGetDiskErrors(dom, array, 3, 0) will return 2 (info on A and C, since those were the only ones with errors) or 3 (info on A, B, and C, with B explicitly stating that there were no errors). I guess virDomainDiskErrorPtr has a reserved value for no error, so I'm assuming the latter, but I'm wondering how best to word this. Maybe: s~array with all disks that encountered an I/O error~array with information on all disks, and whether they have encountered an I/O error~ Or, if you really did mean to return only the disks with errors while omitting working disks, then we should add a shortcut, where calling with errors==NULL and maxerrors==0 returns how many errors would be returned, in order for the caller to know how big to allocate things. For that matter, even if we return no error, it would _still_ be handy to support the special case as a way to count how many disks are being tracked for errors.
+ * Returns number of disks with errors filled in the @errors array or -1 on + * error.
Please also document that the caller is responsible for calling free() on each disk name returned.
+int +virDomainGetDiskErrors(virDomainPtr dom, + virDomainDiskErrorPtr errors, + int maxerrors, + unsigned int flags) +{ + VIR_DOMAIN_DEBUG(dom, "errors=%p, maxerrors=%d, flags=%x", + errors, maxerrors, flags); + + virResetLastError(); + + if (!VIR_IS_DOMAIN(dom)) { + virLibDomainError(VIR_ERR_INVALID_DOMAIN, __FUNCTION__); + virDispatchError(NULL); + return -1; + } + + if (dom->conn->flags & VIR_CONNECT_RO) { + virLibDomainError(VIR_ERR_OPERATION_DENIED, __FUNCTION__); + goto error; + }
Missing a sanity check that errors is non-NULL and maxerrors is > 0. It's a common check, so it should be here rather than making each driver repeat it, and we have precedence for that sort of sanity check in this file.
+struct remote_domain_get_disk_errors_ret { + remote_domain_disk_error errors<REMOTE_DOMAIN_DISK_ERRORS_MAX>; /* insert@1 */ +};
If we implement the shorthand of virDomainGetDiskErrors(dom, NULL, 0, 0) as a way to tell how big to allocate an array for the next call, then this struct also needs to have a return length independent from the array. Quite a few of the virTypedParameters functions should serve as a model.
+
/*----- Protocol. -----*/
@@ -2708,7 +2727,8 @@ enum remote_procedure { REMOTE_PROC_STORAGE_VOL_RESIZE = 260, /* autogen autogen */
REMOTE_PROC_DOMAIN_PM_SUSPEND_FOR_DURATION = 261, /* autogen autogen */ - REMOTE_PROC_DOMAIN_GET_CPU_STATS = 262 /* skipgen skipgen */ + REMOTE_PROC_DOMAIN_GET_CPU_STATS = 262, /* skipgen skipgen */ + REMOTE_PROC_DOMAIN_GET_DISK_ERRORS = 263 /* autogen autogen */
Nice that you figured out how to autogen the serialize/deserialize calls. We should probably do the same for a lot of the virTypedParameter clients someday (then again, I think the reason the virTypedParameter clients don't autogen is because we don't yet have the shorthand for determining allocation limits wired up in the generator, so I may have just broken that for you by requesting the shorthand). Since adding a shorthand for returning the allocation size would imply an ABI break for RPC, we need to do it now, before the 0.9.10 freeze, so I think it's best to post a v3. -- Eric Blake eblake@redhat.com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On Mon, Jan 30, 2012 at 15:56:11 -0700, Eric Blake wrote:
On 01/30/2012 09:00 AM, Jiri Denemark wrote:
We already provide ways to detect when a domain has been paused as a result of I/O error, but there was no way of getting the exact error or even the device that experienced it. This new API may be used for both. --- daemon/remote.c | 40 ++++++++++++++++++++++++++++++++ include/libvirt/libvirt.h.in | 32 +++++++++++++++++++++++++ python/generator.py | 3 +- src/driver.h | 7 +++++ src/libvirt.c | 52 ++++++++++++++++++++++++++++++++++++++++++ src/libvirt_public.syms | 1 + src/remote/remote_driver.c | 34 +++++++++++++++++++++++++++ src/remote/remote_protocol.x | 22 +++++++++++++++++- src/remote_protocol-structs | 16 +++++++++++++ src/rpc/gendispatch.pl | 47 +++++++++++++++++++++++++++++++++++++ 10 files changed, 252 insertions(+), 2 deletions(-)
This is big enough that I might have split it into public API (include, python, src/{driver.h,libvirt.c,libvirt_public.syms}) and RPC implementation (daemon, src/remote, src/rpc). But I'll go ahead and review this, whether or not you split it.
I did so and addressed the other comments in v3. ...
/*----- Protocol. -----*/
@@ -2708,7 +2727,8 @@ enum remote_procedure { REMOTE_PROC_STORAGE_VOL_RESIZE = 260, /* autogen autogen */
REMOTE_PROC_DOMAIN_PM_SUSPEND_FOR_DURATION = 261, /* autogen autogen */ - REMOTE_PROC_DOMAIN_GET_CPU_STATS = 262 /* skipgen skipgen */ + REMOTE_PROC_DOMAIN_GET_CPU_STATS = 262, /* skipgen skipgen */ + REMOTE_PROC_DOMAIN_GET_DISK_ERRORS = 263 /* autogen autogen */
Nice that you figured out how to autogen the serialize/deserialize calls.
Yeah, although I didn't try so hard after making this a multi-return call, so v3 now uses skipgen skipgen :-)
We should probably do the same for a lot of the virTypedParameter clients someday The generator already seems to know how to deal with virTypedParameter but only for APIs with only one item in their *_ret structure.
Jirka

--- src/qemu/qemu_conf.h | 1 + src/qemu/qemu_driver.c | 83 ++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_monitor.c | 40 ++++++++++++++++++++ src/qemu/qemu_monitor.h | 1 + src/qemu/qemu_monitor_json.c | 8 ++++ src/qemu/qemu_monitor_text.c | 15 ++++++++ 6 files changed, 148 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h index 7d79823..0b65d7d 100644 --- a/src/qemu/qemu_conf.h +++ b/src/qemu/qemu_conf.h @@ -175,6 +175,7 @@ struct qemuDomainDiskInfo { bool removable; bool locked; bool tray_open; + int io_status; }; #endif /* __QEMUD_CONF_H */ diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 196fd23..6dc6a53 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -11740,6 +11740,88 @@ cleanup: return ret; } +static int +qemuDomainGetDiskErrors(virDomainPtr dom, + virDomainDiskErrorPtr errors, + int nerrors, + unsigned int flags) +{ + struct qemud_driver *driver = dom->conn->privateData; + virDomainObjPtr vm = NULL; + qemuDomainObjPrivatePtr priv; + char uuidstr[VIR_UUID_STRING_BUFLEN]; + virHashTablePtr table = NULL; + int ret = -1; + int i; + int n = 0; + + virCheckFlags(0, -1); + + qemuDriverLock(driver); + virUUIDFormat(dom->uuid, uuidstr); + vm = virDomainFindByUUID(&driver->domains, dom->uuid); + qemuDriverUnlock(driver); + + if (!vm) { + qemuReportError(VIR_ERR_NO_DOMAIN, + _("no domain with matching uuid '%s'"), uuidstr); + goto cleanup; + } + priv = vm->privateData; + + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_QUERY) < 0) + goto cleanup; + + if (!virDomainObjIsActive(vm)) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("domain is not running")); + goto endjob; + } + + qemuDomainObjEnterMonitor(driver, vm); + table = qemuMonitorGetBlockInfo(priv->mon); + qemuDomainObjExitMonitor(driver, vm); + if (!table) + goto endjob; + + for (i = n = 0; i < vm->def->ndisks; i++) { + struct qemuDomainDiskInfo *info; + virDomainDiskDefPtr disk = vm->def->disks[i]; + + if ((info = virHashLookup(table, disk->info.alias)) && + info->io_status != VIR_DOMAIN_DISK_ERROR_NONE) { + if (n >= nerrors) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("too many disks with error")); + goto endjob; + } + + if (!(errors[n].disk = strdup(disk->dst))) { + virReportOOMError(); + goto endjob; + } + errors[n].error = info->io_status; + n++; + } + } + + ret = n; + +endjob: + if (qemuDomainObjEndJob(driver, vm) == 0) + vm = NULL; + +cleanup: + if (vm) + virDomainObjUnlock(vm); + virHashFree(table); + if (ret < 0) { + for (i = 0; i < n; i++) + VIR_FREE(errors[i].disk); + } + return ret; +} + static virDriver qemuDriver = { .no = VIR_DRV_QEMU, .name = "QEMU", @@ -11893,6 +11975,7 @@ static virDriver qemuDriver = { .domainGetNumaParameters = qemuDomainGetNumaParameters, /* 0.9.9 */ .domainGetInterfaceParameters = qemuDomainGetInterfaceParameters, /* 0.9.9 */ .domainSetInterfaceParameters = qemuDomainSetInterfaceParameters, /* 0.9.9 */ + .domainGetDiskErrors = qemuDomainGetDiskErrors, /* 0.9.10 */ }; diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c index dda0521..7872e3d 100644 --- a/src/qemu/qemu_monitor.c +++ b/src/qemu/qemu_monitor.c @@ -87,6 +87,20 @@ VIR_ENUM_IMPL(qemuMonitorVMStatus, "postmigrate", "prelaunch", "finish-migrate", "restore-vm", "running", "save-vm", "shutdown", "watchdog") +typedef enum { + QEMU_MONITOR_BLOCK_IO_STATUS_OK, + QEMU_MONITOR_BLOCK_IO_STATUS_FAILED, + QEMU_MONITOR_BLOCK_IO_STATUS_NOSPACE, + + QEMU_MONITOR_BLOCK_IO_STATUS_LAST +} qemuMonitorBlockIOStatus; + +VIR_ENUM_DECL(qemuMonitorBlockIOStatus) + +VIR_ENUM_IMPL(qemuMonitorBlockIOStatus, + QEMU_MONITOR_BLOCK_IO_STATUS_LAST, + "ok", "failed", "nospace") + char *qemuMonitorEscapeArg(const char *in) { int len = 0; @@ -1227,6 +1241,32 @@ int qemuMonitorGetMemoryStats(qemuMonitorPtr mon, return ret; } +int +qemuMonitorBlockIOStatusToError(const char *status) +{ + int st = qemuMonitorBlockIOStatusTypeFromString(status); + + if (st < 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("unknown block IO status: %s"), status); + return -1; + } + + switch ((qemuMonitorBlockIOStatus) st) { + case QEMU_MONITOR_BLOCK_IO_STATUS_OK: + return VIR_DOMAIN_DISK_ERROR_NONE; + case QEMU_MONITOR_BLOCK_IO_STATUS_FAILED: + return VIR_DOMAIN_DISK_ERROR_UNSPEC; + case QEMU_MONITOR_BLOCK_IO_STATUS_NOSPACE: + return VIR_DOMAIN_DISK_ERROR_NO_SPACE; + + /* unreachable */ + case QEMU_MONITOR_BLOCK_IO_STATUS_LAST: + break; + } + return -1; +} + virHashTablePtr qemuMonitorGetBlockInfo(qemuMonitorPtr mon) { diff --git a/src/qemu/qemu_monitor.h b/src/qemu/qemu_monitor.h index a9471fe..5945d3e 100644 --- a/src/qemu/qemu_monitor.h +++ b/src/qemu/qemu_monitor.h @@ -236,6 +236,7 @@ int qemuMonitorGetMemoryStats(qemuMonitorPtr mon, virDomainMemoryStatPtr stats, unsigned int nr_stats); +int qemuMonitorBlockIOStatusToError(const char *status); virHashTablePtr qemuMonitorGetBlockInfo(qemuMonitorPtr mon); struct qemuDomainDiskInfo * qemuMonitorBlockInfoLookup(virHashTablePtr blockInfo, diff --git a/src/qemu/qemu_monitor_json.c b/src/qemu/qemu_monitor_json.c index 0bd9f46..b0ae570 100644 --- a/src/qemu/qemu_monitor_json.c +++ b/src/qemu/qemu_monitor_json.c @@ -1389,6 +1389,7 @@ int qemuMonitorJSONGetBlockInfo(qemuMonitorPtr mon, virJSONValuePtr dev = virJSONValueArrayGet(devices, i); struct qemuDomainDiskInfo *info; const char *thisdev; + const char *status; if (!dev || dev->type != VIR_JSON_TYPE_OBJECT) { qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", @@ -1434,6 +1435,13 @@ int qemuMonitorJSONGetBlockInfo(qemuMonitorPtr mon, */ ignore_value(virJSONValueObjectGetBoolean(dev, "tray-open", &info->tray_open)); + + /* Missing io-status indicates no error */ + if ((status = virJSONValueObjectGetString(dev, "io-status"))) { + info->io_status = qemuMonitorBlockIOStatusToError(status); + if (info->io_status < 0) + goto cleanup; + } } ret = 0; diff --git a/src/qemu/qemu_monitor_text.c b/src/qemu/qemu_monitor_text.c index edeb435..f051a86 100644 --- a/src/qemu/qemu_monitor_text.c +++ b/src/qemu/qemu_monitor_text.c @@ -839,6 +839,21 @@ int qemuMonitorTextGetBlockInfo(qemuMonitorPtr mon, VIR_DEBUG("error reading tray_open: %s", p); else info->tray_open = (tmp != 0); + } else if (STRPREFIX(p, "io-status=")) { + char *end; + char c; + + p += strlen("io-status="); + end = strchr(p, ' '); + if (!end || end > eol) + end = eol; + + c = *end; + *end = '\0'; + info->io_status = qemuMonitorBlockIOStatusToError(p); + *end = c; + if (info->io_status < 0) + goto cleanup; } else { /* ignore because we don't parse all options */ } -- 1.7.8.4

On 01/30/2012 09:01 AM, Jiri Denemark wrote:
--- src/qemu/qemu_conf.h | 1 + src/qemu/qemu_driver.c | 83 ++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_monitor.c | 40 ++++++++++++++++++++ src/qemu/qemu_monitor.h | 1 + src/qemu/qemu_monitor_json.c | 8 ++++ src/qemu/qemu_monitor_text.c | 15 ++++++++ 6 files changed, 148 insertions(+), 0 deletions(-)
+static int +qemuDomainGetDiskErrors(virDomainPtr dom, + virDomainDiskErrorPtr errors, + int nerrors, + unsigned int flags) +{
+ qemuDomainObjEnterMonitor(driver, vm); + table = qemuMonitorGetBlockInfo(priv->mon); + qemuDomainObjExitMonitor(driver, vm); + if (!table) + goto endjob; + + for (i = n = 0; i < vm->def->ndisks; i++) { + struct qemuDomainDiskInfo *info; + virDomainDiskDefPtr disk = vm->def->disks[i]; + + if ((info = virHashLookup(table, disk->info.alias)) && + info->io_status != VIR_DOMAIN_DISK_ERROR_NONE) {
Ah, so this only returns disks with errors. Document that in the public API.
+ if (n >= nerrors) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("too many disks with error"));
Do we really want to error out if we can't tell them about _all_ known errors? Let's think back to my request in 1/3 - if the user can query how many errors are present, and at the moment we answer the query, there is only 1 disk with an error, but before they can allocate and repeat the request, we encounter a second error. Either they should _always_ pass us space for _all_ disks tied to the domain (and fail if nerrors < vm->def->ndisks), or we should allow for races with the user passing in a too-small array, where we just give back as much information as they gave us room for, rather than erroring out because they didn't give us enough room. In fact, if you go with my request to allow the user shorthand, I'm okay with special case nerrors == 0 to just blindly return vm->def->ndisks, even if there aren't that many errors, so that the user will be unlikely to ever pass us too small of an array (the user should be prepared for the return value to be less than maxerrors on input, in fact, a return value of 0 when there are no errors would be reasonable in the case where the array is non-NULL). The virTypedParameter cases must not reject a too-small array, but then again, that is for back-compat reasons (it is likely that a newer server introduces attributes not expected by an older client, so if the older client hard-coded the array size, the newer server should still deliver only the older stats that the older client was expecting); but for the disk error case, the number of disks is dependent on how many disks are attached to the domain, and not something where newer servers will see more disks than older clients, so for this API, I can live with an error if the user's incoming array is too small.
+++ b/src/qemu/qemu_monitor_text.c @@ -839,6 +839,21 @@ int qemuMonitorTextGetBlockInfo(qemuMonitorPtr mon, VIR_DEBUG("error reading tray_open: %s", p);
Do we still need to implement this in qemu_monitor_text, now that we require JSON for any qemu version new enough to actually provide this information in the first place? -- Eric Blake eblake@redhat.com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On Mon, Jan 30, 2012 at 16:13:37 -0700, Eric Blake wrote:
On 01/30/2012 09:01 AM, Jiri Denemark wrote:
--- src/qemu/qemu_conf.h | 1 + src/qemu/qemu_driver.c | 83 ++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_monitor.c | 40 ++++++++++++++++++++ src/qemu/qemu_monitor.h | 1 + src/qemu/qemu_monitor_json.c | 8 ++++ src/qemu/qemu_monitor_text.c | 15 ++++++++ 6 files changed, 148 insertions(+), 0 deletions(-)
...
+++ b/src/qemu/qemu_monitor_text.c @@ -839,6 +839,21 @@ int qemuMonitorTextGetBlockInfo(qemuMonitorPtr mon, VIR_DEBUG("error reading tray_open: %s", p);
Do we still need to implement this in qemu_monitor_text, now that we require JSON for any qemu version new enough to actually provide this information in the first place?
I guess we don't but v1 predates your patch and I tested the code worked so I just didn't remove it :-) Jirka

This command lists all disk devices with errors --- tools/virsh.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ tools/virsh.pod | 7 ++++ 2 files changed, 96 insertions(+), 0 deletions(-) diff --git a/tools/virsh.c b/tools/virsh.c index 3a59746..1b93852 100644 --- a/tools/virsh.c +++ b/tools/virsh.c @@ -16037,6 +16037,94 @@ cleanup: } /* + * "domioerror" command + */ +static const char * +vshDomainIOErrorToString(int error) +{ + switch ((virDomainDiskErrorCode) error) { + case VIR_DOMAIN_DISK_ERROR_NONE: + return _("no error"); + case VIR_DOMAIN_DISK_ERROR_UNSPEC: + return _("unspecified error"); + case VIR_DOMAIN_DISK_ERROR_NO_SPACE: + return _("no space"); + case VIR_DOMAIN_DISK_ERROR_LAST: + ; + } + + return _("unknown error"); +} + +static const vshCmdInfo info_domblkerror[] = { + {"help", N_("Show errors on block devices")}, + {"desc", N_("Show block device errors")}, + {NULL, NULL} +}; + +static const vshCmdOptDef opts_domblkerror[] = { + {"domain", VSH_OT_DATA, VSH_OFLAG_REQ, N_("domain name, id, or uuid")}, + {NULL, 0, 0, NULL} +}; + +static bool +cmdDomBlkError(vshControl *ctl, const vshCmd *cmd) +{ + virDomainPtr dom; + char *xml; + xmlDocPtr xmldoc = NULL; + xmlXPathContextPtr ctxt = NULL; + virDomainDiskErrorPtr disks = NULL; + int ndisks; + int i; + int count; + bool ret = false; + + if (!vshConnectionUsability(ctl, ctl->conn)) + return false; + + if (!(dom = vshCommandOptDomain(ctl, cmd, NULL))) + return false; + + if (!(xml = virDomainGetXMLDesc(dom, 0))) + goto cleanup; + + xmldoc = virXMLParseStringCtxt(xml, _("(domain_definition)"), &ctxt); + VIR_FREE(xml); + if (!xmldoc) + goto cleanup; + + if (virXPathInt("count(./devices/disk)", ctxt, &ndisks) < 0) + goto cleanup; + + if (ndisks < 1) { + vshError(ctl, _("no disks attached to this domain")); + goto cleanup; + } + + if (VIR_ALLOC_N(disks, ndisks) < 0) + goto cleanup; + + if ((count = virDomainGetDiskErrors(dom, disks, ndisks, 0)) == -1) + goto cleanup; + + for (i = 0; i < count; i++) { + vshPrint(ctl, "%s: %s\n", + disks[i].disk, + vshDomainIOErrorToString(disks[i].error)); + } + + ret = true; + +cleanup: + VIR_FREE(disks); + xmlXPathFreeContext(ctxt); + xmlFreeDoc(xmldoc); + virDomainFree(dom); + return ret; +} + +/* * "qemu-monitor-command" command */ static const vshCmdInfo info_qemu_monitor_command[] = { @@ -16240,6 +16328,7 @@ static const vshCmdDef domMonitoringCmds[] = { {"domblklist", cmdDomblklist, opts_domblklist, info_domblklist, 0}, {"domblkstat", cmdDomblkstat, opts_domblkstat, info_domblkstat, 0}, {"domcontrol", cmdDomControl, opts_domcontrol, info_domcontrol, 0}, + {"domblkerror", cmdDomBlkError, opts_domblkerror, info_domblkerror, 0}, {"domif-getlink", cmdDomIfGetLink, opts_domif_getlink, info_domif_getlink, 0}, {"domiflist", cmdDomiflist, opts_domiflist, info_domiflist, 0}, {"domifstat", cmdDomIfstat, opts_domifstat, info_domifstat, 0}, diff --git a/tools/virsh.pod b/tools/virsh.pod index 4bc25bf..e9598ac 100644 --- a/tools/virsh.pod +++ b/tools/virsh.pod @@ -507,6 +507,13 @@ on hypervisor. Get memory stats for a running domain. +=item B<domblkerror> I<domain-id> + +Show errors on block devices. This command usually comes handy when +B<domstate> command says that a domain was paused due to I/O error. +The B<domblkerror> command lists all block devices in error state and +the error seen on each of them. + =item B<domblkinfo> I<domain> I<block-device> Get block device size info for a domain. A I<block-device> corresponds -- 1.7.8.4

On 01/30/2012 09:01 AM, Jiri Denemark wrote:
This command lists all disk devices with errors --- tools/virsh.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ tools/virsh.pod | 7 ++++ 2 files changed, 96 insertions(+), 0 deletions(-)
diff --git a/tools/virsh.c b/tools/virsh.c index 3a59746..1b93852 100644 --- a/tools/virsh.c +++ b/tools/virsh.c @@ -16037,6 +16037,94 @@ cleanup: }
/* + * "domioerror" command + */
+ +static const vshCmdInfo info_domblkerror[] = {
Looks like you missed a rename; s/domioerror/domblkerror/ in the comment.
+ {"help", N_("Show errors on block devices")}, + {"desc", N_("Show block device errors")}, + {NULL, NULL} +}; + +static const vshCmdOptDef opts_domblkerror[] = { + {"domain", VSH_OT_DATA, VSH_OFLAG_REQ, N_("domain name, id, or uuid")}, + {NULL, 0, 0, NULL}
Should we also allow this additional usage: virsh domblkerror dom vda which lists the status of just vda (no error, no space left, ...)? That would mean adding: {"disk", VSH_OT_DATA, VSH_OFLAG_OPT, N_("particular disk to check")} then filtering through the libvirt API to match just that disk? You can say "no, that's overkill" and not implement it, and I won't be hurt.
+ if (!(xml = virDomainGetXMLDesc(dom, 0))) + goto cleanup; + + xmldoc = virXMLParseStringCtxt(xml, _("(domain_definition)"), &ctxt); + VIR_FREE(xml); + if (!xmldoc) + goto cleanup; + + if (virXPathInt("count(./devices/disk)", ctxt, &ndisks) < 0) + goto cleanup;
Guess what - if we made the API take NULL,0 as a shorthand to query how big to make our array, you wouldn't have to resort to this XML parsing.
+ if ((count = virDomainGetDiskErrors(dom, disks, ndisks, 0)) == -1) + goto cleanup; + + for (i = 0; i < count; i++) { + vshPrint(ctl, "%s: %s\n", + disks[i].disk, + vshDomainIOErrorToString(disks[i].error)); + }
Are we okay that if there are no disk errors (count is 0), we have no output, not even mentioning that the command succeeded?
@@ -16240,6 +16328,7 @@ static const vshCmdDef domMonitoringCmds[] = { {"domblklist", cmdDomblklist, opts_domblklist, info_domblklist, 0}, {"domblkstat", cmdDomblkstat, opts_domblkstat, info_domblkstat, 0}, {"domcontrol", cmdDomControl, opts_domcontrol, info_domcontrol, 0}, + {"domblkerror", cmdDomBlkError, opts_domblkerror, info_domblkerror, 0},
Sort this line earlier. Overall, I like the series, and want this API in 0.9.10; can we get a v3 prior to RC1? -- Eric Blake eblake@redhat.com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On Mon, Jan 30, 2012 at 16:20:07 -0700, Eric Blake wrote:
On 01/30/2012 09:01 AM, Jiri Denemark wrote:
This command lists all disk devices with errors --- tools/virsh.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ tools/virsh.pod | 7 ++++ 2 files changed, 96 insertions(+), 0 deletions(-) ... +static const vshCmdOptDef opts_domblkerror[] = { + {"domain", VSH_OT_DATA, VSH_OFLAG_REQ, N_("domain name, id, or uuid")}, + {NULL, 0, 0, NULL}
Should we also allow this additional usage:
virsh domblkerror dom vda
which lists the status of just vda (no error, no space left, ...)? That would mean adding:
{"disk", VSH_OT_DATA, VSH_OFLAG_OPT, N_("particular disk to check")}
then filtering through the libvirt API to match just that disk? You can say "no, that's overkill" and not implement it, and I won't be hurt.
I think grep/sed/awk and similar friends can do similar job here if anyone wants to get the error status of just one specific disk.
+ if ((count = virDomainGetDiskErrors(dom, disks, ndisks, 0)) == -1) + goto cleanup; + + for (i = 0; i < count; i++) { + vshPrint(ctl, "%s: %s\n", + disks[i].disk, + vshDomainIOErrorToString(disks[i].error)); + }
Are we okay that if there are no disk errors (count is 0), we have no output, not even mentioning that the command succeeded?
I guess we are not and I changed that. I also made this command consistent with python API which reports no disk errors for domain without disks instead of complaining that the domain has no disks. Jirka
participants (2)
-
Eric Blake
-
Jiri Denemark