[libvirt] [PATCH] rbd: Do not error out on a single image during pool refresh

It could happen that rbd_list() returns X names, but that while refreshing the pool one of those RBD images is removed from Ceph through a different route then libvirt. We do not need to error out in such case, we can simply ignore the volume and continue. error : volStorageBackendRBDRefreshVolInfo:289 : failed to open the RBD image 'vol-998': No such file or directory It could also be that one or more Placement Groups (PGs) inside Ceph are inactive due to a system failure. If that happens it could be that some RBD images can not be refreshed and a timeout will be raised by librados. error : volStorageBackendRBDRefreshVolInfo:289 : failed to open the RBD image 'vol-893': Connection timed out Ignore the error and continue to refresh the rest of the pool's contents. Signed-off-by: Wido den Hollander <wido@widodh.nl> --- src/storage/storage_backend_rbd.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/src/storage/storage_backend_rbd.c b/src/storage/storage_backend_rbd.c index 80684eb..888e3be 100644 --- a/src/storage/storage_backend_rbd.c +++ b/src/storage/storage_backend_rbd.c @@ -279,18 +279,17 @@ static int volStorageBackendRBDRefreshVolInfo(virStorageVolDefPtr vol, virStoragePoolObjPtr pool, virStorageBackendRBDStatePtr ptr) { - int ret = -1; - int r = 0; - rbd_image_t image; + int r = -1; + rbd_image_t image = NULL; + rbd_image_info_t info; r = rbd_open(ptr->ioctx, vol->name, &image, NULL); if (r < 0) { virReportSystemError(-r, _("failed to open the RBD image '%s'"), vol->name); - return ret; + goto cleanup; } - rbd_image_info_t info; r = rbd_stat(image, &info, sizeof(info)); if (r < 0) { virReportSystemError(-r, _("failed to stat the RBD image '%s'"), @@ -308,6 +307,8 @@ static int volStorageBackendRBDRefreshVolInfo(virStorageVolDefPtr vol, vol->type = VIR_STORAGE_VOL_NETWORK; vol->target.format = VIR_STORAGE_FILE_RAW; + r = -1; + VIR_FREE(vol->target.path); if (virAsprintf(&vol->target.path, "%s/%s", pool->def->source.name, @@ -320,11 +321,13 @@ static int volStorageBackendRBDRefreshVolInfo(virStorageVolDefPtr vol, vol->name) == -1) goto cleanup; - ret = 0; + r = 0; cleanup: - rbd_close(image); - return ret; + if (image) + rbd_close(image); + + return r; } static int virStorageBackendRBDRefreshPool(virConnectPtr conn, @@ -399,7 +402,23 @@ static int virStorageBackendRBDRefreshPool(virConnectPtr conn, name += strlen(name) + 1; - if (volStorageBackendRBDRefreshVolInfo(vol, pool, &ptr) < 0) { + r = volStorageBackendRBDRefreshVolInfo(vol, pool, &ptr); + + /* It could be that a volume has been deleted through a different route + * then libvirt and that will cause a -ENOENT to be returned. + * + * Another possibility is that there is something wrong with the placement + * group (PG) that RBD image's header is in and that causes -ETIMEDOUT + * to be returned. + * + * Do not error out and simply ignore the volume + */ + if (r == -ENOENT || r == -ETIMEDOUT) { + virStorageVolDefFree(vol); + continue; + } + + if (r < 0) { virStorageVolDefFree(vol); goto cleanup; } -- 1.9.1

On 12/29/2015 08:35 AM, Wido den Hollander wrote:
It could happen that rbd_list() returns X names, but that while refreshing the pool one of those RBD images is removed from Ceph through a different route then libvirt.
We do not need to error out in such case, we can simply ignore the volume and continue.
error : volStorageBackendRBDRefreshVolInfo:289 : failed to open the RBD image 'vol-998': No such file or directory
It could also be that one or more Placement Groups (PGs) inside Ceph are inactive due to a system failure.
If that happens it could be that some RBD images can not be refreshed and a timeout will be raised by librados.
error : volStorageBackendRBDRefreshVolInfo:289 : failed to open the RBD image 'vol-893': Connection timed out
Ignore the error and continue to refresh the rest of the pool's contents.
Signed-off-by: Wido den Hollander <wido@widodh.nl> --- src/storage/storage_backend_rbd.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-)
Seems reasonable - one nit... This should be two patches..
diff --git a/src/storage/storage_backend_rbd.c b/src/storage/storage_backend_rbd.c index 80684eb..888e3be 100644 --- a/src/storage/storage_backend_rbd.c +++ b/src/storage/storage_backend_rbd.c @@ -279,18 +279,17 @@ static int volStorageBackendRBDRefreshVolInfo(virStorageVolDefPtr vol, virStoragePoolObjPtr pool, virStorageBackendRBDStatePtr ptr) { - int ret = -1; - int r = 0; - rbd_image_t image; + int r = -1; + rbd_image_t image = NULL; + rbd_image_info_t info;
r = rbd_open(ptr->ioctx, vol->name, &image, NULL); if (r < 0) { virReportSystemError(-r, _("failed to open the RBD image '%s'"), vol->name); - return ret; + goto cleanup; }
- rbd_image_info_t info; r = rbd_stat(image, &info, sizeof(info)); if (r < 0) { virReportSystemError(-r, _("failed to stat the RBD image '%s'"), @@ -308,6 +307,8 @@ static int volStorageBackendRBDRefreshVolInfo(virStorageVolDefPtr vol, vol->type = VIR_STORAGE_VOL_NETWORK; vol->target.format = VIR_STORAGE_FILE_RAW;
+ r = -1; + VIR_FREE(vol->target.path); if (virAsprintf(&vol->target.path, "%s/%s", pool->def->source.name, @@ -320,11 +321,13 @@ static int volStorageBackendRBDRefreshVolInfo(virStorageVolDefPtr vol, vol->name) == -1) goto cleanup;
- ret = 0; + r = 0;
cleanup: - rbd_close(image); - return ret; + if (image) + rbd_close(image);
Ahhh a bug fix - should be a separate patch...
+ + return r;
Changes in this function should have been their own patch since they fix a bug. Also, keep "ret = -1;' at the top and then use ret as the return rather than adjusting 'r'. John
}
static int virStorageBackendRBDRefreshPool(virConnectPtr conn, @@ -399,7 +402,23 @@ static int virStorageBackendRBDRefreshPool(virConnectPtr conn,
name += strlen(name) + 1;
- if (volStorageBackendRBDRefreshVolInfo(vol, pool, &ptr) < 0) { + r = volStorageBackendRBDRefreshVolInfo(vol, pool, &ptr); + + /* It could be that a volume has been deleted through a different route + * then libvirt and that will cause a -ENOENT to be returned. + * + * Another possibility is that there is something wrong with the placement + * group (PG) that RBD image's header is in and that causes -ETIMEDOUT + * to be returned. + * + * Do not error out and simply ignore the volume + */ + if (r == -ENOENT || r == -ETIMEDOUT) { + virStorageVolDefFree(vol); + continue; + } + + if (r < 0) { virStorageVolDefFree(vol); goto cleanup; }

It could be that we error out while the RBD image has not been opened yet. This would cause us to call rbd_close() on pointer which has not been initialized. Set it to NULL by default and only close if it is not NULL. Signed-off-by: Wido den Hollander <wido@widodh.nl> --- src/storage/storage_backend_rbd.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/storage/storage_backend_rbd.c b/src/storage/storage_backend_rbd.c index 80684eb..8e2d51b 100644 --- a/src/storage/storage_backend_rbd.c +++ b/src/storage/storage_backend_rbd.c @@ -281,13 +281,13 @@ static int volStorageBackendRBDRefreshVolInfo(virStorageVolDefPtr vol, { int ret = -1; int r = 0; - rbd_image_t image; + rbd_image_t image = NULL; r = rbd_open(ptr->ioctx, vol->name, &image, NULL); if (r < 0) { virReportSystemError(-r, _("failed to open the RBD image '%s'"), vol->name); - return ret; + goto cleanup; } rbd_image_info_t info; @@ -323,7 +323,8 @@ static int volStorageBackendRBDRefreshVolInfo(virStorageVolDefPtr vol, ret = 0; cleanup: - rbd_close(image); + if (image) + rbd_close(image); return ret; } -- 1.9.1

It could happen that rbd_list() returns X names, but that while refreshing the pool one of those RBD images is removed from Ceph through a different route then libvirt. We do not need to error out in such case, we can simply ignore the volume and continue. error : volStorageBackendRBDRefreshVolInfo:289 : failed to open the RBD image 'vol-998': No such file or directory It could also be that one or more Placement Groups (PGs) inside Ceph are inactive due to a system failure. If that happens it could be that some RBD images can not be refreshed and a timeout will be raised by librados. error : volStorageBackendRBDRefreshVolInfo:289 : failed to open the RBD image 'vol-893': Connection timed out Ignore the error and continue to refresh the rest of the pool's contents. Signed-off-by: Wido den Hollander <wido@widodh.nl> --- src/storage/storage_backend_rbd.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/storage/storage_backend_rbd.c b/src/storage/storage_backend_rbd.c index 8e2d51b..8dcb9be 100644 --- a/src/storage/storage_backend_rbd.c +++ b/src/storage/storage_backend_rbd.c @@ -285,6 +285,7 @@ static int volStorageBackendRBDRefreshVolInfo(virStorageVolDefPtr vol, r = rbd_open(ptr->ioctx, vol->name, &image, NULL); if (r < 0) { + ret = -r; virReportSystemError(-r, _("failed to open the RBD image '%s'"), vol->name); goto cleanup; @@ -293,6 +294,7 @@ static int volStorageBackendRBDRefreshVolInfo(virStorageVolDefPtr vol, rbd_image_info_t info; r = rbd_stat(image, &info, sizeof(info)); if (r < 0) { + ret = -r; virReportSystemError(-r, _("failed to stat the RBD image '%s'"), vol->name); goto cleanup; @@ -400,7 +402,21 @@ static int virStorageBackendRBDRefreshPool(virConnectPtr conn, name += strlen(name) + 1; - if (volStorageBackendRBDRefreshVolInfo(vol, pool, &ptr) < 0) { + r = volStorageBackendRBDRefreshVolInfo(vol, pool, &ptr); + + /* It could be that a volume has been deleted through a different route + * then libvirt and that will cause a -ENOENT to be returned. + * + * Another possibility is that there is something wrong with the placement + * group (PG) that RBD image's header is in and that causes -ETIMEDOUT + * to be returned. + * + * Do not error out and simply ignore the volume + */ + if (r < 0) { + if (r == -ENOENT || r == -ETIMEDOUT) + continue; + virStorageVolDefFree(vol); goto cleanup; } -- 1.9.1

On 01/06/2016 04:25 AM, Wido den Hollander wrote:
It could happen that rbd_list() returns X names, but that while refreshing the pool one of those RBD images is removed from Ceph through a different route then libvirt.
We do not need to error out in such case, we can simply ignore the volume and continue.
error : volStorageBackendRBDRefreshVolInfo:289 : failed to open the RBD image 'vol-998': No such file or directory
It could also be that one or more Placement Groups (PGs) inside Ceph are inactive due to a system failure.
If that happens it could be that some RBD images can not be refreshed and a timeout will be raised by librados.
error : volStorageBackendRBDRefreshVolInfo:289 : failed to open the RBD image 'vol-893': Connection timed out
Ignore the error and continue to refresh the rest of the pool's contents.
Signed-off-by: Wido den Hollander <wido@widodh.nl> --- src/storage/storage_backend_rbd.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-)
ACK and pushed both patches - tks, John
participants (2)
-
John Ferlan
-
Wido den Hollander