RBD supports cloning by creating a snapshot, protecting it and create
a child image based on that snapshot afterwards.
The RBD storage driver will try to find a snapshot with zero deltas between
the current state of the original volume and the snapshot.
If such a snapshot is found a clone/child image will be created using
the rbd_clone2() function from librbd.
It will use the same features, strip size and stripe count as the parent image.
This implementation will only create a single snapshot on the parent image if
this never changes. That should improve performance when removing the parent
image at some point.
During build the decision will be made to user rbd_diff_iterate() or rbd_diff_iterate2().
The latter is faster, but only available on Ceph versions after 0.94 (Hammer).
Cloning is only supported if RBD format 2 is used. All images created by libvirt
are already format 2.
If a RBD format 1 image is used as the original volume libvirt will return
VIR_ERR_OPERATION_UNSUPPORTED
Signed-off-by: Wido den Hollander <wido(a)widodh.nl>
---
src/storage/storage_backend_rbd.c | 340 ++++++++++++++++++++++++++++++++++++++
1 file changed, 340 insertions(+)
diff --git a/src/storage/storage_backend_rbd.c b/src/storage/storage_backend_rbd.c
index c0001d0..e353be9 100644
--- a/src/storage/storage_backend_rbd.c
+++ b/src/storage/storage_backend_rbd.c
@@ -33,6 +33,7 @@
#include "viruuid.h"
#include "virstring.h"
#include "virutil.h"
+#include "time.h"
#include "rados/librados.h"
#include "rbd/librbd.h"
@@ -662,6 +663,344 @@ virStorageBackendRBDBuildVol(virConnectPtr conn,
return ret;
}
+static int virStorageBackendRBDImageInfo(rbd_image_t image, char *volname,
+ uint64_t *features,
+ uint64_t *stripe_unit,
+ uint64_t *stripe_count)
+{
+ int r = -1;
+ uint8_t oldformat;
+
+ r = rbd_get_old_format(image, &oldformat);
+ if (r < 0) {
+ virReportSystemError(-r, _("failed to get the format of RBD image
%s"),
+ volname);
+ goto cleanup;
+ }
+
+ if (oldformat != 0) {
+ virReportError(VIR_ERR_OPERATION_UNSUPPORTED,
+ _("RBD image %s is old format. Does not support "
+ "extended features and striping"),
+ volname);
+ r = VIR_ERR_OPERATION_UNSUPPORTED;
+ goto cleanup;
+ }
+
+ r = rbd_get_features(image, features);
+ if (r < 0) {
+ virReportSystemError(-r, _("failed to get the features of RBD image
%s"),
+ volname);
+ goto cleanup;
+ }
+
+ r = rbd_get_stripe_unit(image, stripe_unit);
+ if (r < 0) {
+ virReportSystemError(-r, _("failed to get the stripe unit of RBD image
%s"),
+ volname);
+ goto cleanup;
+ }
+
+ r = rbd_get_stripe_count(image, stripe_count);
+ if (r < 0) {
+ virReportSystemError(-r, _("failed to get the stripe count of RBD image
%s"),
+ volname);
+ goto cleanup;
+ }
+
+ cleanup:
+ return r;
+}
+
+/* Callback function for rbd_diff_iterate() */
+static int virStorageBackendRBDIterateCb(uint64_t offset ATTRIBUTE_UNUSED,
+ size_t length ATTRIBUTE_UNUSED,
+ int exists ATTRIBUTE_UNUSED,
+ void *arg)
+{
+ /*
+ * Just set that there is a diff for this snapshot, we do not care where
+ */
+ *(int*) arg = 1;
+ return -1;
+}
+
+static int virStorageBackendRBDSnapshotFindNoDiff(rbd_image_t image, char *imgname,
+ virBufferPtr snapname)
+{
+ int r = -1;
+ int snap_count;
+ int max_snaps = 128;
+ size_t i;
+ int diff;
+ rbd_snap_info_t *snaps = NULL;
+ rbd_image_info_t info;
+
+ r = rbd_stat(image, &info, sizeof(info));
+ if (r < 0) {
+ virReportSystemError(-r, _("failed to stat the RBD image %s"),
+ imgname);
+ goto cleanup;
+ }
+
+ do {
+ if (VIR_ALLOC_N(snaps, max_snaps))
+ goto cleanup;
+
+ snap_count = rbd_snap_list(image, snaps, &max_snaps);
+ if (snap_count <= 0)
+ VIR_FREE(snaps);
+
+ } while (snap_count == -ERANGE);
+
+ VIR_DEBUG("Found %d snapshots for RBD image %s", snap_count, imgname);
+
+ if (snap_count == 0) {
+ r = -ENOENT;
+ goto cleanup;
+ }
+
+ if (snap_count > 0) {
+ for (i = 0; i < snap_count; i++) {
+ VIR_DEBUG("Quering diff for RBD snapshot %s@%s", imgname,
+ snaps[i].name);
+
+ /* The callback will set diff to non-zero if there is a diff */
+ diff = 0;
+
+/*
+ * rbd_diff_iterate2() is available in versions above Ceph 0.94 (Hammer)
+ * It uses a object map inside Ceph which is faster than rbd_diff_iterate()
+ * which iterates all objects.
+ */
+#if LIBRBD_VERSION_CODE > 266
+ r = rbd_diff_iterate2(image, snaps[i].name, 0, info.size, 0, 1,
+#else
+ r = rbd_diff_iterate(image, snaps[i].name, 0, info.size,
+#endif
+ virStorageBackendRBDIterateCb, (void *)&diff);
+
+ if (r < 0) {
+ virReportSystemError(-r, _("failed to iterate RBD snapshot
%s@%s"),
+ imgname, snaps[i].name);
+ goto cleanup;
+ }
+
+ if (diff == 0) {
+ VIR_DEBUG("RBD snapshot %s@%s has no delta", imgname,
+ snaps[i].name);
+ virBufferAsprintf(snapname, "%s", snaps[i].name);
+ r = 0;
+ goto cleanup;
+ }
+
+ VIR_DEBUG("RBD snapshot %s@%s has deltas", imgname,
+ snaps[i].name);
+ }
+ }
+
+ r = -ENOENT;
+
+ cleanup:
+ if (snaps)
+ rbd_snap_list_end(snaps);
+
+ VIR_FREE(snaps);
+
+ return r;
+}
+
+static int virStorageBackendRBDSnapshotCreate(rbd_image_t image, char *imgname,
+ char *snapname)
+{
+ int r = -1;
+
+ VIR_DEBUG("Creating RBD snapshot %s@%s", imgname, snapname);
+
+ r = rbd_snap_create(image, snapname);
+ if (r < 0) {
+ virReportSystemError(-r, _("failed to create RBD snapshot %s@%s"),
+ imgname, snapname);
+ goto cleanup;
+ }
+
+ cleanup:
+ return r;
+}
+
+static int virStorageBackendRBDSnapshotProtect(rbd_image_t image, char *imgname,
+ char *snapname)
+{
+ int r = -1;
+ int protected;
+
+ VIR_DEBUG("Quering if RBD snapshot %s@%s is protected", imgname,
snapname);
+
+ r = rbd_snap_is_protected(image, snapname, &protected);
+ if (r < 0) {
+ virReportSystemError(-r, _("failed verify if RBD snapshot %s@%s "
+ "is protected"), imgname, snapname);
+ goto cleanup;
+ }
+
+ if (protected == 0) {
+ VIR_DEBUG("RBD Snapshot %s@%s is not protected, protecting",
+ imgname, snapname);
+
+ r = rbd_snap_protect(image, snapname);
+ if (r < 0) {
+ virReportSystemError(-r, _("failed protect RBD snapshot %s@%s"),
+ imgname, snapname);
+ goto cleanup;
+ }
+ } else {
+ VIR_DEBUG("RBD Snapshot %s@%s is already protected", imgname,
snapname);
+ }
+
+ cleanup:
+ return r;
+}
+
+static int virStorageBackendRBDCloneImage(rados_ioctx_t io, char *origvol,
+ char *newvol)
+{
+ int r = -1;
+ int order = 0;
+ uint64_t features;
+ uint64_t stripe_count;
+ uint64_t stripe_unit;
+ virBuffer snapname = VIR_BUFFER_INITIALIZER;
+ char *snapname_buff;
+ rbd_image_t image = NULL;
+
+ r = rbd_open(io, origvol, &image, NULL);
+ if (r < 0) {
+ virReportSystemError(-r, _("failed to open the RBD image %s"),
+ origvol);
+ goto cleanup;
+ }
+
+ r = virStorageBackendRBDImageInfo(image, origvol, &features, &stripe_unit,
+ &stripe_count);
+ if (r < 0) {
+ virReportSystemError(-r, _("failed to get info from RBD image %s"),
+ origvol);
+ goto cleanup;
+ }
+
+ /*
+ * First we attempt to find a snapshot which has no differences between
+ * the current state of the RBD image.
+ *
+ * This prevents us from creating a new snapshot for every clone operation
+ * while it could be that the original volume has not changed
+ */
+ r = virStorageBackendRBDSnapshotFindNoDiff(image, origvol, &snapname);
+ if (r < 0 && r != -ENOENT) {
+ virReportSystemError(-r, _("failed to find snapshot for RBD image
%s"),
+ origvol);
+ goto cleanup;
+ }
+
+ /*
+ * No such snapshot could be found, so we will create a new snapshot
+ * and use that for cloning
+ */
+ if (r == -ENOENT) {
+ VIR_DEBUG("No RBD snapshot with zero delta could be found for image
%s",
+ origvol);
+
+ virBufferAsprintf(&snapname, "libvirt-%d", (int)time(NULL));
+
+ if (virBufferCheckError(&snapname) < 0)
+ goto cleanup;
+
+ snapname_buff = virBufferContentAndReset(&snapname);
+
+ r = virStorageBackendRBDSnapshotCreate(image, origvol, snapname_buff);
+ if (r < 0) {
+ virReportSystemError(-r, _("failed to snapshot RBD image %s"),
+ origvol);
+ goto cleanup;
+ }
+ } else {
+ snapname_buff = virBufferContentAndReset(&snapname);
+
+ VIR_DEBUG("Found RBD snapshot %s with zero delta for image %s",
+ snapname_buff, origvol);
+ }
+
+ VIR_DEBUG("Using snapshot name %s for cloning RBD image %s to %s",
+ snapname_buff, origvol, newvol);
+
+ /*
+ * RBD snapshots have to be 'protected' before they can be used
+ * as a parent snapshot for a child image
+ */
+ r = virStorageBackendRBDSnapshotProtect(image, origvol, snapname_buff);
+ if (r < 0) {
+ virReportSystemError(-r, _("failed to protect RBD snapshot %s@%s"),
+ origvol, snapname_buff);
+ goto cleanup;
+ }
+
+ VIR_DEBUG("Performing RBD clone from %s to %s", origvol, newvol);
+
+ r = rbd_clone2(io, origvol, snapname_buff, io, newvol, features, &order,
+ stripe_unit, stripe_count);
+ if (r < 0) {
+ virReportSystemError(-r, _("failed to clone RBD volume %s to %s"),
+ origvol, newvol);
+ goto cleanup;
+ }
+
+ VIR_DEBUG("Cloned RBD image %s to %s", origvol, newvol);
+
+ cleanup:
+ virBufferFreeAndReset(&snapname);
+ VIR_FREE(snapname_buff);
+
+ if (image)
+ rbd_close(image);
+
+ return r;
+}
+
+static int virStorageBackendRBDBuildVolFrom(virConnectPtr conn,
+ virStoragePoolObjPtr pool,
+ virStorageVolDefPtr newvol,
+ virStorageVolDefPtr origvol,
+ unsigned int flags)
+{
+ virStorageBackendRBDState ptr;
+ ptr.cluster = NULL;
+ ptr.ioctx = NULL;
+ int r = -1;
+
+ VIR_DEBUG("Creating clone of RBD image %s/%s with name %s",
+ pool->def->source.name, origvol->name, newvol->name);
+
+ virCheckFlags(0, -1);
+
+ if (virStorageBackendRBDOpenRADOSConn(&ptr, conn, &pool->def->source)
< 0)
+ goto cleanup;
+
+ if (virStorageBackendRBDOpenIoCTX(&ptr, pool) < 0)
+ goto cleanup;
+
+ r = virStorageBackendRBDCloneImage(ptr.ioctx, origvol->name, newvol->name);
+ if (r < 0) {
+ virReportSystemError(-r, _("failed to clone volume '%s/%s' to
%s"),
+ pool->def->source.name, origvol->name,
+ newvol->name);
+ goto cleanup;
+ }
+
+ cleanup:
+ virStorageBackendRBDCloseRADOSConn(&ptr);
+ return r;
+}
+
static int virStorageBackendRBDRefreshVol(virConnectPtr conn,
virStoragePoolObjPtr pool ATTRIBUTE_UNUSED,
virStorageVolDefPtr vol)
@@ -890,6 +1229,7 @@ virStorageBackend virStorageBackendRBD = {
.refreshPool = virStorageBackendRBDRefreshPool,
.createVol = virStorageBackendRBDCreateVol,
.buildVol = virStorageBackendRBDBuildVol,
+ .buildVolFrom = virStorageBackendRBDBuildVolFrom,
.refreshVol = virStorageBackendRBDRefreshVol,
.deleteVol = virStorageBackendRBDDeleteVol,
.wipeVol = virStorageBackendRBDVolWipe,
--
1.9.1