[libvirt] [PATCHv8 00/11] blockjob: storage migration via block-copy

v7 of these patches is here: https://www.redhat.com/archives/libvir-list/2012-September/msg01122.html Since then, I have rebased the series on top of my block-commit work, which changed (and simplified) several patches. Also, upstream qemu is closer to taking Paolo's patches that actually add the 'drive-mirror' monitor command, although it still isn't official there; so I'm still a bit reluctant to push patches 3-11 into libvirt.git until qemu.git actually gains support for block copy. Patches 1 and 2 are new, and can be applied now if they get a good review. I've done some light testing with the qemu-kvm-rhev of RHEV 3.0, so I'm fairly confident that my rebase work is correct. Also available at: http://repo.or.cz/w/libvirt/ericb.git/shortlog/refs/heads/blockjob git fetch git://repo.or.cz/libvirt/ericb.git blockjob Eric Blake (11): storage: let format probing work on root-squash NFS blockjob: sanity check when creating snapshot blockjob: add qemu capabilities related to block jobs blockjob: react to active block copy blockjob: return appropriate event and info blockjob: support pivot operation on cancel blockjob: make drive-reopen safer blockjob: implement block copy for qemu blockjob: allow for existing files in block-copy blockjob: allow mirroring under SELinux and cgroup blockjob: relabel entire existing chain include/libvirt/libvirt.h.in | 1 + src/conf/domain_conf.c | 12 ++ src/conf/domain_conf.h | 1 + src/libvirt.c | 7 +- src/libvirt_private.syms | 1 + src/qemu/qemu_capabilities.c | 6 + src/qemu/qemu_capabilities.h | 2 + src/qemu/qemu_driver.c | 431 ++++++++++++++++++++++++++++++++++++++- src/qemu/qemu_hotplug.c | 7 + src/qemu/qemu_monitor.c | 52 +++++ src/qemu/qemu_monitor.h | 14 ++ src/qemu/qemu_monitor_json.c | 96 ++++++++- src/qemu/qemu_monitor_json.h | 21 +- src/qemu/qemu_process.c | 3 + src/storage/storage_backend_fs.c | 2 +- src/util/storage_file.c | 4 +- src/util/storage_file.h | 2 +- 17 files changed, 646 insertions(+), 16 deletions(-) -- 1.7.11.7

Yet another instance of where using plain open() mishandles files that live on root-squash NFS, and where improving the API can improve the chance of a successful probe. * src/util/storage_file.h (virStorageFileProbeFormat): Alter signature. * src/util/storage_file.c (virStorageFileProbeFormat): Use better method for opening file. * src/qemu/qemu_driver.c (qemuDomainGetBlockInfo): Update caller. * src/storage/storage_backend_fs.c (virStorageBackendProbeTarget): Likewise. --- v8: new patch src/qemu/qemu_driver.c | 3 ++- src/storage/storage_backend_fs.c | 2 +- src/util/storage_file.c | 4 ++-- src/util/storage_file.h | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index feda4d9..dc9c62c 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -9309,7 +9309,8 @@ static int qemuDomainGetBlockInfo(virDomainPtr dom, format = disk->format; } else { if (driver->allowDiskFormatProbing) { - if ((format = virStorageFileProbeFormat(disk->src)) < 0) + if ((format = virStorageFileProbeFormat(disk->src, driver->user, + driver->group)) < 0) goto cleanup; } else { virReportError(VIR_ERR_INTERNAL_ERROR, diff --git a/src/storage/storage_backend_fs.c b/src/storage/storage_backend_fs.c index cecc2d2..30e203c 100644 --- a/src/storage/storage_backend_fs.c +++ b/src/storage/storage_backend_fs.c @@ -104,7 +104,7 @@ virStorageBackendProbeTarget(virStorageVolTargetPtr target, meta->backingStore = NULL; if (meta->backingStoreFormat == VIR_STORAGE_FILE_AUTO && meta->backingStoreIsFile) { - if ((ret = virStorageFileProbeFormat(*backingStore)) < 0) { + if ((ret = virStorageFileProbeFormat(*backingStore, -1, -1)) < 0) { /* If the backing file is currently unavailable, only log an error, * but continue. Returning -1 here would disable the whole storage * pool, making it unavailable for even maintenance. */ diff --git a/src/util/storage_file.c b/src/util/storage_file.c index 882df6e..e0b4178 100644 --- a/src/util/storage_file.c +++ b/src/util/storage_file.c @@ -830,11 +830,11 @@ cleanup: * Best option: Don't use this function */ int -virStorageFileProbeFormat(const char *path) +virStorageFileProbeFormat(const char *path, uid_t uid, gid_t gid) { int fd, ret; - if ((fd = open(path, O_RDONLY)) < 0) { + if ((fd = virFileOpenAs(path, O_RDONLY, 0, uid, gid, 0)) < 0) { virReportSystemError(errno, _("cannot open file '%s'"), path); return -1; } diff --git a/src/util/storage_file.h b/src/util/storage_file.h index 9b00dba..abfaca9 100644 --- a/src/util/storage_file.h +++ b/src/util/storage_file.h @@ -66,7 +66,7 @@ struct _virStorageFileMetadata { # define DEV_BSIZE 512 # endif -int virStorageFileProbeFormat(const char *path); +int virStorageFileProbeFormat(const char *path, uid_t uid, gid_t gid); int virStorageFileProbeFormatFromFD(const char *path, int fd); -- 1.7.11.7

On Sat, Oct 20, 2012 at 03:47:10PM -0600, Eric Blake wrote:
Yet another instance of where using plain open() mishandles files that live on root-squash NFS, and where improving the API can improve the chance of a successful probe.
* src/util/storage_file.h (virStorageFileProbeFormat): Alter signature. * src/util/storage_file.c (virStorageFileProbeFormat): Use better method for opening file. * src/qemu/qemu_driver.c (qemuDomainGetBlockInfo): Update caller. * src/storage/storage_backend_fs.c (virStorageBackendProbeTarget): Likewise. ---
v8: new patch
src/qemu/qemu_driver.c | 3 ++- src/storage/storage_backend_fs.c | 2 +- src/util/storage_file.c | 4 ++-- src/util/storage_file.h | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index feda4d9..dc9c62c 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -9309,7 +9309,8 @@ static int qemuDomainGetBlockInfo(virDomainPtr dom, format = disk->format; } else { if (driver->allowDiskFormatProbing) { - if ((format = virStorageFileProbeFormat(disk->src)) < 0) + if ((format = virStorageFileProbeFormat(disk->src, driver->user, + driver->group)) < 0) goto cleanup; } else { virReportError(VIR_ERR_INTERNAL_ERROR, diff --git a/src/storage/storage_backend_fs.c b/src/storage/storage_backend_fs.c index cecc2d2..30e203c 100644 --- a/src/storage/storage_backend_fs.c +++ b/src/storage/storage_backend_fs.c @@ -104,7 +104,7 @@ virStorageBackendProbeTarget(virStorageVolTargetPtr target, meta->backingStore = NULL; if (meta->backingStoreFormat == VIR_STORAGE_FILE_AUTO && meta->backingStoreIsFile) { - if ((ret = virStorageFileProbeFormat(*backingStore)) < 0) { + if ((ret = virStorageFileProbeFormat(*backingStore, -1, -1)) < 0) { /* If the backing file is currently unavailable, only log an error, * but continue. Returning -1 here would disable the whole storage * pool, making it unavailable for even maintenance. */ diff --git a/src/util/storage_file.c b/src/util/storage_file.c index 882df6e..e0b4178 100644 --- a/src/util/storage_file.c +++ b/src/util/storage_file.c @@ -830,11 +830,11 @@ cleanup: * Best option: Don't use this function */ int -virStorageFileProbeFormat(const char *path) +virStorageFileProbeFormat(const char *path, uid_t uid, gid_t gid) { int fd, ret;
- if ((fd = open(path, O_RDONLY)) < 0) { + if ((fd = virFileOpenAs(path, O_RDONLY, 0, uid, gid, 0)) < 0) { virReportSystemError(errno, _("cannot open file '%s'"), path); return -1; } diff --git a/src/util/storage_file.h b/src/util/storage_file.h index 9b00dba..abfaca9 100644 --- a/src/util/storage_file.h +++ b/src/util/storage_file.h @@ -66,7 +66,7 @@ struct _virStorageFileMetadata { # define DEV_BSIZE 512 # endif
-int virStorageFileProbeFormat(const char *path); +int virStorageFileProbeFormat(const char *path, uid_t uid, gid_t gid); int virStorageFileProbeFormatFromFD(const char *path, int fd);
I don't see how passing more context informations could be a bad idea and usually we can't predict when something may break various NFS scenario except by testing the code, ACK, Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

On 10/22/2012 07:33 AM, Daniel Veillard wrote:
On Sat, Oct 20, 2012 at 03:47:10PM -0600, Eric Blake wrote:
Yet another instance of where using plain open() mishandles files that live on root-squash NFS, and where improving the API can improve the chance of a successful probe.
* src/util/storage_file.h (virStorageFileProbeFormat): Alter signature. * src/util/storage_file.c (virStorageFileProbeFormat): Use better method for opening file. * src/qemu/qemu_driver.c (qemuDomainGetBlockInfo): Update caller. * src/storage/storage_backend_fs.c (virStorageBackendProbeTarget): Likewise. ---
v8: new patch
I don't see how passing more context informations could be a bad idea and usually we can't predict when something may break various NFS scenario except by testing the code, ACK,
Thanks; pushed. I'll still wait on the rest of the series until both of the following have happened: positive review, and qemu.git having an actual commit for drive-mirror. -- Eric Blake eblake@redhat.com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On Mon, Oct 22, 2012 at 09:07:54AM -0600, Eric Blake wrote:
Thanks; pushed. I'll still wait on the rest of the series until both of the following have happened: positive review, and qemu.git having an actual commit for drive-mirror.
Well 2/11 ought to be pushed, it was looking sane to me but I lacked context for the first chunk to really ACK it ! Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

On 10/22/2012 09:11 AM, Daniel Veillard wrote:
On Mon, Oct 22, 2012 at 09:07:54AM -0600, Eric Blake wrote:
Thanks; pushed. I'll still wait on the rest of the series until both of the following have happened: positive review, and qemu.git having an actual commit for drive-mirror.
Well 2/11 ought to be pushed, it was looking sane to me but I lacked context for the first chunk to really ACK it !
Indeed - 2/11 is independent; the missing context is that if the user passed the REUSE_EXT flag, but stat() fails with ENOENT, then there is no existing file to reuse and we can't honor the user's request. I'll resubmit that patch independently, anyways, since I got some off-list feedback about the name 'allow_reuse' being a bit confusing compared to the API flag name. -- Eric Blake eblake@redhat.com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 10/20/12 23:47, Eric Blake wrote:
Yet another instance of where using plain open() mishandles files that live on root-squash NFS, and where improving the API can improve the chance of a successful probe.
* src/util/storage_file.h (virStorageFileProbeFormat): Alter signature. * src/util/storage_file.c (virStorageFileProbeFormat): Use better method for opening file. * src/qemu/qemu_driver.c (qemuDomainGetBlockInfo): Update caller. * src/storage/storage_backend_fs.c (virStorageBackendProbeTarget): Likewise. ---
v8: new patch
src/qemu/qemu_driver.c | 3 ++- src/storage/storage_backend_fs.c | 2 +- src/util/storage_file.c | 4 ++-- src/util/storage_file.h | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index feda4d9..dc9c62c 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -9309,7 +9309,8 @@ static int qemuDomainGetBlockInfo(virDomainPtr dom, format = disk->format; } else { if (driver->allowDiskFormatProbing) { - if ((format = virStorageFileProbeFormat(disk->src)) < 0) + if ((format = virStorageFileProbeFormat(disk->src, driver->user, + driver->group)) < 0)
I know it's late now, and this patch has been pushed, but we will probably need a followup patch that changes this part to values set in the DAC seclabels in the domain configuration. The DAC driver gives us the ability to specify the user and group of the machine separately, so we should use that information to access the images. Peter

On 10/22/2012 09:18 AM, Peter Krempa wrote:
On 10/20/12 23:47, Eric Blake wrote:
Yet another instance of where using plain open() mishandles files that live on root-squash NFS, and where improving the API can improve the chance of a successful probe.
* src/util/storage_file.h (virStorageFileProbeFormat): Alter signature. * src/util/storage_file.c (virStorageFileProbeFormat): Use better method for opening file. * src/qemu/qemu_driver.c (qemuDomainGetBlockInfo): Update caller. * src/storage/storage_backend_fs.c (virStorageBackendProbeTarget): Likewise. ---
v8: new patch
I know it's late now, and this patch has been pushed, but we will probably need a followup patch that changes this part to values set in the DAC seclabels in the domain configuration. The DAC driver gives us the ability to specify the user and group of the machine separately, so we should use that information to access the images.
In that case, qemu_driver.c:qemuOpenFile() also needs to be fixed to honor VM DAC labeling, as it also passes driver->user and driver->group down to virFileOpenAs. That is, if I'm understanding your complaint, the new DAC labeling allows us to run a single qemu guest process under a different uid:gid than the defaults specified in qemu.conf, and if we have that turned on, then we should be favoring per-guest user and group over the driver user/group default. Sounds like we need a helper function, which when given the qemu driver and the vm definition, returns the appropriate user:group id to use for that vm. -- Eric Blake eblake@redhat.com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

The snapshot code when reusing an existing file had hard-to-read logic, as well as a missing sanity check: REUSE_EXT should require the destination to already be present. * src/qemu/qemu_driver.c (qemuDomainSnapshotDiskPrepare): Require destination on REUSE_EXT. --- v8: new, split out of v7's 7/9 src/qemu/qemu_driver.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index dc9c62c..202cb4e 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -10758,8 +10758,13 @@ qemuDomainSnapshotDiskPrepare(virDomainObjPtr vm, virDomainSnapshotDefPtr def, _("unable to stat for disk %s: %s"), disk->name, disk->file); goto cleanup; + } else if (allow_reuse) { + virReportSystemError(errno, + _("missing existing file for disk %s: %s"), + disk->name, disk->file); + goto cleanup; } - } else if (!(S_ISBLK(st.st_mode) || !st.st_size || allow_reuse)) { + } else if (!S_ISBLK(st.st_mode) && st.st_size && !allow_reuse) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, _("external snapshot file for disk %s already " "exists and is not a block device: %s"), -- 1.7.11.7

Upstream qemu 1.3 is adding two new monitor commands, 'drive-mirror' and 'block-job-complete'[1], which can drive live block copy and storage migration. Additionally, RHEL 6.3 had backported an earlier version of most of the same functionality, but under the names '__com.redhat_drive-mirror' and '__com.redhat_drive-reopen' and with slightly different JSON arguments, which we can support without too much extra effort. The libvirt API virDomainBlockRebase as already committed for 0.9.12 is flexible enough to expose the basics of block copy, but some additional features in the 'drive-mirror' qemu command, such as setting error policy, setting granularity, or using a persistent bitmap, may later require a new libvirt API virDomainBlockCopy. I will wait to add that API until we know more about what qemu 1.3 will finally provide. The use of two capability bits allows the bulk of libvirt code to handle both RHEL 6.3 and upstream qemu 1.3 without caring about the difference in monitor commands used; one bit that says whether the commands are present, and another saying whether the RHEL 6.3 limitations are in effect. The choice of XML names for the capability bits is essential for interoperability with the RHEL 6.3 version of libvirt. [1]https://lists.gnu.org/archive/html/qemu-devel/2012-10/msg03256.html * src/qemu/qemu_capabilities.h (QEMU_CAPS_DRIVE_MIRROR) (QEMU_CAPS_DRIVE_REOPEN): New bits. * src/qemu/qemu_capabilities.c (qemuCaps): Name them. * src/qemu/qemu_monitor_json.c (qemuMonitorJSONCheckCommands): Set them. (qemuMonitorJSONDriveMirror, qemuMonitorDrivePivot): New functions. * src/qemu/qemu_monitor_json.h (qemuMonitorJSONDriveMirror) (qemuMonitorDrivePivot): Declare them. * src/qemu/qemu_monitor.c (qemuMonitorDriveMirror) (qemuMonitorDrivePivot): New passthroughs. * src/qemu/qemu_monitor.h (qemuMonitorDriveMirror) (qemuMonitorDrivePivot): Declare them. --- v8: update qemu feature URL, move commit earlier in series, rebase on top of block-commit src/qemu/qemu_capabilities.c | 6 ++++ src/qemu/qemu_capabilities.h | 2 ++ src/qemu/qemu_monitor.c | 52 ++++++++++++++++++++++++++++++ src/qemu/qemu_monitor.h | 14 ++++++++ src/qemu/qemu_monitor_json.c | 77 ++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_monitor_json.h | 21 ++++++++++-- 6 files changed, 170 insertions(+), 2 deletions(-) diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c index 7adfc38..c9b40d7 100644 --- a/src/qemu/qemu_capabilities.c +++ b/src/qemu/qemu_capabilities.c @@ -188,6 +188,8 @@ VIR_ENUM_IMPL(qemuCaps, QEMU_CAPS_LAST, "dump-guest-core", "seamless-migration", "block-commit", + "drive-mirror", + "drive-reopen", ); struct _qemuCaps { @@ -1884,6 +1886,10 @@ qemuCapsProbeQMPCommands(qemuCapsPtr caps, qemuCapsSet(caps, QEMU_CAPS_KVM); else if (strstr(name, "block-commit")) qemuCapsSet(caps, QEMU_CAPS_BLOCK_COMMIT); + else if (strstr(name, "drive-mirror")) + qemuCapsSet(caps, QEMU_CAPS_DRIVE_MIRROR); + else if (STREQ(name, "__com.redhat_drive-reopen")) + qemuCapsSet(caps, QEMU_CAPS_DRIVE_REOPEN); VIR_FREE(name); } VIR_FREE(commands); diff --git a/src/qemu/qemu_capabilities.h b/src/qemu/qemu_capabilities.h index 6939c45..9c19381 100644 --- a/src/qemu/qemu_capabilities.h +++ b/src/qemu/qemu_capabilities.h @@ -151,6 +151,8 @@ enum qemuCapsFlags { QEMU_CAPS_DUMP_GUEST_CORE = 111, /* dump-guest-core-parameter */ QEMU_CAPS_SEAMLESS_MIGRATION = 112, /* seamless-migration for SPICE */ QEMU_CAPS_BLOCK_COMMIT = 113, /* block-commit */ + QEMU_CAPS_DRIVE_MIRROR = 114, /* drive-mirror monitor command */ + QEMU_CAPS_DRIVE_REOPEN = 115, /* drive-reopen instead of block-job-complete */ QEMU_CAPS_LAST, /* this must always be the last item */ }; diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c index 2d9c44c..3a544fd 100644 --- a/src/qemu/qemu_monitor.c +++ b/src/qemu/qemu_monitor.c @@ -2780,6 +2780,39 @@ qemuMonitorDiskSnapshot(qemuMonitorPtr mon, virJSONValuePtr actions, return ret; } +/* Start a drive-mirror block job. bandwidth is in MB/sec. */ +int +qemuMonitorDriveMirror(qemuMonitorPtr mon, + const char *device, const char *file, + const char *format, unsigned long bandwidth, + bool reopen, unsigned int flags) +{ + int ret = -1; + unsigned long long speed; + + VIR_DEBUG("mon=%p, device=%s, file=%s, format=%s, bandwidth=%ld, " + "reopen=%d, flags=%x", + mon, device, file, NULLSTR(format), bandwidth, reopen, flags); + + /* Convert bandwidth MiB to bytes */ + speed = bandwidth; + if (speed > ULLONG_MAX / 1024 / 1024) { + virReportError(VIR_ERR_OVERFLOW, + _("bandwidth must be less than %llu"), + ULLONG_MAX / 1024 / 1024); + return -1; + } + speed <<= 20; + + if (mon->json) + ret = qemuMonitorJSONDriveMirror(mon, device, file, format, speed, + reopen, flags); + else + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("drive-mirror requires JSON monitor")); + return ret; +} + /* Use the transaction QMP command to run atomic snapshot commands. */ int qemuMonitorTransaction(qemuMonitorPtr mon, virJSONValuePtr actions) @@ -2826,6 +2859,25 @@ qemuMonitorBlockCommit(qemuMonitorPtr mon, const char *device, return ret; } +/* Use the block-job-complete or drive-reopen monitor command to pivot + * a block copy job. */ +int +qemuMonitorDrivePivot(qemuMonitorPtr mon, const char *device, + const char *file, const char *format, bool reopen) +{ + int ret = -1; + + VIR_DEBUG("mon=%p, device=%s, file=%s, format=%s, reopen=%d", + mon, device, file, NULLSTR(format), reopen); + + if (mon->json) + ret = qemuMonitorJSONDrivePivot(mon, device, file, format, reopen); + else + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("drive pivot requires JSON monitor")); + return ret; +} + int qemuMonitorArbitraryCommand(qemuMonitorPtr mon, const char *cmd, char **reply, diff --git a/src/qemu/qemu_monitor.h b/src/qemu/qemu_monitor.h index 8856d9f..8a33b6f 100644 --- a/src/qemu/qemu_monitor.h +++ b/src/qemu/qemu_monitor.h @@ -523,6 +523,20 @@ int qemuMonitorDiskSnapshot(qemuMonitorPtr mon, bool reuse); int qemuMonitorTransaction(qemuMonitorPtr mon, virJSONValuePtr actions) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); +int qemuMonitorDriveMirror(qemuMonitorPtr mon, + const char *device, + const char *file, + const char *format, + unsigned long bandwidth, + bool reopen, + unsigned int flags) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_NONNULL(3); +int qemuMonitorDrivePivot(qemuMonitorPtr mon, + const char *device, + const char *file, + const char *format, + bool reopen) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_NONNULL(3); int qemuMonitorBlockCommit(qemuMonitorPtr mon, const char *device, diff --git a/src/qemu/qemu_monitor_json.c b/src/qemu/qemu_monitor_json.c index 423849b..ccaeb95 100644 --- a/src/qemu/qemu_monitor_json.c +++ b/src/qemu/qemu_monitor_json.c @@ -3249,6 +3249,52 @@ cleanup: return ret; } +/* speed is in bytes/sec */ +int +qemuMonitorJSONDriveMirror(qemuMonitorPtr mon, + const char *device, const char *file, + const char *format, unsigned long long speed, + bool reopen, unsigned int flags) +{ + int ret = -1; + virJSONValuePtr cmd; + virJSONValuePtr reply = NULL; + bool shallow = (flags & VIR_DOMAIN_BLOCK_REBASE_SHALLOW) != 0; + bool reuse = (flags & VIR_DOMAIN_BLOCK_REBASE_REUSE_EXT) != 0; + + if (reopen) + cmd = qemuMonitorJSONMakeCommand("__com.redhat_drive-mirror", + "s:device", device, + "s:target", file, + "U:speed", speed, + "b:full", !shallow, + "s:mode", + reuse ? "existing" : "absolute-paths", + format ? "s:format" : NULL, format, + NULL); + else + cmd = qemuMonitorJSONMakeCommand("drive-mirror", + "s:device", device, + "s:target", file, + "U:speed", speed, + "s:sync", shallow ? "top" : "full", + "s:mode", + reuse ? "existing" : "absolute-paths", + format ? "s:format" : NULL, format, + NULL); + if (!cmd) + return -1; + + if ((ret = qemuMonitorJSONCommand(mon, cmd, &reply)) < 0) + goto cleanup; + ret = qemuMonitorJSONCheckError(cmd, reply); + +cleanup: + virJSONValueFree(cmd); + virJSONValueFree(reply); + return ret; +} + int qemuMonitorJSONTransaction(qemuMonitorPtr mon, virJSONValuePtr actions) { @@ -3322,6 +3368,37 @@ cleanup: return ret; } +int +qemuMonitorJSONDrivePivot(qemuMonitorPtr mon, const char *device, + const char *file, const char *format, bool reopen) +{ + int ret; + virJSONValuePtr cmd; + virJSONValuePtr reply = NULL; + + if (reopen) + cmd = qemuMonitorJSONMakeCommand("__com.redhat_drive-reopen", + "s:device", device, + "s:new-image-file", file, + format ? "s:format" : NULL, format, + NULL); + else + cmd = qemuMonitorJSONMakeCommand("block-job-complete", + "s:device", device, + NULL); + if (!cmd) + return -1; + + if ((ret = qemuMonitorJSONCommand(mon, cmd, &reply)) < 0) + goto cleanup; + ret = qemuMonitorJSONCheckError(cmd, reply); + +cleanup: + virJSONValueFree(cmd); + virJSONValueFree(reply); + return ret; +} + int qemuMonitorJSONArbitraryCommand(qemuMonitorPtr mon, const char *cmd_str, diff --git a/src/qemu/qemu_monitor_json.h b/src/qemu/qemu_monitor_json.h index 61127a7..3dfe420 100644 --- a/src/qemu/qemu_monitor_json.h +++ b/src/qemu/qemu_monitor_json.h @@ -232,8 +232,25 @@ int qemuMonitorJSONDiskSnapshot(qemuMonitorPtr mon, const char *device, const char *file, const char *format, - bool reuse); -int qemuMonitorJSONTransaction(qemuMonitorPtr mon, virJSONValuePtr actions); + bool reuse) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(3) + ATTRIBUTE_NONNULL(4) ATTRIBUTE_NONNULL(5); +int qemuMonitorJSONTransaction(qemuMonitorPtr mon, virJSONValuePtr actions) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); +int qemuMonitorJSONDriveMirror(qemuMonitorPtr mon, + const char *device, + const char *file, + const char *format, + unsigned long long speed, + bool reopen, + unsigned int flags) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_NONNULL(3); +int qemuMonitorJSONDrivePivot(qemuMonitorPtr mon, + const char *device, + const char *file, + const char *format, + bool reopen) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_NONNULL(3); int qemuMonitorJSONBlockCommit(qemuMonitorPtr mon, const char *device, -- 1.7.11.7

On 10/20/2012 03:47 PM, Eric Blake wrote:
Upstream qemu 1.3 is adding two new monitor commands, 'drive-mirror' and 'block-job-complete'[1], which can drive live block copy and storage migration. Additionally, RHEL 6.3 had backported an earlier version of most of the same functionality, but under the names '__com.redhat_drive-mirror' and '__com.redhat_drive-reopen' and with slightly different JSON arguments, which we can support without too much extra effort.
@@ -1884,6 +1886,10 @@ qemuCapsProbeQMPCommands(qemuCapsPtr caps, qemuCapsSet(caps, QEMU_CAPS_KVM); else if (strstr(name, "block-commit")) qemuCapsSet(caps, QEMU_CAPS_BLOCK_COMMIT); + else if (strstr(name, "drive-mirror")) + qemuCapsSet(caps, QEMU_CAPS_DRIVE_MIRROR);
Off-list, it was pointed out to me that strstr() is a bit weak; I'm changing this to: if (STREQ(name, "drive-mirror") || STREQ(name, "__com.redhat_drive-mirror")) as that is more robust to any future QMP commands that happen to contain the substring drive-mirror. -- Eric Blake eblake@redhat.com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

For now, disk migration via block copy job is not implemented in libvirt. But when we do implement it, we have to deal with the fact that qemu does not yet provide an easy way to re-start a qemu process with mirroring still intact. Paolo has proposed an idea for a persistent dirty bitmap that might make this possible, but until that design is complete, it's hard to say what changes libvirt would need. Even something like 'virDomainSave' becomes hairy, if you realize the implications that 'virDomainRestore' would be stuck with recreating the same mirror layout. But if we step back and look at the bigger picture, we realize that the initial client of live storage migration via disk mirroring is oVirt, which always uses transient domains, and that if a transient domain is destroyed while a mirror exists, oVirt can easily restart the storage migration by creating a new domain that visits just the source storage, with no loss in data. We can make life a lot easier by being cowards for now, forbidding certain operations on a domain. This patch guarantees that we never get in a state where we would have to restart a domain with a mirroring block copy, by preventing saves, snapshots, hot unplug of a disk in use, and conversion to a persistent domain (thankfully, it is still relatively easy to 'virsh undefine' a running domain to temporarily make it transient, run tests on 'virsh blockcopy', then 'virsh define' to restore the persistence). Later, if the qemu design is enhanced, we can relax our code. The change to qemudDomainDefine looks a bit odd for undoing an assignment, rather than probing up front to avoid the assignment, but this is because of how virDomainAssignDef combines both a lookup and assignment into a single function call. * src/conf/domain_conf.h (virDomainHasDiskMirror): New prototype. * src/conf/domain_conf.c (virDomainHasDiskMirror): New function. * src/libvirt_private.syms (domain_conf.h): Export it. * src/qemu/qemu_driver.c (qemuDomainSaveInternal) (qemuDomainSnapshotCreateXML, qemuDomainRevertToSnapshot) (qemuDomainBlockJobImpl, qemudDomainDefine): Prevent dangerous actions while block copy is already in action. * src/qemu/qemu_hotplug.c (qemuDomainDetachDiskDevice): Likewise. --- v8: no real changes src/conf/domain_conf.c | 12 ++++++++++++ src/conf/domain_conf.h | 1 + src/libvirt_private.syms | 1 + src/qemu/qemu_driver.c | 29 +++++++++++++++++++++++++++++ src/qemu/qemu_hotplug.c | 7 +++++++ 5 files changed, 50 insertions(+) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 0933c08..c163477 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -7980,6 +7980,18 @@ virDomainDiskRemoveByName(virDomainDefPtr def, const char *name) return virDomainDiskRemove(def, i); } +/* Return true if VM has at least one disk involved in a current block + * copy job (that is, with a <mirror> element in the disk xml). */ +bool +virDomainHasDiskMirror(virDomainObjPtr vm) +{ + int i; + for (i = 0; i < vm->def->ndisks; i++) + if (vm->def->disks[i]->mirror) + return true; + return false; +} + int virDomainNetInsert(virDomainDefPtr def, virDomainNetDefPtr net) { if (VIR_REALLOC_N(def->nets, def->nnets + 1) < 0) diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 4151bc2..4349a31 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -2008,6 +2008,7 @@ virDomainDiskDefPtr virDomainDiskRemove(virDomainDefPtr def, size_t i); virDomainDiskDefPtr virDomainDiskRemoveByName(virDomainDefPtr def, const char *name); +bool virDomainHasDiskMirror(virDomainObjPtr vm); int virDomainNetIndexByMac(virDomainDefPtr def, const virMacAddrPtr mac); int virDomainNetInsert(virDomainDefPtr def, virDomainNetDefPtr net); diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 60f9c7f..4b86437 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -396,6 +396,7 @@ virDomainGraphicsSpiceZlibCompressionTypeFromString; virDomainGraphicsSpiceZlibCompressionTypeToString; virDomainGraphicsTypeFromString; virDomainGraphicsTypeToString; +virDomainHasDiskMirror; virDomainHostdevDefAlloc; virDomainHostdevDefClear; virDomainHostdevDefFree; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 202cb4e..3b83068 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -2783,6 +2783,11 @@ qemuDomainSaveInternal(struct qemud_driver *driver, virDomainPtr dom, "%s", _("domain is marked for auto destroy")); goto cleanup; } + if (virDomainHasDiskMirror(vm)) { + virReportError(VIR_ERR_BLOCK_COPY_ACTIVE, "%s", + _("domain has active block copy job")); + goto cleanup; + } memset(&header, 0, sizeof(header)); memcpy(header.magic, QEMUD_SAVE_PARTIAL, sizeof(header.magic)); @@ -5653,6 +5658,12 @@ static virDomainPtr qemudDomainDefine(virConnectPtr conn, const char *xml) { } } def = NULL; + if (virDomainHasDiskMirror(vm)) { + virReportError(VIR_ERR_BLOCK_COPY_ACTIVE, "%s", + _("domain has active block copy job")); + virDomainObjAssignDef(vm, NULL, false); + goto cleanup; + } vm->persistent = 1; if (virDomainSaveConfig(driver->configDir, @@ -11200,6 +11211,12 @@ qemuDomainSnapshotCreateXML(virDomainPtr domain, "%s", _("domain is marked for auto destroy")); goto cleanup; } + if (virDomainHasDiskMirror(vm)) { + virReportError(VIR_ERR_BLOCK_COPY_ACTIVE, "%s", + _("domain has active block copy job")); + goto cleanup; + } + if (!vm->persistent && (flags & VIR_DOMAIN_SNAPSHOT_CREATE_HALT)) { virReportError(VIR_ERR_OPERATION_INVALID, "%s", _("cannot halt after transient domain snapshot")); @@ -11789,6 +11806,11 @@ static int qemuDomainRevertToSnapshot(virDomainSnapshotPtr snapshot, _("no domain with matching uuid '%s'"), uuidstr); goto cleanup; } + if (virDomainHasDiskMirror(vm)) { + virReportError(VIR_ERR_BLOCK_COPY_ACTIVE, "%s", + _("domain has active block copy job")); + goto cleanup; + } snap = virDomainSnapshotFindByName(vm->snapshots, snapshot->name); if (!snap) { @@ -12567,6 +12589,13 @@ qemuDomainBlockJobImpl(virDomainPtr dom, const char *path, const char *base, goto cleanup; disk = vm->def->disks[idx]; + if (mode == BLOCK_JOB_PULL && disk->mirror) { + virReportError(VIR_ERR_BLOCK_COPY_ACTIVE, + _("disk '%s' already in active block copy job"), + disk->dst); + goto cleanup; + } + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index 7381921..e1aeb49 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -2049,6 +2049,13 @@ int qemuDomainDetachDiskDevice(struct qemud_driver *driver, detach = vm->def->disks[i]; + if (detach->mirror) { + virReportError(VIR_ERR_BLOCK_COPY_ACTIVE, + _("disk '%s' is in an active block copy job"), + detach->dst); + goto cleanup; + } + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_DEVICES)) { if (virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0) != 0) { virReportError(VIR_ERR_INTERNAL_ERROR, -- 1.7.11.7

Handle the new type of block copy event and info. Of course, this patch does nothing until a later patch actually allows the creation/abort of a block copy job. The new event is available in qemu 1.3, but not in RHEL 6.3; rather than doing polling ourselves to synthesize the event in RHEL 6.3, we just document that the event might not occur. * include/libvirt/libvirt.h.in (VIR_DOMAIN_BLOCK_JOB_READY): New block job status. * src/libvirt.c (virDomainBlockRebase): Document the event. * src/qemu/qemu_monitor_json.c (eventHandlers): New event. (qemuMonitorJSONHandleBlockJobReady): New function. (qemuMonitorJSONGetBlockJobInfoOne): Translate new job type. (qemuMonitorJSONHandleBlockJobImpl): Handle new event and job type. * src/qemu/qemu_process.c (qemuProcessHandleBlockJob): Recognize the event to minimize snooping. * src/qemu/qemu_driver.c (qemuDomainBlockJobImpl): Snoop a successful info query to save effort on a pivot request. --- v8: no real changes include/libvirt/libvirt.h.in | 1 + src/libvirt.c | 7 ++++--- src/qemu/qemu_driver.c | 6 ++++++ src/qemu/qemu_monitor_json.c | 19 +++++++++++++++++-- src/qemu/qemu_process.c | 3 +++ 5 files changed, 31 insertions(+), 5 deletions(-) diff --git a/include/libvirt/libvirt.h.in b/include/libvirt/libvirt.h.in index 52555f8..a84db1e 100644 --- a/include/libvirt/libvirt.h.in +++ b/include/libvirt/libvirt.h.in @@ -4023,6 +4023,7 @@ typedef enum { VIR_DOMAIN_BLOCK_JOB_COMPLETED = 0, VIR_DOMAIN_BLOCK_JOB_FAILED = 1, VIR_DOMAIN_BLOCK_JOB_CANCELED = 2, + VIR_DOMAIN_BLOCK_JOB_READY = 3, #ifdef VIR_ENUM_SENTINELS VIR_DOMAIN_BLOCK_JOB_LAST diff --git a/src/libvirt.c b/src/libvirt.c index 6d65965..d50c098 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -19212,9 +19212,10 @@ error: * mirror all further changes to both source and destination. The user * must call virDomainBlockJobAbort() to end the mirroring while choosing * whether to revert to source or pivot to the destination. An event is - * issued when the job ends, and in the future, an event may be added when - * the job transitions from pulling to mirroring. If the job is aborted, - * a new job will have to start over from the beginning of the first phase. + * issued when the job ends, and depending on the hypervisor, an event may + * also be issued when the job transitions from pulling to mirroring. If + * the job is aborted, a new job will have to start over from the beginning + * of the first phase. * * Some hypervisors will restrict certain actions, such as virDomainSave() * or virDomainDetachDevice(), while a copy job is active; they may diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 3b83068..97b25a2 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -12615,6 +12615,12 @@ qemuDomainBlockJobImpl(virDomainPtr dom, const char *path, const char *base, if (ret < 0) goto endjob; + /* Snoop block copy operations, so future cancel operations can + * avoid checking if pivot is safe. */ + if (mode == BLOCK_JOB_INFO && ret == 1 && disk->mirror && + info->cur == info->end && info->type == VIR_DOMAIN_BLOCK_JOB_TYPE_COPY) + disk->mirroring = true; + /* With synchronous block cancel, we must synthesize an event, and * we silently ignore the ABORT_ASYNC flag. With asynchronous * block cancel, the event will come from qemu, but without the diff --git a/src/qemu/qemu_monitor_json.c b/src/qemu/qemu_monitor_json.c index ccaeb95..9e8fd19 100644 --- a/src/qemu/qemu_monitor_json.c +++ b/src/qemu/qemu_monitor_json.c @@ -69,6 +69,7 @@ static void qemuMonitorJSONHandlePMWakeup(qemuMonitorPtr mon, virJSONValuePtr da static void qemuMonitorJSONHandlePMSuspend(qemuMonitorPtr mon, virJSONValuePtr data); static void qemuMonitorJSONHandleBlockJobCompleted(qemuMonitorPtr mon, virJSONValuePtr data); static void qemuMonitorJSONHandleBlockJobCanceled(qemuMonitorPtr mon, virJSONValuePtr data); +static void qemuMonitorJSONHandleBlockJobReady(qemuMonitorPtr mon, virJSONValuePtr data); static void qemuMonitorJSONHandleBalloonChange(qemuMonitorPtr mon, virJSONValuePtr data); static void qemuMonitorJSONHandlePMSuspendDisk(qemuMonitorPtr mon, virJSONValuePtr data); @@ -82,6 +83,7 @@ static qemuEventHandler eventHandlers[] = { { "BLOCK_IO_ERROR", qemuMonitorJSONHandleIOError, }, { "BLOCK_JOB_CANCELLED", qemuMonitorJSONHandleBlockJobCanceled, }, { "BLOCK_JOB_COMPLETED", qemuMonitorJSONHandleBlockJobCompleted, }, + { "BLOCK_JOB_READY", qemuMonitorJSONHandleBlockJobReady, }, { "DEVICE_TRAY_MOVED", qemuMonitorJSONHandleTrayChange, }, { "POWERDOWN", qemuMonitorJSONHandlePowerdown, }, { "RESET", qemuMonitorJSONHandleReset, }, @@ -807,6 +809,8 @@ qemuMonitorJSONHandleBlockJobImpl(qemuMonitorPtr mon, type = VIR_DOMAIN_BLOCK_JOB_TYPE_PULL; else if (STREQ(type_str, "commit")) type = VIR_DOMAIN_BLOCK_JOB_TYPE_COMMIT; + else if (STREQ(type_str, "mirror")) + type = VIR_DOMAIN_BLOCK_JOB_TYPE_COPY; switch ((virConnectDomainEventBlockJobStatus) event) { case VIR_DOMAIN_BLOCK_JOB_COMPLETED: @@ -815,11 +819,12 @@ qemuMonitorJSONHandleBlockJobImpl(qemuMonitorPtr mon, event = VIR_DOMAIN_BLOCK_JOB_FAILED; break; case VIR_DOMAIN_BLOCK_JOB_CANCELED: + case VIR_DOMAIN_BLOCK_JOB_READY: break; case VIR_DOMAIN_BLOCK_JOB_FAILED: case VIR_DOMAIN_BLOCK_JOB_LAST: - VIR_DEBUG("should not get here"); - break; + VIR_DEBUG("should not get here"); + break; } out: @@ -883,6 +888,14 @@ qemuMonitorJSONHandleBlockJobCanceled(qemuMonitorPtr mon, } static void +qemuMonitorJSONHandleBlockJobReady(qemuMonitorPtr mon, + virJSONValuePtr data) +{ + qemuMonitorJSONHandleBlockJobImpl(mon, data, + VIR_DOMAIN_BLOCK_JOB_READY); +} + +static void qemuMonitorJSONHandleBalloonChange(qemuMonitorPtr mon, virJSONValuePtr data) { @@ -3518,6 +3531,8 @@ static int qemuMonitorJSONGetBlockJobInfoOne(virJSONValuePtr entry, info->type = VIR_DOMAIN_BLOCK_JOB_TYPE_PULL; else if (STREQ(type, "commit")) info->type = VIR_DOMAIN_BLOCK_JOB_TYPE_COMMIT; + else if (STREQ(type, "mirror")) + info->type = VIR_DOMAIN_BLOCK_JOB_TYPE_COPY; else info->type = VIR_DOMAIN_BLOCK_JOB_TYPE_UNKNOWN; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 9a35b29..3c1c7bb 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -920,6 +920,9 @@ qemuProcessHandleBlockJob(qemuMonitorPtr mon ATTRIBUTE_UNUSED, type == VIR_DOMAIN_BLOCK_JOB_TYPE_COMMIT) && status == VIR_DOMAIN_BLOCK_JOB_COMPLETED) qemuDomainDetermineDiskChain(driver, disk, true); + if (disk->mirror && type == VIR_DOMAIN_BLOCK_JOB_TYPE_COPY && + status == VIR_DOMAIN_BLOCK_JOB_READY) + disk->mirroring = true; } virDomainObjUnlock(vm); -- 1.7.11.7

This is the bare minimum to end a copy job (of course, until a later patch adds the ability to start a copy job, this patch doesn't do much in isolation; I've just split the patches to ease the review). This patch intentionally avoids SELinux, lock manager, and audit actions. Also, if libvirtd restarts at the exact moment that a 'block-job-complete' is in flight, the proposed proper way to detect the outcome of that would be with a persistent bitmap and some additional query commands for qemu 1.3 when libvirtd restarts (RHEL 6.3 is out of luck). This patch is enough to test the common case of success when used correctly, while saving the subtleties of proper cleanup for worst-case errors for later. When a mirror job is started, cancelling the job safely reverts back to the source disk, regardless of whether the destination is in phase 1 (streaming, in which case the destination is worthless) or phase 2 (mirroring, in which case the destination is synced up to the source at the time of the cancel). Our existing code does just fine in either phase, other than some bookkeeping cleanup; this implements live block copy. Pivoting the job requires the qemu 1.3 'block-job-complete' (safe) or the RHEL 6.3 '__com.redhat_drive-reopen' command (where failure of the command is potentially catastrophic to the domain, since it rips out the old disk before attempting to open the new one). Ideas for future enhancements via new flags: Since qemu 1.3 is safer than RHEL 6.3, it may be worth adding a VIR_DOMAIN_REBASE_COPY_ATOMIC flag that fails up front if we detect an older qemu with the risky pivot operation. Interesting side note: while snapshot-create --disk-only creates a copy of the disk at a point in time by moving the domain on to a new file (the copy is the file now in the just-extended backing chain), blockjob --abort of a copy job creates a copy of the disk while keeping the domain on the original file. There may be potential improvements to the snapshot code to exploit block copy over multiple disks all at one point in time. And, if 'block-job-cancel' were made part of 'transaction', you could copy multiple disks at the same point in time without pausing the domain. This also implies we may want to add a --quiesce flag to virDomainBlockJobAbort, so that when breaking a mirror (whether by cancel or pivot), the side of the mirror that we are abandoning is at least in a stable state with regards to guest I/O. * src/qemu/qemu_driver.c (qemuDomainBlockJobAbort): Accept new flag. (qemuDomainBlockPivot): New helper function. (qemuDomainBlockJobImpl): Implement it. --- v8: invalidate cached chain information on pivot src/qemu/qemu_driver.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 97b25a2..cd17fbc 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -12535,6 +12535,88 @@ cleanup: return ret; } +/* Called while holding the VM job lock, to implement a block job + * abort with pivot; this updates the VM definition as appropriate, on + * either success or failure (although there are some forms of + * catastrophic failure that will leave the VM unusable). */ +static int +qemuDomainBlockPivot(struct qemud_driver *driver, virDomainObjPtr vm, + const char *device, virDomainDiskDefPtr disk) +{ + int ret = -1; + qemuDomainObjPrivatePtr priv = vm->privateData; + virDomainBlockJobInfo info; + bool reopen = qemuCapsGet(priv->caps, QEMU_CAPS_DRIVE_REOPEN); + const char *format = virStorageFileFormatTypeToString(disk->mirrorFormat); + + /* Probe the status, if needed. */ + if (!disk->mirroring) { + qemuDomainObjEnterMonitorWithDriver(driver, vm); + ret = qemuMonitorBlockJob(priv->mon, device, NULL, 0, &info, + BLOCK_JOB_INFO, true); + qemuDomainObjExitMonitorWithDriver(driver, vm); + if (ret < 0) + goto cleanup; + if (!virDomainObjIsActive(vm)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("domain is not running")); + goto cleanup; + } + if (ret == 1 && info.cur == info.end && + info.type == VIR_DOMAIN_BLOCK_JOB_TYPE_COPY) + disk->mirroring = true; + } + + if (!disk->mirroring) { + virReportError(VIR_ERR_BLOCK_COPY_ACTIVE, + _("disk '%s' not ready for pivot yet"), + disk->dst); + goto cleanup; + } + + /* Attempt the pivot. */ + qemuDomainObjEnterMonitorWithDriver(driver, vm); + ret = qemuMonitorDrivePivot(priv->mon, device, disk->mirror, format, + reopen); + qemuDomainObjExitMonitorWithDriver(driver, vm); + + /* Note that RHEL 6.3 'drive-reopen' has the remote risk of a + * catastrophic failure, where the it fails but can't recover by + * reopening the source. Not much we can do about it. qemu 1.3 + * 'block-job-complete' is safer by design. */ + + if (ret == 0) { + /* XXX We want to revoke security labels and disk lease, as + * well as audit that revocation, before dropping the original + * source. But it gets tricky if both source and mirror share + * common backing files (we want to only revoke the non-shared + * portion of the chain, and is made more difficult by the + * fact that we aren't tracking the full chain ourselves; so + * for now, we leak the access to the original. */ + VIR_FREE(disk->src); + disk->src = disk->mirror; + disk->format = disk->mirrorFormat; + disk->mirror = NULL; + disk->mirrorFormat = VIR_STORAGE_FILE_NONE; + disk->mirroring = false; + qemuDomainDetermineDiskChain(driver, disk, true); + } else { + /* On failure, qemu abandons the mirror, and attempts to + * revert back to the source disk. Hopefully it was able to + * reopen things. */ + /* XXX should we be parsing the exact qemu error, or calling + * 'query-block', to see what state we really got left in + * before killing the mirroring job? And just as on the + * success case, there's security labeling to worry about. */ + VIR_FREE(disk->mirror); + disk->mirrorFormat = VIR_STORAGE_FILE_NONE; + disk->mirroring = false; + } + +cleanup: + return ret; +} + static int qemuDomainBlockJobImpl(virDomainPtr dom, const char *path, const char *base, unsigned long bandwidth, virDomainBlockJobInfoPtr info, @@ -12595,6 +12677,14 @@ qemuDomainBlockJobImpl(virDomainPtr dom, const char *path, const char *base, disk->dst); goto cleanup; } + if (mode == BLOCK_JOB_ABORT && + (flags & VIR_DOMAIN_BLOCK_JOB_ABORT_PIVOT) && + !(async && disk->mirror)) { + virReportError(VIR_ERR_OPERATION_INVALID, + _("pivot of disk '%s' requires an active copy job"), + disk->dst); + goto cleanup; + } if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; @@ -12605,6 +12695,12 @@ qemuDomainBlockJobImpl(virDomainPtr dom, const char *path, const char *base, goto endjob; } + if (disk->mirror && mode == BLOCK_JOB_ABORT && + (flags & VIR_DOMAIN_BLOCK_JOB_ABORT_PIVOT)) { + ret = qemuDomainBlockPivot(driver, vm, device, disk); + goto endjob; + } + qemuDomainObjEnterMonitorWithDriver(driver, vm); /* XXX - libvirt should really be tracking the backing file chain * itself, and validating that base is on the chain, rather than @@ -12621,6 +12717,15 @@ qemuDomainBlockJobImpl(virDomainPtr dom, const char *path, const char *base, info->cur == info->end && info->type == VIR_DOMAIN_BLOCK_JOB_TYPE_COPY) disk->mirroring = true; + /* A successful block job cancelation stops any mirroring. */ + if (mode == BLOCK_JOB_ABORT && disk->mirror) { + /* XXX We should also revoke security labels and disk lease on + * the mirror, and audit that fact, before dropping things. */ + VIR_FREE(disk->mirror); + disk->mirrorFormat = VIR_STORAGE_FILE_NONE; + disk->mirroring = false; + } + /* With synchronous block cancel, we must synthesize an event, and * we silently ignore the ABORT_ASYNC flag. With asynchronous * block cancel, the event will come from qemu, but without the @@ -12685,7 +12790,8 @@ cleanup: static int qemuDomainBlockJobAbort(virDomainPtr dom, const char *path, unsigned int flags) { - virCheckFlags(VIR_DOMAIN_BLOCK_JOB_ABORT_ASYNC, -1); + virCheckFlags(VIR_DOMAIN_BLOCK_JOB_ABORT_ASYNC | + VIR_DOMAIN_BLOCK_JOB_ABORT_PIVOT, -1); return qemuDomainBlockJobImpl(dom, path, NULL, 0, NULL, BLOCK_JOB_ABORT, flags); } -- 1.7.11.7

Since libvirt drops locks between issuing a monitor command and getting a response, it is possible for libvirtd to be restarted before getting a response on a drive-reopen command; worse, it is also possible for the guest to shut itself down during the window while libvirtd is down, ending the qemu process. A management app needs to know if the pivot happened (and the destination file contains guest contents not in the source) or failed (and the source file contains guest contents not in the destination), but since the job is finished, 'query-block-jobs' no longer tracks the status of the job, and if the qemu process itself has disappeared, even 'query-block' cannot be checked to ask qemu its current state. This is only a problem for the RHEL 6.3 drive-reopen command; which partly explains why upstream qemu 1.3 abandoned that command and went with block-job-complete plus persistent bitmap instead. At the time of this patch, the design for persistent bitmap has not been clarified, so a followup patch will be needed once we actually figure out how to use the qemu 1.3 interface. If we surround 'drive-reopen' with a pause/resume pair, then we can guarantee that the guest cannot modify either source or destination files in the window of libvirtd uncertainty, and the management app is guaranteed that either libvirt knows the outcome and reported it correctly; or that on libvirtd restart, the guest will still be paused and that the qemu process cannot have disappeared due to guest shutdown; and use that as a clue that the management app must implement recovery protocol, with both source and destination files still being in sync and with 'query-block' still being an option as part of that recovery. My testing of the RHEL 6.3 implementation of 'drive-reopen' show that the pause window will typically be only a fraction of a second. * src/qemu/qemu_driver.c (qemuDomainBlockPivot): Pause around drive-reopen. (qemuDomainBlockJobImpl): Update caller. --- v8: clarify that persistent bitmap (still to be designed in qemu) is required to avoid pausing the guest src/qemu/qemu_driver.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index cd17fbc..5669b2f 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -12540,7 +12540,8 @@ cleanup: * either success or failure (although there are some forms of * catastrophic failure that will leave the VM unusable). */ static int -qemuDomainBlockPivot(struct qemud_driver *driver, virDomainObjPtr vm, +qemuDomainBlockPivot(virConnectPtr conn, + struct qemud_driver *driver, virDomainObjPtr vm, const char *device, virDomainDiskDefPtr disk) { int ret = -1; @@ -12548,6 +12549,7 @@ qemuDomainBlockPivot(struct qemud_driver *driver, virDomainObjPtr vm, virDomainBlockJobInfo info; bool reopen = qemuCapsGet(priv->caps, QEMU_CAPS_DRIVE_REOPEN); const char *format = virStorageFileFormatTypeToString(disk->mirrorFormat); + bool resume = false; /* Probe the status, if needed. */ if (!disk->mirroring) { @@ -12574,6 +12576,29 @@ qemuDomainBlockPivot(struct qemud_driver *driver, virDomainObjPtr vm, goto cleanup; } + /* If we are using the older 'drive-reopen', we want to make sure + * that management apps can tell whether the command succeeded, + * even if libvirtd is restarted at the wrong time. To accomplish + * that, we pause the guest before drive-reopen, and resume it + * only when we know the outcome; if libvirtd restarts, then + * management will see the guest still paused, and know that no + * guest I/O has caused the source and mirror to diverge. XXX + * With the newer 'block-job-complete', we need to use a + * persistent bitmap to make things safe; so for now, we just + * blindly pause the guest. */ + if (virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING) { + if (qemuProcessStopCPUs(driver, vm, VIR_DOMAIN_PAUSED_SAVE, + QEMU_ASYNC_JOB_NONE) < 0) + goto cleanup; + + resume = true; + if (!virDomainObjIsActive(vm)) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("guest unexpectedly quit")); + goto cleanup; + } + } + /* Attempt the pivot. */ qemuDomainObjEnterMonitorWithDriver(driver, vm); ret = qemuMonitorDrivePivot(priv->mon, device, disk->mirror, format, @@ -12614,6 +12639,14 @@ qemuDomainBlockPivot(struct qemud_driver *driver, virDomainObjPtr vm, } cleanup: + if (resume && virDomainObjIsActive(vm) && + qemuProcessStartCPUs(driver, vm, conn, + VIR_DOMAIN_RUNNING_UNPAUSED, + QEMU_ASYNC_JOB_NONE) < 0 && + virGetLastError() == NULL) { + virReportError(VIR_ERR_OPERATION_FAILED, "%s", + _("resuming after drive-reopen failed")); + } return ret; } @@ -12697,7 +12730,7 @@ qemuDomainBlockJobImpl(virDomainPtr dom, const char *path, const char *base, if (disk->mirror && mode == BLOCK_JOB_ABORT && (flags & VIR_DOMAIN_BLOCK_JOB_ABORT_PIVOT)) { - ret = qemuDomainBlockPivot(driver, vm, device, disk); + ret = qemuDomainBlockPivot(dom->conn, driver, vm, device, disk); goto endjob; } -- 1.7.11.7

Minimal patch to wire up all the pieces in the previous patches to actually enable a block copy job. By minimal, I mean that qemu creates the file (that is, no REUSE_EXT flag support yet), SELinux must be disabled, a lock manager is not informed, the audit logs aren't updated, and bandwidth is not supported. But those will be added as improvements in future patches. * src/qemu/qemu_driver.c (qemuDomainBlockCopy): New function. (qemuDomainBlockRebase): Call it when appropriate. --- v8: use qemuDomObjFromDomain for less code src/qemu/qemu_driver.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 139 insertions(+), 2 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 5669b2f..bcbece2 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -12848,10 +12848,145 @@ qemuDomainBlockJobSetSpeed(virDomainPtr dom, const char *path, } static int +qemuDomainBlockCopy(virDomainPtr dom, const char *path, + const char *dest, const char *format, + unsigned long bandwidth, unsigned int flags) +{ + struct qemud_driver *driver = dom->conn->privateData; + virDomainObjPtr vm; + qemuDomainObjPrivatePtr priv; + char *device = NULL; + virDomainDiskDefPtr disk; + int ret = -1; + int idx; + bool reopen; + + /* Preliminaries: find the disk we are editing, sanity checks */ + virCheckFlags(VIR_DOMAIN_BLOCK_REBASE_SHALLOW, -1); + + if (!(vm = qemuDomObjFromDomain(dom))) + goto cleanup; + priv = vm->privateData; + + if (!virDomainObjIsActive(vm)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("domain is not running")); + goto cleanup; + } + + device = qemuDiskPathToAlias(vm, path, &idx); + if (!device) { + goto cleanup; + } + disk = vm->def->disks[idx]; + if (disk->mirror) { + virReportError(VIR_ERR_BLOCK_COPY_ACTIVE, + _("disk '%s' already in active block copy job"), + disk->dst); + goto cleanup; + } + + reopen = qemuCapsGet(priv->caps, QEMU_CAPS_DRIVE_REOPEN); + if (!(qemuCapsGet(priv->caps, QEMU_CAPS_DRIVE_MIRROR) && + qemuCapsGet(priv->caps, QEMU_CAPS_BLOCKJOB_ASYNC))) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("block copy is not supported with this QEMU binary")); + goto cleanup; + } + if (vm->persistent) { + /* XXX if qemu ever lets us start a new domain with mirroring + * already active, we can relax this; but for now, the risk of + * 'managedsave' due to libvirt-guests means we can't risk + * this on persistent domains. */ + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("domain is not transient")); + goto cleanup; + } + + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) + goto cleanup; + + if (!virDomainObjIsActive(vm)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("domain is not running")); + goto endjob; + } + if (qemuDomainDetermineDiskChain(driver, disk, false) < 0) + goto endjob; + + if ((flags & VIR_DOMAIN_BLOCK_REBASE_SHALLOW) && + STREQ_NULLABLE(format, "raw") && + disk->backingChain->backingStore) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("disk '%s' has backing file, so raw shallow copy " + "is not possible"), + disk->dst); + goto endjob; + } + + /* Prepare the destination file. */ + /* XXX We also need to add security labeling, lock manager lease, + * and auditing of those events, as well as to support reuse of + * existing images, including probing the existing format of an + * existing image. */ + if (!format) { + disk->mirrorFormat = disk->format; + if (disk->mirrorFormat > 0) + format = virStorageFileFormatTypeToString(disk->mirrorFormat); + } else { + disk->mirrorFormat = virStorageFileFormatTypeFromString(format); + if (disk->mirrorFormat <= 0) { + virReportError(VIR_ERR_INVALID_ARG, _("unrecognized format '%s'"), + format); + goto endjob; + } + } + if (!(disk->mirror = strdup(dest))) { + virReportOOMError(); + goto endjob; + } + + /* Actually start the mirroring */ + qemuDomainObjEnterMonitorWithDriver(driver, vm); + ret = qemuMonitorDriveMirror(priv->mon, device, dest, format, bandwidth, + reopen, flags); + qemuDomainObjExitMonitorWithDriver(driver, vm); + +endjob: + if (ret < 0) { + VIR_FREE(disk->mirror); + disk->mirrorFormat = VIR_STORAGE_FILE_NONE; + } + if (qemuDomainObjEndJob(driver, vm) == 0) { + vm = NULL; + goto cleanup; + } + +cleanup: + VIR_FREE(device); + if (vm) + virDomainObjUnlock(vm); + qemuDriverUnlock(driver); + return ret; +} + +static int qemuDomainBlockRebase(virDomainPtr dom, const char *path, const char *base, unsigned long bandwidth, unsigned int flags) { - virCheckFlags(0, -1); + virCheckFlags(VIR_DOMAIN_BLOCK_REBASE_SHALLOW | + VIR_DOMAIN_BLOCK_REBASE_COPY | + VIR_DOMAIN_BLOCK_REBASE_COPY_RAW, -1); + + if (flags & VIR_DOMAIN_BLOCK_REBASE_COPY) { + const char *format = NULL; + if (flags & VIR_DOMAIN_BLOCK_REBASE_COPY_RAW) + format = "raw"; + flags &= ~(VIR_DOMAIN_BLOCK_REBASE_COPY | + VIR_DOMAIN_BLOCK_REBASE_COPY_RAW); + return qemuDomainBlockCopy(dom, path, base, format, bandwidth, flags); + } + return qemuDomainBlockJobImpl(dom, path, base, bandwidth, NULL, BLOCK_JOB_PULL, flags); } @@ -12860,7 +12995,9 @@ static int qemuDomainBlockPull(virDomainPtr dom, const char *path, unsigned long bandwidth, unsigned int flags) { - return qemuDomainBlockRebase(dom, path, NULL, bandwidth, flags); + virCheckFlags(0, -1); + return qemuDomainBlockJobImpl(dom, path, NULL, bandwidth, NULL, + BLOCK_JOB_PULL, flags); } -- 1.7.11.7

Support the REUSE_EXT flag, in part by copying sanity checks from snapshot code. This code introduces a case of probing an external file for its type; such an action would be a security risk if the existing file is supposed to be raw but the contents resemble some other format; however, since the virDomainBlockRebase API has a flag to force treating the file as raw rather than probe, we can assume that probing is safe in all other instances. * src/qemu/qemu_driver.c (qemuDomainBlockRebase): Allow REUSE_EXT flag. (qemuDomainBlockCopy): Wire up flag, and add some sanity checks. --- v8: use backing chain for better analysis of external file src/qemu/qemu_driver.c | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index bcbece2..13cba61 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -12860,9 +12860,11 @@ qemuDomainBlockCopy(virDomainPtr dom, const char *path, int ret = -1; int idx; bool reopen; + struct stat st; /* Preliminaries: find the disk we are editing, sanity checks */ - virCheckFlags(VIR_DOMAIN_BLOCK_REBASE_SHALLOW, -1); + virCheckFlags(VIR_DOMAIN_BLOCK_REBASE_SHALLOW | + VIR_DOMAIN_BLOCK_REBASE_REUSE_EXT, -1); if (!(vm = qemuDomObjFromDomain(dom))) goto cleanup; @@ -12925,12 +12927,39 @@ qemuDomainBlockCopy(virDomainPtr dom, const char *path, } /* Prepare the destination file. */ + if (stat(dest, &st) < 0) { + if (errno != ENOENT) { + virReportSystemError(errno, _("unable to stat for disk %s: %s"), + disk->dst, dest); + goto endjob; + } else if (flags & VIR_DOMAIN_BLOCK_REBASE_REUSE_EXT) { + virReportSystemError(errno, + _("missing destination file for disk %s: %s"), + disk->dst, dest); + goto endjob; + } + } else if (!S_ISBLK(st.st_mode) && st.st_size && + !(flags & VIR_DOMAIN_BLOCK_REBASE_REUSE_EXT)) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("external destination file for disk %s already " + "exists and is not a block device: %s"), + disk->dst, dest); + goto endjob; + } + /* XXX We also need to add security labeling, lock manager lease, - * and auditing of those events, as well as to support reuse of - * existing images, including probing the existing format of an - * existing image. */ + * and auditing of those events. */ if (!format) { - disk->mirrorFormat = disk->format; + if (flags & VIR_DOMAIN_BLOCK_REBASE_REUSE_EXT) { + /* If the user passed the REUSE_EXT flag, then either they + * also passed the RAW flag (and format is non-NULL), or + * it is safe for us to probe the format from the file + * that we will be using. */ + disk->mirrorFormat = virStorageFileProbeFormat(dest, driver->user, + driver->group); + } else { + disk->mirrorFormat = disk->format; + } if (disk->mirrorFormat > 0) format = virStorageFileFormatTypeToString(disk->mirrorFormat); } else { @@ -12975,6 +13004,7 @@ qemuDomainBlockRebase(virDomainPtr dom, const char *path, const char *base, unsigned long bandwidth, unsigned int flags) { virCheckFlags(VIR_DOMAIN_BLOCK_REBASE_SHALLOW | + VIR_DOMAIN_BLOCK_REBASE_REUSE_EXT | VIR_DOMAIN_BLOCK_REBASE_COPY | VIR_DOMAIN_BLOCK_REBASE_COPY_RAW, -1); -- 1.7.11.7

Use the recent addition of qemuDomainPrepareDiskChainElement to set the SELinux label, obtain locking manager lease, permit a block device through cgroups, and audit the fact that we hand a new file over to qemu. Alas, releasing the lease and label at the end of the mirroring is a trickier prospect (we would have to trace the backing chain of both source and destination, and be sure not to revoke rights to any part of the chain that is shared), so for now, virDomainBlockJobAbort still leaves things with additional access granted (cleaning this up in the future should also improve block-pull and block-commit). * src/qemu/qemu_driver.c (qemuDomainBlockCopy): Set up labeling. --- v8: simplify, based on additions in block-commit series src/qemu/qemu_driver.c | 71 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 19 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 13cba61..f2ff931 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -12861,6 +12861,9 @@ qemuDomainBlockCopy(virDomainPtr dom, const char *path, int idx; bool reopen; struct stat st; + bool need_unlink = false; + char *mirror = NULL; + virCgroupPtr cgroup = NULL; /* Preliminaries: find the disk we are editing, sanity checks */ virCheckFlags(VIR_DOMAIN_BLOCK_REBASE_SHALLOW | @@ -12875,6 +12878,13 @@ qemuDomainBlockCopy(virDomainPtr dom, const char *path, _("domain is not running")); goto cleanup; } + if (qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_DEVICES) && + virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Unable to find cgroup for %s"), + vm->def->name); + goto cleanup; + } device = qemuDiskPathToAlias(vm, path, &idx); if (!device) { @@ -12947,51 +12957,74 @@ qemuDomainBlockCopy(virDomainPtr dom, const char *path, goto endjob; } - /* XXX We also need to add security labeling, lock manager lease, - * and auditing of those events. */ - if (!format) { - if (flags & VIR_DOMAIN_BLOCK_REBASE_REUSE_EXT) { - /* If the user passed the REUSE_EXT flag, then either they - * also passed the RAW flag (and format is non-NULL), or - * it is safe for us to probe the format from the file - * that we will be using. */ - disk->mirrorFormat = virStorageFileProbeFormat(dest, driver->user, - driver->group); - } else { + if (!(flags & VIR_DOMAIN_BLOCK_REBASE_REUSE_EXT)) { + int fd = qemuOpenFile(driver, dest, O_WRONLY | O_TRUNC | O_CREAT, + &need_unlink, NULL); + if (fd < 0) + goto endjob; + VIR_FORCE_CLOSE(fd); + if (!format) disk->mirrorFormat = disk->format; - } - if (disk->mirrorFormat > 0) - format = virStorageFileFormatTypeToString(disk->mirrorFormat); - } else { + } else if (format) { disk->mirrorFormat = virStorageFileFormatTypeFromString(format); if (disk->mirrorFormat <= 0) { virReportError(VIR_ERR_INVALID_ARG, _("unrecognized format '%s'"), format); goto endjob; } - } + } else { + /* If the user passed the REUSE_EXT flag, then either they + * also passed the RAW flag (and format is non-NULL), or it is + * safe for us to probe the format from the file that we will + * be using. */ + disk->mirrorFormat = virStorageFileProbeFormat(dest, driver->user, + driver->group); + } + if (!format && disk->mirrorFormat > 0) + format = virStorageFileFormatTypeToString(disk->mirrorFormat); if (!(disk->mirror = strdup(dest))) { virReportOOMError(); goto endjob; } + if (qemuDomainPrepareDiskChainElement(driver, vm, cgroup, disk, dest, + VIR_DISK_CHAIN_READ_WRITE) < 0) { + qemuDomainPrepareDiskChainElement(driver, vm, cgroup, disk, dest, + VIR_DISK_CHAIN_NO_ACCESS); + goto endjob; + } + /* Actually start the mirroring */ qemuDomainObjEnterMonitorWithDriver(driver, vm); ret = qemuMonitorDriveMirror(priv->mon, device, dest, format, bandwidth, reopen, flags); + virDomainAuditDisk(vm, NULL, dest, "mirror", ret >= 0); qemuDomainObjExitMonitorWithDriver(driver, vm); + if (ret < 0) { + qemuDomainPrepareDiskChainElement(driver, vm, cgroup, disk, dest, + VIR_DISK_CHAIN_NO_ACCESS); + goto endjob; + } + + /* Update vm in place to match changes. */ + need_unlink = false; + disk->mirror = mirror; + mirror = NULL; endjob: - if (ret < 0) { - VIR_FREE(disk->mirror); + if (need_unlink && unlink(dest)) + VIR_WARN("unable to unlink just-created %s", dest); + if (ret < 0) disk->mirrorFormat = VIR_STORAGE_FILE_NONE; - } + VIR_FREE(mirror); if (qemuDomainObjEndJob(driver, vm) == 0) { vm = NULL; goto cleanup; } cleanup: + if (cgroup) + virCgroupFree(&cgroup); VIR_FREE(device); if (vm) virDomainObjUnlock(vm); -- 1.7.11.7

When using block copy to pivot over to a new chain, the backing files for the new chain might still need labeling (particularly if the user passes --reuse-ext with a relative backing file name). Relabeling a file that is already labeled won't hurt, so this just labels the entire chain at the point of the pivot. But to do the relabel, we must probe the file type of an external file (the API already promised to do this, and the probe is safe; we are just moving the probe from qemu into libvirt). * src/qemu/qemu_driver.c (qemuDomainBlockPivot): Relabel chain before asking qemu to pivot. --- v8: rebase on top of block-commit work src/qemu/qemu_driver.c | 49 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index f2ff931..3316e4b 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -12550,6 +12550,10 @@ qemuDomainBlockPivot(virConnectPtr conn, bool reopen = qemuCapsGet(priv->caps, QEMU_CAPS_DRIVE_REOPEN); const char *format = virStorageFileFormatTypeToString(disk->mirrorFormat); bool resume = false; + virCgroupPtr cgroup = NULL; + char *oldsrc = NULL; + int oldformat; + virStorageFileMetadataPtr oldchain = NULL; /* Probe the status, if needed. */ if (!disk->mirroring) { @@ -12599,6 +12603,39 @@ qemuDomainBlockPivot(virConnectPtr conn, } } + /* We previously labeled only the top-level image; but if the + * image includes a relative backing file, the pivot may result in + * qemu needing to open the entire backing chain, so we need to + * label the entire chain. This action is safe even if the + * backing chain has already been labeled; but only necessary when + * we know for sure that there is a backing chain. */ + if (disk->mirrorFormat && disk->mirrorFormat != VIR_STORAGE_FILE_RAW && + qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_DEVICES) && + virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Unable to find cgroup for %s"), + vm->def->name); + goto cleanup; + } + oldsrc = disk->src; + oldformat = disk->format; + oldchain = disk->backingChain; + disk->src = disk->mirror; + disk->format = disk->mirrorFormat; + disk->backingChain = NULL; + if (qemuDomainDetermineDiskChain(driver, disk, false) < 0) + goto cleanup; + if (disk->mirrorFormat && disk->mirrorFormat != VIR_STORAGE_FILE_RAW && + (virDomainLockDiskAttach(driver->lockManager, driver->uri, + vm, disk) < 0 || + (cgroup && qemuSetupDiskCgroup(vm, cgroup, disk) < 0) || + virSecurityManagerSetImageLabel(driver->securityManager, vm->def, + disk) < 0)) { + disk->src = oldsrc; + disk->format = oldformat; + goto cleanup; + } + /* Attempt the pivot. */ qemuDomainObjEnterMonitorWithDriver(driver, vm); ret = qemuMonitorDrivePivot(priv->mon, device, disk->mirror, format, @@ -12618,13 +12655,11 @@ qemuDomainBlockPivot(virConnectPtr conn, * portion of the chain, and is made more difficult by the * fact that we aren't tracking the full chain ourselves; so * for now, we leak the access to the original. */ - VIR_FREE(disk->src); - disk->src = disk->mirror; - disk->format = disk->mirrorFormat; + VIR_FREE(oldsrc); + virStorageFileFreeMetadata(oldchain); disk->mirror = NULL; disk->mirrorFormat = VIR_STORAGE_FILE_NONE; disk->mirroring = false; - qemuDomainDetermineDiskChain(driver, disk, true); } else { /* On failure, qemu abandons the mirror, and attempts to * revert back to the source disk. Hopefully it was able to @@ -12633,12 +12668,18 @@ qemuDomainBlockPivot(virConnectPtr conn, * 'query-block', to see what state we really got left in * before killing the mirroring job? And just as on the * success case, there's security labeling to worry about. */ + disk->src = oldsrc; + disk->format = oldformat; + virStorageFileFreeMetadata(disk->backingChain); + disk->backingChain = oldchain; VIR_FREE(disk->mirror); disk->mirrorFormat = VIR_STORAGE_FILE_NONE; disk->mirroring = false; } cleanup: + if (cgroup) + virCgroupFree(&cgroup); if (resume && virDomainObjIsActive(vm) && qemuProcessStartCPUs(driver, vm, conn, VIR_DOMAIN_RUNNING_UNPAUSED, -- 1.7.11.7
participants (3)
-
Daniel Veillard
-
Eric Blake
-
Peter Krempa