If a guest is paused, we were silently ignoring the quiesce flag,
which results in unclean snapshots, contrary to the intent of the
flag. Since we can't quiesce without guest agent support, we should
instead fail if the guest is not running.
Meanwhile, if we attempt a quiesce command, but the guest agent
doesn't respond, and we time out, we may have left the command
pending on the guest's queue, and when the guest resumes parsing
commands, it will freeze even though our command is no longer
around to issue a thaw. To be safe, we must _always_ pair every
quiesce call with a counterpart thaw, even if the quiesce call
failed due to a timeout, so that if a guest wakes up and starts
processing a command backlog, it will not get stuck in a frozen
state.
* src/qemu/qemu_driver.c (qemuDomainSnapshotCreateDiskActive):
Always issue thaw after a quiesce, even if quiesce failed.
(qemuDomainSnapshotFSThaw): Add a parameter.
---
See also:
https://bugzilla.redhat.com/show_bug.cgi?id=804210
https://www.redhat.com/archives/libvir-list/2012-March/msg00708.html
src/qemu/qemu_driver.c | 51 +++++++++++++++++++++++++++++++++--------------
1 files changed, 36 insertions(+), 15 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
index 2c467ab..13ca92f 100644
--- a/src/qemu/qemu_driver.c
+++ b/src/qemu/qemu_driver.c
@@ -9584,24 +9584,36 @@ qemuDomainSnapshotFSFreeze(struct qemud_driver *driver,
static int
qemuDomainSnapshotFSThaw(struct qemud_driver *driver,
- virDomainObjPtr vm) {
+ virDomainObjPtr vm, bool report)
+{
qemuDomainObjPrivatePtr priv = vm->privateData;
int thawed;
+ virErrorPtr err = NULL;
if (priv->agentError) {
- qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
- _("QEMU guest agent is not "
- "available due to an error"));
+ if (report)
+ qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+ _("QEMU guest agent is not "
+ "available due to an error"));
return -1;
}
if (!priv->agent) {
- qemuReportError(VIR_ERR_ARGUMENT_UNSUPPORTED, "%s",
- _("QEMU guest agent is not configured"));
+ if (report)
+ qemuReportError(VIR_ERR_ARGUMENT_UNSUPPORTED, "%s",
+ _("QEMU guest agent is not configured"));
return -1;
}
qemuDomainObjEnterAgent(driver, vm);
+ if (!report)
+ err = virGetLastError();
thawed = qemuAgentFSThaw(priv->agent);
+ if (!report) {
+ if (err)
+ virResetError(err);
+ else
+ virResetLastError();
+ }
qemuDomainObjExitAgent(driver, vm);
return thawed;
@@ -9907,6 +9919,7 @@ qemuDomainSnapshotCreateDiskActive(virConnectPtr conn,
int ret = -1;
int i;
bool persist = false;
+ int thaw = 0; /* 1 if freeze succeeded, -1 if freeze failed */
if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0)
return -1;
@@ -9917,14 +9930,21 @@ qemuDomainSnapshotCreateDiskActive(virConnectPtr conn,
goto endjob;
}
-
- if (virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING) {
- if ((flags & VIR_DOMAIN_SNAPSHOT_CREATE_QUIESCE) &&
- (qemuDomainSnapshotFSFreeze(driver, vm) < 0)) {
+ /* If quiesce was requested, then issue a freeze command, and a
+ * counterpart thaw command, no matter what. The command will
+ * fail if the guest is paused or the guest agent is not
+ * running. */
+ if (flags & VIR_DOMAIN_SNAPSHOT_CREATE_QUIESCE) {
+ if (qemuDomainSnapshotFSFreeze(driver, vm) < 0) {
/* helper reported the error */
+ thaw = -1;
goto endjob;
+ } else {
+ thaw = 1;
}
+ }
+ if (virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING) {
/* In qemu, snapshot_blkdev on a single disk will pause cpus,
* but this confuses libvirt since notifications are not given
* when qemu resumes. And for multiple disks, libvirt must
@@ -10001,11 +10021,6 @@ cleanup:
_("resuming after snapshot failed"));
goto endjob;
}
-
- if ((flags & VIR_DOMAIN_SNAPSHOT_CREATE_QUIESCE) &&
- qemuDomainSnapshotFSThaw(driver, vm) < 0) {
- /* helper reported the error */
- }
}
if (vm) {
@@ -10016,6 +10031,12 @@ cleanup:
}
endjob:
+ if (vm && thaw != 0 &&
+ qemuDomainSnapshotFSThaw(driver, vm, thaw > 0) < 0) {
+ /* helper reported the error, if it was needed */
+ if (thaw > 0)
+ ret = -1;
+ }
if (vm && (qemuDomainObjEndJob(driver, vm) == 0)) {
/* Only possible if a transient vm quit while our locks were down,
* in which case we don't want to save snapshot metadata. */
--
1.7.7.6