On Tue, Aug 23, 2022 at 18:32:24 +0200, Pavel Hrdina wrote:
When deleting snapshot we are starting block-commit job over all
disks
that are part of the snapshot.
This operation may fail as it writes data changes to the backing qcow2
image so we need to wait for all the disks to finish the operation and
wait for correct signal from QEMU. If deleting active snapshot we will
get `ready` signal and for inactive snapshots we need to disable
autofinalize in order to get `pending` signal.
At this point if commit for any disk fails for some reason and we abort
the VM is still in consistent state and user can fix the reason why the
deletion failed.
After that we do `pivot` or `finalize` if it's active snapshot or not to
finish the block job. It still may fail but there is nothing else we can
do about it.
Signed-off-by: Pavel Hrdina <phrdina(a)redhat.com>
---
src/qemu/qemu_snapshot.c | 163 +++++++++++++++++++++++++++++++++++----
1 file changed, 147 insertions(+), 16 deletions(-)
diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c
index dade5dcea4..64ee395230 100644
--- a/src/qemu/qemu_snapshot.c
+++ b/src/qemu/qemu_snapshot.c
@@ -2380,6 +2380,114 @@ qemuSnapshotChildrenReparent(void *payload,
}
+static int
+qemuSnapshotJobRunning(virDomainObj *vm,
+ qemuBlockJobData *job)
+{
+ qemuBlockJobUpdate(vm, job, VIR_ASYNC_JOB_NONE);
+
+ while (job->state != QEMU_BLOCKJOB_STATE_READY &&
+ job->state != QEMU_BLOCKJOB_STATE_PENDING &&
+ job->state != QEMU_BLOCKJOB_STATE_FAILED &&
+ job->state != QEMU_BLOCKJOB_STATE_CANCELLED &&
+ job->state != QEMU_BLOCKJOB_STATE_COMPLETED) {
Preferrably extract this condition into a separate function (e.g.
qemuSnapshotJobIsRunning) which will use a switch statement so we are
sure to update them when the list of supported states changes.
+ if (qemuDomainObjWait(vm) < 0)
+ return -1;
+ qemuBlockJobUpdate(vm, job, VIR_ASYNC_JOB_NONE);
+ }
+
+ return 0;
+}
+
+
+static int
+qemuSnapshotJobFinishing(virDomainObj *vm,
+ qemuBlockJobData *job)
Both functions need a docs string outlining the expectations.
+{
+ qemuBlockJobUpdate(vm, job, VIR_ASYNC_JOB_NONE);
+
+ while (job->state != QEMU_BLOCKJOB_STATE_READY &&
+ job->state != QEMU_BLOCKJOB_STATE_FAILED &&
+ job->state != QEMU_BLOCKJOB_STATE_CANCELLED &&
+ job->state != QEMU_BLOCKJOB_STATE_COMPLETED) {
+ if (qemuDomainObjWait(vm) < 0)
+ return -1;
+ qemuBlockJobUpdate(vm, job, VIR_ASYNC_JOB_NONE);
+ }
+
+ return 0;
+}
+
+
+static int
+qemuSnapshotDiscardExternal(virDomainObj *vm,
+ virQEMUDriver *driver,
+ GPtrArray *externalData)
+{
+ ssize_t i;
+
+ for (i = 0; i < externalData->len; i++) {
As I've noted previously you are not randomly accessing the list of data
so you can use a G(S)List and avoid having to deal with length.
+ qemuSnapshotDeleteExternalData *data =
g_ptr_array_index(externalData, i);
+ virTristateBool autofinalize = VIR_TRISTATE_BOOL_NO;
+ unsigned int commitFlags = VIR_DOMAIN_BLOCK_COMMIT_DELETE;
+
+ if (data->domDisk->src == data->diskSrc) {
+ commitFlags |= VIR_DOMAIN_BLOCK_COMMIT_ACTIVE;
+ autofinalize = VIR_TRISTATE_BOOL_YES;
+ }
+
+ if (qemuBlockCommitImpl(vm, driver,
+ data->domDisk,
+ data->parentDiskSrc,
+ data->diskSrc,
+ data->prevDiskSrc,
+ 0, true, autofinalize, commitFlags) < 0) {
+ return -1;
+ }
+
+ data->job = qemuBlockJobDiskGetJob(data->domDisk);
I think it should be better to return the job data struct from
qemuBlockCommitImpl, as there it's guaranteed to be present ...
+ if (!data->job) {
+ virReportError(VIR_ERR_OPERATION_INVALID,
+ _("disk '%s' does not have an active block
job"),
+ data->domDisk->dst);
+ return -1;
+ }
... so you'll avoid an error path.
+ }
+
+ for (i = 0; i < externalData->len; i++) {
+ qemuSnapshotDeleteExternalData *data = g_ptr_array_index(externalData, i);
+
+ if (qemuSnapshotJobRunning(vm, data->job) < 0)
+ return -1;
+
+ if (data->job->state == QEMU_BLOCKJOB_STATE_FAILED)
+ return -1;
You can't do this without reporting an error. Transition of the block
job to failed state doesn't report any error.
Additionally in this case the VM keeps on running so I'd expect that
there's some form of recovery needed to e.g. cancel the jobs.
Without that you'll end up with a bunch of blockjobs in unfinished
states after the API fails. What's even worse is that the blockjobs are
in sync mode so nothing will even be able to update them.
+ }
+
+ for (i = 0; i < externalData->len; i++) {
+ qemuSnapshotDeleteExternalData *data = g_ptr_array_index(externalData, i);
+
+ if (data->job->state == QEMU_BLOCKJOB_STATE_READY) {
+ if (qemuBlockPivot(vm, data->job, NULL) < 0)
+ return -1;
+ } else if (data->job->state == QEMU_BLOCKJOB_STATE_PENDING) {
+ if (qemuBlockFinalize(vm, data->job) < 0)
+ return -1;
+ }
+
+ if (qemuSnapshotJobFinishing(vm, data->job) < 0)
+ return -1;
+
+ if (data->job->state == QEMU_BLOCKJOB_STATE_FAILED)
+ return -1;
Same as above.
+
+ qemuBlockJobSyncEnd(vm, data->job, VIR_ASYNC_JOB_NONE);
+ }
+
+ return 0;
+}
+
+