[libvirt] [PATCH 0/2] qemu: migration: bugfixes for cancelling drive mirror

Nikolay Shirokovskiy (2): qemu: take current async job into account in qemuBlockNodeNamesDetect qemu: migration: fix race on cancelling drive mirror src/qemu/qemu_block.c | 6 ++++-- src/qemu/qemu_block.h | 4 +++- src/qemu/qemu_blockjob.c | 9 ++++++--- src/qemu/qemu_blockjob.h | 4 ++++ src/qemu/qemu_driver.c | 11 ++++++----- src/qemu/qemu_migration.c | 43 +++++++++++++++++++++++++++++++------------ src/qemu/qemu_process.c | 2 +- 7 files changed, 55 insertions(+), 24 deletions(-) -- 1.8.3.1

Becase it can be called during migration out (namely on cancelling blockjobs). --- src/qemu/qemu_block.c | 6 ++++-- src/qemu/qemu_block.h | 4 +++- src/qemu/qemu_blockjob.c | 9 ++++++--- src/qemu/qemu_blockjob.h | 4 ++++ src/qemu/qemu_driver.c | 11 ++++++----- src/qemu/qemu_migration.c | 28 ++++++++++++++++------------ src/qemu/qemu_process.c | 2 +- 7 files changed, 40 insertions(+), 24 deletions(-) diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c index 586d568..29b5c47 100644 --- a/src/qemu/qemu_block.c +++ b/src/qemu/qemu_block.c @@ -336,7 +336,8 @@ qemuBlockDiskDetectNodes(virDomainDiskDefPtr disk, int qemuBlockNodeNamesDetect(virQEMUDriverPtr driver, - virDomainObjPtr vm) + virDomainObjPtr vm, + qemuDomainAsyncJob asyncJob) { qemuDomainObjPrivatePtr priv = vm->privateData; virHashTablePtr disktable = NULL; @@ -350,7 +351,8 @@ qemuBlockNodeNamesDetect(virQEMUDriverPtr driver, if (!virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_QUERY_NAMED_BLOCK_NODES)) return 0; - qemuDomainObjEnterMonitor(driver, vm); + if (qemuDomainObjEnterMonitorAsync(driver, vm, asyncJob) < 0) + return -1; disktable = qemuMonitorGetBlockInfo(qemuDomainGetMonitor(vm)); data = qemuMonitorQueryNamedBlockNodes(qemuDomainGetMonitor(vm)); diff --git a/src/qemu/qemu_block.h b/src/qemu/qemu_block.h index 9d6a246..2af15a6 100644 --- a/src/qemu/qemu_block.h +++ b/src/qemu/qemu_block.h @@ -22,6 +22,7 @@ # include "internal.h" # include "qemu_conf.h" +# include "qemu_domain.h" # include "virhash.h" # include "virjson.h" @@ -46,7 +47,8 @@ qemuBlockNodeNameGetBackingChain(virJSONValuePtr data); int qemuBlockNodeNamesDetect(virQEMUDriverPtr driver, - virDomainObjPtr vm); + virDomainObjPtr vm, + qemuDomainAsyncJob asyncJob); virHashTablePtr qemuBlockGetNodeData(virJSONValuePtr data); diff --git a/src/qemu/qemu_blockjob.c b/src/qemu/qemu_blockjob.c index 0601e68..415768d 100644 --- a/src/qemu/qemu_blockjob.c +++ b/src/qemu/qemu_blockjob.c @@ -55,13 +55,14 @@ VIR_LOG_INIT("qemu.qemu_blockjob"); int qemuBlockJobUpdate(virQEMUDriverPtr driver, virDomainObjPtr vm, + qemuDomainAsyncJob asyncJob, virDomainDiskDefPtr disk) { qemuDomainDiskPrivatePtr diskPriv = QEMU_DOMAIN_DISK_PRIVATE(disk); int status = diskPriv->blockJobStatus; if (status != -1) { - qemuBlockJobEventProcess(driver, vm, disk, + qemuBlockJobEventProcess(driver, vm, disk, asyncJob, diskPriv->blockJobType, diskPriv->blockJobStatus); diskPriv->blockJobStatus = -1; @@ -87,6 +88,7 @@ void qemuBlockJobEventProcess(virQEMUDriverPtr driver, virDomainObjPtr vm, virDomainDiskDefPtr disk, + qemuDomainAsyncJob asyncJob, int type, int status) { @@ -167,7 +169,7 @@ qemuBlockJobEventProcess(virQEMUDriverPtr driver, disk->mirrorJob = VIR_DOMAIN_BLOCK_JOB_TYPE_UNKNOWN; ignore_value(qemuDomainDetermineDiskChain(driver, vm, disk, true, true)); - ignore_value(qemuBlockNodeNamesDetect(driver, vm)); + ignore_value(qemuBlockNodeNamesDetect(driver, vm, asyncJob)); diskPriv->blockjob = false; break; @@ -247,9 +249,10 @@ qemuBlockJobSyncBegin(virDomainDiskDefPtr disk) void qemuBlockJobSyncEnd(virQEMUDriverPtr driver, virDomainObjPtr vm, + qemuDomainAsyncJob asyncJob, virDomainDiskDefPtr disk) { VIR_DEBUG("disk=%s", disk->dst); - qemuBlockJobUpdate(driver, vm, disk); + qemuBlockJobUpdate(driver, vm, asyncJob, disk); QEMU_DOMAIN_DISK_PRIVATE(disk)->blockJobSync = false; } diff --git a/src/qemu/qemu_blockjob.h b/src/qemu/qemu_blockjob.h index 775ce95..47aa4c1 100644 --- a/src/qemu/qemu_blockjob.h +++ b/src/qemu/qemu_blockjob.h @@ -24,19 +24,23 @@ # include "internal.h" # include "qemu_conf.h" +# include "qemu_domain.h" int qemuBlockJobUpdate(virQEMUDriverPtr driver, virDomainObjPtr vm, + qemuDomainAsyncJob asyncJob, virDomainDiskDefPtr disk); void qemuBlockJobEventProcess(virQEMUDriverPtr driver, virDomainObjPtr vm, virDomainDiskDefPtr disk, + qemuDomainAsyncJob asyncJob, int type, int status); void qemuBlockJobSyncBegin(virDomainDiskDefPtr disk); void qemuBlockJobSyncEnd(virQEMUDriverPtr driver, virDomainObjPtr vm, + qemuDomainAsyncJob asyncJob, virDomainDiskDefPtr disk); #endif /* __QEMU_BLOCKJOB_H__ */ diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 388af4f..6b7370f 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4513,7 +4513,7 @@ processBlockJobEvent(virQEMUDriverPtr driver, } if ((disk = qemuProcessFindDomainDiskByAlias(vm, diskAlias))) - qemuBlockJobEventProcess(driver, vm, disk, type, status); + qemuBlockJobEventProcess(driver, vm, disk, QEMU_ASYNC_JOB_NONE, type, status); endjob: qemuDomainObjEndJob(driver, vm); @@ -16234,24 +16234,25 @@ qemuDomainBlockJobAbort(virDomainPtr dom, * event to pull and let qemuBlockJobEventProcess() handle * the rest as usual */ qemuBlockJobEventProcess(driver, vm, disk, + QEMU_ASYNC_JOB_NONE, VIR_DOMAIN_BLOCK_JOB_TYPE_PULL, VIR_DOMAIN_BLOCK_JOB_CANCELED); } else { qemuDomainDiskPrivatePtr diskPriv = QEMU_DOMAIN_DISK_PRIVATE(disk); - qemuBlockJobUpdate(driver, vm, disk); + qemuBlockJobUpdate(driver, vm, QEMU_ASYNC_JOB_NONE, disk); while (diskPriv->blockjob) { if (virDomainObjWait(vm) < 0) { ret = -1; goto endjob; } - qemuBlockJobUpdate(driver, vm, disk); + qemuBlockJobUpdate(driver, vm, QEMU_ASYNC_JOB_NONE, disk); } } } endjob: if (disk) - qemuBlockJobSyncEnd(driver, vm, disk); + qemuBlockJobSyncEnd(driver, vm, QEMU_ASYNC_JOB_NONE, disk); qemuDomainObjEndJob(driver, vm); cleanup: @@ -20399,7 +20400,7 @@ qemuDomainSetBlockThreshold(virDomainPtr dom, goto endjob; if (!src->nodebacking && - qemuBlockNodeNamesDetect(driver, vm) < 0) + qemuBlockNodeNamesDetect(driver, vm, QEMU_ASYNC_JOB_NONE) < 0) goto endjob; if (!src->nodebacking) { diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 68e72b3..d7ff415 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -600,7 +600,8 @@ qemuMigrationStopNBDServer(virQEMUDriverPtr driver, */ static int qemuMigrationDriveMirrorReady(virQEMUDriverPtr driver, - virDomainObjPtr vm) + virDomainObjPtr vm, + qemuDomainAsyncJob asyncJob) { size_t i; size_t notReady = 0; @@ -613,7 +614,7 @@ qemuMigrationDriveMirrorReady(virQEMUDriverPtr driver, if (!diskPriv->migrating) continue; - status = qemuBlockJobUpdate(driver, vm, disk); + status = qemuBlockJobUpdate(driver, vm, asyncJob, disk); if (status == VIR_DOMAIN_BLOCK_JOB_FAILED) { virReportError(VIR_ERR_OPERATION_FAILED, _("migration of disk %s failed"), @@ -648,6 +649,7 @@ qemuMigrationDriveMirrorReady(virQEMUDriverPtr driver, static int qemuMigrationDriveMirrorCancelled(virQEMUDriverPtr driver, virDomainObjPtr vm, + qemuDomainAsyncJob asyncJob, bool check) { size_t i; @@ -662,7 +664,7 @@ qemuMigrationDriveMirrorCancelled(virQEMUDriverPtr driver, if (!diskPriv->migrating) continue; - status = qemuBlockJobUpdate(driver, vm, disk); + status = qemuBlockJobUpdate(driver, vm, asyncJob, disk); switch (status) { case VIR_DOMAIN_BLOCK_JOB_FAILED: if (check) { @@ -674,7 +676,7 @@ qemuMigrationDriveMirrorCancelled(virQEMUDriverPtr driver, /* fallthrough */ case VIR_DOMAIN_BLOCK_JOB_CANCELED: case VIR_DOMAIN_BLOCK_JOB_COMPLETED: - qemuBlockJobSyncEnd(driver, vm, disk); + qemuBlockJobSyncEnd(driver, vm, asyncJob, disk); diskPriv->migrating = false; break; @@ -722,7 +724,7 @@ qemuMigrationCancelOneDriveMirror(virQEMUDriverPtr driver, int status; int rv; - status = qemuBlockJobUpdate(driver, vm, disk); + status = qemuBlockJobUpdate(driver, vm, asyncJob, disk); switch (status) { case VIR_DOMAIN_BLOCK_JOB_FAILED: case VIR_DOMAIN_BLOCK_JOB_CANCELED: @@ -799,12 +801,13 @@ qemuMigrationCancelDriveMirror(virQEMUDriverPtr driver, err = virSaveLastError(); failed = true; } - qemuBlockJobSyncEnd(driver, vm, disk); + qemuBlockJobSyncEnd(driver, vm, asyncJob, disk); diskPriv->migrating = false; } } - while ((rv = qemuMigrationDriveMirrorCancelled(driver, vm, check)) != 1) { + while ((rv = qemuMigrationDriveMirrorCancelled(driver, vm, asyncJob, + check)) != 1) { if (check && !failed && dconn && virConnectIsAlive(dconn) <= 0) { virReportError(VIR_ERR_OPERATION_FAILED, "%s", @@ -930,7 +933,7 @@ qemuMigrationDriveMirror(virQEMUDriverPtr driver, VIR_FREE(nbd_dest); if (qemuDomainObjExitMonitor(driver, vm) < 0 || mon_ret < 0) { - qemuBlockJobSyncEnd(driver, vm, disk); + qemuBlockJobSyncEnd(driver, vm, QEMU_ASYNC_JOB_MIGRATION_OUT, disk); goto cleanup; } diskPriv->migrating = true; @@ -941,7 +944,8 @@ qemuMigrationDriveMirror(virQEMUDriverPtr driver, } } - while ((rv = qemuMigrationDriveMirrorReady(driver, vm)) != 1) { + while ((rv = qemuMigrationDriveMirrorReady(driver, vm, + QEMU_ASYNC_JOB_MIGRATION_OUT)) != 1) { if (rv < 0) goto cleanup; @@ -1475,7 +1479,7 @@ qemuMigrationCompleted(virQEMUDriverPtr driver, goto error; if (flags & QEMU_MIGRATION_COMPLETED_CHECK_STORAGE && - qemuMigrationDriveMirrorReady(driver, vm) < 0) + qemuMigrationDriveMirrorReady(driver, vm, asyncJob) < 0) goto error; if (flags & QEMU_MIGRATION_COMPLETED_ABORT_ON_ERROR && @@ -5573,7 +5577,7 @@ qemuMigrationCancel(virQEMUDriverPtr driver, VIR_DEBUG("Drive mirror on disk %s is still running", disk->dst); } else { VIR_DEBUG("Drive mirror on disk %s is gone", disk->dst); - qemuBlockJobSyncEnd(driver, vm, disk); + qemuBlockJobSyncEnd(driver, vm, QEMU_ASYNC_JOB_NONE, disk); diskPriv->migrating = false; } } @@ -5595,7 +5599,7 @@ qemuMigrationCancel(virQEMUDriverPtr driver, qemuDomainDiskPrivatePtr diskPriv = QEMU_DOMAIN_DISK_PRIVATE(disk); if (diskPriv->migrating) { - qemuBlockJobSyncEnd(driver, vm, disk); + qemuBlockJobSyncEnd(driver, vm, QEMU_ASYNC_JOB_NONE, disk); diskPriv->migrating = false; } } diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 8323a18..a479f44 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3489,7 +3489,7 @@ qemuProcessReconnect(void *opaque) if (qemuProcessRefreshDisks(driver, obj, QEMU_ASYNC_JOB_NONE) < 0) goto error; - if (qemuBlockNodeNamesDetect(driver, obj) < 0) + if (qemuBlockNodeNamesDetect(driver, obj, QEMU_ASYNC_JOB_NONE) < 0) goto error; if (qemuRefreshVirtioChannelState(driver, obj, QEMU_ASYNC_JOB_NONE) < 0) -- 1.8.3.1

0feebab2 adds calling qemuBlockNodeNamesDetect for completed job on updating block jobs. This affects cancelling drive mirror logic as this function drops vm lock. Now we have to recheck all disks before the disk with the completed block job before going to wait for block job events. --- src/qemu/qemu_migration.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index d7ff415..769de97 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -654,9 +654,11 @@ qemuMigrationDriveMirrorCancelled(virQEMUDriverPtr driver, { size_t i; size_t active = 0; + size_t completed = 0; int status; bool failed = false; + retry: for (i = 0; i < vm->def->ndisks; i++) { virDomainDiskDefPtr disk = vm->def->disks[i]; qemuDomainDiskPrivatePtr diskPriv = QEMU_DOMAIN_DISK_PRIVATE(disk); @@ -683,6 +685,19 @@ qemuMigrationDriveMirrorCancelled(virQEMUDriverPtr driver, default: active++; } + + if (status == VIR_DOMAIN_BLOCK_JOB_COMPLETED) + completed++; + } + + /* Updating completed block job drops the lock thus we have to recheck + * block jobs for disks that reside before the disk(s) with completed + * block job. + */ + if (completed > 0) { + completed = 0; + active = 0; + goto retry; } if (failed) { -- 1.8.3.1

ping On 07.04.2017 14:06, Nikolay Shirokovskiy wrote:
Nikolay Shirokovskiy (2): qemu: take current async job into account in qemuBlockNodeNamesDetect qemu: migration: fix race on cancelling drive mirror
src/qemu/qemu_block.c | 6 ++++-- src/qemu/qemu_block.h | 4 +++- src/qemu/qemu_blockjob.c | 9 ++++++--- src/qemu/qemu_blockjob.h | 4 ++++ src/qemu/qemu_driver.c | 11 ++++++----- src/qemu/qemu_migration.c | 43 +++++++++++++++++++++++++++++++------------ src/qemu/qemu_process.c | 2 +- 7 files changed, 55 insertions(+), 24 deletions(-)

On Fri, Apr 07, 2017 at 14:06:23 +0300, Nikolay Shirokovskiy wrote:
Nikolay Shirokovskiy (2): qemu: take current async job into account in qemuBlockNodeNamesDetect qemu: migration: fix race on cancelling drive mirror
src/qemu/qemu_block.c | 6 ++++-- src/qemu/qemu_block.h | 4 +++- src/qemu/qemu_blockjob.c | 9 ++++++--- src/qemu/qemu_blockjob.h | 4 ++++ src/qemu/qemu_driver.c | 11 ++++++----- src/qemu/qemu_migration.c | 43 +++++++++++++++++++++++++++++++------------ src/qemu/qemu_process.c | 2 +- 7 files changed, 55 insertions(+), 24 deletions(-)
ACK series and pushed. Thanks, Jirka

On 27.04.2017 15:40, Jiri Denemark wrote:
On Fri, Apr 07, 2017 at 14:06:23 +0300, Nikolay Shirokovskiy wrote:
Nikolay Shirokovskiy (2): qemu: take current async job into account in qemuBlockNodeNamesDetect qemu: migration: fix race on cancelling drive mirror
src/qemu/qemu_block.c | 6 ++++-- src/qemu/qemu_block.h | 4 +++- src/qemu/qemu_blockjob.c | 9 ++++++--- src/qemu/qemu_blockjob.h | 4 ++++ src/qemu/qemu_driver.c | 11 ++++++----- src/qemu/qemu_migration.c | 43 +++++++++++++++++++++++++++++++------------ src/qemu/qemu_process.c | 2 +- 7 files changed, 55 insertions(+), 24 deletions(-)
ACK series and pushed.
Thanks,
Jirka
Thanks for review! Nikolay
participants (2)
-
Jiri Denemark
-
Nikolay Shirokovskiy