https://bugzilla.redhat.com/show_bug.cgi?id=1448268
When migrating to a file (e.g. when doing 'virsh save file'),
couple of things are happening in the thread that is executing
the API:
1) the domain obj is locked
2) iohelper is spawned as a separate process to handle all I/O
3) the thread waits for iohelper to finish
4) the domain obj is unlocked
Now, the problem is that while the thread waits in step 3 for
iohelper to finish this may take ages because iohelper calls
fdatasync(). And unfortunately, we are waiting the whole time
with the domain locked. So if another thread wants to jump in and
say copy the domain name ('virsh list' for instance), they are
stuck.
The solution is to unlock the domain whenever waiting for I/O and
lock it back again when it finished.
Signed-off-by: Michal Privoznik <mprivozn(a)redhat.com>
---
diff to v1:
- check after locking the domain obj back if the domain is still alive,
since it was unlocked it might have changed its state.
src/qemu/qemu_driver.c | 33 +++++++++++++++++++++++++++++----
1 file changed, 29 insertions(+), 4 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
index e1a0dd553..6095a5ec5 100644
--- a/src/qemu/qemu_driver.c
+++ b/src/qemu/qemu_driver.c
@@ -3216,6 +3216,31 @@ qemuOpenFileAs(uid_t fallback_uid, gid_t fallback_gid,
goto cleanup;
}
+
+static int
+qemuFileWrapperFDClose(virDomainObjPtr vm,
+ virFileWrapperFdPtr fd)
+{
+ int ret;
+
+ /* virFileWrapperFd uses iohelper to write data onto disk.
+ * However, iohelper calls fdatasync() which may take ages to
+ * finish. Therefore, we shouldn't be waiting with the domain
+ * object locked. */
+
+ virObjectUnlock(vm);
+ ret = virFileWrapperFdClose(fd);
+ virObjectLock(vm);
+ if (!virDomainObjIsActive(vm)) {
+ if (!virGetLastError())
+ virReportError(VIR_ERR_OPERATION_FAILED, "%s",
+ _("domain is no longer running"));
+ ret = -1;
+ }
+ return ret;
+}
+
+
/* Helper function to execute a migration to file with a correct save header
* the caller needs to make sure that the processors are stopped and do all other
* actions besides saving memory */
@@ -3276,7 +3301,7 @@ qemuDomainSaveMemory(virQEMUDriverPtr driver,
goto cleanup;
}
- if (virFileWrapperFdClose(wrapperFd) < 0)
+ if (qemuFileWrapperFDClose(vm, wrapperFd) < 0)
goto cleanup;
if ((fd = qemuOpenFile(driver, vm, path, O_WRONLY, NULL, NULL)) < 0 ||
@@ -3827,7 +3852,7 @@ doCoreDump(virQEMUDriverPtr driver,
path);
goto cleanup;
}
- if (virFileWrapperFdClose(wrapperFd) < 0)
+ if (qemuFileWrapperFDClose(vm, wrapperFd) < 0)
goto cleanup;
ret = 0;
@@ -6687,7 +6712,7 @@ qemuDomainRestoreFlags(virConnectPtr conn,
ret = qemuDomainSaveImageStartVM(conn, driver, vm, &fd, data, path,
false, QEMU_ASYNC_JOB_START);
- if (virFileWrapperFdClose(wrapperFd) < 0)
+ if (qemuFileWrapperFDClose(vm, wrapperFd) < 0)
VIR_WARN("Failed to close %s", path);
qemuProcessEndJob(driver, vm);
@@ -6962,7 +6987,7 @@ qemuDomainObjRestore(virConnectPtr conn,
ret = qemuDomainSaveImageStartVM(conn, driver, vm, &fd, data, path,
start_paused, asyncJob);
- if (virFileWrapperFdClose(wrapperFd) < 0)
+ if (qemuFileWrapperFDClose(vm, wrapperFd) < 0)
VIR_WARN("Failed to close %s", path);
cleanup:
--
2.13.5