At 07/29/2011 07:47 AM, Eric Blake Write:
Currently, we attempt to run sync job and async job at the same time.
It
means that the monitor commands for two jobs can be run in any order.
In the function qemuDomainObjEnterMonitorInternal():
if (priv->job.active == QEMU_JOB_NONE && priv->job.asyncJob) {
if (qemuDomainObjBeginNestedJob(driver, obj) < 0)
We check whether the caller is an async job by priv->job.active and
priv->job.asynJob. But when an async job is running, and a sync job is
also running at the time of the check, then priv->job.active is not
QEMU_JOB_NONE. So we cannot check whether the caller is an async job
in the function qemuDomainObjEnterMonitorInternal(), and must instead
put the burden on the caller to tell us when an async command wants
to do a nested job.
* src/qemu/THREADS.txt: Reflect new rules.
* src/qemu/qemu_domain.h (qemuDomainObjEnterMonitorAsync): New
prototype.
* src/qemu/qemu_process.h (qemuProcessStartCPUs)
(qemuProcessStopCPUs): Add parameter.
* src/qemu/qemu_migration.h (qemuMigrationToFile): Likewise.
(qemuMigrationWaitForCompletion): Make static.
* src/qemu/qemu_domain.c (qemuDomainObjEnterMonitorInternal): Add
parameter.
(qemuDomainObjEnterMonitorAsync): New function.
(qemuDomainObjEnterMonitor, qemuDomainObjEnterMonitorWithDriver):
Update callers.
* src/qemu/qemu_driver.c (qemuDomainSaveInternal)
(qemudDomainCoreDump, doCoreDump, processWatchdogEvent)
(qemudDomainSuspend, qemudDomainResume, qemuDomainSaveImageStartVM)
(qemuDomainSnapshotCreateActive, qemuDomainRevertToSnapshot):
Likewise.
* src/qemu/qemu_process.c (qemuProcessStopCPUs)
(qemuProcessFakeReboot, qemuProcessRecoverMigration)
(qemuProcessRecoverJob, qemuProcessStart): Likewise.
* src/qemu/qemu_migration.c (qemuMigrationToFile)
(qemuMigrationWaitForCompletion, qemuMigrationUpdateJobStatus)
(qemuMigrationJobStart, qemuDomainMigrateGraphicsRelocate)
(doNativeMigrate, doTunnelMigrate, qemuMigrationPerformJob)
(qemuMigrationPerformPhase, qemuMigrationFinish)
(qemuMigrationConfirm): Likewise.
---
My initial smoke testing shows that this fixes 'virsh managedsave',
but I still have more testing to do before I'm convinced I got
everything (for example, I need to test migration still).
I test this patch with save by virt-manager, and find that it will cause
libvirt to be deadlock.
With this patch, we can ignore the return value of qemuDomainObjEnterMonitor(WithDriver),
because these two functions always return 0. But we can not ignore the
return value of qemuDomainObjEnterMonitorAsync().
If qemuDomainObjEnterMonitorAsync() failed, we do nothing in this function.
So it's very dangerous to call qemuDomainObjExitMonitorWithDriver() when
qemuDomainObjEnterMonitorAsync() failed.
I think this problem already exists before this patch.
src/qemu/THREADS.txt | 8 ++++--
src/qemu/qemu_domain.c | 43 +++++++++++++++++++++++++++-------
src/qemu/qemu_domain.h | 4 +++
src/qemu/qemu_driver.c | 39 +++++++++++++++++++++----------
src/qemu/qemu_migration.c | 55 ++++++++++++++++++++++++++++----------------
src/qemu/qemu_migration.h | 5 +--
src/qemu/qemu_process.c | 32 ++++++++++++++++++--------
src/qemu/qemu_process.h | 7 ++++-
8 files changed, 133 insertions(+), 60 deletions(-)
diff --git a/src/qemu/THREADS.txt b/src/qemu/THREADS.txt
index e73076c..125bd5d 100644
--- a/src/qemu/THREADS.txt
+++ b/src/qemu/THREADS.txt
@@ -374,7 +374,7 @@ Design patterns
qemuDriverUnlock(driver);
- * Running asynchronous job
+ * Running asynchronous job with driver lock held
virDomainObjPtr obj;
qemuDomainObjPrivatePtr priv;
@@ -387,7 +387,8 @@ Design patterns
...do prep work...
- if (qemuDomainObjEnterMonitorWithDriver(driver, obj) < 0) {
+ if (qemuDomainObjEnterMonitorAsync(driver, obj,
+ QEMU_ASYNC_JOB_TYPE) < 0) {
/* domain died in the meantime */
goto error;
}
@@ -395,7 +396,8 @@ Design patterns
qemuDomainObjExitMonitorWithDriver(driver, obj);
while (!finished) {
- if (qemuDomainObjEnterMonitorWithDriver(driver, obj) < 0) {
+ if (qemuDomainObjEnterMonitorAsync(driver, true, obj,
+ QEMU_ASYNC_JOB_TYPE) < 0) {
/* domain died in the meantime */
goto error;
}
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index 2eaaf3a..4cf6888 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -863,14 +863,20 @@ qemuDomainObjEndAsyncJob(struct qemud_driver *driver,
virDomainObjPtr obj)
return virDomainObjUnref(obj);
}
-static int ATTRIBUTE_NONNULL(1)
+static int
qemuDomainObjEnterMonitorInternal(struct qemud_driver *driver,
bool driver_locked,
- virDomainObjPtr obj)
+ virDomainObjPtr obj,
+ enum qemuDomainAsyncJob asyncJob)
{
qemuDomainObjPrivatePtr priv = obj->privateData;
- if (priv->job.active == QEMU_JOB_NONE && priv->job.asyncJob) {
+ if (asyncJob != QEMU_ASYNC_JOB_NONE) {
+ if (asyncJob != priv->job.asyncJob) {
When we recover a async job after livbirtd restart, priv->job.asyncJob is
QEMU_ASYNC_JOB_NONE,
because we call qemuDomainObjRestoreJob() to reset priv->job in the function
qemuProcessReconnect().
+ qemuReportError(VIR_ERR_INTERNAL_ERROR,
+ _("unepxected async job %d"), asyncJob);
+ return -1;
+ }
if (qemuDomainObjBeginJobInternal(driver, driver_locked, obj,
QEMU_JOB_ASYNC_NESTED,
QEMU_ASYNC_JOB_NONE) < 0)
return -1;
if (!virDomainObjIsActive(obj)) {
qemuReportError(VIR_ERR_OPERATION_FAILED, "%s",
_("domain is no longer running"));
return -1;
}
if the domain is not active after calling qemuDomainObjBeginJobInternal(), we set
priv->job.active to QEMU_JOB_ASYNC_NESTED, but we do not clear it and notify the other
thread which is waiting priv->job.cond.
@@ -930,15 +936,15 @@ qemuDomainObjExitMonitorInternal(struct
qemud_driver *driver,
*
* To be called immediately before any QEMU monitor API call
* Must have already either called qemuDomainObjBeginJob() and checked
- * that the VM is still active or called qemuDomainObjBeginAsyncJob, in which
- * case this will start a nested job.
+ * that the VM is still active; may not be used for nested async jobs.
*
* To be followed with qemuDomainObjExitMonitor() once complete
*/
<snip>
@@ -2424,7 +2430,8 @@ qemuProcessRecoverJob(struct qemud_driver
*driver,
reason == VIR_DOMAIN_PAUSED_SAVE) ||
reason == VIR_DOMAIN_PAUSED_UNKNOWN)) {
if (qemuProcessStartCPUs(driver, vm, conn,
- VIR_DOMAIN_RUNNING_UNPAUSED) < 0) {
+ VIR_DOMAIN_RUNNING_UNPAUSED,
+ job->asyncJob) < 0) {
As the above mentioned, we set priv->job.asynJob to QEMU_ASYNC_JOB_NONE, and
priv->job.active is QEMU_JOB_MODIFY. I think we can use QEMU_ASYNC_JOB_NONE
instead of job->asyncJob safely.
Thanks.
Wen Congyang
VIR_WARN("Could not resume domain %s
after", vm->def->name);
}
}
@@ -2974,6 +2981,10 @@ int qemuProcessStart(virConnectPtr conn,
goto cleanup;
}
+ /* Technically, qemuProcessStart can be called from inside
+ * QEMU_ASYNC_JOB_MIGRATION_IN, but we are okay treating this like
+ * a sync job since no other job can call into the domain until
+ * migration completes. */
VIR_DEBUG("Setting initial memory amount");
cur_balloon = vm->def->mem.cur_balloon;
ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm));
@@ -2987,7 +2998,8 @@ int qemuProcessStart(virConnectPtr conn,
VIR_DEBUG("Starting domain CPUs");
/* Allow the CPUS to start executing */
if (qemuProcessStartCPUs(driver, vm, conn,
- VIR_DOMAIN_RUNNING_BOOTED) < 0) {
+ VIR_DOMAIN_RUNNING_BOOTED,
+ QEMU_ASYNC_JOB_NONE) < 0) {
if (virGetLastError() == NULL)
qemuReportError(VIR_ERR_INTERNAL_ERROR,
"%s", _("resume operation
failed"));
diff --git a/src/qemu/qemu_process.h b/src/qemu/qemu_process.h
index 449d7f1..e9b910d 100644
--- a/src/qemu/qemu_process.h
+++ b/src/qemu/qemu_process.h
@@ -23,6 +23,7 @@
# define __QEMU_PROCESS_H__
# include "qemu_conf.h"
+# include "qemu_domain.h"
int qemuProcessPrepareMonitorChr(struct qemud_driver *driver,
virDomainChrSourceDefPtr monConfig,
@@ -31,10 +32,12 @@ int qemuProcessPrepareMonitorChr(struct qemud_driver *driver,
int qemuProcessStartCPUs(struct qemud_driver *driver,
virDomainObjPtr vm,
virConnectPtr conn,
- virDomainRunningReason reason);
+ virDomainRunningReason reason,
+ enum qemuDomainAsyncJob asyncJob);
int qemuProcessStopCPUs(struct qemud_driver *driver,
virDomainObjPtr vm,
- virDomainPausedReason reason);
+ virDomainPausedReason reason,
+ enum qemuDomainAsyncJob asyncJob);
void qemuProcessAutostartAll(struct qemud_driver *driver);
void qemuProcessReconnectAll(virConnectPtr conn, struct qemud_driver *driver);