
At 07/29/2011 07:47 AM, Eric Blake Write:
Currently, we attempt to run sync job and async job at the same time. It means that the monitor commands for two jobs can be run in any order.
In the function qemuDomainObjEnterMonitorInternal(): if (priv->job.active == QEMU_JOB_NONE && priv->job.asyncJob) { if (qemuDomainObjBeginNestedJob(driver, obj) < 0) We check whether the caller is an async job by priv->job.active and priv->job.asynJob. But when an async job is running, and a sync job is also running at the time of the check, then priv->job.active is not QEMU_JOB_NONE. So we cannot check whether the caller is an async job in the function qemuDomainObjEnterMonitorInternal(), and must instead put the burden on the caller to tell us when an async command wants to do a nested job.
* src/qemu/THREADS.txt: Reflect new rules. * src/qemu/qemu_domain.h (qemuDomainObjEnterMonitorAsync): New prototype. * src/qemu/qemu_process.h (qemuProcessStartCPUs) (qemuProcessStopCPUs): Add parameter. * src/qemu/qemu_migration.h (qemuMigrationToFile): Likewise. (qemuMigrationWaitForCompletion): Make static. * src/qemu/qemu_domain.c (qemuDomainObjEnterMonitorInternal): Add parameter. (qemuDomainObjEnterMonitorAsync): New function. (qemuDomainObjEnterMonitor, qemuDomainObjEnterMonitorWithDriver): Update callers. * src/qemu/qemu_driver.c (qemuDomainSaveInternal) (qemudDomainCoreDump, doCoreDump, processWatchdogEvent) (qemudDomainSuspend, qemudDomainResume, qemuDomainSaveImageStartVM) (qemuDomainSnapshotCreateActive, qemuDomainRevertToSnapshot): Likewise. * src/qemu/qemu_process.c (qemuProcessStopCPUs) (qemuProcessFakeReboot, qemuProcessRecoverMigration) (qemuProcessRecoverJob, qemuProcessStart): Likewise. * src/qemu/qemu_migration.c (qemuMigrationToFile) (qemuMigrationWaitForCompletion, qemuMigrationUpdateJobStatus) (qemuMigrationJobStart, qemuDomainMigrateGraphicsRelocate) (doNativeMigrate, doTunnelMigrate, qemuMigrationPerformJob) (qemuMigrationPerformPhase, qemuMigrationFinish) (qemuMigrationConfirm): Likewise. ---
My initial smoke testing shows that this fixes 'virsh managedsave', but I still have more testing to do before I'm convinced I got everything (for example, I need to test migration still).
I test this patch with save by virt-manager, and find that it will cause libvirt to be deadlock. With this patch, we can ignore the return value of qemuDomainObjEnterMonitor(WithDriver), because these two functions always return 0. But we can not ignore the return value of qemuDomainObjEnterMonitorAsync(). If qemuDomainObjEnterMonitorAsync() failed, we do nothing in this function. So it's very dangerous to call qemuDomainObjExitMonitorWithDriver() when qemuDomainObjEnterMonitorAsync() failed. I think this problem already exists before this patch.
src/qemu/THREADS.txt | 8 ++++-- src/qemu/qemu_domain.c | 43 +++++++++++++++++++++++++++------- src/qemu/qemu_domain.h | 4 +++ src/qemu/qemu_driver.c | 39 +++++++++++++++++++++---------- src/qemu/qemu_migration.c | 55 ++++++++++++++++++++++++++++---------------- src/qemu/qemu_migration.h | 5 +-- src/qemu/qemu_process.c | 32 ++++++++++++++++++-------- src/qemu/qemu_process.h | 7 ++++- 8 files changed, 133 insertions(+), 60 deletions(-)
diff --git a/src/qemu/THREADS.txt b/src/qemu/THREADS.txt index e73076c..125bd5d 100644 --- a/src/qemu/THREADS.txt +++ b/src/qemu/THREADS.txt @@ -374,7 +374,7 @@ Design patterns qemuDriverUnlock(driver);
- * Running asynchronous job + * Running asynchronous job with driver lock held
virDomainObjPtr obj; qemuDomainObjPrivatePtr priv; @@ -387,7 +387,8 @@ Design patterns
...do prep work...
- if (qemuDomainObjEnterMonitorWithDriver(driver, obj) < 0) { + if (qemuDomainObjEnterMonitorAsync(driver, obj, + QEMU_ASYNC_JOB_TYPE) < 0) { /* domain died in the meantime */ goto error; } @@ -395,7 +396,8 @@ Design patterns qemuDomainObjExitMonitorWithDriver(driver, obj);
while (!finished) { - if (qemuDomainObjEnterMonitorWithDriver(driver, obj) < 0) { + if (qemuDomainObjEnterMonitorAsync(driver, true, obj, + QEMU_ASYNC_JOB_TYPE) < 0) { /* domain died in the meantime */ goto error; } diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 2eaaf3a..4cf6888 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -863,14 +863,20 @@ qemuDomainObjEndAsyncJob(struct qemud_driver *driver, virDomainObjPtr obj) return virDomainObjUnref(obj); }
-static int ATTRIBUTE_NONNULL(1) +static int qemuDomainObjEnterMonitorInternal(struct qemud_driver *driver, bool driver_locked, - virDomainObjPtr obj) + virDomainObjPtr obj, + enum qemuDomainAsyncJob asyncJob) { qemuDomainObjPrivatePtr priv = obj->privateData;
- if (priv->job.active == QEMU_JOB_NONE && priv->job.asyncJob) { + if (asyncJob != QEMU_ASYNC_JOB_NONE) { + if (asyncJob != priv->job.asyncJob) {
When we recover a async job after livbirtd restart, priv->job.asyncJob is QEMU_ASYNC_JOB_NONE, because we call qemuDomainObjRestoreJob() to reset priv->job in the function qemuProcessReconnect().
+ qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("unepxected async job %d"), asyncJob); + return -1; + } if (qemuDomainObjBeginJobInternal(driver, driver_locked, obj, QEMU_JOB_ASYNC_NESTED, QEMU_ASYNC_JOB_NONE) < 0) return -1; if (!virDomainObjIsActive(obj)) { qemuReportError(VIR_ERR_OPERATION_FAILED, "%s", _("domain is no longer running")); return -1; }
if the domain is not active after calling qemuDomainObjBeginJobInternal(), we set priv->job.active to QEMU_JOB_ASYNC_NESTED, but we do not clear it and notify the other thread which is waiting priv->job.cond.
@@ -930,15 +936,15 @@ qemuDomainObjExitMonitorInternal(struct qemud_driver *driver, * * To be called immediately before any QEMU monitor API call * Must have already either called qemuDomainObjBeginJob() and checked - * that the VM is still active or called qemuDomainObjBeginAsyncJob, in which - * case this will start a nested job. + * that the VM is still active; may not be used for nested async jobs. * * To be followed with qemuDomainObjExitMonitor() once complete */
<snip>
@@ -2424,7 +2430,8 @@ qemuProcessRecoverJob(struct qemud_driver *driver, reason == VIR_DOMAIN_PAUSED_SAVE) || reason == VIR_DOMAIN_PAUSED_UNKNOWN)) { if (qemuProcessStartCPUs(driver, vm, conn, - VIR_DOMAIN_RUNNING_UNPAUSED) < 0) { + VIR_DOMAIN_RUNNING_UNPAUSED, + job->asyncJob) < 0) {
As the above mentioned, we set priv->job.asynJob to QEMU_ASYNC_JOB_NONE, and priv->job.active is QEMU_JOB_MODIFY. I think we can use QEMU_ASYNC_JOB_NONE instead of job->asyncJob safely. Thanks. Wen Congyang
VIR_WARN("Could not resume domain %s after", vm->def->name); } } @@ -2974,6 +2981,10 @@ int qemuProcessStart(virConnectPtr conn, goto cleanup; }
+ /* Technically, qemuProcessStart can be called from inside + * QEMU_ASYNC_JOB_MIGRATION_IN, but we are okay treating this like + * a sync job since no other job can call into the domain until + * migration completes. */ VIR_DEBUG("Setting initial memory amount"); cur_balloon = vm->def->mem.cur_balloon; ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); @@ -2987,7 +2998,8 @@ int qemuProcessStart(virConnectPtr conn, VIR_DEBUG("Starting domain CPUs"); /* Allow the CPUS to start executing */ if (qemuProcessStartCPUs(driver, vm, conn, - VIR_DOMAIN_RUNNING_BOOTED) < 0) { + VIR_DOMAIN_RUNNING_BOOTED, + QEMU_ASYNC_JOB_NONE) < 0) { if (virGetLastError() == NULL) qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("resume operation failed")); diff --git a/src/qemu/qemu_process.h b/src/qemu/qemu_process.h index 449d7f1..e9b910d 100644 --- a/src/qemu/qemu_process.h +++ b/src/qemu/qemu_process.h @@ -23,6 +23,7 @@ # define __QEMU_PROCESS_H__
# include "qemu_conf.h" +# include "qemu_domain.h"
int qemuProcessPrepareMonitorChr(struct qemud_driver *driver, virDomainChrSourceDefPtr monConfig, @@ -31,10 +32,12 @@ int qemuProcessPrepareMonitorChr(struct qemud_driver *driver, int qemuProcessStartCPUs(struct qemud_driver *driver, virDomainObjPtr vm, virConnectPtr conn, - virDomainRunningReason reason); + virDomainRunningReason reason, + enum qemuDomainAsyncJob asyncJob); int qemuProcessStopCPUs(struct qemud_driver *driver, virDomainObjPtr vm, - virDomainPausedReason reason); + virDomainPausedReason reason, + enum qemuDomainAsyncJob asyncJob);
void qemuProcessAutostartAll(struct qemud_driver *driver); void qemuProcessReconnectAll(virConnectPtr conn, struct qemud_driver *driver);