[libvirt] [PATCH 1/2] qemu: Track job owner for better debugging

In case an API fails with "cannot acquire state change lock", searching for the API that possibly forgot to end its job is not always easy. Let's keep track of the job owner and print it out for easier identification. --- src/qemu/qemu_domain.c | 53 +++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_domain.h | 4 +++ src/qemu/qemu_migration.c | 1 + src/qemu/qemu_process.c | 6 +--- 4 files changed, 60 insertions(+), 4 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index da1f57f..7f1f8ee 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -147,6 +147,7 @@ qemuDomainObjResetJob(qemuDomainObjPrivatePtr priv) struct qemuDomainJobObj *job = &priv->job; job->active = QEMU_JOB_NONE; + job->owner = 0; } static void @@ -155,6 +156,7 @@ qemuDomainObjResetAsyncJob(qemuDomainObjPrivatePtr priv) struct qemuDomainJobObj *job = &priv->job; job->asyncJob = QEMU_ASYNC_JOB_NONE; + job->asyncOwner = 0; job->phase = 0; job->mask = DEFAULT_JOB_MASK; job->start = 0; @@ -169,13 +171,25 @@ qemuDomainObjRestoreJob(virDomainObjPtr obj, memset(job, 0, sizeof(*job)); job->active = priv->job.active; + job->owner = priv->job.owner; job->asyncJob = priv->job.asyncJob; + job->asyncOwner = priv->job.asyncOwner; job->phase = priv->job.phase; qemuDomainObjResetJob(priv); qemuDomainObjResetAsyncJob(priv); } +void +qemuDomainObjTransferJob(virDomainObjPtr obj) +{ + qemuDomainObjPrivatePtr priv = obj->privateData; + + VIR_DEBUG("Changing job owner from %d to %d", + priv->job.owner, virThreadSelfID()); + priv->job.owner = virThreadSelfID(); +} + static void qemuDomainObjFreeJob(qemuDomainObjPrivatePtr priv) { @@ -664,11 +678,23 @@ qemuDomainObjSetJobPhase(struct qemud_driver *driver, int phase) { qemuDomainObjPrivatePtr priv = obj->privateData; + int me = virThreadSelfID(); if (!priv->job.asyncJob) return; + VIR_DEBUG("Setting '%s' phase to '%s'", + qemuDomainAsyncJobTypeToString(priv->job.asyncJob), + qemuDomainAsyncJobPhaseToString(priv->job.asyncJob, phase)); + + if (priv->job.asyncOwner && me != priv->job.asyncOwner) { + VIR_WARN("'%s' async job is owned by thread %d", + qemuDomainAsyncJobTypeToString(priv->job.asyncJob), + priv->job.asyncOwner); + } + priv->job.phase = phase; + priv->job.asyncOwner = me; qemuDomainObjSaveJob(driver, obj); } @@ -695,6 +721,22 @@ qemuDomainObjDiscardAsyncJob(struct qemud_driver *driver, virDomainObjPtr obj) qemuDomainObjSaveJob(driver, obj); } +void +qemuDomainObjReleaseAsyncJob(virDomainObjPtr obj) +{ + qemuDomainObjPrivatePtr priv = obj->privateData; + + VIR_DEBUG("Releasing ownership of '%s' async job", + qemuDomainAsyncJobTypeToString(priv->job.asyncJob)); + + if (priv->job.asyncOwner != virThreadSelfID()) { + VIR_WARN("'%s' async job is owned by thread %d", + qemuDomainAsyncJobTypeToString(priv->job.asyncJob), + priv->job.asyncOwner); + } + priv->job.asyncOwner = 0; +} + static bool qemuDomainNestedJobAllowed(qemuDomainObjPrivatePtr priv, enum qemuDomainJob job) { @@ -764,11 +806,13 @@ retry: qemuDomainJobTypeToString(job), qemuDomainAsyncJobTypeToString(priv->job.asyncJob)); priv->job.active = job; + priv->job.owner = virThreadSelfID(); } else { VIR_DEBUG("Starting async job: %s", qemuDomainAsyncJobTypeToString(asyncJob)); qemuDomainObjResetAsyncJob(priv); priv->job.asyncJob = asyncJob; + priv->job.asyncOwner = virThreadSelfID(); priv->job.start = now; } @@ -784,6 +828,15 @@ retry: return 0; error: + VIR_WARN("Cannot start job (%s, %s) for domain %s;" + " current job is (%s, %s) owned by (%d, %d)", + qemuDomainJobTypeToString(job), + qemuDomainAsyncJobTypeToString(asyncJob), + obj->def->name, + qemuDomainJobTypeToString(priv->job.active), + qemuDomainAsyncJobTypeToString(priv->job.asyncJob), + priv->job.owner, priv->job.asyncOwner); + if (errno == ETIMEDOUT) qemuReportError(VIR_ERR_OPERATION_TIMEOUT, "%s", _("cannot acquire state change lock")); diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index b3eecd3..ce52569 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -96,9 +96,11 @@ VIR_ENUM_DECL(qemuDomainAsyncJob) struct qemuDomainJobObj { virCond cond; /* Use to coordinate jobs */ enum qemuDomainJob active; /* Currently running job */ + int owner; /* Thread which set current job */ virCond asyncCond; /* Use to coordinate with async jobs */ enum qemuDomainAsyncJob asyncJob; /* Currently active async job */ + int asyncOwner; /* Thread which set current async job */ int phase; /* Job phase (mainly for migrations) */ unsigned long long mask; /* Jobs allowed during async job */ unsigned long long start; /* When the async job started */ @@ -203,8 +205,10 @@ void qemuDomainObjSetAsyncJobMask(virDomainObjPtr obj, unsigned long long allowedJobs); void qemuDomainObjRestoreJob(virDomainObjPtr obj, struct qemuDomainJobObj *job); +void qemuDomainObjTransferJob(virDomainObjPtr obj); void qemuDomainObjDiscardAsyncJob(struct qemud_driver *driver, virDomainObjPtr obj); +void qemuDomainObjReleaseAsyncJob(virDomainObjPtr obj); void qemuDomainObjEnterMonitor(struct qemud_driver *driver, virDomainObjPtr obj) diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index ff6f273..c1bb93a 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -3242,6 +3242,7 @@ qemuMigrationJobStartPhase(struct qemud_driver *driver, int qemuMigrationJobContinue(virDomainObjPtr vm) { + qemuDomainObjReleaseAsyncJob(vm); return virDomainObjUnref(vm); } diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 96f39e8..19569cf 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3038,8 +3038,8 @@ qemuProcessReconnect(void *opaque) priv = obj->privateData; - /* Set fake job so that EnterMonitor* doesn't want to start a new one */ - priv->job.active = QEMU_JOB_MODIFY; + /* Job was started by the caller for us */ + qemuDomainObjTransferJob(obj); /* Hold an extra reference because we can't allow 'vm' to be * deleted if qemuConnectMonitor() failed */ @@ -3119,8 +3119,6 @@ qemuProcessReconnect(void *opaque) if (qemuProcessRecoverJob(driver, obj, conn, &oldjob) < 0) goto error; - priv->job.active = QEMU_JOB_NONE; - /* update domain state XML with possibly updated state in virDomainObj */ if (virDomainSaveStatus(driver->caps, driver->stateDir, obj) < 0) goto error; -- 1.7.8.5

qemuDomainObjEnterMonitor{,WithDriver} should not be called from async jobs, only EnterMonitorAsync variant is allowed. --- src/qemu/qemu_domain.c | 6 ++++++ 1 files changed, 6 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 7f1f8ee..4dda2e0 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -975,6 +975,9 @@ qemuDomainObjEnterMonitorInternal(struct qemud_driver *driver, _("unexpected async job %d"), asyncJob); return -1; } + if (priv->job.asyncOwner != virThreadSelfID()) + VIR_WARN("This thread doesn't seem to be the async job owner: %d", + priv->job.asyncOwner); if (qemuDomainObjBeginJobInternal(driver, driver_locked, obj, QEMU_JOB_ASYNC_NESTED, QEMU_ASYNC_JOB_NONE) < 0) @@ -986,6 +989,9 @@ qemuDomainObjEnterMonitorInternal(struct qemud_driver *driver, ignore_value(qemuDomainObjEndJob(driver, obj)); return -1; } + } else if (priv->job.asyncOwner == virThreadSelfID()) { + VIR_WARN("This thread seems to be the async job owner; entering" + " monitor without asking for a nested job is dangerous"); } qemuMonitorLock(priv->mon); -- 1.7.8.5

On 04/10/2012 09:05 AM, Jiri Denemark wrote:
qemuDomainObjEnterMonitor{,WithDriver} should not be called from async jobs, only EnterMonitorAsync variant is allowed. --- src/qemu/qemu_domain.c | 6 ++++++ 1 files changed, 6 insertions(+), 0 deletions(-)
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 7f1f8ee..4dda2e0 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -975,6 +975,9 @@ qemuDomainObjEnterMonitorInternal(struct qemud_driver *driver, _("unexpected async job %d"), asyncJob); return -1; } + if (priv->job.asyncOwner != virThreadSelfID()) + VIR_WARN("This thread doesn't seem to be the async job owner: %d", + priv->job.asyncOwner);
virThreadSelfID() is not bullet-proof (it returns int, but there are platforms with 64-bit thread ids); but it is adequate for our purposes (you only check it for issuing a warning, rather than actually erroring out, and on our common development platform of Linux, it does the right thing). ACK series. -- Eric Blake eblake@redhat.com +1-919-301-3266 Libvirt virtualization library http://libvirt.org
participants (2)
-
Eric Blake
-
Jiri Denemark