On 12/05/14 15:03, Martin Kletzander wrote:
There is one problem that causes various errors in the daemon. When
domain is waiting for a job, it is unlocked while waiting on the
condition. However, if that domain is for example transient and being
removed in another API (e.g. cancelling incoming migration), it get's
unref'd. If the first call, that was waiting, fails to get the job, it
unref's the domain object, and because it was the last reference, it
causes clearing of the whole domain object. However, when finishing the
call, the domain must be unlocked, but there is no way for the API to
know whether it was cleaned or not (unless there is some ugly temporary
variable, but let's scratch that).
The root cause is that our APIs don't ref the objects they are using and
all use the implicit reference that the object has when it is in the
domain list. That reference can be removed when the API is waiting for
a job. And because each domain doesn't do its ref'ing, it results in
the ugly checking of the return value of virObjectUnref() that we have
everywhere.
This patch changes qemuDomObjFromDomain() to ref the domain (using
virDomainObjListFindByUUIDRef()) and adds qemuDomObjEndAPI() which
should be the only function in which the return value of
virObjectUnref() is checked. This makes all reference counting
deterministic and makes the code a bit clearer.
Signed-off-by: Martin Kletzander <mkletzan(a)redhat.com>
---
src/qemu/THREADS.txt | 40 ++-
src/qemu/qemu_domain.c | 29 +-
src/qemu/qemu_domain.h | 12 +-
src/qemu/qemu_driver.c | 708 ++++++++++++++++------------------------------
src/qemu/qemu_migration.c | 108 +++----
src/qemu/qemu_migration.h | 10 +-
src/qemu/qemu_process.c | 87 +++---
7 files changed, 359 insertions(+), 635 deletions(-)
diff --git a/src/qemu/THREADS.txt b/src/qemu/THREADS.txt
index 50a0cf9..53a3415 100644
--- a/src/qemu/THREADS.txt
+++ b/src/qemu/THREADS.txt
...
@@ -109,7 +117,6 @@ To lock the virDomainObjPtr
To acquire the normal job condition
qemuDomainObjBeginJob()
*
- - Increments ref count on virDomainObjPtr
- Waits until the job is compatible with current async job or no
async job is running
- Waits for job.cond condition 'job.active != 0' using virDomainObjPtr
@@ -122,14 +129,12 @@ To acquire the normal job condition
qemuDomainObjEndJob()
- Sets job.active to 0
- Signals on job.cond condition
- - Decrements ref count on virDomainObjPtr
To acquire the asynchronous job condition
qemuDomainObjBeginAsyncJob()
- - Increments ref count on virDomainObjPtr
Should we mention that having a ref already is pre-requisite for calling
Begin*Job?
- Waits until no async job is running
- Waits for job.cond condition 'job.active != 0' using virDomainObjPtr
mutex
...
@@ -179,12 +183,10 @@ To acquire the QEMU monitor lock as part of an
asynchronous job
To keep a domain alive while waiting on a remote command
qemuDomainObjEnterRemote()
Same here ?
- - Increments ref count on virDomainObjPtr
- Releases the virDomainObjPtr lock
qemuDomainObjExitRemote()
- Acquires the virDomainObjPtr lock
- - Decrements ref count on virDomainObjPtr
Design patterns
@@ -195,18 +197,18 @@ Design patterns
virDomainObjPtr obj;
- obj = virDomainFindByUUID(driver->domains, dom->uuid);
+ obj = qemuDomObjFromDomain(dom);
...do work...
- virDomainObjUnlock(obj);
+ qemuDomObjEndAPI(obj);
&obj to match the code.
* Updating something directly to do with a virDomainObjPtr
virDomainObjPtr obj;
- obj = virDomainFindByUUID(driver->domains, dom->uuid);
+ obj = qemuDomObjFromDomain(dom);
qemuDomainObjBeginJob(obj, QEMU_JOB_TYPE);
@@ -214,18 +216,15 @@ Design patterns
qemuDomainObjEndJob(obj);
- virDomainObjUnlock(obj);
-
-
+ qemuDomObjEndAPI(obj);
&obj ...
* Invoking a monitor command on a virDomainObjPtr
-
virDomainObjPtr obj;
qemuDomainObjPrivatePtr priv;
- obj = virDomainFindByUUID(driver->domains, dom->uuid);
+ obj = qemuDomObjFromDomain(dom);
qemuDomainObjBeginJob(obj, QEMU_JOB_TYPE);
@@ -240,8 +239,7 @@ Design patterns
...do final work...
qemuDomainObjEndJob(obj);
- virDomainObjUnlock(obj);
-
+ qemuDomObjEndAPI(obj);
&obj ...
* Running asynchronous job
@@ -249,7 +247,7 @@ Design patterns
virDomainObjPtr obj;
qemuDomainObjPrivatePtr priv;
- obj = virDomainFindByUUID(driver->domains, dom->uuid);
+ obj = qemuDomObjFromDomain(dom);
qemuDomainObjBeginAsyncJob(obj, QEMU_ASYNC_JOB_TYPE);
qemuDomainObjSetAsyncJobMask(obj, allowedJobs);
@@ -281,7 +279,7 @@ Design patterns
...do final work...
qemuDomainObjEndAsyncJob(obj);
- virDomainObjUnlock(obj);
+ qemuDomObjEndAPI(obj);
&obj ...
* Coordinating with a remote server for migration
@@ -289,7 +287,7 @@ Design patterns
virDomainObjPtr obj;
qemuDomainObjPrivatePtr priv;
- obj = virDomainFindByUUID(driver->domains, dom->uuid);
+ obj = qemuDomObjFromDomain(dom);
qemuDomainObjBeginAsyncJob(obj, QEMU_ASYNC_JOB_TYPE);
@@ -309,4 +307,4 @@ Design patterns
...do final work...
qemuDomainObjEndAsyncJob(obj);
- virDomainObjUnlock(obj);
+ qemuDomObjEndAPI(obj);
&obj ...
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index 220304f..d4a6021 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -1318,8 +1318,6 @@ qemuDomainObjBeginJobInternal(virQEMUDriverPtr driver,
priv->jobs_queued++;
then = now + QEMU_JOB_WAIT_TIME;
- virObjectRef(obj);
-
retry:
if (cfg->maxQueuedJobs &&
priv->jobs_queued > cfg->maxQueuedJobs) {
@@ -1398,7 +1396,6 @@ qemuDomainObjBeginJobInternal(virQEMUDriverPtr driver,
cleanup:
priv->jobs_queued--;
- virObjectUnref(obj);
virObjectUnref(cfg);
return ret;
}
@@ -1467,7 +1464,8 @@ qemuDomainObjBeginNestedJob(virQEMUDriverPtr driver,
* Returns true if @obj was still referenced, false if it was
* disposed of.
Comment is no longer valid, also mentioning that caller should hold a
reference would be helpful.
*/
-bool qemuDomainObjEndJob(virQEMUDriverPtr driver, virDomainObjPtr obj)
+void
+qemuDomainObjEndJob(virQEMUDriverPtr driver, virDomainObjPtr obj)
{
qemuDomainObjPrivatePtr priv = obj->privateData;
qemuDomainJob job = priv->job.active;
...
@@ -1541,7 +1535,7 @@
qemuDomainObjEnterMonitorInternal(virQEMUDriverPtr driver,
virReportError(VIR_ERR_OPERATION_FAILED, "%s",
_("domain is no longer running"));
/* Still referenced by the containing async job. */
This comment is no longer valid.
- ignore_value(qemuDomainObjEndJob(driver, obj));
+ qemuDomainObjEndJob(driver, obj);
return -1;
}
} else if (priv->job.asyncOwner == virThreadSelfID()) {
...
@@ -2795,3 +2786,11 @@
qemuDomainAgentAvailable(qemuDomainObjPrivatePtr priv,
}
return true;
}
+
Adding some docs what this exactly does would be cool.
+void
+qemuDomObjEndAPI(virDomainObjPtr *vm)
+{
+ virObjectUnlock(*vm);
+ virObjectUnref(*vm);
+ *vm = NULL;
+}
[...]
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
index 9152cf5..2e0b7fc 100644
--- a/src/qemu/qemu_driver.c
+++ b/src/qemu/qemu_driver.c
@@ -205,11 +205,11 @@ struct qemuAutostartData {
* qemuDomObjFromDomain:
* @domain: Domain pointer that has to be looked up
*
- * This function looks up @domain and returns the appropriate
- * virDomainObjPtr.
+ * This function looks up @domain and returns the appropriate virDomainObjPtr
+ * that has to be cleaned by calling qemuDomObjEndAPI().
s/cleaned/released/ ?
*
- * Returns the domain object which is locked on success, NULL
- * otherwise.
+ * Returns the domain object with incremented reference counter which is locked
+ * on success, NULL otherwise.
*/
static virDomainObjPtr
qemuDomObjFromDomain(virDomainPtr domain)
[...]
@@ -1446,7 +1445,7 @@ static virDomainPtr
qemuDomainLookupByID(virConnectPtr conn,
virDomainObjPtr vm;
virDomainPtr dom = NULL;
- vm = virDomainObjListFindByID(driver->domains, id);
+ vm = virDomainObjListFindByID(driver->domains, id);
Unrelated whitespace change ..
if (!vm) {
virReportError(VIR_ERR_NO_DOMAIN,
@@ -1461,8 +1460,7 @@ static virDomainPtr qemuDomainLookupByID(virConnectPtr conn,
if (dom) dom->id = vm->def->id;
cleanup:
- if (vm)
- virObjectUnlock(vm);
and possible crash. If virDomainObjListFindByID fails, the code jumps to
the cleanup label with "vm == NULL".
Both changes also don't do anything with the reference count and thus
should be dropped from this patch as a separate cleanup.
Additional changes would be required in case you want to make the API
more robust by adding a similar "ref then lock" scheme also for the ID
lookup func. Without that change, this leaves a vector that will allow
to lock up the domains hash in case a VM is unresponsive.
+ virObjectUnlock(vm);
return dom;
}
@@ -1473,7 +1471,7 @@ static virDomainPtr qemuDomainLookupByUUID(virConnectPtr conn,
virDomainObjPtr vm;
virDomainPtr dom = NULL;
- vm = virDomainObjListFindByUUID(driver->domains, uuid);
+ vm = virDomainObjListFindByUUIDRef(driver->domains, uuid);
if (!vm) {
char uuidstr[VIR_UUID_STRING_BUFLEN];
@@ -1490,8 +1488,7 @@ static virDomainPtr qemuDomainLookupByUUID(virConnectPtr conn,
if (dom) dom->id = vm->def->id;
cleanup:
- if (vm)
- virObjectUnlock(vm);
+ qemuDomObjEndAPI(&vm);
Control flow allows to reach this part with "vm == NULL" which would
induce a crash.
return dom;
}
@@ -1517,8 +1514,7 @@ static virDomainPtr qemuDomainLookupByName(virConnectPtr conn,
if (dom) dom->id = vm->def->id;
cleanup:
- if (vm)
- virObjectUnlock(vm);
+ virObjectUnlock(vm);
Aaand the same as above ;)
return dom;
}
@@ -1537,8 +1533,7 @@ static int qemuDomainIsActive(virDomainPtr dom)
ret = virDomainObjIsActive(obj);
cleanup:
- if (obj)
- virObjectUnlock(obj);
+ qemuDomObjEndAPI(&obj);
Same as above. if obj is NULL we jump here.
return ret;
}
@@ -1556,8 +1551,7 @@ static int qemuDomainIsPersistent(virDomainPtr dom)
ret = obj->persistent;
cleanup:
- if (obj)
- virObjectUnlock(obj);
+ qemuDomObjEndAPI(&obj);
Ditto.
return ret;
}
@@ -1575,8 +1569,7 @@ static int qemuDomainIsUpdated(virDomainPtr dom)
ret = obj->updated;
cleanup:
- if (obj)
- virObjectUnlock(obj);
+ qemuDomObjEndAPI(&obj);
Ditto.
return ret;
}
@@ -1717,12 +1710,11 @@ static virDomainPtr qemuDomainCreateXML(virConnectPtr conn,
VIR_DOMAIN_OBJ_LIST_ADD_CHECK_LIVE,
NULL)))
goto cleanup;
-
+ virObjectRef(vm);
def = NULL;
if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) {
qemuDomainRemoveInactive(driver, vm);
- vm = NULL;
goto cleanup;
}
@@ -1731,9 +1723,8 @@ static virDomainPtr qemuDomainCreateXML(virConnectPtr conn,
VIR_NETDEV_VPORT_PROFILE_OP_CREATE,
start_flags) < 0) {
virDomainAuditStart(vm, "booted", false);
- if (qemuDomainObjEndJob(driver, vm))
- qemuDomainRemoveInactive(driver, vm);
- vm = NULL;
+ qemuDomainObjEndJob(driver, vm);
+ qemuDomainRemoveInactive(driver, vm);
goto cleanup;
}
@@ -1756,13 +1747,11 @@ static virDomainPtr qemuDomainCreateXML(virConnectPtr conn,
if (dom)
dom->id = vm->def->id;
- if (!qemuDomainObjEndJob(driver, vm))
- vm = NULL;
+ qemuDomainObjEndJob(driver, vm);
cleanup:
virDomainDefFree(def);
- if (vm)
- virObjectUnlock(vm);
+ qemuDomObjEndAPI(&vm);
if (event) {
qemuDomainEventQueue(driver, event);
if (event2)
@@ -1842,12 +1831,10 @@ static int qemuDomainSuspend(virDomainPtr dom)
ret = 0;
endjob:
- if (!qemuDomainObjEndJob(driver, vm))
- vm = NULL;
+ qemuDomainObjEndJob(driver, vm);
cleanup:
- if (vm)
- virObjectUnlock(vm);
+ qemuDomObjEndAPI(&vm);
Aaand same here. I'm starting to think that
void
qemuDomObjEndAPI(virDomainObjPtr *vm)
{
virObjectUnlock(*vm);
virObjectUnref(*vm);
*vm = NULL;
}
should look more like:
void
qemuDomObjEndAPI(virDomainObjPtr *vm)
{
if (!*vm)
return;
virObjectUnlock(*vm);
virObjectUnref(*vm);
*vm = NULL;
}
I'm stopping to report this issue assuming the tweak above.
if (event)
qemuDomainEventQueue(driver, event);
[...]
@@ -3333,11 +3282,9 @@ qemuDomainSaveFlags(virDomainPtr dom, const
char *path, const char *dxml,
ret = qemuDomainSaveInternal(driver, dom, vm, path, compressed,
dxml, flags);
This function consumes @vm with it's reference and lock, thus ...
- vm = NULL;
... you cannot remove this line.
cleanup:
- if (vm)
- virObjectUnlock(vm);
+ qemuDomObjEndAPI(&vm);
virObjectUnref(cfg);
return ret;
}
@@ -3416,15 +3363,13 @@ qemuDomainManagedSave(virDomainPtr dom, unsigned int flags)
VIR_INFO("Saving state of domain '%s' to '%s'",
vm->def->name, name);
- if ((ret = qemuDomainSaveInternal(driver, dom, vm, name, compressed,
- NULL, flags)) == 0)
+ ret = qemuDomainSaveInternal(driver, dom, vm, name,
+ compressed, NULL, flags);
Same as above. qemuDomainSaveInternal consumes VM ...
> + if (ret == 0)
> vm->hasManagedSave = true;
>
- vm = NULL;
... so this needs to stay in the code.
-
cleanup:
- if (vm)
- virObjectUnlock(vm);
+ qemuDomObjEndAPI(&vm);
VIR_FREE(name);
virObjectUnref(cfg);
[...]
@@ -6744,7 +6647,7 @@ static virDomainPtr
qemuDomainDefineXML(virConnectPtr conn, const char *xml)
driver->xmlopt,
0, &oldDef)))
goto cleanup;
-
Don't remove the empty line for clarity.
+ virObjectRef(vm);
def = NULL;
if (virDomainHasDiskMirror(vm)) {
virReportError(VIR_ERR_BLOCK_COPY_ACTIVE, "%s",
[...]
| Migrations |
--------------
qemuDomainMigratePerform uses qemuDomObjFromDomain but doesn't unlock
the object if ACL check fails. I'll post a patch for that. You'll need
to rebase on top of that.
qemuDomainMigratePerform3Params does unlock on the ACL check, but you
forgot to tweak the check.
@@ -12638,8 +12491,7 @@ qemuDomainMigrateGetMaxSpeed(virDomainPtr
dom,
ret = 0;
cleanup:
- if (vm)
- virObjectUnlock(vm);
+ qemuDomObjEndAPI(&vm);
return ret;
}
Uff, I'm not able to finish this review, so I'm sending this as a part 1
and will continue tomorrow.
Peter