On Thu, Apr 07, 2011 at 10:33:03AM +0100, Daniel P. Berrange wrote:
On Wed, Apr 06, 2011 at 03:19:51PM +0800, Hu Tao wrote:
> This patch also eliminates a dead-lock bug in
> qemuDomainObjBeginJobWithDriver: if virCondWaitUntil() timeouts, the
> thread tries to acquire qemu driver lock while holding virDomainObj
> lock.
> ---
> src/conf/domain_conf.c | 56 ++++----
> src/conf/domain_conf.h | 6 +-
> src/openvz/openvz_conf.c | 8 +-
> src/qemu/qemu_domain.c | 32 ++---
> src/qemu/qemu_domain.h | 2 +-
> src/qemu/qemu_driver.c | 304 ++++++++++++++++++++-------------------------
> src/qemu/qemu_migration.c | 45 +++----
> src/qemu/qemu_process.c | 33 ++---
> src/vmware/vmware_conf.c | 2 +-
> 9 files changed, 215 insertions(+), 273 deletions(-)
>
> diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
> index 90a1317..fc76a00 100644
> --- a/src/conf/domain_conf.c
> +++ b/src/conf/domain_conf.c
> @@ -47,6 +47,7 @@
> #include "storage_file.h"
> #include "files.h"
> #include "bitmap.h"
> +#include "virobject.h"
>
> #define VIR_FROM_THIS VIR_FROM_DOMAIN
>
> @@ -395,9 +396,7 @@ static void
> virDomainObjListDataFree(void *payload, const void *name ATTRIBUTE_UNUSED)
> {
> virDomainObjPtr obj = payload;
> - virDomainObjLock(obj);
> - if (virDomainObjUnref(obj) > 0)
> - virDomainObjUnlock(obj);
> + virDomainObjUnref(obj);
> }
>
> int virDomainObjListInit(virDomainObjListPtr doms)
> @@ -437,7 +436,7 @@ virDomainObjPtr virDomainFindByID(const virDomainObjListPtr
doms,
> virDomainObjPtr obj;
> obj = virHashSearch(doms->objs, virDomainObjListSearchID, &id);
> if (obj)
> - virDomainObjLock(obj);
> + virDomainObjRef(obj);
> return obj;
> }
>
> @@ -452,7 +451,7 @@ virDomainObjPtr virDomainFindByUUID(const virDomainObjListPtr
doms,
>
> obj = virHashLookup(doms->objs, uuidstr);
> if (obj)
> - virDomainObjLock(obj);
> + virDomainObjRef(obj);
> return obj;
> }
>
> @@ -476,7 +475,7 @@ virDomainObjPtr virDomainFindByName(const virDomainObjListPtr
doms,
> virDomainObjPtr obj;
> obj = virHashSearch(doms->objs, virDomainObjListSearchName, name);
> if (obj)
> - virDomainObjLock(obj);
> + virDomainObjRef(obj);
> return obj;
> }
This is a major change in semantics, which makes pretty much
every single caller non-thread safe, unless the callers are
all changed todo virDomainObjLock immediately after calling
this. So I don't really see the point in this - it just means
more code duplication.
If we do this:
obj = virHashSearch(doms->objs, virDomainObjListSearchName, name);
if (obj)
virDomainObjLock(obj);
return obj;
And at the meantime another thread removes the same obj from doms->objs
and frees it, than we are accessing a freed obj.
lock doesn't help prevent object from being freed.
>
> @@ -967,6 +966,12 @@ static void virDomainObjFree(virDomainObjPtr dom)
> {
> if (!dom)
> return;
> + virDomainObjUnref(dom);
> +}
> +
> +static void doDomainObjFree(virObjectPtr obj)
> +{
> + virDomainObjPtr dom = (virDomainObjPtr)obj;
>
> VIR_DEBUG("obj=%p", dom);
> virDomainDefFree(dom->def);
> @@ -984,21 +989,13 @@ static void virDomainObjFree(virDomainObjPtr dom)
>
> void virDomainObjRef(virDomainObjPtr dom)
> {
> - dom->refs++;
> - VIR_DEBUG("obj=%p refs=%d", dom, dom->refs);
> + virObjectRef(&dom->obj);
> }
>
>
> -int virDomainObjUnref(virDomainObjPtr dom)
> +void virDomainObjUnref(virDomainObjPtr dom)
> {
> - dom->refs--;
> - VIR_DEBUG("obj=%p refs=%d", dom, dom->refs);
> - if (dom->refs == 0) {
> - virDomainObjUnlock(dom);
> - virDomainObjFree(dom);
> - return 0;
> - }
> - return dom->refs;
> + virObjectUnref(&dom->obj);
> }
>
> static virDomainObjPtr virDomainObjNew(virCapsPtr caps)
> @@ -1010,6 +1007,11 @@ static virDomainObjPtr virDomainObjNew(virCapsPtr caps)
> return NULL;
> }
>
> + if (virObjectInit(&domain->obj, doDomainObjFree)) {
> + VIR_FREE(domain);
> + return NULL;
> + }
> +
> if (caps->privateDataAllocFunc &&
> !(domain->privateData = (caps->privateDataAllocFunc)())) {
> virReportOOMError();
> @@ -1027,9 +1029,7 @@ static virDomainObjPtr virDomainObjNew(virCapsPtr caps)
> return NULL;
> }
>
> - virDomainObjLock(domain);
> domain->state = VIR_DOMAIN_SHUTOFF;
> - domain->refs = 1;
>
> virDomainSnapshotObjListInit(&domain->snapshots);
>
> @@ -1075,8 +1075,10 @@ virDomainObjPtr virDomainAssignDef(virCapsPtr caps,
> domain->def = def;
>
> virUUIDFormat(def->uuid, uuidstr);
> + virDomainObjRef(domain);
> if (virHashAddEntry(doms->objs, uuidstr, domain) < 0) {
> - VIR_FREE(domain);
> + virDomainObjUnref(domain);
> + virDomainObjFree(domain);
> return NULL;
> }
Simiarly here, you're now requiring all callers to manually obtain
a lock.
That is the desired result, lock right before do read/write and unlock
it as soon as possible.
> @@ -1239,39 +1243,57 @@ static virDomainPtr qemudDomainCreate(virConnectPtr conn,
const char *xml,
>
> qemuDriverLock(driver);
> if (!(def = virDomainDefParseString(driver->caps, xml,
> - VIR_DOMAIN_XML_INACTIVE)))
> + VIR_DOMAIN_XML_INACTIVE))) {
> + qemuDriverUnlock(driver);
> goto cleanup;
> + }
>
> - if (virSecurityManagerVerify(driver->securityManager, def) < 0)
> + if (virSecurityManagerVerify(driver->securityManager, def) < 0) {
> + qemuDriverUnlock(driver);
> goto cleanup;
> + }
>
> - if (virDomainObjIsDuplicate(&driver->domains, def, 1) < 0)
> + if (virDomainObjIsDuplicate(&driver->domains, def, 1) < 0) {
> + qemuDriverUnlock(driver);
> goto cleanup;
> + }
>
> - if (qemudCanonicalizeMachine(driver, def) < 0)
> + if (qemudCanonicalizeMachine(driver, def) < 0) {
> + qemuDriverUnlock(driver);
> goto cleanup;
> + }
>
> - if (qemuDomainAssignPCIAddresses(def) < 0)
> + if (qemuDomainAssignPCIAddresses(def) < 0) {
> + qemuDriverUnlock(driver);
> goto cleanup;
> + }
>
> if (!(vm = virDomainAssignDef(driver->caps,
> &driver->domains,
> - def, false)))
> + def, false))) {
> + qemuDriverUnlock(driver);
> goto cleanup;
> + }
> +
> + qemuDriverUnlock(driver);
driver is now unlocked....
>
> def = NULL;
>
> - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0)
> + if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) {
> + virDomainObjUnref(vm);
...but this method *requires* driver to be locked.
> goto cleanup; /* XXXX free the 'vm' we created ? */
> + }
>
> if (qemuProcessStart(conn, driver, vm, NULL,
> (flags & VIR_DOMAIN_START_PAUSED) != 0,
> -1, NULL, VIR_VM_OP_CREATE) < 0) {
> qemuAuditDomainStart(vm, "booted", false);
...and this method writes to 'driver', so it is now unsafe.
> - if (qemuDomainObjEndJob(vm) > 0)
> - virDomainRemoveInactive(&driver->domains,
> - vm);
> - vm = NULL;
> + qemuDomainObjEndJob(vm);
> + qemuDriverLock(driver);
> + virDomainRemoveInactive(&driver->domains,
> + vm);
> + qemuDriverUnlock(driver);
> + virDomainObjUnref(vm);
> goto cleanup;
> }
>
> @@ -1283,17 +1305,13 @@ static virDomainPtr qemudDomainCreate(virConnectPtr conn,
const char *xml,
> dom = virGetDomain(conn, vm->def->name, vm->def->uuid);
> if (dom) dom->id = vm->def->id;
>
> - if (vm &&
> - qemuDomainObjEndJob(vm) == 0)
> - vm = NULL;
> + qemuDomainObjEndJob(vm);
> + virDomainObjUnref(vm);
>
> cleanup:
> virDomainDefFree(def);
> - if (vm)
> - virDomainObjUnlock(vm);
> if (event)
> qemuDomainEventQueue(driver, event);
> - qemuDriverUnlock(driver);
> return dom;
> }
And all the usage of 'vm' in this method is now completely
unlocked and unsafe.
>
> @@ -1307,13 +1325,14 @@ static int qemudDomainSuspend(virDomainPtr dom) {
>
> qemuDriverLock(driver);
> vm = virDomainFindByUUID(&driver->domains, dom->uuid);
> + qemuDriverUnlock(driver);
>
> if (!vm) {
> char uuidstr[VIR_UUID_STRING_BUFLEN];
> virUUIDFormat(dom->uuid, uuidstr);
> qemuReportError(VIR_ERR_NO_DOMAIN,
> _("no domain with matching uuid '%s'"),
uuidstr);
> - goto cleanup;
> + return -1;
> }
> if (!virDomainObjIsActive(vm)) {
> qemuReportError(VIR_ERR_OPERATION_INVALID,
> @@ -1354,16 +1373,12 @@ static int qemudDomainSuspend(virDomainPtr dom) {
> }
>
> endjob:
> - if (qemuDomainObjEndJob(vm) == 0)
> - vm = NULL;
> + qemuDomainObjEndJob(vm);
>
> cleanup:
> - if (vm)
> - virDomainObjUnlock(vm);
> -
> + virDomainObjUnref(vm);
> if (event)
> qemuDomainEventQueue(driver, event);
> - qemuDriverUnlock(driver);
> return ret;
> }
Also now completely unsafe since 'vm' is never locked while
making changes to it.
Yes. Will improve qemudDomainSuspend.
[cut rest of patch]
I don't see how any of this patch is threadsafe now that virtually
no methods are acquiring the 'vm' lock.
qemuDomainObjBeginJob does acquire vm lock.