Hi Daniel,
I'm giving it some heavy testing and seems to be rock solid! Thanks a
lot for Your effort!
I'm going to add few tens of guests to torture libvirt with multiple simultaneous
accesses even more...
regards
nik
On Mon, Dec 07, 2009 at 02:18:44PM +0000, Daniel P. Berrange wrote:
On Thu, Dec 03, 2009 at 11:12:40AM +0000, Daniel P. Berrange wrote:
> On Thu, Dec 03, 2009 at 12:47:02AM +0100, Matthias Bolte wrote:
> > 2009/11/26 Daniel P. Berrange <berrange(a)redhat.com>:
> > > If QEMU shuts down while we're in the middle of processing a
> > > monitor command, the monitor will be freed, and upon cleaning
> > > up we attempt to do qemuMonitorUnlock(priv->mon) when priv->mon
> > > is NULL.
> > >
> > > To address this we introduce proper reference counting into
> > > the qemuMonitorPtr object, and hold an extra reference whenever
> > > executing a command.
> > >
> > > * src/qemu/qemu_driver.c: Hold a reference on the monitor while
> > > executing commands, and only NULL-ify the priv->mon field when
> > > the last reference is released
> > > * src/qemu/qemu_monitor.h, src/qemu/qemu_monitor.c: Add reference
> > > counting to handle safe deletion of monitor objects
> >
> > The locking pattern below results in destroying a locked mutex. It
> > this intended?
>
> No, that's a bug, the qemuMonitorUnref() call itself should have
> called unlock if the ref count hit 0, before destroying it.
>
> > qemuMonitorLock(mon);
> > [...]
> > if (qemuMonitorUnref(mon) > 0)
> > qemuMonitorUnlock(mon);
> >
> > Well, this patch makes the TCK deadlock for me, seems to be a lock
> > ordering issue combined with a race condition; it doesn't happen every
> > run. I don't understand all details of the locking and refcounting
> > scheme of the QEMU monitor yet, it's quite complex and gets even more
> > complex.
>
> Yes, that problem shown by valgrind will indeed deadlock. I'll fix
> this. The domain object lock must never be acquired if the thread
> holds the monitor lock already. We must have strict ordering from
> top to bottom (driver -> domain object -> qemu monitor)
This is the patch I'm proposing instead. It removes the ref count adjust
of virDomainObjPtr from qemuMonitor, since that requires that you have
the bad lock ordering. Instead that extra ref count is added/removed by
the qemu driver code. This means the qemuMonitor never has to touch any
locks on virDomainObjPtr which complies with our strict top->bottom
ordering requirement.
commit c2b32f964b0eed861dea11e5037993e3a3c3fb3d
Author: Daniel P. Berrange <berrange(a)redhat.com>
Date: Thu Nov 26 13:29:29 2009 +0000
Fix crash when deleting monitor while a command is in progress
If QEMU shuts down while we're in the middle of processing a
monitor command, the monitor will be freed, and upon cleaning
up we attempt to do qemuMonitorUnlock(priv->mon) when priv->mon
is NULL.
To address this we introduce proper reference counting into
the qemuMonitorPtr object, and hold an extra reference whenever
executing a command.
* src/qemu/qemu_driver.c: Hold a reference on the monitor while
executing commands, and only NULL-ify the priv->mon field when
the last reference is released
* src/qemu/qemu_monitor.h, src/qemu/qemu_monitor.c: Add reference
counting to handle safe deletion of monitor objects
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
index 43d20ea..fd47dc0 100644
--- a/src/qemu/qemu_driver.c
+++ b/src/qemu/qemu_driver.c
@@ -272,6 +272,7 @@ static void qemuDomainObjEnterMonitor(virDomainObjPtr obj)
qemuDomainObjPrivatePtr priv = obj->privateData;
qemuMonitorLock(priv->mon);
+ qemuMonitorRef(priv->mon);
virDomainObjUnlock(obj);
}
@@ -283,9 +284,16 @@ static void qemuDomainObjEnterMonitor(virDomainObjPtr obj)
static void qemuDomainObjExitMonitor(virDomainObjPtr obj)
{
qemuDomainObjPrivatePtr priv = obj->privateData;
+ int refs;
+ refs = qemuMonitorUnref(priv->mon);
qemuMonitorUnlock(priv->mon);
virDomainObjLock(obj);
+
+ if (refs == 0) {
+ virDomainObjUnref(obj);
+ priv->mon = NULL;
+ }
}
@@ -302,6 +310,7 @@ static void qemuDomainObjEnterMonitorWithDriver(struct qemud_driver
*driver, vir
qemuDomainObjPrivatePtr priv = obj->privateData;
qemuMonitorLock(priv->mon);
+ qemuMonitorRef(priv->mon);
virDomainObjUnlock(obj);
qemuDriverUnlock(driver);
}
@@ -315,10 +324,17 @@ static void qemuDomainObjEnterMonitorWithDriver(struct qemud_driver
*driver, vir
static void qemuDomainObjExitMonitorWithDriver(struct qemud_driver *driver,
virDomainObjPtr obj)
{
qemuDomainObjPrivatePtr priv = obj->privateData;
+ int refs;
+ refs = qemuMonitorUnref(priv->mon);
qemuMonitorUnlock(priv->mon);
qemuDriverLock(driver);
virDomainObjLock(obj);
+
+ if (refs == 0) {
+ virDomainObjUnref(obj);
+ priv->mon = NULL;
+ }
}
@@ -653,6 +669,10 @@ qemuConnectMonitor(virDomainObjPtr vm)
{
qemuDomainObjPrivatePtr priv = vm->privateData;
+ /* Hold an extra reference because we can't allow 'vm' to be
+ * deleted while the monitor is active */
+ virDomainObjRef(vm);
+
if ((priv->mon = qemuMonitorOpen(vm, qemuHandleMonitorEOF)) == NULL) {
VIR_ERROR(_("Failed to connect monitor for %s\n"),
vm->def->name);
return -1;
@@ -2400,8 +2420,9 @@ static void qemudShutdownVMDaemon(virConnectPtr conn,
_("Failed to send SIGTERM to %s (%d)"),
vm->def->name, vm->pid);
- if (priv->mon) {
- qemuMonitorClose(priv->mon);
+ if (priv->mon &&
+ qemuMonitorClose(priv->mon) == 0) {
+ virDomainObjUnref(vm);
priv->mon = NULL;
}
diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c
index f0ef81b..a2e10bc 100644
--- a/src/qemu/qemu_monitor.c
+++ b/src/qemu/qemu_monitor.c
@@ -42,7 +42,7 @@ struct _qemuMonitor {
virMutex lock;
virCond notify;
- virDomainObjPtr dom;
+ int refs;
int fd;
int watch;
@@ -67,12 +67,14 @@ struct _qemuMonitor {
* the next monitor msg */
int lastErrno;
- /* If the monitor callback is currently active */
+ /* If the monitor EOF callback is currently active (stops more commands being run)
*/
unsigned eofcb: 1;
- /* If the monitor callback should free the closed monitor */
+ /* If the monitor is in process of shutting down */
unsigned closed: 1;
+
};
+
void qemuMonitorLock(qemuMonitorPtr mon)
{
virMutexLock(&mon->lock);
@@ -84,21 +86,33 @@ void qemuMonitorUnlock(qemuMonitorPtr mon)
}
-static void qemuMonitorFree(qemuMonitorPtr mon, int lockDomain)
+static void qemuMonitorFree(qemuMonitorPtr mon)
{
- VIR_DEBUG("mon=%p, lockDomain=%d", mon, lockDomain);
- if (mon->vm) {
- if (lockDomain)
- virDomainObjLock(mon->vm);
- if (!virDomainObjUnref(mon->vm) && lockDomain)
- virDomainObjUnlock(mon->vm);
- }
+ VIR_DEBUG("mon=%p", mon);
if (virCondDestroy(&mon->notify) < 0)
{}
virMutexDestroy(&mon->lock);
VIR_FREE(mon);
}
+int qemuMonitorRef(qemuMonitorPtr mon)
+{
+ mon->refs++;
+ return mon->refs;
+}
+
+int qemuMonitorUnref(qemuMonitorPtr mon)
+{
+ mon->refs--;
+
+ if (mon->refs == 0) {
+ qemuMonitorUnlock(mon);
+ qemuMonitorFree(mon);
+ }
+
+ return mon->refs;
+}
+
static int
qemuMonitorOpenUnix(const char *monitor)
@@ -348,6 +362,7 @@ qemuMonitorIO(int watch, int fd, int events, void *opaque) {
int quit = 0, failed = 0;
qemuMonitorLock(mon);
+ qemuMonitorRef(mon);
VIR_DEBUG("Monitor %p I/O on watch %d fd %d events %d", mon, watch, fd,
events);
if (mon->fd != fd || mon->watch != watch) {
@@ -407,23 +422,17 @@ qemuMonitorIO(int watch, int fd, int events, void *opaque) {
* but is this safe ? I think it is, because the callback
* will try to acquire the virDomainObjPtr mutex next */
if (failed || quit) {
+ qemuMonitorEOFNotify eofCB = mon->eofCB;
+ virDomainObjPtr vm = mon->vm;
/* Make sure anyone waiting wakes up now */
virCondSignal(&mon->notify);
- mon->eofcb = 1;
- qemuMonitorUnlock(mon);
- VIR_DEBUG("Triggering EOF callback error? %d", failed);
- mon->eofCB(mon, mon->vm, failed);
-
- qemuMonitorLock(mon);
- if (mon->closed) {
+ if (qemuMonitorUnref(mon) > 0)
qemuMonitorUnlock(mon);
- VIR_DEBUG("Delayed free of monitor %p", mon);
- qemuMonitorFree(mon, 1);
- } else {
- qemuMonitorUnlock(mon);
- }
+ VIR_DEBUG("Triggering EOF callback error? %d", failed);
+ (eofCB)(mon, vm, failed);
} else {
- qemuMonitorUnlock(mon);
+ if (qemuMonitorUnref(mon) > 0)
+ qemuMonitorUnlock(mon);
}
}
@@ -453,10 +462,10 @@ qemuMonitorOpen(virDomainObjPtr vm,
return NULL;
}
mon->fd = -1;
+ mon->refs = 1;
mon->vm = vm;
mon->eofCB = eofCB;
qemuMonitorLock(mon);
- virDomainObjRef(vm);
switch (vm->monitor_chr->type) {
case VIR_DOMAIN_CHR_TYPE_UNIX:
@@ -512,10 +521,14 @@ cleanup:
}
-void qemuMonitorClose(qemuMonitorPtr mon)
+int qemuMonitorClose(qemuMonitorPtr mon)
{
+ int refs;
+
if (!mon)
- return;
+ return 0;
+
+ VIR_DEBUG("mon=%p", mon);
qemuMonitorLock(mon);
if (!mon->closed) {
@@ -523,18 +536,17 @@ void qemuMonitorClose(qemuMonitorPtr mon)
virEventRemoveHandle(mon->watch);
if (mon->fd != -1)
close(mon->fd);
- /* NB: don't reset fd / watch fields, since active
- * callback may still want them */
+ /* NB: ordinarily one might immediately set mon->watch to -1
+ * and mon->fd to -1, but there may be a callback active
+ * that is still relying on these fields being valid. So
+ * we merely close them, but not clear their values and
+ * use this explicit 'closed' flag to track this state */
mon->closed = 1;
}
- if (mon->eofcb) {
- VIR_DEBUG("Mark monitor to be deleted %p", mon);
+ if ((refs = qemuMonitorUnref(mon)) > 0)
qemuMonitorUnlock(mon);
- } else {
- VIR_DEBUG("Delete monitor now %p", mon);
- qemuMonitorFree(mon, 0);
- }
+ return refs;
}
@@ -552,7 +564,6 @@ int qemuMonitorSend(qemuMonitorPtr mon,
if (mon->eofcb) {
msg->lastErrno = EIO;
- qemuMonitorUnlock(mon);
return -1;
}
diff --git a/src/qemu/qemu_monitor.h b/src/qemu/qemu_monitor.h
index 71688cb..27b43b7 100644
--- a/src/qemu/qemu_monitor.h
+++ b/src/qemu/qemu_monitor.h
@@ -78,11 +78,14 @@ typedef int (*qemuMonitorDiskSecretLookup)(qemuMonitorPtr mon,
qemuMonitorPtr qemuMonitorOpen(virDomainObjPtr vm,
qemuMonitorEOFNotify eofCB);
-void qemuMonitorClose(qemuMonitorPtr mon);
+int qemuMonitorClose(qemuMonitorPtr mon);
void qemuMonitorLock(qemuMonitorPtr mon);
void qemuMonitorUnlock(qemuMonitorPtr mon);
+int qemuMonitorRef(qemuMonitorPtr mon);
+int qemuMonitorUnref(qemuMonitorPtr mon);
+
void qemuMonitorRegisterDiskSecretLookup(qemuMonitorPtr mon,
qemuMonitorDiskSecretLookup secretCB);
--
|: Red Hat, Engineering, London -o-
http://people.redhat.com/berrange/ :|
|:
http://libvirt.org -o-
http://virt-manager.org -o-
http://ovirt.org :|
|:
http://autobuild.org -o-
http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|
--
Libvir-list mailing list
Libvir-list(a)redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
--
-------------------------------------
Nikola CIPRICH
LinuxBox.cz, s.r.o.
28. rijna 168, 709 01 Ostrava
tel.: +420 596 603 142
fax: +420 596 621 273
mobil: +420 777 093 799
mobil servis: +420 737 238 656
email servis: servis(a)linuxbox.cz
-------------------------------------