Hello,
here are call traces with two threads generated by python script
// ==============================
[root@dandreev-vz-6-0-0-243-master libvirt]# gdb -p 288985
(gdb) t a a bt
Thread 2 (Thread 0x7f2112862640 (LWP 288986) "python3"):
#0 0x00007f2121d4296f in poll () at /lib64/libc.so.6
#1 0x00007f211444650c in g_main_context_iterate.constprop () at /lib64/libglib-2.0.so.0
#2 0x00007f21143f1483 in g_main_loop_run () at /lib64/libglib-2.0.so.0
#3 0x00007f211406800b in virNetClientIOEventLoop () at /lib64/libvirt.so.0
#4 0x00007f2114068a0a in virNetClientIO () at /lib64/libvirt.so.0
#5 0x00007f21140692c1 in virNetClientSendInternal () at /lib64/libvirt.so.0
#6 0x00007f211406936d in virNetClientSendWithReply () at /lib64/libvirt.so.0
#7 0x00007f2114061a1d in virNetClientProgramCall () at /lib64/libvirt.so.0
#8 0x00007f21140f79ac in callFull () at /lib64/libvirt.so.0
#9 0x00007f21140f7a2f in call () at /lib64/libvirt.so.0
#10 0x00007f21140d8435 in remoteDomainCreate () at /lib64/libvirt.so.0
#11 0x00007f21141dd60c in virDomainCreate () at /lib64/libvirt.so.0
#12 0x00007f21145c8114 in libvirt_virDomainCreate () at
/usr/lib64/python3.9/site-packages/libvirtmod.cpython-39-x86_64-linux-gnu.so
#13 0x00007f21221268a8 in cfunction_call () at /lib64/libpython3.9.so.1.0
#14 0x00007f2122118814 in _PyObject_MakeTpCall () at /lib64/libpython3.9.so.1.0
#15 0x00007f212211566e in _PyEval_EvalFrameDefault () at /lib64/libpython3.9.so.1.0
#16 0x00007f212211d013 in function_code_fastcall () at /lib64/libpython3.9.so.1.0
#17 0x00007f2122110650 in _PyEval_EvalFrameDefault () at /lib64/libpython3.9.so.1.0
#18 0x00007f212211d013 in function_code_fastcall () at /lib64/libpython3.9.so.1.0
#19 0x00007f21221103e8 in _PyEval_EvalFrameDefault () at /lib64/libpython3.9.so.1.0
#20 0x00007f212211d013 in function_code_fastcall () at /lib64/libpython3.9.so.1.0
#21 0x00007f21221133d2 in _PyEval_EvalFrameDefault () at /lib64/libpython3.9.so.1.0
#22 0x00007f212211d013 in function_code_fastcall () at /lib64/libpython3.9.so.1.0
#23 0x00007f2122110650 in _PyEval_EvalFrameDefault () at /lib64/libpython3.9.so.1.0
#24 0x00007f212211d013 in function_code_fastcall () at /lib64/libpython3.9.so.1.0
#25 0x00007f2122110650 in _PyEval_EvalFrameDefault () at /lib64/libpython3.9.so.1.0
#26 0x00007f212211d013 in function_code_fastcall () at /lib64/libpython3.9.so.1.0
#27 0x00007f2122125382 in method_vectorcall () at /lib64/libpython3.9.so.1.0
#28 0x00007f21221d8c4a in t_bootstrap () at /lib64/libpython3.9.so.1.0
#29 0x00007f21221d8bf8 in pythread_wrapper () at /lib64/libpython3.9.so.1.0
#30 0x00007f2121c9f802 in start_thread () at /lib64/libc.so.6
#31 0x00007f2121c3f450 in clone3 () at /lib64/libc.so.6
Thread 1 (Thread 0x7f21223cf740 (LWP 288985) "python3"):
#0 0x00007f2121c9c39a in __futex_abstimed_wait_common () at /lib64/libc.so.6
#1 0x00007f2121c9eba0 in pthread_cond_wait@(a)GLIBC_2.3.2 () at /lib64/libc.so.6
#2 0x00007f2113f4982a in virCondWait () at /lib64/libvirt.so.0
#3 0x00007f2113f1fee3 in virObjectWait () at /lib64/libvirt.so.0
#4 0x00007f211406882a in virNetClientIO () at /lib64/libvirt.so.0
#5 0x00007f21140692c1 in virNetClientSendInternal () at /lib64/libvirt.so.0
#6 0x00007f211406936d in virNetClientSendWithReply () at /lib64/libvirt.so.0
#7 0x00007f2114061a1d in virNetClientProgramCall () at /lib64/libvirt.so.0
#8 0x00007f21140f79ac in callFull () at /lib64/libvirt.so.0
#9 0x00007f21140f7a2f in call () at /lib64/libvirt.so.0
#10 0x00007f21140f24eb in remoteNodeDeviceNumOfCaps () at /lib64/libvirt.so.0
#11 0x00007f2114207a00 in virNodeDeviceNumOfCaps () at /lib64/libvirt.so.0
#12 0x00007f21145d8edf in libvirt_virNodeDeviceListCaps.lto_priv.0 () at
/usr/lib64/python3.9/site-packages/libvirtmod.cpython-39-x86_64-linux-gnu.so
#13 0x00007f21221268a8 in cfunction_call () at /lib64/libpython3.9.so.1.0
#14 0x00007f2122118814 in _PyObject_MakeTpCall () at /lib64/libpython3.9.so.1.0
#15 0x00007f212211566e in _PyEval_EvalFrameDefault () at /lib64/libpython3.9.so.1.0
#16 0x00007f212211d013 in function_code_fastcall () at /lib64/libpython3.9.so.1.0
#17 0x00007f2122110650 in _PyEval_EvalFrameDefault () at /lib64/libpython3.9.so.1.0
#18 0x00007f212210f06d in _PyEval_EvalCode () at /lib64/libpython3.9.so.1.0
#19 0x00007f212218c495 in _PyEval_EvalCodeWithName () at /lib64/libpython3.9.so.1.0
--Type <RET> for more, q to quit, c to continue without paging--
#20 0x00007f212218c42d in PyEval_EvalCodeEx () at /lib64/libpython3.9.so.1.0
#21 0x00007f212218c3df in PyEval_EvalCode () at /lib64/libpython3.9.so.1.0
#22 0x00007f21221b7524 in run_eval_code_obj () at /lib64/libpython3.9.so.1.0
#23 0x00007f21221b5da6 in run_mod () at /lib64/libpython3.9.so.1.0
#24 0x00007f212208f0cb in pyrun_file.cold () at /lib64/libpython3.9.so.1.0
#25 0x00007f21221bb253 in PyRun_SimpleFileExFlags () at /lib64/libpython3.9.so.1.0
#26 0x00007f21221b7ee8 in Py_RunMain () at /lib64/libpython3.9.so.1.0
#27 0x00007f212217f02d in Py_BytesMain () at /lib64/libpython3.9.so.1.0
#28 0x00007f2121c3feb0 in __libc_start_call_main () at /lib64/libc.so.6
#29 0x00007f2121c3ff60 in __libc_start_main_impl () at /lib64/libc.so.6
#30 0x000056013181a095 in _start ()
// =========================================================================
just in case here is python script
[root@dandreev-vz-6-0-0-243-master ~]# cat a.py
import libvirt
import time
from threading import Thread
def startVM(connection, vm_name):
try:
# Find the virtual machine by name
print('starting VM')
connection.lookupByName(vm_name).create()
print('done starting VM')
except libvirt.libvirtError as e:
print(f'Libvirt Error: {e}')
# Replace 'qemu+tcp://10.34.66.13/system' with your actual connection URI
connection_uri = 'qemu+tcp://localhost/system'
connection = libvirt.open(connection_uri)
if connection is None:
print(f'Failed to open connection to {connection_uri}')
exit(1)
try:
# Replace 'your_vm_name' with the actual name of your virtual machine
# startVM(connection, 'instance-00000002')
thread = Thread(target=lambda: startVM(connection, 'instance-00000002'))
thread.start()
time.sleep(3)
devs = connection.listAllDevices()
for i in range(100000):
for dev in devs:
print('listing caps for dev %i %s' % (i, dev))
try:
dev.listCaps()
except Exception as e:
print('ERROR: %s' % e)
print('DONE listing caps')
connection.lookupByName('instance-00000002').destroy()
except:
connection.lookupByName('instance-00000002').destroy()
finally:
thread.join()
# Close the connection outside the function
connection.close()
// =================================================================
We use virtual machine startup as a loaded operation, but migration or something similar
could be used.
Fima Shevrin
________________________________
From: Daniel P. Berrangé <berrange(a)redhat.com>
Sent: Friday, December 15, 2023 19:09
To: Efim Shevrin <efim.shevrin(a)virtuozzo.com>
Cc: devel(a)lists.libvirt.org <devel(a)lists.libvirt.org>; den(a)openvz.org
<den(a)openvz.org>
Subject: Re: [PATCH 3/3] rpc: Rework rpc notifications in main and side thread
[You don't often get email from berrange(a)redhat.com. Learn why this is important at
https://aka.ms/LearnAboutSenderIdentification ]
On Fri, Dec 15, 2023 at 02:32:19AM +0800, Fima Shevrin wrote:
RPC client implementation uses the following paradigm. The critical
section is organized via virObjectLock(client)/virObjectUnlock(client)
braces. Though this is potentially problematic as
main thread: side thread:
virObjectUnlock(client);
virObjectLock(client);
g_main_loop_quit(client->eventLoop);
virObjectUnlock(client);
g_main_loop_run(client->eventLoop);
This means in particular that is the main thread is executing very long
request like VM migration, the wakeup from the side thread could be
stuck until the main request will be fully completed.
Can you explain this in more detail, with call traces illustration
for the two threads. You're not saying where the main thread is
doing work with the 'client' lock hold for a long time. Generally
the goal should be for the main thread to only hold the lock for
a short time. Also if the side thread is already holding a reference
on 'client', then potentially we should consider if it is possible
to terminate the event loop without acquiring the mutex, as GMainLoop
protects itself wrt concurrent usage already, provided all threads
hold a reference directly or indirectly.
Discrubed case is easily reproducible with the simple python scripts doing slow
and fast requests in parallel from two different threads.
Our idea is to release the lock at the prepare stage and avoid libvirt stuck
during the interaction between main and side threads.
Co-authored-by: Denis V. Lunev <den(a)openvz.org>
Co-authored-by: Nikolai Barybin <nikolai.barybin(a)virtuozzo.com>
Signed-off-by: Fima Shevrin <efim.shevrin(a)virtuozzo.com>
---
src/rpc/virnetclient.c | 17 ++++++++++++-----
src/util/vireventglibwatch.c | 28 ++++++++++++++++++++++++++--
2 files changed, 38 insertions(+), 7 deletions(-)
diff --git a/src/rpc/virnetclient.c b/src/rpc/virnetclient.c
index de8ebc2da9..63bd42ed3a 100644
--- a/src/rpc/virnetclient.c
+++ b/src/rpc/virnetclient.c
@@ -987,6 +987,9 @@ int virNetClientSetTLSSession(virNetClient *client,
* etc. If we make the grade, it will send us a '\1' byte.
*/
+ /* Here we are not passing the client to virEventGLibAddSocketWatch,
+ * since the entire virNetClientSetTLSSession function requires a lock.
+ */
source = virEventGLibAddSocketWatch(virNetSocketGetFD(client->sock),
G_IO_IN,
client->eventCtx,
@@ -1692,14 +1695,18 @@ static int virNetClientIOEventLoop(virNetClient *client,
if (client->nstreams)
ev |= G_IO_IN;
+ /*
+ * We don't need to call virObjectLock(client) here,
+ * since the .prepare function inside glib Main Loop
+ * will do this. virEventGLibAddSocketWatch is responsible
+ * for passing client var in glib .prepare
+ */
source = virEventGLibAddSocketWatch(virNetSocketGetFD(client->sock),
ev,
client->eventCtx,
- virNetClientIOEventFD, &data, NULL,
NULL);
-
- /* Release lock while poll'ing so other threads
- * can stuff themselves on the queue */
- virObjectUnlock(client);
+ virNetClientIOEventFD, &data,
+ (virObjectLockable *)client,
+ NULL);
#ifndef WIN32
/* Block SIGWINCH from interrupting poll in curses programs,
diff --git a/src/util/vireventglibwatch.c b/src/util/vireventglibwatch.c
index 7680656ba2..641b772995 100644
--- a/src/util/vireventglibwatch.c
+++ b/src/util/vireventglibwatch.c
@@ -34,11 +34,23 @@ struct virEventGLibFDSource {
static gboolean
-virEventGLibFDSourcePrepare(GSource *source G_GNUC_UNUSED,
+virEventGLibFDSourcePrepare(GSource *source,
gint *timeout)
{
+ virEventGLibFDSource *ssource = (virEventGLibFDSource *)source;
*timeout = -1;
+ if (ssource->client != NULL)
+ virObjectUnlock(ssource->client);
+
+ /*
+ * Prepare function may be called multiple times
+ * in glib Main Loop, thus we assign source->client
+ * a null pointer to avoid calling pthread_mutex_unlock
+ * on an already unlocked mutex.
+ * */
+ ssource->client = NULL;
+
return FALSE;
}
@@ -123,11 +135,23 @@ struct virEventGLibSocketSource {
static gboolean
-virEventGLibSocketSourcePrepare(GSource *source G_GNUC_UNUSED,
+virEventGLibSocketSourcePrepare(GSource *source,
gint *timeout)
{
+ virEventGLibSocketSource *ssource = (virEventGLibSocketSource *)source;
*timeout = -1;
+ if (ssource->client != NULL)
+ virObjectUnlock(ssource->client);
+
+ /*
+ * Prepare function may be called multiple times
+ * in glib Main Loop, thus we assign source->client
+ * a null pointer to avoid calling pthread_mutex_unlock
+ * on an already unlocked mutex.
+ * */
+ ssource->client = NULL;
+
return FALSE;
}
--
2.34.1
_______________________________________________
Devel mailing list -- devel(a)lists.libvirt.org
To unsubscribe send an email to devel-leave(a)lists.libvirt.org
With regards,
Daniel
--
|:
https://berrange.com -o-
https://www.flickr.com/photos/dberrange :|
|:
https://libvirt.org -o-
https://fstop138.berrange.com :|
|:
https://entangle-photo.org -o-
https://www.instagram.com/dberrange :|