[libvirt] [PATCH v4] qemu: Introduce state_lock_timeout to qemu.conf

When doing some job holding state lock for a long time, we may come across error: "Timed out during operation: cannot acquire state change lock" Well, sometimes it's not a problem and users wanner continue to wait, and this patch allow users decide how long time they can wait the state lock. Signed-off-by: Yi Wang <wang.yi59@zte.com.cn> Reviewed-by: Xi Xu <xu.xi8@zte.com.cn> --- changes in v4: - fox syntax-check error changes in v3: - add user-friendly description and nb of state lock - check validity of stateLockTimeout changes in v2: - change default value to 30 in qemu.conf - set the default value in virQEMUDriverConfigNew() v4 Signed-off-by: Yi Wang <wang.yi59@zte.com.cn> --- src/qemu/libvirtd_qemu.aug | 1 + src/qemu/qemu.conf | 10 ++++++++++ src/qemu/qemu_conf.c | 14 ++++++++++++++ src/qemu/qemu_conf.h | 2 ++ src/qemu/qemu_domain.c | 5 +---- src/qemu/test_libvirtd_qemu.aug.in | 1 + 6 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/qemu/libvirtd_qemu.aug b/src/qemu/libvirtd_qemu.aug index ddc4bbf..f7287ae 100644 --- a/src/qemu/libvirtd_qemu.aug +++ b/src/qemu/libvirtd_qemu.aug @@ -93,6 +93,7 @@ module Libvirtd_qemu = | limits_entry "max_core" | bool_entry "dump_guest_core" | str_entry "stdio_handler" + | int_entry "state_lock_timeout" let device_entry = bool_entry "mac_filter" | bool_entry "relaxed_acs_check" diff --git a/src/qemu/qemu.conf b/src/qemu/qemu.conf index cd57b3c..8920a1a 100644 --- a/src/qemu/qemu.conf +++ b/src/qemu/qemu.conf @@ -667,6 +667,16 @@ # #max_queued = 0 + +# When two or more threads want to work with the same domain they use a +# job lock to mutually exclude each other. However, waiting for the lock +# is limited up to state_lock_timeout seconds. +# NB, strong recommendation to set the timeout longer than 30 seconds. +# +# Default is 30 +# +#state_lock_timeout = 60 + ################################################################### # Keepalive protocol: # This allows qemu driver to detect broken connections to remote diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index a4f545e..c761cae 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -129,6 +129,9 @@ void qemuDomainCmdlineDefFree(qemuDomainCmdlineDefPtr def) #endif +/* Give up waiting for mutex after 30 seconds */ +#define QEMU_JOB_WAIT_TIME (1000ull * 30) + virQEMUDriverConfigPtr virQEMUDriverConfigNew(bool privileged) { virQEMUDriverConfigPtr cfg; @@ -346,6 +349,8 @@ virQEMUDriverConfigPtr virQEMUDriverConfigNew(bool privileged) cfg->glusterDebugLevel = 4; cfg->stdioLogD = true; + cfg->stateLockTimeout = QEMU_JOB_WAIT_TIME; + if (!(cfg->namespaces = virBitmapNew(QEMU_DOMAIN_NS_LAST))) goto error; @@ -863,6 +868,9 @@ int virQEMUDriverConfigLoadFile(virQEMUDriverConfigPtr cfg, if (virConfGetValueUInt(conf, "keepalive_count", &cfg->keepAliveCount) < 0) goto cleanup; + if (virConfGetValueInt(conf, "state_lock_timeout", &cfg->stateLockTimeout) < 0) + goto cleanup; + if (virConfGetValueInt(conf, "seccomp_sandbox", &cfg->seccompSandbox) < 0) goto cleanup; @@ -1055,6 +1063,12 @@ virQEMUDriverConfigValidate(virQEMUDriverConfigPtr cfg) return -1; } + if (cfg->stateLockTimeout <= 0) { + virReportError(VIR_ERR_CONF_SYNTAX, "%s", + _("state_lock_timeout should larger than zero")); + return -1; + } + return 0; } diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h index a8d84ef..97cf2e1 100644 --- a/src/qemu/qemu_conf.h +++ b/src/qemu/qemu_conf.h @@ -190,6 +190,8 @@ struct _virQEMUDriverConfig { int keepAliveInterval; unsigned int keepAliveCount; + int stateLockTimeout; + int seccompSandbox; char *migrateHost; diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 886e3fb..5a2ca52 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -6652,9 +6652,6 @@ qemuDomainObjCanSetJob(qemuDomainObjPrivatePtr priv, priv->job.agentActive == QEMU_AGENT_JOB_NONE)); } -/* Give up waiting for mutex after 30 seconds */ -#define QEMU_JOB_WAIT_TIME (1000ull * 30) - /** * qemuDomainObjBeginJobInternal: * @driver: qemu driver @@ -6714,7 +6711,7 @@ qemuDomainObjBeginJobInternal(virQEMUDriverPtr driver, } priv->jobs_queued++; - then = now + QEMU_JOB_WAIT_TIME; + then = now + cfg->stateLockTimeout; retry: if ((!async && job != QEMU_JOB_DESTROY) && diff --git a/src/qemu/test_libvirtd_qemu.aug.in b/src/qemu/test_libvirtd_qemu.aug.in index f1e8806..dc5de96 100644 --- a/src/qemu/test_libvirtd_qemu.aug.in +++ b/src/qemu/test_libvirtd_qemu.aug.in @@ -105,3 +105,4 @@ module Test_libvirtd_qemu = { "pr_helper" = "/usr/bin/qemu-pr-helper" } { "swtpm_user" = "tss" } { "swtpm_group" = "tss" } +{ "state_lock_timeout" = "60" } -- 1.8.3.1

On Tue, Aug 28, 2018 at 04:40:16PM +0800, Yi Wang wrote:
When doing some job holding state lock for a long time, we may come across error: "Timed out during operation: cannot acquire state change lock" Well, sometimes it's not a problem and users wanner continue
s/wanner/want to/
to wait, and this patch allow users decide how long time they can wait the state lock.
Signed-off-by: Yi Wang <wang.yi59@zte.com.cn> Reviewed-by: Xi Xu <xu.xi8@zte.com.cn> --- changes in v4: - fox syntax-check error
changes in v3: - add user-friendly description and nb of state lock - check validity of stateLockTimeout
changes in v2: - change default value to 30 in qemu.conf - set the default value in virQEMUDriverConfigNew()
v4
Signed-off-by: Yi Wang <wang.yi59@zte.com.cn> --- src/qemu/libvirtd_qemu.aug | 1 + src/qemu/qemu.conf | 10 ++++++++++ src/qemu/qemu_conf.c | 14 ++++++++++++++ src/qemu/qemu_conf.h | 2 ++ src/qemu/qemu_domain.c | 5 +---- src/qemu/test_libvirtd_qemu.aug.in | 1 + 6 files changed, 29 insertions(+), 4 deletions(-)
diff --git a/src/qemu/libvirtd_qemu.aug b/src/qemu/libvirtd_qemu.aug index ddc4bbf..f7287ae 100644 --- a/src/qemu/libvirtd_qemu.aug +++ b/src/qemu/libvirtd_qemu.aug @@ -93,6 +93,7 @@ module Libvirtd_qemu = | limits_entry "max_core" | bool_entry "dump_guest_core" | str_entry "stdio_handler" + | int_entry "state_lock_timeout"
here you add the option at the end of the 'process_entry' group
let device_entry = bool_entry "mac_filter" | bool_entry "relaxed_acs_check" diff --git a/src/qemu/qemu.conf b/src/qemu/qemu.conf index cd57b3c..8920a1a 100644 --- a/src/qemu/qemu.conf +++ b/src/qemu/qemu.conf @@ -667,6 +667,16 @@ # #max_queued = 0
+ +# When two or more threads want to work with the same domain they use a +# job lock to mutually exclude each other. However, waiting for the lock +# is limited up to state_lock_timeout seconds. +# NB, strong recommendation to set the timeout longer than 30 seconds. +# +# Default is 30 +# +#state_lock_timeout = 60
But here in qemu.conf, you add it between the rpc entries. It seems we did not follow the structure with 'stdio_handler', but adding it either right after 'dump_guest_core' or at the end of file would be better than squeezing it between rpc entries.
+ ################################################################### # Keepalive protocol: # This allows qemu driver to detect broken connections to remote diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index a4f545e..c761cae 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -129,6 +129,9 @@ void qemuDomainCmdlineDefFree(qemuDomainCmdlineDefPtr def) #endif
+/* Give up waiting for mutex after 30 seconds */ +#define QEMU_JOB_WAIT_TIME (1000ull * 30) +
Here the constant is multiplied by 1000 to convert from seconds to milliseconds.
virQEMUDriverConfigPtr virQEMUDriverConfigNew(bool privileged) { virQEMUDriverConfigPtr cfg; @@ -346,6 +349,8 @@ virQEMUDriverConfigPtr virQEMUDriverConfigNew(bool privileged) cfg->glusterDebugLevel = 4; cfg->stdioLogD = true;
+ cfg->stateLockTimeout = QEMU_JOB_WAIT_TIME; + if (!(cfg->namespaces = virBitmapNew(QEMU_DOMAIN_NS_LAST))) goto error;
@@ -863,6 +868,9 @@ int virQEMUDriverConfigLoadFile(virQEMUDriverConfigPtr cfg, if (virConfGetValueUInt(conf, "keepalive_count", &cfg->keepAliveCount) < 0) goto cleanup;
+ if (virConfGetValueInt(conf, "state_lock_timeout", &cfg->stateLockTimeout) < 0) + goto cleanup; +
But the parsed value is passed directly here, so '60' in the config file would result in 60 ms.
if (virConfGetValueInt(conf, "seccomp_sandbox", &cfg->seccompSandbox) < 0) goto cleanup;
@@ -1055,6 +1063,12 @@ virQEMUDriverConfigValidate(virQEMUDriverConfigPtr cfg) return -1; }
+ if (cfg->stateLockTimeout <= 0) { + virReportError(VIR_ERR_CONF_SYNTAX, "%s", + _("state_lock_timeout should larger than zero")); + return -1; + } + return 0; }
diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h index a8d84ef..97cf2e1 100644 --- a/src/qemu/qemu_conf.h +++ b/src/qemu/qemu_conf.h @@ -190,6 +190,8 @@ struct _virQEMUDriverConfig { int keepAliveInterval; unsigned int keepAliveCount;
+ int stateLockTimeout; + int seccompSandbox;
char *migrateHost; diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 886e3fb..5a2ca52 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -6652,9 +6652,6 @@ qemuDomainObjCanSetJob(qemuDomainObjPrivatePtr priv, priv->job.agentActive == QEMU_AGENT_JOB_NONE)); }
-/* Give up waiting for mutex after 30 seconds */ -#define QEMU_JOB_WAIT_TIME (1000ull * 30) - /** * qemuDomainObjBeginJobInternal: * @driver: qemu driver @@ -6714,7 +6711,7 @@ qemuDomainObjBeginJobInternal(virQEMUDriverPtr driver, }
priv->jobs_queued++; - then = now + QEMU_JOB_WAIT_TIME; + then = now + cfg->stateLockTimeout;
retry: if ((!async && job != QEMU_JOB_DESTROY) && diff --git a/src/qemu/test_libvirtd_qemu.aug.in b/src/qemu/test_libvirtd_qemu.aug.in index f1e8806..dc5de96 100644 --- a/src/qemu/test_libvirtd_qemu.aug.in +++ b/src/qemu/test_libvirtd_qemu.aug.in @@ -105,3 +105,4 @@ module Test_libvirtd_qemu = { "pr_helper" = "/usr/bin/qemu-pr-helper" } { "swtpm_user" = "tss" } { "swtpm_group" = "tss" } +{ "state_lock_timeout" = "60" }
This needs to be ordered properly - install the 'augeas' tool to see where 'make check' fails. Jano

Hi Jano, thanks for your reply.
On Tue, Aug 28, 2018 at 04:40:16PM +0800, Yi Wang wrote:
When doing some job holding state lock for a long time, we may come across error: "Timed out during operation: cannot acquire state change lock" Well, sometimes it's not a problem and users wanner continue
s/wanner/want to/
Ok, I will pay attention to this later.
diff --git a/src/qemu/libvirtd_qemu.aug b/src/qemu/libvirtd_qemu.aug index ddc4bbf..f7287ae 100644 --- a/src/qemu/libvirtd_qemu.aug +++ b/src/qemu/libvirtd_qemu.aug @@ -93,6 +93,7 @@ module Libvirtd_qemu = | limits_entry "max_core" | bool_entry "dump_guest_core" | str_entry "stdio_handler" + | int_entry "state_lock_timeout"
here you add the option at the end of the 'process_entry' group
let device_entry = bool_entry "mac_filter" | bool_entry "relaxed_acs_check" diff --git a/src/qemu/qemu.conf b/src/qemu/qemu.conf index cd57b3c..8920a1a 100644 --- a/src/qemu/qemu.conf +++ b/src/qemu/qemu.conf @@ -667,6 +667,16 @@ # #max_queued = 0
+ +# When two or more threads want to work with the same domain they use a +# job lock to mutually exclude each other. However, waiting for the lock +# is limited up to state_lock_timeout seconds. +# NB, strong recommendation to set the timeout longer than 30 seconds. +# +# Default is 30 +# +#state_lock_timeout = 60
But here in qemu.conf, you add it between the rpc entries.
It seems we did not follow the structure with 'stdio_handler', but adding it either right after 'dump_guest_core' or at the end of file would be better than squeezing it between rpc entries.
As Michal suggested in: https://www.redhat.com/archives/libvir-list/2018-August/msg01693.html max_queued and state_lock_timeout both refer to the same area, so I put it here :)
@@ -863,6 +868,9 @@ int virQEMUDriverConfigLoadFile(virQEMUDriverConfigPtr cfg, if (virConfGetValueUInt(conf, "keepalive_count", &cfg->keepAliveCount) < 0) goto cleanup;
+ if (virConfGetValueInt(conf, "state_lock_timeout", &cfg->stateLockTimeout) < 0) + goto cleanup; +
But the parsed value is passed directly here, so '60' in the config file would result in 60 ms.
Oho, I will fix this.
retry: if ((!async && job != QEMU_JOB_DESTROY) && diff --git a/src/qemu/test_libvirtd_qemu.aug.in b/src/qemu/test_libvirtd_qemu.aug.in index f1e8806..dc5de96 100644 --- a/src/qemu/test_libvirtd_qemu.aug.in +++ b/src/qemu/test_libvirtd_qemu.aug.in @@ -105,3 +105,4 @@ module Test_libvirtd_qemu = { "pr_helper" = "/usr/bin/qemu-pr-helper" } { "swtpm_user" = "tss" } { "swtpm_group" = "tss" } +{ "state_lock_timeout" = "60" }
This needs to be ordered properly - install the 'augeas' tool to see where 'make check' fails.
My fault, sorry for the mistake, and next patch will fix this. Thanks again, Jano. --- Best wishes Yi Wang

On Wed, Sep 05, 2018 at 04:49:59PM +0800, wang.yi59@zte.com.cn wrote:
Hi Jano, thanks for your reply.
On Tue, Aug 28, 2018 at 04:40:16PM +0800, Yi Wang wrote:
[...]
diff --git a/src/qemu/libvirtd_qemu.aug b/src/qemu/libvirtd_qemu.aug index ddc4bbf..f7287ae 100644 --- a/src/qemu/libvirtd_qemu.aug +++ b/src/qemu/libvirtd_qemu.aug @@ -93,6 +93,7 @@ module Libvirtd_qemu = | limits_entry "max_core" | bool_entry "dump_guest_core" | str_entry "stdio_handler" + | int_entry "state_lock_timeout"
here you add the option at the end of the 'process_entry' group
let device_entry = bool_entry "mac_filter" | bool_entry "relaxed_acs_check" diff --git a/src/qemu/qemu.conf b/src/qemu/qemu.conf index cd57b3c..8920a1a 100644 --- a/src/qemu/qemu.conf +++ b/src/qemu/qemu.conf @@ -667,6 +667,16 @@ # #max_queued = 0
+ +# When two or more threads want to work with the same domain they use a +# job lock to mutually exclude each other. However, waiting for the lock +# is limited up to state_lock_timeout seconds. +# NB, strong recommendation to set the timeout longer than 30 seconds. +# +# Default is 30 +# +#state_lock_timeout = 60
But here in qemu.conf, you add it between the rpc entries.
It seems we did not follow the structure with 'stdio_handler', but adding it either right after 'dump_guest_core' or at the end of file would be better than squeezing it between rpc entries.
As Michal suggested in: https://www.redhat.com/archives/libvir-list/2018-August/msg01693.html max_queued and state_lock_timeout both refer to the same area, so I put it here :)
My point was that the position in qemu.conf does not match the grouping in libvirtd_qemu.aug So if it's related to max_queued, it should also be in the rpc_entry group in the aug file. Jano

On Wed, Sep 05, 2018 at 04:49:59PM +0800, wang.yi59@zte.com.cn wrote:
Hi Jano, thanks for your reply.
On Tue, Aug 28, 2018 at 04:40:16PM +0800, Yi Wang wrote:
[...]
diff --git a/src/qemu/libvirtd_qemu.aug b/src/qemu/libvirtd_qemu.aug index ddc4bbf..f7287ae 100644 --- a/src/qemu/libvirtd_qemu.aug +++ b/src/qemu/libvirtd_qemu.aug @@ -93,6 +93,7 @@ module Libvirtd_qemu = | limits_entry "max_core" | bool_entry "dump_guest_core" | str_entry "stdio_handler" + | int_entry "state_lock_timeout"
here you add the option at the end of the 'process_entry' group
let device_entry = bool_entry "mac_filter" | bool_entry "relaxed_acs_check" diff --git a/src/qemu/qemu.conf b/src/qemu/qemu.conf index cd57b3c..8920a1a 100644 --- a/src/qemu/qemu.conf +++ b/src/qemu/qemu.conf @@ -667,6 +667,16 @@ # #max_queued = 0
+ +# When two or more threads want to work with the same domain they use a +# job lock to mutually exclude each other. However, waiting for the lock +# is limited up to state_lock_timeout seconds. +# NB, strong recommendation to set the timeout longer than 30 seconds. +# +# Default is 30 +# +#state_lock_timeout = 60
But here in qemu.conf, you add it between the rpc entries.
It seems we did not follow the structure with 'stdio_handler', but adding it either right after 'dump_guest_core' or at the end of file would be better than squeezing it between rpc entries.
As Michal suggested in: https://www.redhat.com/archives/libvir-list/2018-August/msg01693.html max_queued and state_lock_timeout both refer to the same area, so I put it here :)
My point was that the position in qemu.conf does not match the grouping in libvirtd_qemu.aug
So if it's related to max_queued, it should also be in the rpc_entry group in the aug file.
I got it, and I will adjust that in the next version. Thanks. --- Best wishes Yi Wang
participants (3)
-
Ján Tomko
-
wang.yi59@zte.com.cn
-
Yi Wang