Problem:
libvirt has a 5 second timeout (generally) for hotplug/unplug
operations which can time out due to heavy load in guest.
vcpu hotunplug occurs one vcpu at a time.
But, if we perform hotplug-unplug repeatedly,
Case 1: qemu sends multiple timedout vcpu unplug notification before
libvirt processed first one.
Case 2: when attempting a hotplug, qemu finishes unplug of another cpu
libvirt waits for an async event notification from qemu regarding
successful vcpu delete.
qemu deletes its vcpu, vcpuinfo and sends notification to libvirt.
libvirt handles vcpu delete notification, and updates vcpuinfo
received from qemu in qemuDomainRefreshVcpuInfo().
qemu's vcpuinfo during refresh will not contain other deleted, sent vcpu
qemu's vcpuinfo will overwrite libvirt's vcpuinfo of the other pending
timedout vcpu(s) which qemu has deleted and notified libvirt.
The overwrite resets other timedout vcpu's alias to NULL and tid to 0.
The error is then seen when validating tid of vcpus.
Example error log:
"internal error: qemu didn't report thread id for vcpu 'XX'"
Solution:
Clear vcpu alias of only the vcpu that hotplug API calls.
Other vcpu-alias won't get reset when vcpuinfo refresh occurs.
Validation is also done for corresponding vcpus only, not all.
Co-authored-by: Partha Satapathy <partha.satapathy(a)oracle.com>
Signed-off-by: Shaleen Bathla <shaleen.bathla(a)oracle.com>
---
src/qemu/qemu_domain.c | 6 ++++--
src/qemu/qemu_hotplug.c | 39 +++++++++++++++++++++++++++++++--------
2 files changed, 35 insertions(+), 10 deletions(-)
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index 3435da5bdc4c..6ae842d0e32f 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -9773,8 +9773,10 @@ qemuDomainRefreshVcpuInfo(virDomainObj *vm,
vcpupriv->vcpus = info[i].vcpus;
VIR_FREE(vcpupriv->type);
vcpupriv->type = g_steal_pointer(&info[i].type);
- VIR_FREE(vcpupriv->alias);
- vcpupriv->alias = g_steal_pointer(&info[i].alias);
+ if (info[i].alias) {
+ VIR_FREE(vcpupriv->alias);
+ vcpupriv->alias = g_steal_pointer(&info[i].alias);
+ }
virJSONValueFree(vcpupriv->props);
vcpupriv->props = g_steal_pointer(&info[i].props);
vcpupriv->enable_id = info[i].id;
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
index da92ced2f444..f11b90a220ac 100644
--- a/src/qemu/qemu_hotplug.c
+++ b/src/qemu/qemu_hotplug.c
@@ -6090,6 +6090,8 @@ qemuDomainRemoveVcpu(virDomainObj *vm,
unsigned int nvcpus = vcpupriv->vcpus;
virErrorPtr save_error = NULL;
size_t i;
+ bool hasVcpuPids = qemuDomainHasVcpuPids(vm);
+ bool rollback = false;
if (qemuDomainRefreshVcpuInfo(vm, VIR_ASYNC_JOB_NONE, false) < 0)
return -1;
@@ -6097,14 +6099,26 @@ qemuDomainRemoveVcpu(virDomainObj *vm,
/* validation requires us to set the expected state prior to calling it */
for (i = vcpu; i < vcpu + nvcpus; i++) {
vcpuinfo = virDomainDefGetVcpu(vm->def, i);
+ vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpuinfo);
+
vcpuinfo->online = false;
+ VIR_FREE(vcpupriv->alias); /* clear vcpu alias of only this vcpu */
+
+ if (vcpupriv->tid != 0) {
+ if (hasVcpuPids)
+ virReportError(VIR_ERR_INTERNAL_ERROR,
+ _("qemu reported thread id for inactive vcpu
'%zu'"), i);
+
+ rollback = true;
+ break;
+ }
}
- if (qemuDomainValidateVcpuInfo(vm) < 0) {
+ if (rollback) {
/* rollback vcpu count if the setting has failed */
virDomainAuditVcpu(vm, oldvcpus, oldvcpus - nvcpus, "update", false);
- for (i = vcpu; i < vcpu + nvcpus; i++) {
+ for (; i >= vcpu; i--) {
vcpuinfo = virDomainDefGetVcpu(vm->def, i);
vcpuinfo->online = true;
}
@@ -6141,6 +6155,9 @@ qemuDomainRemoveVcpuAlias(virDomainObj *vm,
return;
}
}
+
+ VIR_DEBUG("%s not found in vcpulist of domain %s ",
+ alias, vm->def->name);
}
@@ -6209,6 +6226,7 @@ qemuDomainHotplugAddVcpu(virQEMUDriver *driver,
int rc;
int oldvcpus = virDomainDefGetVcpus(vm->def);
size_t i;
+ bool hasVcpuPids = qemuDomainHasVcpuPids(vm);
if (!qemuDomainSupportsNewVcpuHotplug(vm)) {
virReportError(VIR_ERR_OPERATION_UNSUPPORTED, "%s",
@@ -6245,14 +6263,19 @@ qemuDomainHotplugAddVcpu(virQEMUDriver *driver,
vcpuinfo->online = true;
- if (vcpupriv->tid > 0 &&
- qemuProcessSetupVcpu(vm, i, true) < 0)
- return -1;
+ if (vcpupriv->tid > 0) {
+ if (qemuProcessSetupVcpu(vm, i, true) < 0) {
+ return -1;
+ }
+ } else {
+ if (hasVcpuPids) {
+ virReportError(VIR_ERR_INTERNAL_ERROR,
+ _("qemu didn't report thread id for vcpu
'%zu'"), i);
+ return -1;
+ }
+ }
}
- if (qemuDomainValidateVcpuInfo(vm) < 0)
- return -1;
-
qemuDomainVcpuPersistOrder(vm->def);
if (virDomainObjSave(vm, driver->xmlopt, cfg->stateDir) < 0)
--
2.31.1
V1 Patch :
https://listman.redhat.com/archives/libvir-list/2022-November/235470.html
V1 Patch Review comments :
https://listman.redhat.com/archives/libvir-list/2022-November/235534.html
Changes in V2 since V1:
- Patch addresses review comments from Peter Krempa.
- Validation is done for per cpu hotplug entity instead of all