[libvirt] [PATCH 0/4] qemu: Don't lose domain on failed restore

https://bugzilla.redhat.com/show_bug.cgi?id=1718707 When restoring a domain we might leave a qemu process behind. The problem is described in 4/4 in more detail, but the digest is: we start qemu process successfully but then fail to restore its vCPUs and leave qemu process running and forget about. I thought of two possible ways to fix this: 1) Kill qemu process and claim error. 2) Don't kill qemu process and don't remove it from our internal list and claim error. I lean towards 1) because I find it clearer. If I were a mgmt app and call 'virsh restore /path/to/domain.save' then I'd much rather see either success or an error without any need for cleanup (which would be implied if we went with number 2). Michal Prívozník (4): qemuDomainSaveImageStartVM: Use VIR_AUTOCLOSE for @intermediatefd qemuDomainSaveImageStartVM: Use g_autoptr() for virCommand qemu: Use g_autoptr() for qemuDomainSaveCookie qemu: Stop domain on failed restore src/qemu/qemu_domain.c | 28 ++++++++++------------------ src/qemu/qemu_domain.h | 1 + src/qemu/qemu_driver.c | 37 +++++++++++++++++-------------------- 3 files changed, 28 insertions(+), 38 deletions(-) -- 2.24.1

Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_driver.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index d6b1e9f00c..9de5bc6a32 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -6803,7 +6803,7 @@ qemuDomainSaveImageStartVM(virConnectPtr conn, int ret = -1; bool restored = false; virObjectEventPtr event; - int intermediatefd = -1; + VIR_AUTOCLOSE intermediatefd = -1; virCommandPtr cmd = NULL; g_autofree char *errbuf = NULL; g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver); @@ -6829,6 +6829,7 @@ qemuDomainSaveImageStartVM(virConnectPtr conn, if (virCommandRunAsync(cmd, NULL) < 0) { *fd = intermediatefd; + intermediatefd = -1; goto cleanup; } } @@ -6872,8 +6873,6 @@ qemuDomainSaveImageStartVM(virConnectPtr conn, virErrorRestore(&orig_err); } - VIR_FORCE_CLOSE(intermediatefd); - if (VIR_CLOSE(*fd) < 0) { virReportSystemError(errno, _("cannot close file: %s"), path); restored = false; -- 2.24.1

Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_driver.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 9de5bc6a32..e1c0550b9a 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -6804,7 +6804,7 @@ qemuDomainSaveImageStartVM(virConnectPtr conn, bool restored = false; virObjectEventPtr event; VIR_AUTOCLOSE intermediatefd = -1; - virCommandPtr cmd = NULL; + g_autoptr(virCommand) cmd = NULL; g_autofree char *errbuf = NULL; g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver); virQEMUSaveHeaderPtr header = &data->header; @@ -6920,7 +6920,6 @@ qemuDomainSaveImageStartVM(virConnectPtr conn, cleanup: virObjectUnref(cookie); - virCommandFree(cmd); if (qemuSecurityRestoreSavedStateLabel(driver, vm, path) < 0) VIR_WARN("failed to restore save state label on %s", path); return ret; -- 2.24.1

Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_domain.c | 28 ++++++++++------------------ src/qemu/qemu_domain.h | 1 + src/qemu/qemu_driver.c | 6 ++---- 3 files changed, 13 insertions(+), 22 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 1f358544ab..3620739120 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -15997,27 +15997,23 @@ qemuDomainSaveCookiePtr qemuDomainSaveCookieNew(virDomainObjPtr vm) { qemuDomainObjPrivatePtr priv = vm->privateData; - qemuDomainSaveCookiePtr cookie = NULL; + g_autoptr(qemuDomainSaveCookie) cookie = NULL; if (qemuDomainInitialize() < 0) - goto error; + return NULL; if (!(cookie = virObjectNew(qemuDomainSaveCookieClass))) - goto error; + return NULL; if (priv->origCPU && !(cookie->cpu = virCPUDefCopy(vm->def->cpu))) - goto error; + return NULL; cookie->slirpHelper = qemuDomainGetSlirpHelperOk(vm); VIR_DEBUG("Save cookie %p, cpu=%p, slirpHelper=%d", cookie, cookie->cpu, cookie->slirpHelper); - return cookie; - - error: - virObjectUnref(cookie); - return NULL; + return g_steal_pointer(&cookie); } @@ -16025,26 +16021,22 @@ static int qemuDomainSaveCookieParse(xmlXPathContextPtr ctxt G_GNUC_UNUSED, virObjectPtr *obj) { - qemuDomainSaveCookiePtr cookie = NULL; + g_autoptr(qemuDomainSaveCookie) cookie = NULL; if (qemuDomainInitialize() < 0) - goto error; + return -1; if (!(cookie = virObjectNew(qemuDomainSaveCookieClass))) - goto error; + return -1; if (virCPUDefParseXML(ctxt, "./cpu[1]", VIR_CPU_TYPE_GUEST, &cookie->cpu) < 0) - goto error; + return -1; cookie->slirpHelper = virXPathBoolean("boolean(./slirpHelper)", ctxt) > 0; - *obj = (virObjectPtr) cookie; + *obj = (virObjectPtr) g_steal_pointer(&cookie); return 0; - - error: - virObjectUnref(cookie); - return -1; } diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index c6afc484f6..60b80297fa 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -610,6 +610,7 @@ struct _qemuDomainSaveCookie { bool slirpHelper; }; +G_DEFINE_AUTOPTR_CLEANUP_FUNC(qemuDomainSaveCookie, virObjectUnref); typedef struct _qemuDomainXmlNsDef qemuDomainXmlNsDef; typedef qemuDomainXmlNsDef *qemuDomainXmlNsDefPtr; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index e1c0550b9a..ce9b1772c1 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -3293,7 +3293,7 @@ qemuDomainSaveInternal(virQEMUDriverPtr driver, virObjectEventPtr event = NULL; qemuDomainObjPrivatePtr priv = vm->privateData; virQEMUSaveDataPtr data = NULL; - qemuDomainSaveCookiePtr cookie = NULL; + g_autoptr(qemuDomainSaveCookie) cookie = NULL; if (!qemuMigrationSrcIsAllowed(driver, vm, false, 0)) goto cleanup; @@ -3399,7 +3399,6 @@ qemuDomainSaveInternal(virQEMUDriverPtr driver, qemuDomainRemoveInactiveJob(driver, vm); cleanup: - virObjectUnref(cookie); virQEMUSaveDataFree(data); virObjectEventStateQueue(driver->domainEventState, event); return ret; @@ -6808,7 +6807,7 @@ qemuDomainSaveImageStartVM(virConnectPtr conn, g_autofree char *errbuf = NULL; g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver); virQEMUSaveHeaderPtr header = &data->header; - qemuDomainSaveCookiePtr cookie = NULL; + g_autoptr(qemuDomainSaveCookie) cookie = NULL; if (virSaveCookieParseString(data->cookie, (virObjectPtr *)&cookie, virDomainXMLOptionGetSaveCookie(driver->xmlopt)) < 0) @@ -6919,7 +6918,6 @@ qemuDomainSaveImageStartVM(virConnectPtr conn, ret = 0; cleanup: - virObjectUnref(cookie); if (qemuSecurityRestoreSavedStateLabel(driver, vm, path) < 0) VIR_WARN("failed to restore save state label on %s", path); return ret; -- 2.24.1

When resuming a domain from a save file, we read the domain XML from the file, add it onto our internal list of domains, start the qemu process, let it load the incoming migration stream and resume its vCPUs afterwards. If anything goes wrong, the domain object is removed from the list of domains and error is returned to the caller. However, the qemu process might be left behind - if resuming vCPUs fails (e.g. because qemu is unable to acquire write lock on a disk) then due to a bug the qemu process is not killed but the domain object is removed from the list. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1718707 Signed-off-by: Michal Privoznik <mprivozn@redhat.com> --- src/qemu/qemu_driver.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index ce9b1772c1..217d873671 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -6800,7 +6800,7 @@ qemuDomainSaveImageStartVM(virConnectPtr conn, { qemuDomainObjPrivatePtr priv = vm->privateData; int ret = -1; - bool restored = false; + bool started = false; virObjectEventPtr event; VIR_AUTOCLOSE intermediatefd = -1; g_autoptr(virCommand) cmd = NULL; @@ -6808,6 +6808,7 @@ qemuDomainSaveImageStartVM(virConnectPtr conn, g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver); virQEMUSaveHeaderPtr header = &data->header; g_autoptr(qemuDomainSaveCookie) cookie = NULL; + int rc = 0; if (virSaveCookieParseString(data->cookie, (virObjectPtr *)&cookie, virDomainXMLOptionGetSaveCookie(driver->xmlopt)) < 0) @@ -6848,12 +6849,12 @@ qemuDomainSaveImageStartVM(virConnectPtr conn, VIR_NETDEV_VPORT_PROFILE_OP_RESTORE, VIR_QEMU_PROCESS_START_PAUSED | VIR_QEMU_PROCESS_START_GEN_VMID) == 0) - restored = true; + started = true; if (intermediatefd != -1) { virErrorPtr orig_err = NULL; - if (!restored) { + if (!started) { /* if there was an error setting up qemu, the intermediate * process will wait forever to write to stdout, so we * must manually kill it and ignore any error related to @@ -6864,21 +6865,17 @@ qemuDomainSaveImageStartVM(virConnectPtr conn, VIR_FORCE_CLOSE(*fd); } - if (virCommandWait(cmd, NULL) < 0) { - qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, asyncJob, 0); - restored = false; - } + rc = virCommandWait(cmd, NULL); VIR_DEBUG("Decompression binary stderr: %s", NULLSTR(errbuf)); - virErrorRestore(&orig_err); } if (VIR_CLOSE(*fd) < 0) { virReportSystemError(errno, _("cannot close file: %s"), path); - restored = false; + rc = -1; } - virDomainAuditStart(vm, "restored", restored); - if (!restored) + virDomainAuditStart(vm, "restored", started); + if (!started || rc < 0) goto cleanup; /* qemuProcessStart doesn't unset the qemu error reporting infrastructure @@ -6918,6 +6915,10 @@ qemuDomainSaveImageStartVM(virConnectPtr conn, ret = 0; cleanup: + if (ret < 0 && started) { + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, + asyncJob, VIR_QEMU_PROCESS_STOP_MIGRATED); + } if (qemuSecurityRestoreSavedStateLabel(driver, vm, path) < 0) VIR_WARN("failed to restore save state label on %s", path); return ret; -- 2.24.1

On 1/13/20 7:58 AM, Michal Privoznik wrote:
https://bugzilla.redhat.com/show_bug.cgi?id=1718707
When restoring a domain we might leave a qemu process behind.
The problem is described in 4/4 in more detail, but the digest is: we start qemu process successfully but then fail to restore its vCPUs and leave qemu process running and forget about. I thought of two possible ways to fix this:
1) Kill qemu process and claim error.
2) Don't kill qemu process and don't remove it from our internal list and claim error.
I lean towards 1) because I find it clearer. If I were a mgmt app and call 'virsh restore /path/to/domain.save' then I'd much rather see either success or an error without any need for cleanup (which would be implied if we went with number 2).
Michal Prívozník (4): qemuDomainSaveImageStartVM: Use VIR_AUTOCLOSE for @intermediatefd qemuDomainSaveImageStartVM: Use g_autoptr() for virCommand qemu: Use g_autoptr() for qemuDomainSaveCookie qemu: Stop domain on failed restore
All patches: Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
src/qemu/qemu_domain.c | 28 ++++++++++------------------ src/qemu/qemu_domain.h | 1 + src/qemu/qemu_driver.c | 37 +++++++++++++++++-------------------- 3 files changed, 28 insertions(+), 38 deletions(-)
participants (2)
-
Daniel Henrique Barboza
-
Michal Privoznik