[libvirt] [PATCH 0/3] qemu: Ignore temporary job errors when checking migration status

https://bugzilla.redhat.com/show_bug.cgi?id=1083238 Jiri Denemark (3): qemu: Make qemuDomainObjBeginNestedJob static qemuDomainObjBeginNestedJob: Return -2 for temporary failures qemu: Ignore temporary job errors when checking migration status src/qemu/qemu_domain.c | 41 +++++++++++++++++++++++++++-------------- src/qemu/qemu_domain.h | 4 ---- src/qemu/qemu_migration.c | 7 ++++--- 3 files changed, 31 insertions(+), 21 deletions(-) -- 1.9.3

It's only used within qemu_domain.c. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_domain.c | 2 +- src/qemu/qemu_domain.h | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 34cc736..d4fb569 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -1176,7 +1176,7 @@ int qemuDomainObjBeginAsyncJob(virQEMUDriverPtr driver, asyncJob); } -int +static int ATTRIBUTE_RETURN_CHECK qemuDomainObjBeginNestedJob(virQEMUDriverPtr driver, virDomainObjPtr obj, enum qemuDomainAsyncJob asyncJob) diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index ab27f15..9ae6995 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -211,10 +211,6 @@ int qemuDomainObjBeginAsyncJob(virQEMUDriverPtr driver, virDomainObjPtr obj, enum qemuDomainAsyncJob asyncJob) ATTRIBUTE_RETURN_CHECK; -int qemuDomainObjBeginNestedJob(virQEMUDriverPtr driver, - virDomainObjPtr obj, - enum qemuDomainAsyncJob asyncJob) - ATTRIBUTE_RETURN_CHECK; bool qemuDomainObjEndJob(virQEMUDriverPtr driver, virDomainObjPtr obj) -- 1.9.3

On 05/13/14 15:44, Jiri Denemark wrote:
It's only used within qemu_domain.c.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_domain.c | 2 +- src/qemu/qemu_domain.h | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-)
ACK, Peter

If job queue is full or waiting for a job times out, the function returns -2 so that it can be handled in a different way by callers. The change is safe since all existing callers of qemuDomainObjBeginNestedJob check the return value to be less than zero. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_domain.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index d4fb569..3df454f 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -1058,6 +1058,7 @@ qemuDomainObjBeginJobInternal(virQEMUDriverPtr driver, unsigned long long then; bool nested = job == QEMU_JOB_ASYNC_NESTED; virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver); + int ret; VIR_DEBUG("Starting %s: %s (async=%s vm=%p name=%s)", job == QEMU_JOB_ASYNC ? "async job" : "job", @@ -1134,21 +1135,25 @@ qemuDomainObjBeginJobInternal(virQEMUDriverPtr driver, qemuDomainAsyncJobTypeToString(priv->job.asyncJob), priv->job.owner, priv->job.asyncOwner); - if (errno == ETIMEDOUT) + ret = -1; + if (errno == ETIMEDOUT) { virReportError(VIR_ERR_OPERATION_TIMEOUT, "%s", _("cannot acquire state change lock")); - else if (cfg->maxQueuedJobs && - priv->jobs_queued > cfg->maxQueuedJobs) + ret = -2; + } else if (cfg->maxQueuedJobs && + priv->jobs_queued > cfg->maxQueuedJobs) { virReportError(VIR_ERR_OPERATION_FAILED, "%s", _("cannot acquire state change lock " "due to max_queued limit")); - else + ret = -2; + } else { virReportSystemError(errno, "%s", _("cannot acquire job mutex")); + } priv->jobs_queued--; virObjectUnref(obj); virObjectUnref(cfg); - return -1; + return ret; } /* @@ -1164,16 +1169,22 @@ int qemuDomainObjBeginJob(virQEMUDriverPtr driver, virDomainObjPtr obj, enum qemuDomainJob job) { - return qemuDomainObjBeginJobInternal(driver, obj, job, - QEMU_ASYNC_JOB_NONE); + if (qemuDomainObjBeginJobInternal(driver, obj, job, + QEMU_ASYNC_JOB_NONE) < 0) + return -1; + else + return 0; } int qemuDomainObjBeginAsyncJob(virQEMUDriverPtr driver, virDomainObjPtr obj, enum qemuDomainAsyncJob asyncJob) { - return qemuDomainObjBeginJobInternal(driver, obj, QEMU_JOB_ASYNC, - asyncJob); + if (qemuDomainObjBeginJobInternal(driver, obj, QEMU_JOB_ASYNC, + asyncJob) < 0) + return -1; + else + return 0; } static int ATTRIBUTE_RETURN_CHECK -- 1.9.3

On 05/13/14 15:44, Jiri Denemark wrote:
If job queue is full or waiting for a job times out, the function returns -2 so that it can be handled in a different way by callers.
The change is safe since all existing callers of
There's just one :)
qemuDomainObjBeginNestedJob check the return value to be less than zero.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_domain.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-)
ACK, Peter

When qemu driver is polling for migration to finish (in qemuMigrationWaitForCompletion), it may happen that another job allowed during migration is running and if it does not finish within 30 seconds, migration would be cancelled because of that. However, we can just ignore the timeout and let the waiting loop try again later. If an event fired at the end of migration is ever implemented in QEMU, we can just wait for the event instead of polling for migration status and libvirt will behave consistently, i.e., migration won't be cancelled in case another job started during migration takes long time to finish. For bug https://bugzilla.redhat.com/show_bug.cgi?id=1083238 Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_domain.c | 10 ++++++---- src/qemu/qemu_migration.c | 7 ++++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 3df454f..2d8afd6 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -1287,8 +1287,9 @@ qemuDomainObjEnterMonitorInternal(virQEMUDriverPtr driver, qemuDomainObjPrivatePtr priv = obj->privateData; if (asyncJob != QEMU_ASYNC_JOB_NONE) { - if (qemuDomainObjBeginNestedJob(driver, obj, asyncJob) < 0) - return -1; + int ret; + if ((ret = qemuDomainObjBeginNestedJob(driver, obj, asyncJob)) < 0) + return ret; if (!virDomainObjIsActive(obj)) { virReportError(VIR_ERR_OPERATION_FAILED, "%s", _("domain is no longer running")); @@ -1367,8 +1368,9 @@ void qemuDomainObjExitMonitor(virQEMUDriverPtr driver, * with the same asyncJob. * * Returns 0 if job was started, in which case this must be followed with - * qemuDomainObjExitMonitor(); or -1 if the job could not be - * started (probably because the vm exited in the meantime). + * qemuDomainObjExitMonitor(); -2 if waiting for the nested job times out; + * or -1 if the job could not be started (probably because the vm exited + * in the meantime). */ int qemuDomainObjEnterMonitorAsync(virQEMUDriverPtr driver, diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index a9f7fea..f0df1a6 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -1714,8 +1714,9 @@ qemuMigrationUpdateJobStatus(virQEMUDriverPtr driver, ret = qemuDomainObjEnterMonitorAsync(driver, vm, asyncJob); if (ret < 0) { - /* Guest already exited; nothing further to update. */ - return -1; + /* Guest already exited or waiting for the job timed out; nothing + * further to update. */ + return ret; } ret = qemuMonitorGetMigrationStatus(priv->mon, &status); @@ -1812,7 +1813,7 @@ qemuMigrationWaitForCompletion(virQEMUDriverPtr driver, virDomainObjPtr vm, /* Poll every 50ms for progress & to allow cancellation */ struct timespec ts = { .tv_sec = 0, .tv_nsec = 50 * 1000 * 1000ull }; - if (qemuMigrationUpdateJobStatus(driver, vm, job, asyncJob) < 0) + if (qemuMigrationUpdateJobStatus(driver, vm, job, asyncJob) == -1) goto cleanup; /* cancel migration if disk I/O error is emitted while migrating */ -- 1.9.3

On 05/13/14 15:44, Jiri Denemark wrote:
When qemu driver is polling for migration to finish (in qemuMigrationWaitForCompletion), it may happen that another job allowed during migration is running and if it does not finish within 30 seconds, migration would be cancelled because of that. However, we can just ignore the timeout and let the waiting loop try again later.
If an event fired at the end of migration is ever implemented in QEMU, we can just wait for the event instead of polling for migration status and libvirt will behave consistently, i.e., migration won't be cancelled in case another job started during migration takes long time to finish.
For bug https://bugzilla.redhat.com/show_bug.cgi?id=1083238
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_domain.c | 10 ++++++---- src/qemu/qemu_migration.c | 7 ++++--- 2 files changed, 10 insertions(+), 7 deletions(-)
The code in this patch looks good, but while inspecting the callers of qemuDomainObjEnterMonitorAsync I found two other places that store the value of the call and then possibly return it. I think that qemuProcessStartCPUs() and qemuProcessStopCPUs() should convert the value back to -2 so that we don't have to make sure that everything deals okay with that. Peter

On Wed, May 14, 2014 at 11:08:28 +0200, Peter Krempa wrote:
On 05/13/14 15:44, Jiri Denemark wrote:
When qemu driver is polling for migration to finish (in qemuMigrationWaitForCompletion), it may happen that another job allowed during migration is running and if it does not finish within 30 seconds, migration would be cancelled because of that. However, we can just ignore the timeout and let the waiting loop try again later.
If an event fired at the end of migration is ever implemented in QEMU, we can just wait for the event instead of polling for migration status and libvirt will behave consistently, i.e., migration won't be cancelled in case another job started during migration takes long time to finish.
For bug https://bugzilla.redhat.com/show_bug.cgi?id=1083238
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_domain.c | 10 ++++++---- src/qemu/qemu_migration.c | 7 ++++--- 2 files changed, 10 insertions(+), 7 deletions(-)
The code in this patch looks good, but while inspecting the callers of qemuDomainObjEnterMonitorAsync I found two other places that store the value of the call and then possibly return it. I think that qemuProcessStartCPUs() and qemuProcessStopCPUs() should convert the value back to -2 so that we don't have to make sure that everything deals okay with that.
Oops, I somehow overlooked these two instances. New patch 2.5/3 should fix them. Jirka

On 05/14/14 13:30, Jiri Denemark wrote:
On Wed, May 14, 2014 at 11:08:28 +0200, Peter Krempa wrote:
On 05/13/14 15:44, Jiri Denemark wrote:
When qemu driver is polling for migration to finish (in qemuMigrationWaitForCompletion), it may happen that another job allowed during migration is running and if it does not finish within 30 seconds, migration would be cancelled because of that. However, we can just ignore the timeout and let the waiting loop try again later.
If an event fired at the end of migration is ever implemented in QEMU, we can just wait for the event instead of polling for migration status and libvirt will behave consistently, i.e., migration won't be cancelled in case another job started during migration takes long time to finish.
For bug https://bugzilla.redhat.com/show_bug.cgi?id=1083238
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_domain.c | 10 ++++++---- src/qemu/qemu_migration.c | 7 ++++--- 2 files changed, 10 insertions(+), 7 deletions(-)
The code in this patch looks good, but while inspecting the callers of qemuDomainObjEnterMonitorAsync I found two other places that store the value of the call and then possibly return it. I think that qemuProcessStartCPUs() and qemuProcessStopCPUs() should convert the value back to -2 so that we don't have to make sure that everything deals okay with that.
Oops, I somehow overlooked these two instances. New patch 2.5/3 should fix them.
Yep, ACK to this patch as 2.5 fixes the stuff.
Jirka
Peter

As a side effect, the return value of qemuDomainObjEnterMonitorAsync is not directly used as the return value of qemuProcess{Start,Stop}CPUs. Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_process.c | 53 +++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 606478e..a83780f 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2747,23 +2747,26 @@ qemuProcessStartCPUs(virQEMUDriverPtr driver, virDomainObjPtr vm, } VIR_FREE(priv->lockState); - ret = qemuDomainObjEnterMonitorAsync(driver, vm, asyncJob); - if (ret == 0) { - ret = qemuMonitorStartCPUs(priv->mon, conn); - qemuDomainObjExitMonitor(driver, vm); - } + if (qemuDomainObjEnterMonitorAsync(driver, vm, asyncJob) < 0) + goto release; - if (ret == 0) { - virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason); - } else { - if (virDomainLockProcessPause(driver->lockManager, vm, &priv->lockState) < 0) - VIR_WARN("Unable to release lease on %s", vm->def->name); - VIR_DEBUG("Preserving lock state '%s'", NULLSTR(priv->lockState)); - } + ret = qemuMonitorStartCPUs(priv->mon, conn); + qemuDomainObjExitMonitor(driver, vm); + + if (ret < 0) + goto release; + + virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason); cleanup: virObjectUnref(cfg); return ret; + + release: + if (virDomainLockProcessPause(driver->lockManager, vm, &priv->lockState) < 0) + VIR_WARN("Unable to release lease on %s", vm->def->name); + VIR_DEBUG("Preserving lock state '%s'", NULLSTR(priv->lockState)); + goto cleanup; } @@ -2771,24 +2774,26 @@ int qemuProcessStopCPUs(virQEMUDriverPtr driver, virDomainObjPtr vm, virDomainPausedReason reason, enum qemuDomainAsyncJob asyncJob) { - int ret; + int ret = -1; qemuDomainObjPrivatePtr priv = vm->privateData; VIR_FREE(priv->lockState); - ret = qemuDomainObjEnterMonitorAsync(driver, vm, asyncJob); - if (ret == 0) { - ret = qemuMonitorStopCPUs(priv->mon); - qemuDomainObjExitMonitor(driver, vm); - } + if (qemuDomainObjEnterMonitorAsync(driver, vm, asyncJob) < 0) + goto cleanup; - if (ret == 0) { - virDomainObjSetState(vm, VIR_DOMAIN_PAUSED, reason); - if (virDomainLockProcessPause(driver->lockManager, vm, &priv->lockState) < 0) - VIR_WARN("Unable to release lease on %s", vm->def->name); - VIR_DEBUG("Preserving lock state '%s'", NULLSTR(priv->lockState)); - } + ret = qemuMonitorStopCPUs(priv->mon); + qemuDomainObjExitMonitor(driver, vm); + if (ret < 0) + goto cleanup; + + virDomainObjSetState(vm, VIR_DOMAIN_PAUSED, reason); + if (virDomainLockProcessPause(driver->lockManager, vm, &priv->lockState) < 0) + VIR_WARN("Unable to release lease on %s", vm->def->name); + VIR_DEBUG("Preserving lock state '%s'", NULLSTR(priv->lockState)); + + cleanup: return ret; } -- 1.9.3

On 05/14/14 13:29, Jiri Denemark wrote:
As a side effect, the return value of qemuDomainObjEnterMonitorAsync is not directly used as the return value of qemuProcess{Start,Stop}CPUs.
Signed-off-by: Jiri Denemark <jdenemar@redhat.com> --- src/qemu/qemu_process.c | 53 +++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 24 deletions(-)
ACK, Peter
participants (2)
-
Jiri Denemark
-
Peter Krempa