[libvirt] [PATCH 0/2] Don't always fail migrations on I/O errors

Recently I've added code that aborts migration in case of I/O error. This may not be desirable as qemu does actually support such migration. This series adds a flag that enables this option that will be now disabled by default. Peter Krempa (2): migration: Make erroring out on I/O error controllable by flag migration: Don't propagate VIR_MIGRATE_ABORT_ON_ERROR include/libvirt/libvirt.h.in | 1 + src/libvirt.c | 17 +++++++++++------ src/qemu/qemu_driver.c | 4 ++-- src/qemu/qemu_migration.c | 39 ++++++++++++++++++++++++--------------- src/qemu/qemu_migration.h | 6 ++++-- tools/virsh-domain.c | 7 +++++++ tools/virsh.pod | 6 ++++-- 7 files changed, 53 insertions(+), 27 deletions(-) -- 1.8.2.1

Paolo Bonzini pointed out that it's actually possible to migrate a qemu instance that was paused due to I/O error and it will be able to work on the destination if the storage is accessible. This patch introduces flag VIR_MIGRATE_ABORT_ON_ERROR that cancels the migration in case an I/O error happens while it's being performed and allows migration without this flag. This flag can be possibly used for other error reasons that may be introduced in the future. --- include/libvirt/libvirt.h.in | 1 + src/qemu/qemu_driver.c | 4 ++-- src/qemu/qemu_migration.c | 23 ++++++++++++++--------- src/qemu/qemu_migration.h | 6 ++++-- tools/virsh-domain.c | 7 +++++++ tools/virsh.pod | 6 ++++-- 6 files changed, 32 insertions(+), 15 deletions(-) diff --git a/include/libvirt/libvirt.h.in b/include/libvirt/libvirt.h.in index 574b970..bd75e1d 100644 --- a/include/libvirt/libvirt.h.in +++ b/include/libvirt/libvirt.h.in @@ -1188,6 +1188,7 @@ typedef enum { VIR_MIGRATE_UNSAFE = (1 << 9), /* force migration even if it is considered unsafe */ VIR_MIGRATE_OFFLINE = (1 << 10), /* offline migrate */ VIR_MIGRATE_COMPRESSED = (1 << 11), /* compress data during migration */ + VIR_MIGRATE_ABORT_ON_ERROR = (1 << 12), /* abort migration on I/O errors happened during migration */ } virDomainMigrateFlags; /* Domain migration. */ diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index c886378..4b23bc3 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -2819,7 +2819,7 @@ qemuDomainSaveInternal(virQEMUDriverPtr driver, virDomainPtr dom, if (!(caps = virQEMUDriverGetCapabilities(driver, false))) goto cleanup; - if (!qemuMigrationIsAllowed(driver, vm, vm->def, false)) + if (!qemuMigrationIsAllowed(driver, vm, vm->def, false, false)) goto cleanup; if (qemuDomainObjBeginAsyncJob(driver, vm, @@ -11663,7 +11663,7 @@ qemuDomainSnapshotCreateActiveExternal(virConnectPtr conn, /* do the memory snapshot if necessary */ if (memory) { /* check if migration is possible */ - if (!qemuMigrationIsAllowed(driver, vm, vm->def, false)) + if (!qemuMigrationIsAllowed(driver, vm, vm->def, false, false)) goto endjob; /* allow the migration job to be cancelled or the domain to be paused */ diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 48e0d44..281467b 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -1420,7 +1420,7 @@ cleanup: * the fact that older servers did not do checks on the source. */ bool qemuMigrationIsAllowed(virQEMUDriverPtr driver, virDomainObjPtr vm, - virDomainDefPtr def, bool remote) + virDomainDefPtr def, bool remote, bool abort_on_error) { int nsnapshots; int pauseReason; @@ -1448,7 +1448,8 @@ qemuMigrationIsAllowed(virQEMUDriverPtr driver, virDomainObjPtr vm, } /* cancel migration if disk I/O error is emitted while migrating */ - if (virDomainObjGetState(vm, &pauseReason) == VIR_DOMAIN_PAUSED && + if (abort_on_error && + virDomainObjGetState(vm, &pauseReason) == VIR_DOMAIN_PAUSED && pauseReason == VIR_DOMAIN_PAUSED_IOERROR) { virReportError(VIR_ERR_OPERATION_INVALID, "%s", _("cannot migrate domain with I/O error")); @@ -1692,7 +1693,7 @@ qemuMigrationUpdateJobStatus(virQEMUDriverPtr driver, static int qemuMigrationWaitForCompletion(virQEMUDriverPtr driver, virDomainObjPtr vm, enum qemuDomainAsyncJob asyncJob, - virConnectPtr dconn) + virConnectPtr dconn, bool abort_on_error) { qemuDomainObjPrivatePtr priv = vm->privateData; const char *job; @@ -1719,7 +1720,7 @@ qemuMigrationWaitForCompletion(virQEMUDriverPtr driver, virDomainObjPtr vm, struct timespec ts = { .tv_sec = 0, .tv_nsec = 50 * 1000 * 1000ull }; /* cancel migration if disk I/O error is emitted while migrating */ - if (priv->job.asyncJob == QEMU_ASYNC_JOB_MIGRATION_OUT && + if (abort_on_error && virDomainObjGetState(vm, &pauseReason) == VIR_DOMAIN_PAUSED && pauseReason == VIR_DOMAIN_PAUSED_IOERROR) goto cancel; @@ -1920,6 +1921,7 @@ char *qemuMigrationBegin(virQEMUDriverPtr driver, qemuDomainObjPrivatePtr priv = vm->privateData; virCapsPtr caps = NULL; unsigned int cookieFlags = QEMU_MIGRATION_COOKIE_LOCKSTATE; + bool abort_on_error = !!(flags & VIR_MIGRATE_ABORT_ON_ERROR); VIR_DEBUG("driver=%p, vm=%p, xmlin=%s, dname=%s," " cookieout=%p, cookieoutlen=%p, flags=%lx", @@ -1936,7 +1938,7 @@ char *qemuMigrationBegin(virQEMUDriverPtr driver, if (priv->job.asyncJob == QEMU_ASYNC_JOB_MIGRATION_OUT) qemuMigrationJobSetPhase(driver, vm, QEMU_MIGRATION_PHASE_BEGIN3); - if (!qemuMigrationIsAllowed(driver, vm, NULL, true)) + if (!qemuMigrationIsAllowed(driver, vm, NULL, true, abort_on_error)) goto cleanup; if (!(flags & VIR_MIGRATE_UNSAFE) && !qemuMigrationIsSafe(vm->def)) @@ -2052,6 +2054,7 @@ qemuMigrationPrepareAny(virQEMUDriverPtr driver, virCapsPtr caps = NULL; const char *listenAddr = NULL; char *migrateFrom = NULL; + bool abort_on_error = !!(flags & VIR_MIGRATE_ABORT_ON_ERROR); if (virTimeMillisNow(&now) < 0) return -1; @@ -2081,7 +2084,7 @@ qemuMigrationPrepareAny(virQEMUDriverPtr driver, if (!(caps = virQEMUDriverGetCapabilities(driver, false))) goto cleanup; - if (!qemuMigrationIsAllowed(driver, NULL, *def, true)) + if (!qemuMigrationIsAllowed(driver, NULL, *def, true, abort_on_error)) goto cleanup; /* Let migration hook filter domain XML */ @@ -2777,6 +2780,7 @@ qemuMigrationRun(virQEMUDriverPtr driver, unsigned long migrate_speed = resource ? resource : priv->migMaxBandwidth; virErrorPtr orig_err = NULL; unsigned int cookieFlags = 0; + bool abort_on_error = !!(flags & VIR_MIGRATE_ABORT_ON_ERROR); VIR_DEBUG("driver=%p, vm=%p, cookiein=%s, cookieinlen=%d, " "cookieout=%p, cookieoutlen=%p, flags=%lx, resource=%lu, " @@ -2929,7 +2933,7 @@ qemuMigrationRun(virQEMUDriverPtr driver, if (qemuMigrationWaitForCompletion(driver, vm, QEMU_ASYNC_JOB_MIGRATION_OUT, - dconn) < 0) + dconn, abort_on_error) < 0) goto cleanup; /* When migration completed, QEMU will have paused the @@ -3610,6 +3614,7 @@ qemuMigrationPerformJob(virQEMUDriverPtr driver, int resume = 0; virErrorPtr orig_err = NULL; virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver); + bool abort_on_error = !!(flags & VIR_MIGRATE_ABORT_ON_ERROR); if (qemuMigrationJobStart(driver, vm, QEMU_ASYNC_JOB_MIGRATION_OUT) < 0) goto cleanup; @@ -3620,7 +3625,7 @@ qemuMigrationPerformJob(virQEMUDriverPtr driver, goto endjob; } - if (!qemuMigrationIsAllowed(driver, vm, NULL, true)) + if (!qemuMigrationIsAllowed(driver, vm, NULL, true, abort_on_error)) goto endjob; if (!(flags & VIR_MIGRATE_UNSAFE) && !qemuMigrationIsSafe(vm->def)) @@ -4316,7 +4321,7 @@ qemuMigrationToFile(virQEMUDriverPtr driver, virDomainObjPtr vm, if (rc < 0) goto cleanup; - rc = qemuMigrationWaitForCompletion(driver, vm, asyncJob, NULL); + rc = qemuMigrationWaitForCompletion(driver, vm, asyncJob, NULL, false); if (rc < 0) goto cleanup; diff --git a/src/qemu/qemu_migration.h b/src/qemu/qemu_migration.h index 22b04b4..5b21ca2 100644 --- a/src/qemu/qemu_migration.h +++ b/src/qemu/qemu_migration.h @@ -38,7 +38,8 @@ VIR_MIGRATE_CHANGE_PROTECTION | \ VIR_MIGRATE_UNSAFE | \ VIR_MIGRATE_OFFLINE | \ - VIR_MIGRATE_COMPRESSED) + VIR_MIGRATE_COMPRESSED | \ + VIR_MIGRATE_ABORT_ON_ERROR) enum qemuMigrationJobPhase { QEMU_MIGRATION_PHASE_NONE = 0, @@ -147,7 +148,8 @@ int qemuMigrationConfirm(virQEMUDriverPtr driver, int retcode); bool qemuMigrationIsAllowed(virQEMUDriverPtr driver, virDomainObjPtr vm, - virDomainDefPtr def, bool remote); + virDomainDefPtr def, bool remote, + bool abort_on_error); int qemuMigrationToFile(virQEMUDriverPtr driver, virDomainObjPtr vm, int fd, off_t offset, const char *path, diff --git a/tools/virsh-domain.c b/tools/virsh-domain.c index 955e882..7f9b9e3 100644 --- a/tools/virsh-domain.c +++ b/tools/virsh-domain.c @@ -8306,6 +8306,10 @@ static const vshCmdOptDef opts_migrate[] = { .type = VSH_OT_BOOL, .help = N_("compress repeated pages during live migration") }, + {.name = "abort-on-error", + .type = VSH_OT_BOOL, + .help = N_("abort on soft errors during migration") + }, {.name = "domain", .type = VSH_OT_DATA, .flags = VSH_OFLAG_REQ, @@ -8399,6 +8403,9 @@ doMigrate(void *opaque) flags |= VIR_MIGRATE_OFFLINE; } + if (vshCommandOptBool(cmd, "abort-on-error")) + flags |= VIR_MIGRATE_ABORT_ON_ERROR; + if (xmlfile && virFileReadAll(xmlfile, 8192, &xml) < 0) { vshError(ctl, _("file '%s' doesn't exist"), xmlfile); diff --git a/tools/virsh.pod b/tools/virsh.pod index 405b4d2..21367d4 100644 --- a/tools/virsh.pod +++ b/tools/virsh.pod @@ -1043,7 +1043,8 @@ stats. =item B<migrate> [I<--live>] [I<--offline>] [I<--direct>] [I<--p2p> [I<--tunnelled>]] [I<--persistent>] [I<--undefinesource>] [I<--suspend>] [I<--copy-storage-all>] [I<--copy-storage-inc>] [I<--change-protection>] [I<--unsafe>] [I<--verbose>] -[I<--compressed>] I<domain> I<desturi> [I<migrateuri>] [I<dname>] +[I<--compressed>] [I<--abort-on-error>] +I<domain> I<desturi> [I<migrateuri>] [I<dname>] [I<--timeout> B<seconds>] [I<--xml> B<file>] Migrate domain to another host. Add I<--live> for live migration; <--p2p> @@ -1066,7 +1067,8 @@ is implicitly enabled when supported by the hypervisor, but can be explicitly used to reject the migration if the hypervisor lacks change protection support. I<--verbose> displays the progress of migration. I<--compressed> activates compression of memory pages that have to be transferred repeatedly -during live migration. +during live migration. I<--abort-on-error> cancels the migration if a soft +error (for example I/O error) happens during the migration. B<Note>: Individual hypervisors usually do not support all possible types of migration. For example, QEMU does not support direct migration. -- 1.8.2.1

This flag is meant for errors happening on the source of the migration and isn't used on the destination. To allow better migration compatibility, don't propagate it to the destination. --- src/libvirt.c | 17 +++++++++++------ src/qemu/qemu_migration.c | 16 ++++++++++------ 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/libvirt.c b/src/libvirt.c index 620dbdd..57e1027 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -4554,6 +4554,8 @@ virDomainMigrateVersion1(virDomainPtr domain, char *cookie = NULL; int cookielen = 0, ret; virDomainInfo info; + unsigned int destflags = flags & ~VIR_MIGRATE_ABORT_ON_ERROR; + VIR_DOMAIN_DEBUG(domain, "dconn=%p, flags=%lx, dname=%s, uri=%s, bandwidth=%lu", dconn, flags, NULLSTR(dname), NULLSTR(uri), bandwidth); @@ -4575,7 +4577,7 @@ virDomainMigrateVersion1(virDomainPtr domain, * the URI, it should leave uri_out as NULL. */ if (dconn->driver->domainMigratePrepare - (dconn, &cookie, &cookielen, uri, &uri_out, flags, dname, + (dconn, &cookie, &cookielen, uri, &uri_out, destflags, dname, bandwidth) == -1) goto done; @@ -4602,7 +4604,7 @@ virDomainMigrateVersion1(virDomainPtr domain, dname = dname ? dname : domain->name; if (dconn->driver->domainMigrateFinish) ddomain = dconn->driver->domainMigrateFinish - (dconn, dname, cookie, cookielen, uri, flags); + (dconn, dname, cookie, cookielen, uri, destflags); else ddomain = virDomainLookupByName(dconn, dname); @@ -4648,6 +4650,8 @@ virDomainMigrateVersion2(virDomainPtr domain, virErrorPtr orig_err = NULL; unsigned int getxml_flags = 0; int cancelled; + unsigned int destflags = flags & ~VIR_MIGRATE_ABORT_ON_ERROR; + VIR_DOMAIN_DEBUG(domain, "dconn=%p, flags=%lx, dname=%s, uri=%s, bandwidth=%lu", dconn, flags, NULLSTR(dname), NULLSTR(uri), bandwidth); @@ -4692,7 +4696,7 @@ virDomainMigrateVersion2(virDomainPtr domain, VIR_DEBUG("Prepare2 %p flags=%lx", dconn, flags); ret = dconn->driver->domainMigratePrepare2 - (dconn, &cookie, &cookielen, uri, &uri_out, flags, dname, + (dconn, &cookie, &cookielen, uri, &uri_out, destflags, dname, bandwidth, dom_xml); VIR_FREE(dom_xml); if (ret == -1) @@ -4732,7 +4736,7 @@ finish: dname = dname ? dname : domain->name; VIR_DEBUG("Finish2 %p ret=%d", dconn, ret); ddomain = dconn->driver->domainMigrateFinish2 - (dconn, dname, cookie, cookielen, uri, flags, cancelled); + (dconn, dname, cookie, cookielen, uri, destflags, cancelled); done: if (orig_err) { @@ -4791,6 +4795,7 @@ virDomainMigrateVersion3(virDomainPtr domain, int cancelled = 1; unsigned long protection = 0; bool notify_source = true; + unsigned int destflags = flags & ~VIR_MIGRATE_ABORT_ON_ERROR; VIR_DOMAIN_DEBUG(domain, "dconn=%p xmlin=%s, flags=%lx, " "dname=%s, uri=%s, bandwidth=%lu", @@ -4830,7 +4835,7 @@ virDomainMigrateVersion3(virDomainPtr domain, cookieoutlen = 0; ret = dconn->driver->domainMigratePrepare3 (dconn, cookiein, cookieinlen, &cookieout, &cookieoutlen, - uri, &uri_out, flags, dname, bandwidth, dom_xml); + uri, &uri_out, destflags, dname, bandwidth, dom_xml); VIR_FREE(dom_xml); if (ret == -1) { if (protection) { @@ -4908,7 +4913,7 @@ finish: dname = dname ? dname : domain->name; ddomain = dconn->driver->domainMigrateFinish3 (dconn, dname, cookiein, cookieinlen, &cookieout, &cookieoutlen, - NULL, uri, flags, cancelled); + NULL, uri, destflags, cancelled); /* If ddomain is NULL, then we were unable to start * the guest on the target, and must restart on the diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 281467b..16c8aac 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -3165,6 +3165,8 @@ static int doPeer2PeerMigrate2(virQEMUDriverPtr driver, virErrorPtr orig_err = NULL; bool cancelled; virStreamPtr st = NULL; + unsigned int destflags = flags & ~VIR_MIGRATE_ABORT_ON_ERROR; + VIR_DEBUG("driver=%p, sconn=%p, dconn=%p, vm=%p, dconnuri=%s, " "flags=%lx, dname=%s, resource=%lu", driver, sconn, dconn, vm, NULLSTR(dconnuri), @@ -3194,13 +3196,13 @@ static int doPeer2PeerMigrate2(virQEMUDriverPtr driver, qemuDomainObjEnterRemote(vm); ret = dconn->driver->domainMigratePrepareTunnel - (dconn, st, flags, dname, resource, dom_xml); + (dconn, st, destflags, dname, resource, dom_xml); qemuDomainObjExitRemote(vm); } else { qemuDomainObjEnterRemote(vm); ret = dconn->driver->domainMigratePrepare2 (dconn, &cookie, &cookielen, NULL, &uri_out, - flags, dname, resource, dom_xml); + destflags, dname, resource, dom_xml); qemuDomainObjExitRemote(vm); } VIR_FREE(dom_xml); @@ -3258,7 +3260,7 @@ finish: qemuDomainObjEnterRemote(vm); ddomain = dconn->driver->domainMigrateFinish2 (dconn, dname, cookie, cookielen, - uri_out ? uri_out : dconnuri, flags, cancelled); + uri_out ? uri_out : dconnuri, destflags, cancelled); qemuDomainObjExitRemote(vm); cleanup: @@ -3308,6 +3310,8 @@ static int doPeer2PeerMigrate3(virQEMUDriverPtr driver, virErrorPtr orig_err = NULL; bool cancelled; virStreamPtr st = NULL; + unsigned int destflags = flags & ~VIR_MIGRATE_ABORT_ON_ERROR; + VIR_DEBUG("driver=%p, sconn=%p, dconn=%p, vm=%p, xmlin=%s, " "dconnuri=%s, uri=%s, flags=%lx, dname=%s, resource=%lu", driver, sconn, dconn, vm, NULLSTR(xmlin), @@ -3340,13 +3344,13 @@ static int doPeer2PeerMigrate3(virQEMUDriverPtr driver, ret = dconn->driver->domainMigratePrepareTunnel3 (dconn, st, cookiein, cookieinlen, &cookieout, &cookieoutlen, - flags, dname, resource, dom_xml); + destflags, dname, resource, dom_xml); qemuDomainObjExitRemote(vm); } else { qemuDomainObjEnterRemote(vm); ret = dconn->driver->domainMigratePrepare3 (dconn, cookiein, cookieinlen, &cookieout, &cookieoutlen, - uri, &uri_out, flags, dname, resource, dom_xml); + uri, &uri_out, destflags, dname, resource, dom_xml); qemuDomainObjExitRemote(vm); } VIR_FREE(dom_xml); @@ -3422,7 +3426,7 @@ finish: qemuDomainObjEnterRemote(vm); ddomain = dconn->driver->domainMigrateFinish3 (dconn, dname, cookiein, cookieinlen, &cookieout, &cookieoutlen, - dconnuri, uri_out ? uri_out : uri, flags, cancelled); + dconnuri, uri_out ? uri_out : uri, destflags, cancelled); qemuDomainObjExitRemote(vm); /* If ddomain is NULL, then we were unable to start -- 1.8.2.1

On 12.06.2013 16:11, Peter Krempa wrote:
Recently I've added code that aborts migration in case of I/O error. This may not be desirable as qemu does actually support such migration. This series adds a flag that enables this option that will be now disabled by default.
Peter Krempa (2): migration: Make erroring out on I/O error controllable by flag migration: Don't propagate VIR_MIGRATE_ABORT_ON_ERROR
include/libvirt/libvirt.h.in | 1 + src/libvirt.c | 17 +++++++++++------ src/qemu/qemu_driver.c | 4 ++-- src/qemu/qemu_migration.c | 39 ++++++++++++++++++++++++--------------- src/qemu/qemu_migration.h | 6 ++++-- tools/virsh-domain.c | 7 +++++++ tools/virsh.pod | 6 ++++-- 7 files changed, 53 insertions(+), 27 deletions(-)
ACK series. Michal

On Fri, Jun 14, 2013 at 12:41:40 +0200, Michal Privoznik wrote:
On 12.06.2013 16:11, Peter Krempa wrote:
Recently I've added code that aborts migration in case of I/O error. This may not be desirable as qemu does actually support such migration. This series adds a flag that enables this option that will be now disabled by default.
Peter Krempa (2): migration: Make erroring out on I/O error controllable by flag migration: Don't propagate VIR_MIGRATE_ABORT_ON_ERROR
include/libvirt/libvirt.h.in | 1 + src/libvirt.c | 17 +++++++++++------ src/qemu/qemu_driver.c | 4 ++-- src/qemu/qemu_migration.c | 39 ++++++++++++++++++++++++--------------- src/qemu/qemu_migration.h | 6 ++++-- tools/virsh-domain.c | 7 +++++++ tools/virsh.pod | 6 ++++-- 7 files changed, 53 insertions(+), 27 deletions(-)
ACK series.
I pushed this series. Jirka
participants (3)
-
Jiri Denemark
-
Michal Privoznik
-
Peter Krempa