[libvirt] [PATCH 00/19] Rollback migration when libvirtd restarts

This series is also available at https://gitorious.org/~jirka/libvirt/jirka-staging/commits/migration-recover... The series does several things: - persists current job and its phase in status xml - allows safe monitor commands to be run during migration/save/dump jobs - implements recovery when libvirtd is restarted while a job is active - consolidates some code and fixes bugs I found when working in the area The series is not perfect and still needs some corner cases to be fixed but I think it's better to send the series for review now and add small additional fixes in the next version(s) instead of waiting for it to be perfect. Jiri Denemark (19): qemu: Separate job related data into a new object qemu: Consolidate BeginJob{,WithDriver} into a single method qemu: Consolidate {Enter,Exit}Monitor{,WithDriver} qemu: Allow all query commands to be run during long jobs qemu: Save job type in domain status XML qemu: Recover from interrupted jobs qemu: Add support for job phase qemu: Consolidate qemuMigrationPrepare{Direct,Tunnel} qemu: Implement migration job phases qemu: Migration job on destination daemon qemu: Migration job on source daemon qemu: Recover from interrupted migration qemu: Fix monitor unlocking in some error paths qemu: Remove special case for virDomainGetBlockInfo qemu: Remove special case for virDomainBlockStats qemu: Remove special case for virDomainMigrateSetMaxSpeed qemu: Remove special case for virDomainMigrateSetMaxDowntime qemu: Remove special case for virDomainSuspend qemu: Remove special case for virDomainAbortJob src/libvirt.c | 11 +- src/qemu/MIGRATION.txt | 55 +++ src/qemu/qemu_domain.c | 575 ++++++++++++++++++++++++------- src/qemu/qemu_domain.h | 130 +++++-- src/qemu/qemu_driver.c | 467 +++++++++++++------------ src/qemu/qemu_hotplug.c | 50 ++-- src/qemu/qemu_migration.c | 849 +++++++++++++++++++++++---------------------- src/qemu/qemu_migration.h | 36 ++ src/qemu/qemu_process.c | 244 ++++++++++++- 9 files changed, 1569 insertions(+), 848 deletions(-) create mode 100644 src/qemu/MIGRATION.txt -- 1.7.6

--- src/qemu/qemu_domain.c | 104 +++++++++++++++++++++++++------------ src/qemu/qemu_domain.h | 25 +++++++--- src/qemu/qemu_driver.c | 82 +++++++++++++++--------------- src/qemu/qemu_migration.c | 123 ++++++++++++++++++++++----------------------- src/qemu/qemu_process.c | 5 +- 5 files changed, 192 insertions(+), 147 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 4b65d87..bbdfdc4 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -79,6 +79,42 @@ void qemuDomainEventQueue(struct qemud_driver *driver, } +static int +qemuDomainObjInitJob(qemuDomainObjPrivatePtr priv) +{ + memset(&priv->job, 0, sizeof(priv->job)); + + if (virCondInit(&priv->job.cond) < 0) + return -1; + + if (virCondInit(&priv->job.signalCond) < 0) { + ignore_value(virCondDestroy(&priv->job.cond)); + return -1; + } + + return 0; +} + +static void +qemuDomainObjResetJob(qemuDomainObjPrivatePtr priv) +{ + struct qemuDomainJobObj *job = &priv->job; + + job->active = QEMU_JOB_NONE; + job->start = 0; + memset(&job->info, 0, sizeof(job->info)); + job->signals = 0; + memset(&job->signalsData, 0, sizeof(job->signalsData)); +} + +static void +qemuDomainObjFreeJob(qemuDomainObjPrivatePtr priv) +{ + ignore_value(virCondDestroy(&priv->job.cond)); + ignore_value(virCondDestroy(&priv->job.signalCond)); +} + + static void *qemuDomainObjPrivateAlloc(void) { qemuDomainObjPrivatePtr priv; @@ -86,19 +122,10 @@ static void *qemuDomainObjPrivateAlloc(void) if (VIR_ALLOC(priv) < 0) return NULL; - if (virCondInit(&priv->jobCond) < 0) - goto initfail; - - if (virCondInit(&priv->signalCond) < 0) { - ignore_value(virCondDestroy(&priv->jobCond)); - goto initfail; - } + if (qemuDomainObjInitJob(priv) < 0) + VIR_FREE(priv); return priv; - -initfail: - VIR_FREE(priv); - return NULL; } static void qemuDomainObjPrivateFree(void *data) @@ -109,9 +136,8 @@ static void qemuDomainObjPrivateFree(void *data) qemuDomainPCIAddressSetFree(priv->pciaddrs); virDomainChrSourceDefFree(priv->monConfig); + qemuDomainObjFreeJob(priv); VIR_FREE(priv->vcpupids); - ignore_value(virCondDestroy(&priv->jobCond)); - ignore_value(virCondDestroy(&priv->signalCond)); VIR_FREE(priv->lockState); /* This should never be non-NULL if we get here, but just in case... */ @@ -473,6 +499,24 @@ void qemuDomainSetNamespaceHooks(virCapsPtr caps) caps->ns.href = qemuDomainDefNamespaceHref; } +void +qemuDomainObjSetJob(virDomainObjPtr obj, + enum qemuDomainJob job) +{ + qemuDomainObjPrivatePtr priv = obj->privateData; + + priv->job.active = job; +} + +void +qemuDomainObjDiscardJob(virDomainObjPtr obj) +{ + qemuDomainObjPrivatePtr priv = obj->privateData; + + qemuDomainObjResetJob(priv); + qemuDomainObjSetJob(obj, QEMU_JOB_NONE); +} + /* * obj must be locked before calling, qemud_driver must NOT be locked * @@ -498,8 +542,8 @@ int qemuDomainObjBeginJob(virDomainObjPtr obj) virDomainObjRef(obj); - while (priv->jobActive) { - if (virCondWaitUntil(&priv->jobCond, &obj->lock, then) < 0) { + while (priv->job.active) { + if (virCondWaitUntil(&priv->job.cond, &obj->lock, then) < 0) { /* Safe to ignore value since ref count was incremented above */ ignore_value(virDomainObjUnref(obj)); if (errno == ETIMEDOUT) @@ -511,11 +555,9 @@ int qemuDomainObjBeginJob(virDomainObjPtr obj) return -1; } } - priv->jobActive = QEMU_JOB_UNSPECIFIED; - priv->jobSignals = 0; - memset(&priv->jobSignalsData, 0, sizeof(priv->jobSignalsData)); - priv->jobStart = now; - memset(&priv->jobInfo, 0, sizeof(priv->jobInfo)); + qemuDomainObjResetJob(priv); + qemuDomainObjSetJob(obj, QEMU_JOB_UNSPECIFIED); + priv->job.start = now; return 0; } @@ -540,8 +582,8 @@ int qemuDomainObjBeginJobWithDriver(struct qemud_driver *driver, virDomainObjRef(obj); qemuDriverUnlock(driver); - while (priv->jobActive) { - if (virCondWaitUntil(&priv->jobCond, &obj->lock, then) < 0) { + while (priv->job.active) { + if (virCondWaitUntil(&priv->job.cond, &obj->lock, then) < 0) { if (errno == ETIMEDOUT) qemuReportError(VIR_ERR_OPERATION_TIMEOUT, "%s", _("cannot acquire state change lock")); @@ -556,11 +598,9 @@ int qemuDomainObjBeginJobWithDriver(struct qemud_driver *driver, return -1; } } - priv->jobActive = QEMU_JOB_UNSPECIFIED; - priv->jobSignals = 0; - memset(&priv->jobSignalsData, 0, sizeof(priv->jobSignalsData)); - priv->jobStart = now; - memset(&priv->jobInfo, 0, sizeof(priv->jobInfo)); + qemuDomainObjResetJob(priv); + qemuDomainObjSetJob(obj, QEMU_JOB_UNSPECIFIED); + priv->job.start = now; virDomainObjUnlock(obj); qemuDriverLock(driver); @@ -582,17 +622,13 @@ int qemuDomainObjEndJob(virDomainObjPtr obj) { qemuDomainObjPrivatePtr priv = obj->privateData; - priv->jobActive = QEMU_JOB_NONE; - priv->jobSignals = 0; - memset(&priv->jobSignalsData, 0, sizeof(priv->jobSignalsData)); - priv->jobStart = 0; - memset(&priv->jobInfo, 0, sizeof(priv->jobInfo)); - virCondSignal(&priv->jobCond); + qemuDomainObjResetJob(priv); + qemuDomainObjSetJob(obj, QEMU_JOB_NONE); + virCondSignal(&priv->job.cond); return virDomainObjUnref(obj); } - /* * obj must be locked before calling, qemud_driver must be unlocked * diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index f282df2..214e578 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -62,19 +62,26 @@ struct qemuDomainJobSignalsData { int *infoRetCode; /* Return code for the blkinfo calls */ }; +struct qemuDomainJobObj { + virCond cond; /* Use in conjunction with main virDomainObjPtr lock */ + virCond signalCond; /* Use to coordinate the safe queries during migration */ + + enum qemuDomainJob active; /* Currently running job */ + + unsigned long long start; /* When the job started */ + virDomainJobInfo info; /* Progress data */ + + unsigned int signals; /* Signals for running job */ + struct qemuDomainJobSignalsData signalsData; /* Signal specific data */ +}; + typedef struct _qemuDomainPCIAddressSet qemuDomainPCIAddressSet; typedef qemuDomainPCIAddressSet *qemuDomainPCIAddressSetPtr; typedef struct _qemuDomainObjPrivate qemuDomainObjPrivate; typedef qemuDomainObjPrivate *qemuDomainObjPrivatePtr; struct _qemuDomainObjPrivate { - virCond jobCond; /* Use in conjunction with main virDomainObjPtr lock */ - virCond signalCond; /* Use to coordinate the safe queries during migration */ - enum qemuDomainJob jobActive; /* Currently running job */ - unsigned int jobSignals; /* Signals for running job */ - struct qemuDomainJobSignalsData jobSignalsData; /* Signal specific data */ - virDomainJobInfo jobInfo; - unsigned long long jobStart; + struct qemuDomainJobObj job; qemuMonitorPtr mon; virDomainChrSourceDefPtr monConfig; @@ -114,6 +121,10 @@ int qemuDomainObjBeginJob(virDomainObjPtr obj) ATTRIBUTE_RETURN_CHECK; int qemuDomainObjBeginJobWithDriver(struct qemud_driver *driver, virDomainObjPtr obj) ATTRIBUTE_RETURN_CHECK; int qemuDomainObjEndJob(virDomainObjPtr obj) ATTRIBUTE_RETURN_CHECK; + +void qemuDomainObjSetJob(virDomainObjPtr obj, enum qemuDomainJob job); +void qemuDomainObjDiscardJob(virDomainObjPtr obj); + void qemuDomainObjEnterMonitor(virDomainObjPtr obj); void qemuDomainObjExitMonitor(virDomainObjPtr obj); void qemuDomainObjEnterMonitorWithDriver(struct qemud_driver *driver, diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 52b7dfd..e9cdcbe 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -1343,11 +1343,11 @@ static int qemudDomainSuspend(virDomainPtr dom) { priv = vm->privateData; - if (priv->jobActive == QEMU_JOB_MIGRATION_OUT) { + if (priv->job.active == QEMU_JOB_MIGRATION_OUT) { if (virDomainObjGetState(vm, NULL) != VIR_DOMAIN_PAUSED) { VIR_DEBUG("Requesting domain pause on %s", vm->def->name); - priv->jobSignals |= QEMU_JOB_SIGNAL_SUSPEND; + priv->job.signals |= QEMU_JOB_SIGNAL_SUSPEND; } ret = 0; goto cleanup; @@ -1878,7 +1878,7 @@ static int qemudDomainGetInfo(virDomainPtr dom, if ((vm->def->memballoon != NULL) && (vm->def->memballoon->model == VIR_DOMAIN_MEMBALLOON_MODEL_NONE)) { info->memory = vm->def->mem.max_balloon; - } else if (!priv->jobActive) { + } else if (!priv->job.active) { if (qemuDomainObjBeginJob(vm) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) @@ -1985,12 +1985,12 @@ qemuDomainGetControlInfo(virDomainPtr dom, if (priv->monError) { info->state = VIR_DOMAIN_CONTROL_ERROR; - } else if (priv->jobActive) { + } else if (priv->job.active) { if (!priv->monStart) { info->state = VIR_DOMAIN_CONTROL_JOB; if (virTimeMs(&info->stateTime) < 0) goto cleanup; - info->stateTime -= priv->jobStart; + info->stateTime -= priv->job.start; } else { info->state = VIR_DOMAIN_CONTROL_OCCUPIED; if (virTimeMs(&info->stateTime) < 0) @@ -2125,10 +2125,10 @@ static int qemudDomainSaveFlag(struct qemud_driver *driver, virDomainPtr dom, if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) goto cleanup; - priv->jobActive = QEMU_JOB_SAVE; + qemuDomainObjSetJob(vm, QEMU_JOB_SAVE); - memset(&priv->jobInfo, 0, sizeof(priv->jobInfo)); - priv->jobInfo.type = VIR_DOMAIN_JOB_UNBOUNDED; + memset(&priv->job.info, 0, sizeof(priv->job.info)); + priv->job.info.type = VIR_DOMAIN_JOB_UNBOUNDED; /* Pause */ if (virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING) { @@ -2617,7 +2617,7 @@ static int qemudDomainCoreDump(virDomainPtr dom, goto endjob; } - priv->jobActive = QEMU_JOB_DUMP; + qemuDomainObjSetJob(vm, QEMU_JOB_DUMP); /* Migrate will always stop the VM, so the resume condition is independent of whether the stop command is issued. */ @@ -3847,7 +3847,7 @@ static char *qemuDomainGetXMLDesc(virDomainPtr dom, qemuDomainObjPrivatePtr priv = vm->privateData; /* Don't delay if someone's using the monitor, just use * existing most recent data instead */ - if (!priv->jobActive) { + if (!priv->job.active) { if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) goto cleanup; @@ -6006,19 +6006,19 @@ qemudDomainBlockStats (virDomainPtr dom, } priv = vm->privateData; - if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT) - || (priv->jobActive == QEMU_JOB_SAVE)) { + if ((priv->job.active == QEMU_JOB_MIGRATION_OUT) + || (priv->job.active == QEMU_JOB_SAVE)) { virDomainObjRef(vm); - while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) - ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + while (priv->job.signals & QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->job.signalCond, &vm->lock)); - priv->jobSignalsData.statDevName = disk->info.alias; - priv->jobSignalsData.blockStat = stats; - priv->jobSignalsData.statRetCode = &ret; - priv->jobSignals |= QEMU_JOB_SIGNAL_BLKSTAT; + priv->job.signalsData.statDevName = disk->info.alias; + priv->job.signalsData.blockStat = stats; + priv->job.signalsData.statRetCode = &ret; + priv->job.signals |= QEMU_JOB_SIGNAL_BLKSTAT; - while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) - ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + while (priv->job.signals & QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->job.signalCond, &vm->lock)); if (virDomainObjUnref(vm) == 0) vm = NULL; @@ -6453,19 +6453,19 @@ static int qemuDomainGetBlockInfo(virDomainPtr dom, virDomainObjIsActive(vm)) { qemuDomainObjPrivatePtr priv = vm->privateData; - if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT) - || (priv->jobActive == QEMU_JOB_SAVE)) { + if ((priv->job.active == QEMU_JOB_MIGRATION_OUT) + || (priv->job.active == QEMU_JOB_SAVE)) { virDomainObjRef(vm); - while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) - ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + while (priv->job.signals & QEMU_JOB_SIGNAL_BLKINFO) + ignore_value(virCondWait(&priv->job.signalCond, &vm->lock)); - priv->jobSignalsData.infoDevName = disk->info.alias; - priv->jobSignalsData.blockInfo = info; - priv->jobSignalsData.infoRetCode = &ret; - priv->jobSignals |= QEMU_JOB_SIGNAL_BLKINFO; + priv->job.signalsData.infoDevName = disk->info.alias; + priv->job.signalsData.blockInfo = info; + priv->job.signalsData.infoRetCode = &ret; + priv->job.signals |= QEMU_JOB_SIGNAL_BLKINFO; - while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) - ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + while (priv->job.signals & QEMU_JOB_SIGNAL_BLKINFO) + ignore_value(virCondWait(&priv->job.signalCond, &vm->lock)); if (virDomainObjUnref(vm) == 0) vm = NULL; @@ -7293,8 +7293,8 @@ static int qemuDomainGetJobInfo(virDomainPtr dom, priv = vm->privateData; if (virDomainObjIsActive(vm)) { - if (priv->jobActive) { - memcpy(info, &priv->jobInfo, sizeof(*info)); + if (priv->job.active) { + memcpy(info, &priv->job.info, sizeof(*info)); /* Refresh elapsed time again just to ensure it * is fully updated. This is primarily for benefit @@ -7303,7 +7303,7 @@ static int qemuDomainGetJobInfo(virDomainPtr dom, */ if (virTimeMs(&info->timeElapsed) < 0) goto cleanup; - info->timeElapsed -= priv->jobStart; + info->timeElapsed -= priv->job.start; } else { memset(info, 0, sizeof(*info)); info->type = VIR_DOMAIN_JOB_NONE; @@ -7343,9 +7343,9 @@ static int qemuDomainAbortJob(virDomainPtr dom) { priv = vm->privateData; if (virDomainObjIsActive(vm)) { - if (priv->jobActive) { + if (priv->job.active) { VIR_DEBUG("Requesting cancellation of job on vm %s", vm->def->name); - priv->jobSignals |= QEMU_JOB_SIGNAL_CANCEL; + priv->job.signals |= QEMU_JOB_SIGNAL_CANCEL; } else { qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", _("no job is active on the domain")); @@ -7397,15 +7397,15 @@ qemuDomainMigrateSetMaxDowntime(virDomainPtr dom, priv = vm->privateData; - if (priv->jobActive != QEMU_JOB_MIGRATION_OUT) { + if (priv->job.active != QEMU_JOB_MIGRATION_OUT) { qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", _("domain is not being migrated")); goto cleanup; } VIR_DEBUG("Requesting migration downtime change to %llums", downtime); - priv->jobSignalsData.migrateDowntime = downtime; - priv->jobSignals |= QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME; + priv->job.signalsData.migrateDowntime = downtime; + priv->job.signals |= QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME; ret = 0; cleanup: @@ -7446,15 +7446,15 @@ qemuDomainMigrateSetMaxSpeed(virDomainPtr dom, priv = vm->privateData; - if (priv->jobActive != QEMU_JOB_MIGRATION_OUT) { + if (priv->job.active != QEMU_JOB_MIGRATION_OUT) { qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", _("domain is not being migrated")); goto cleanup; } VIR_DEBUG("Requesting migration speed change to %luMbs", bandwidth); - priv->jobSignalsData.migrateBandwidth = bandwidth; - priv->jobSignals |= QEMU_JOB_SIGNAL_MIGRATE_SPEED; + priv->job.signalsData.migrateBandwidth = bandwidth; + priv->job.signals |= QEMU_JOB_SIGNAL_MIGRATE_SPEED; ret = 0; cleanup: diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index d7b27a0..3634966 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -742,12 +742,12 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, qemuReportError(VIR_ERR_INTERNAL_ERROR, _("%s: %s"), job, _("guest unexpectedly quit")); if (cleanup) - priv->jobSignals = 0; + priv->job.signals = 0; return -1; } - if (priv->jobSignals & QEMU_JOB_SIGNAL_CANCEL) { - priv->jobSignals ^= QEMU_JOB_SIGNAL_CANCEL; + if (priv->job.signals & QEMU_JOB_SIGNAL_CANCEL) { + priv->job.signals ^= QEMU_JOB_SIGNAL_CANCEL; VIR_DEBUG("Cancelling job at client request"); qemuDomainObjEnterMonitorWithDriver(driver, vm); ret = qemuMonitorMigrateCancel(priv->mon); @@ -755,58 +755,58 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, if (ret < 0) { VIR_WARN("Unable to cancel job"); } - } else if (priv->jobSignals & QEMU_JOB_SIGNAL_SUSPEND) { - priv->jobSignals ^= QEMU_JOB_SIGNAL_SUSPEND; + } else if (priv->job.signals & QEMU_JOB_SIGNAL_SUSPEND) { + priv->job.signals ^= QEMU_JOB_SIGNAL_SUSPEND; VIR_DEBUG("Pausing domain for non-live migration"); if (qemuMigrationSetOffline(driver, vm) < 0) VIR_WARN("Unable to pause domain"); - } else if (priv->jobSignals & QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME) { - unsigned long long ms = priv->jobSignalsData.migrateDowntime; + } else if (priv->job.signals & QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME) { + unsigned long long ms = priv->job.signalsData.migrateDowntime; - priv->jobSignals ^= QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME; - priv->jobSignalsData.migrateDowntime = 0; + priv->job.signals ^= QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME; + priv->job.signalsData.migrateDowntime = 0; VIR_DEBUG("Setting migration downtime to %llums", ms); qemuDomainObjEnterMonitorWithDriver(driver, vm); ret = qemuMonitorSetMigrationDowntime(priv->mon, ms); qemuDomainObjExitMonitorWithDriver(driver, vm); if (ret < 0) VIR_WARN("Unable to set migration downtime"); - } else if (priv->jobSignals & QEMU_JOB_SIGNAL_MIGRATE_SPEED) { - unsigned long bandwidth = priv->jobSignalsData.migrateBandwidth; + } else if (priv->job.signals & QEMU_JOB_SIGNAL_MIGRATE_SPEED) { + unsigned long bandwidth = priv->job.signalsData.migrateBandwidth; - priv->jobSignals ^= QEMU_JOB_SIGNAL_MIGRATE_SPEED; - priv->jobSignalsData.migrateBandwidth = 0; + priv->job.signals ^= QEMU_JOB_SIGNAL_MIGRATE_SPEED; + priv->job.signalsData.migrateBandwidth = 0; VIR_DEBUG("Setting migration bandwidth to %luMbs", bandwidth); qemuDomainObjEnterMonitorWithDriver(driver, vm); ret = qemuMonitorSetMigrationSpeed(priv->mon, bandwidth); qemuDomainObjExitMonitorWithDriver(driver, vm); if (ret < 0) VIR_WARN("Unable to set migration speed"); - } else if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) { + } else if (priv->job.signals & QEMU_JOB_SIGNAL_BLKSTAT) { qemuDomainObjEnterMonitorWithDriver(driver, vm); ret = qemuMonitorGetBlockStatsInfo(priv->mon, - priv->jobSignalsData.statDevName, - &priv->jobSignalsData.blockStat->rd_req, - &priv->jobSignalsData.blockStat->rd_bytes, - &priv->jobSignalsData.blockStat->wr_req, - &priv->jobSignalsData.blockStat->wr_bytes, - &priv->jobSignalsData.blockStat->errs); + priv->job.signalsData.statDevName, + &priv->job.signalsData.blockStat->rd_req, + &priv->job.signalsData.blockStat->rd_bytes, + &priv->job.signalsData.blockStat->wr_req, + &priv->job.signalsData.blockStat->wr_bytes, + &priv->job.signalsData.blockStat->errs); qemuDomainObjExitMonitorWithDriver(driver, vm); - *priv->jobSignalsData.statRetCode = ret; - priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKSTAT; + *priv->job.signalsData.statRetCode = ret; + priv->job.signals ^= QEMU_JOB_SIGNAL_BLKSTAT; if (ret < 0) VIR_WARN("Unable to get block statistics"); - } else if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) { + } else if (priv->job.signals & QEMU_JOB_SIGNAL_BLKINFO) { qemuDomainObjEnterMonitorWithDriver(driver, vm); ret = qemuMonitorGetBlockExtent(priv->mon, - priv->jobSignalsData.infoDevName, - &priv->jobSignalsData.blockInfo->allocation); + priv->job.signalsData.infoDevName, + &priv->job.signalsData.blockInfo->allocation); qemuDomainObjExitMonitorWithDriver(driver, vm); - *priv->jobSignalsData.infoRetCode = ret; - priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKINFO; + *priv->job.signalsData.infoRetCode = ret; + priv->job.signals ^= QEMU_JOB_SIGNAL_BLKINFO; if (ret < 0) VIR_WARN("Unable to get block information"); @@ -844,44 +844,44 @@ qemuMigrationUpdateJobStatus(struct qemud_driver *driver, &memTotal); qemuDomainObjExitMonitorWithDriver(driver, vm); - if (ret < 0 || virTimeMs(&priv->jobInfo.timeElapsed) < 0) { - priv->jobInfo.type = VIR_DOMAIN_JOB_FAILED; + if (ret < 0 || virTimeMs(&priv->job.info.timeElapsed) < 0) { + priv->job.info.type = VIR_DOMAIN_JOB_FAILED; return -1; } - priv->jobInfo.timeElapsed -= priv->jobStart; + priv->job.info.timeElapsed -= priv->job.start; switch (status) { case QEMU_MONITOR_MIGRATION_STATUS_INACTIVE: - priv->jobInfo.type = VIR_DOMAIN_JOB_NONE; + priv->job.info.type = VIR_DOMAIN_JOB_NONE; qemuReportError(VIR_ERR_OPERATION_FAILED, _("%s: %s"), job, _("is not active")); break; case QEMU_MONITOR_MIGRATION_STATUS_ACTIVE: - priv->jobInfo.dataTotal = memTotal; - priv->jobInfo.dataRemaining = memRemaining; - priv->jobInfo.dataProcessed = memProcessed; + priv->job.info.dataTotal = memTotal; + priv->job.info.dataRemaining = memRemaining; + priv->job.info.dataProcessed = memProcessed; - priv->jobInfo.memTotal = memTotal; - priv->jobInfo.memRemaining = memRemaining; - priv->jobInfo.memProcessed = memProcessed; + priv->job.info.memTotal = memTotal; + priv->job.info.memRemaining = memRemaining; + priv->job.info.memProcessed = memProcessed; ret = 0; break; case QEMU_MONITOR_MIGRATION_STATUS_COMPLETED: - priv->jobInfo.type = VIR_DOMAIN_JOB_COMPLETED; + priv->job.info.type = VIR_DOMAIN_JOB_COMPLETED; ret = 0; break; case QEMU_MONITOR_MIGRATION_STATUS_ERROR: - priv->jobInfo.type = VIR_DOMAIN_JOB_FAILED; + priv->job.info.type = VIR_DOMAIN_JOB_FAILED; qemuReportError(VIR_ERR_OPERATION_FAILED, _("%s: %s"), job, _("unexpectedly failed")); break; case QEMU_MONITOR_MIGRATION_STATUS_CANCELLED: - priv->jobInfo.type = VIR_DOMAIN_JOB_CANCELLED; + priv->job.info.type = VIR_DOMAIN_JOB_CANCELLED; qemuReportError(VIR_ERR_OPERATION_FAILED, _("%s: %s"), job, _("canceled by client")); break; @@ -897,7 +897,7 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) qemuDomainObjPrivatePtr priv = vm->privateData; const char *job; - switch (priv->jobActive) { + switch (priv->job.active) { case QEMU_JOB_MIGRATION_OUT: job = _("migration job"); break; @@ -911,17 +911,17 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) job = _("job"); } - priv->jobInfo.type = VIR_DOMAIN_JOB_UNBOUNDED; + priv->job.info.type = VIR_DOMAIN_JOB_UNBOUNDED; - while (priv->jobInfo.type == VIR_DOMAIN_JOB_UNBOUNDED) { + while (priv->job.info.type == VIR_DOMAIN_JOB_UNBOUNDED) { /* Poll every 50ms for progress & to allow cancellation */ struct timespec ts = { .tv_sec = 0, .tv_nsec = 50 * 1000 * 1000ull }; - while (priv->jobSignals) { + while (priv->job.signals) { if (qemuMigrationProcessJobSignals(driver, vm, job, false) < 0) goto cleanup; } - virCondSignal(&priv->signalCond); + virCondSignal(&priv->job.signalCond); if (qemuMigrationUpdateJobStatus(driver, vm, job) < 0) goto cleanup; @@ -937,12 +937,12 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) } cleanup: - while (priv->jobSignals) { + while (priv->job.signals) { qemuMigrationProcessJobSignals(driver, vm, job, true); } - virCondBroadcast(&priv->signalCond); + virCondBroadcast(&priv->job.signalCond); - if (priv->jobInfo.type == VIR_DOMAIN_JOB_COMPLETED) + if (priv->job.info.type == VIR_DOMAIN_JOB_COMPLETED) return 0; else return -1; @@ -1110,7 +1110,7 @@ qemuMigrationPrepareTunnel(struct qemud_driver *driver, if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) goto cleanup; - priv->jobActive = QEMU_JOB_MIGRATION_IN; + qemuDomainObjSetJob(vm, QEMU_JOB_MIGRATION_IN); /* Domain starts inactive, even if the domain XML had an id field. */ vm->def->id = -1; @@ -1183,9 +1183,9 @@ endjob: */ if (vm && virDomainObjIsActive(vm)) { - priv->jobActive = QEMU_JOB_MIGRATION_IN; - priv->jobInfo.type = VIR_DOMAIN_JOB_UNBOUNDED; - priv->jobStart = now; + qemuDomainObjSetJob(vm, QEMU_JOB_MIGRATION_IN); + priv->job.info.type = VIR_DOMAIN_JOB_UNBOUNDED; + priv->job.start = now; } cleanup: @@ -1345,7 +1345,7 @@ qemuMigrationPrepareDirect(struct qemud_driver *driver, if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) goto cleanup; - priv->jobActive = QEMU_JOB_MIGRATION_IN; + qemuDomainObjSetJob(vm, QEMU_JOB_MIGRATION_IN); /* Domain starts inactive, even if the domain XML had an id field. */ vm->def->id = -1; @@ -1404,9 +1404,9 @@ endjob: */ if (vm && virDomainObjIsActive(vm)) { - priv->jobActive = QEMU_JOB_MIGRATION_IN; - priv->jobInfo.type = VIR_DOMAIN_JOB_UNBOUNDED; - priv->jobStart = now; + qemuDomainObjSetJob(vm, QEMU_JOB_MIGRATION_IN); + priv->job.info.type = VIR_DOMAIN_JOB_UNBOUNDED; + priv->job.start = now; } cleanup: @@ -2286,7 +2286,7 @@ int qemuMigrationPerform(struct qemud_driver *driver, if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) goto cleanup; - priv->jobActive = QEMU_JOB_MIGRATION_OUT; + qemuDomainObjSetJob(vm, QEMU_JOB_MIGRATION_OUT); if (!virDomainObjIsActive(vm)) { qemuReportError(VIR_ERR_OPERATION_INVALID, @@ -2300,8 +2300,8 @@ int qemuMigrationPerform(struct qemud_driver *driver, goto endjob; } - memset(&priv->jobInfo, 0, sizeof(priv->jobInfo)); - priv->jobInfo.type = VIR_DOMAIN_JOB_UNBOUNDED; + memset(&priv->job.info, 0, sizeof(priv->job.info)); + priv->job.info.type = VIR_DOMAIN_JOB_UNBOUNDED; resume = virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING; @@ -2450,13 +2450,12 @@ qemuMigrationFinish(struct qemud_driver *driver, virErrorPtr orig_err = NULL; priv = vm->privateData; - if (priv->jobActive != QEMU_JOB_MIGRATION_IN) { + if (priv->job.active != QEMU_JOB_MIGRATION_IN) { qemuReportError(VIR_ERR_NO_DOMAIN, _("domain '%s' is not processing incoming migration"), vm->def->name); goto cleanup; } - priv->jobActive = QEMU_JOB_NONE; - memset(&priv->jobInfo, 0, sizeof(priv->jobInfo)); + qemuDomainObjDiscardJob(vm); if (!(mig = qemuMigrationEatCookie(driver, vm, cookiein, cookieinlen, 0))) goto cleanup; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index c9145cb..56593c2 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3013,11 +3013,10 @@ static void qemuProcessAutoDestroyDom(void *payload, } priv = dom->privateData; - if (priv->jobActive == QEMU_JOB_MIGRATION_IN) { + if (priv->job.active == QEMU_JOB_MIGRATION_IN) { VIR_DEBUG("vm=%s has incoming migration active, cancelling", dom->def->name); - priv->jobActive = QEMU_JOB_NONE; - memset(&priv->jobInfo, 0, sizeof(priv->jobInfo)); + qemuDomainObjDiscardJob(dom); } if (qemuDomainObjBeginJobWithDriver(data->driver, dom) < 0) -- 1.7.6

On Fri, Jul 08, 2011 at 01:34:06AM +0200, Jiri Denemark wrote:
--- src/qemu/qemu_domain.c | 104 +++++++++++++++++++++++++------------ src/qemu/qemu_domain.h | 25 +++++++--- src/qemu/qemu_driver.c | 82 +++++++++++++++--------------- src/qemu/qemu_migration.c | 123 ++++++++++++++++++++++----------------------- src/qemu/qemu_process.c | 5 +- 5 files changed, 192 insertions(+), 147 deletions(-)
ACK Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

This avoids code duplication and also avoids relying on good luck that ignore_value(virDomainObjUnref(obj)) doesn't set errno. --- src/qemu/qemu_domain.c | 93 ++++++++++++++++++++++-------------------------- 1 files changed, 43 insertions(+), 50 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index bbdfdc4..8f3eaa7 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -517,20 +517,17 @@ qemuDomainObjDiscardJob(virDomainObjPtr obj) qemuDomainObjSetJob(obj, QEMU_JOB_NONE); } -/* - * obj must be locked before calling, qemud_driver must NOT be locked - * - * This must be called by anything that will change the VM state - * in any way, or anything that will use the QEMU monitor. - * - * Upon successful return, the object will have its ref count increased, - * successful calls must be followed by EndJob eventually - */ - /* Give up waiting for mutex after 30 seconds */ #define QEMU_JOB_WAIT_TIME (1000ull * 30) -int qemuDomainObjBeginJob(virDomainObjPtr obj) +/* + * obj must be locked before calling; driver_locked says if qemu_driver is + * locked or not. + */ +static int +qemuDomainObjBeginJobInternal(struct qemud_driver *driver, + bool driver_locked, + virDomainObjPtr obj) { qemuDomainObjPrivatePtr priv = obj->privateData; unsigned long long now; @@ -541,17 +538,24 @@ int qemuDomainObjBeginJob(virDomainObjPtr obj) then = now + QEMU_JOB_WAIT_TIME; virDomainObjRef(obj); + if (driver_locked) + qemuDriverUnlock(driver); while (priv->job.active) { if (virCondWaitUntil(&priv->job.cond, &obj->lock, then) < 0) { - /* Safe to ignore value since ref count was incremented above */ - ignore_value(virDomainObjUnref(obj)); if (errno == ETIMEDOUT) qemuReportError(VIR_ERR_OPERATION_TIMEOUT, "%s", _("cannot acquire state change lock")); else virReportSystemError(errno, "%s", _("cannot acquire job mutex")); + if (driver_locked) { + virDomainObjUnlock(obj); + qemuDriverLock(driver); + virDomainObjLock(obj); + } + /* Safe to ignore value since ref count was incremented above */ + ignore_value(virDomainObjUnref(obj)); return -1; } } @@ -559,54 +563,43 @@ int qemuDomainObjBeginJob(virDomainObjPtr obj) qemuDomainObjSetJob(obj, QEMU_JOB_UNSPECIFIED); priv->job.start = now; + if (driver_locked) { + virDomainObjUnlock(obj); + qemuDriverLock(driver); + virDomainObjLock(obj); + } + return 0; } /* - * obj must be locked before calling, qemud_driver must be locked + * obj must be locked before calling, qemud_driver must NOT be locked * * This must be called by anything that will change the VM state * in any way, or anything that will use the QEMU monitor. + * + * Upon successful return, the object will have its ref count increased, + * successful calls must be followed by EndJob eventually + */ +int qemuDomainObjBeginJob(virDomainObjPtr obj) +{ + return qemuDomainObjBeginJobInternal(NULL, false, obj); +} + +/* + * obj must be locked before calling. If qemud_driver is passed, it MUST be + * locked; otherwise it MUST NOT be locked. + * + * This must be called by anything that will change the VM state + * in any way, or anything that will use the QEMU monitor. + * + * Upon successful return, the object will have its ref count increased, + * successful calls must be followed by EndJob eventually */ int qemuDomainObjBeginJobWithDriver(struct qemud_driver *driver, virDomainObjPtr obj) { - qemuDomainObjPrivatePtr priv = obj->privateData; - unsigned long long now; - unsigned long long then; - - if (virTimeMs(&now) < 0) - return -1; - then = now + QEMU_JOB_WAIT_TIME; - - virDomainObjRef(obj); - qemuDriverUnlock(driver); - - while (priv->job.active) { - if (virCondWaitUntil(&priv->job.cond, &obj->lock, then) < 0) { - if (errno == ETIMEDOUT) - qemuReportError(VIR_ERR_OPERATION_TIMEOUT, - "%s", _("cannot acquire state change lock")); - else - virReportSystemError(errno, - "%s", _("cannot acquire job mutex")); - virDomainObjUnlock(obj); - qemuDriverLock(driver); - virDomainObjLock(obj); - /* Safe to ignore value since ref count was incremented above */ - ignore_value(virDomainObjUnref(obj)); - return -1; - } - } - qemuDomainObjResetJob(priv); - qemuDomainObjSetJob(obj, QEMU_JOB_UNSPECIFIED); - priv->job.start = now; - - virDomainObjUnlock(obj); - qemuDriverLock(driver); - virDomainObjLock(obj); - - return 0; + return qemuDomainObjBeginJobInternal(driver, true, obj); } /* -- 1.7.6

On 07/07/2011 05:34 PM, Jiri Denemark wrote:
This avoids code duplication and also avoids relying on good luck that ignore_value(virDomainObjUnref(obj)) doesn't set errno. --- src/qemu/qemu_domain.c | 93 ++++++++++++++++++++++-------------------------- 1 files changed, 43 insertions(+), 50 deletions(-)
ACK. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

EnterMonitor and ExitMonitor methods are very similar to their *WithDriver variants; consolidate them into EnterMonitorInternal and ExitMonitorInternal to avoid (mainly future) code duplication. --- src/qemu/qemu_domain.c | 74 ++++++++++++++++++++++------------------------- 1 files changed, 35 insertions(+), 39 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 8f3eaa7..a2e77b6 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -622,16 +622,10 @@ int qemuDomainObjEndJob(virDomainObjPtr obj) return virDomainObjUnref(obj); } -/* - * obj must be locked before calling, qemud_driver must be unlocked - * - * To be called immediately before any QEMU monitor API call - * Must have already called qemuDomainObjBeginJob(), and checked - * that the VM is still active. - * - * To be followed with qemuDomainObjExitMonitor() once complete - */ -void qemuDomainObjEnterMonitor(virDomainObjPtr obj) + +static void +qemuDomainObjEnterMonitorInternal(struct qemud_driver *driver, + virDomainObjPtr obj) { qemuDomainObjPrivatePtr priv = obj->privateData; @@ -639,14 +633,13 @@ void qemuDomainObjEnterMonitor(virDomainObjPtr obj) qemuMonitorRef(priv->mon); ignore_value(virTimeMs(&priv->monStart)); virDomainObjUnlock(obj); + if (driver) + qemuDriverUnlock(driver); } - -/* obj must NOT be locked before calling, qemud_driver must be unlocked - * - * Should be paired with an earlier qemuDomainObjEnterMonitor() call - */ -void qemuDomainObjExitMonitor(virDomainObjPtr obj) +static void +qemuDomainObjExitMonitorInternal(struct qemud_driver *driver, + virDomainObjPtr obj) { qemuDomainObjPrivatePtr priv = obj->privateData; int refs; @@ -656,6 +649,8 @@ void qemuDomainObjExitMonitor(virDomainObjPtr obj) if (refs > 0) qemuMonitorUnlock(priv->mon); + if (driver) + qemuDriverLock(driver); virDomainObjLock(obj); priv->monStart = 0; @@ -664,6 +659,28 @@ void qemuDomainObjExitMonitor(virDomainObjPtr obj) } } +/* + * obj must be locked before calling, qemud_driver must be unlocked + * + * To be called immediately before any QEMU monitor API call + * Must have already called qemuDomainObjBeginJob(), and checked + * that the VM is still active. + * + * To be followed with qemuDomainObjExitMonitor() once complete + */ +void qemuDomainObjEnterMonitor(virDomainObjPtr obj) +{ + qemuDomainObjEnterMonitorInternal(NULL, obj); +} + +/* obj must NOT be locked before calling, qemud_driver must be unlocked + * + * Should be paired with an earlier qemuDomainObjEnterMonitor() call + */ +void qemuDomainObjExitMonitor(virDomainObjPtr obj) +{ + qemuDomainObjExitMonitorInternal(NULL, obj); +} /* * obj must be locked before calling, qemud_driver must be locked @@ -676,16 +693,9 @@ void qemuDomainObjExitMonitor(virDomainObjPtr obj) void qemuDomainObjEnterMonitorWithDriver(struct qemud_driver *driver, virDomainObjPtr obj) { - qemuDomainObjPrivatePtr priv = obj->privateData; - - qemuMonitorLock(priv->mon); - qemuMonitorRef(priv->mon); - ignore_value(virTimeMs(&priv->monStart)); - virDomainObjUnlock(obj); - qemuDriverUnlock(driver); + qemuDomainObjEnterMonitorInternal(driver, obj); } - /* obj must NOT be locked before calling, qemud_driver must be unlocked, * and will be locked after returning * @@ -694,21 +704,7 @@ void qemuDomainObjEnterMonitorWithDriver(struct qemud_driver *driver, void qemuDomainObjExitMonitorWithDriver(struct qemud_driver *driver, virDomainObjPtr obj) { - qemuDomainObjPrivatePtr priv = obj->privateData; - int refs; - - refs = qemuMonitorUnref(priv->mon); - - if (refs > 0) - qemuMonitorUnlock(priv->mon); - - qemuDriverLock(driver); - virDomainObjLock(obj); - - priv->monStart = 0; - if (refs == 0) { - priv->mon = NULL; - } + qemuDomainObjExitMonitorInternal(driver, obj); } void qemuDomainObjEnterRemoteWithDriver(struct qemud_driver *driver, -- 1.7.6

On Fri, Jul 08, 2011 at 01:34:08AM +0200, Jiri Denemark wrote:
EnterMonitor and ExitMonitor methods are very similar to their *WithDriver variants; consolidate them into EnterMonitorInternal and ExitMonitorInternal to avoid (mainly future) code duplication. --- src/qemu/qemu_domain.c | 74 ++++++++++++++++++++++------------------------- 1 files changed, 35 insertions(+), 39 deletions(-)
ACK Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

Query commands are safe to be called during long running jobs (such as migration). This patch makes them all work without the need to special-case every single one of them. The patch introduces new job.asyncCond condition and associated job.asyncJob which are dedicated to asynchronous (from qemu monitor point of view) jobs that can take arbitrarily long time to finish while qemu monitor is still usable for other commands. The existing job.active (and job.cond condition) is used all other synchronous jobs (including the commands run during async job). Locking schema is changed to use these two conditions. While asyncJob is active, only allowed set of synchronous jobs is allowed (the set can be different according to a particular asyncJob) so any method that communicates to qemu monitor needs to check if it is allowed to be executed during current asyncJob (if any). Once the check passes, the method needs to normally acquire job.cond to ensure no other command is running. Since domain object lock is released during that time, asyncJob could have been started in the meantime so the method needs to recheck the first condition. Then, normal jobs set job.active and asynchronous jobs set job.asyncJob and optionally change the list of allowed job groups. Since asynchronous jobs only set job.asyncJob, other allowed commands can still be run when domain object is unlocked (when communicating to remote libvirtd or sleeping). To protect its own internal synchronous commands, the asynchronous job needs to start a special nested job before entering qemu monitor. The nested job doesn't check asyncJob, it only acquires job.cond and sets job.active to block other jobs. --- src/qemu/qemu_domain.c | 219 +++++++++++++++++++++++++++++++++++++-------- src/qemu/qemu_domain.h | 82 +++++++++++++---- src/qemu/qemu_driver.c | 122 +++++++++++++------------- src/qemu/qemu_hotplug.c | 38 ++++---- src/qemu/qemu_migration.c | 152 ++++++++++++++++++------------- src/qemu/qemu_process.c | 42 +++++---- 6 files changed, 439 insertions(+), 216 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index a2e77b6..1ed5efd 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -87,8 +87,14 @@ qemuDomainObjInitJob(qemuDomainObjPrivatePtr priv) if (virCondInit(&priv->job.cond) < 0) return -1; + if (virCondInit(&priv->job.asyncCond) < 0) { + ignore_value(virCondDestroy(&priv->job.cond)); + return -1; + } + if (virCondInit(&priv->job.signalCond) < 0) { ignore_value(virCondDestroy(&priv->job.cond)); + ignore_value(virCondDestroy(&priv->job.asyncCond)); return -1; } @@ -101,6 +107,15 @@ qemuDomainObjResetJob(qemuDomainObjPrivatePtr priv) struct qemuDomainJobObj *job = &priv->job; job->active = QEMU_JOB_NONE; +} + +static void +qemuDomainObjResetAsyncJob(qemuDomainObjPrivatePtr priv) +{ + struct qemuDomainJobObj *job = &priv->job; + + job->asyncJob = QEMU_ASYNC_JOB_NONE; + job->mask = DEFAULT_JOB_MASK; job->start = 0; memset(&job->info, 0, sizeof(job->info)); job->signals = 0; @@ -111,6 +126,7 @@ static void qemuDomainObjFreeJob(qemuDomainObjPrivatePtr priv) { ignore_value(virCondDestroy(&priv->job.cond)); + ignore_value(virCondDestroy(&priv->job.asyncCond)); ignore_value(virCondDestroy(&priv->job.signalCond)); } @@ -509,12 +525,31 @@ qemuDomainObjSetJob(virDomainObjPtr obj, } void -qemuDomainObjDiscardJob(virDomainObjPtr obj) +qemuDomainObjSetAsyncJobMask(virDomainObjPtr obj, + unsigned long long allowedJobs) { qemuDomainObjPrivatePtr priv = obj->privateData; - qemuDomainObjResetJob(priv); - qemuDomainObjSetJob(obj, QEMU_JOB_NONE); + if (!priv->job.asyncJob) + return; + + priv->job.mask = allowedJobs | JOB_MASK(QEMU_JOB_DESTROY); +} + +void +qemuDomainObjDiscardAsyncJob(virDomainObjPtr obj) +{ + qemuDomainObjPrivatePtr priv = obj->privateData; + + if (priv->job.active == QEMU_JOB_ASYNC_NESTED) + qemuDomainObjResetJob(priv); + qemuDomainObjResetAsyncJob(priv); +} + +static bool +qemuDomainJobAllowed(qemuDomainObjPrivatePtr priv, enum qemuDomainJob job) +{ + return !priv->job.asyncJob || (priv->job.mask & JOB_MASK(job)) != 0; } /* Give up waiting for mutex after 30 seconds */ @@ -527,11 +562,14 @@ qemuDomainObjDiscardJob(virDomainObjPtr obj) static int qemuDomainObjBeginJobInternal(struct qemud_driver *driver, bool driver_locked, - virDomainObjPtr obj) + virDomainObjPtr obj, + enum qemuDomainJob job, + enum qemuDomainAsyncJob asyncJob) { qemuDomainObjPrivatePtr priv = obj->privateData; unsigned long long now; unsigned long long then; + bool nested = job == QEMU_JOB_ASYNC_NESTED; if (virTimeMs(&now) < 0) return -1; @@ -541,27 +579,31 @@ qemuDomainObjBeginJobInternal(struct qemud_driver *driver, if (driver_locked) qemuDriverUnlock(driver); +retry: + while (!nested && !qemuDomainJobAllowed(priv, job)) { + if (virCondWaitUntil(&priv->job.asyncCond, &obj->lock, then) < 0) + goto error; + } + while (priv->job.active) { - if (virCondWaitUntil(&priv->job.cond, &obj->lock, then) < 0) { - if (errno == ETIMEDOUT) - qemuReportError(VIR_ERR_OPERATION_TIMEOUT, - "%s", _("cannot acquire state change lock")); - else - virReportSystemError(errno, - "%s", _("cannot acquire job mutex")); - if (driver_locked) { - virDomainObjUnlock(obj); - qemuDriverLock(driver); - virDomainObjLock(obj); - } - /* Safe to ignore value since ref count was incremented above */ - ignore_value(virDomainObjUnref(obj)); - return -1; - } + if (virCondWaitUntil(&priv->job.cond, &obj->lock, then) < 0) + goto error; } + + /* No job is active but a new async job could have been started while obj + * was unlocked, so we need to recheck it. */ + if (!nested && !qemuDomainJobAllowed(priv, job)) + goto retry; + qemuDomainObjResetJob(priv); - qemuDomainObjSetJob(obj, QEMU_JOB_UNSPECIFIED); - priv->job.start = now; + + if (job != QEMU_JOB_ASYNC) { + priv->job.active = job; + } else { + qemuDomainObjResetAsyncJob(priv); + priv->job.asyncJob = asyncJob; + priv->job.start = now; + } if (driver_locked) { virDomainObjUnlock(obj); @@ -570,6 +612,22 @@ qemuDomainObjBeginJobInternal(struct qemud_driver *driver, } return 0; + +error: + if (errno == ETIMEDOUT) + qemuReportError(VIR_ERR_OPERATION_TIMEOUT, + "%s", _("cannot acquire state change lock")); + else + virReportSystemError(errno, + "%s", _("cannot acquire job mutex")); + if (driver_locked) { + virDomainObjUnlock(obj); + qemuDriverLock(driver); + virDomainObjLock(obj); + } + /* Safe to ignore value since ref count was incremented above */ + ignore_value(virDomainObjUnref(obj)); + return -1; } /* @@ -581,9 +639,17 @@ qemuDomainObjBeginJobInternal(struct qemud_driver *driver, * Upon successful return, the object will have its ref count increased, * successful calls must be followed by EndJob eventually */ -int qemuDomainObjBeginJob(virDomainObjPtr obj) +int qemuDomainObjBeginJob(virDomainObjPtr obj, enum qemuDomainJob job) +{ + return qemuDomainObjBeginJobInternal(NULL, false, obj, job, + QEMU_ASYNC_JOB_NONE); +} + +int qemuDomainObjBeginAsyncJob(virDomainObjPtr obj, + enum qemuDomainAsyncJob asyncJob) { - return qemuDomainObjBeginJobInternal(NULL, false, obj); + return qemuDomainObjBeginJobInternal(NULL, false, obj, QEMU_JOB_ASYNC, + asyncJob); } /* @@ -597,9 +663,49 @@ int qemuDomainObjBeginJob(virDomainObjPtr obj) * successful calls must be followed by EndJob eventually */ int qemuDomainObjBeginJobWithDriver(struct qemud_driver *driver, - virDomainObjPtr obj) + virDomainObjPtr obj, + enum qemuDomainJob job) { - return qemuDomainObjBeginJobInternal(driver, true, obj); + if (job <= QEMU_JOB_NONE || job >= QEMU_JOB_ASYNC) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("Attempt to start invalid job")); + return -1; + } + + return qemuDomainObjBeginJobInternal(driver, true, obj, job, + QEMU_ASYNC_JOB_NONE); +} + +int qemuDomainObjBeginAsyncJobWithDriver(struct qemud_driver *driver, + virDomainObjPtr obj, + enum qemuDomainAsyncJob asyncJob) +{ + return qemuDomainObjBeginJobInternal(driver, true, obj, QEMU_JOB_ASYNC, + asyncJob); +} + +/* + * Use this to protect monitor sections within active async job. + * + * The caller must call qemuDomainObjBeginAsyncJob{,WithDriver} before it can + * use this method. Never use this method if you only own non-async job, use + * qemuDomainObjBeginJob{,WithDriver} instead. + */ +int +qemuDomainObjBeginNestedJob(virDomainObjPtr obj) +{ + return qemuDomainObjBeginJobInternal(NULL, false, obj, + QEMU_JOB_ASYNC_NESTED, + QEMU_ASYNC_JOB_NONE); +} + +int +qemuDomainObjBeginNestedJobWithDriver(struct qemud_driver *driver, + virDomainObjPtr obj) +{ + return qemuDomainObjBeginJobInternal(driver, true, obj, + QEMU_JOB_ASYNC_NESTED, + QEMU_ASYNC_JOB_NONE); } /* @@ -616,25 +722,60 @@ int qemuDomainObjEndJob(virDomainObjPtr obj) qemuDomainObjPrivatePtr priv = obj->privateData; qemuDomainObjResetJob(priv); - qemuDomainObjSetJob(obj, QEMU_JOB_NONE); virCondSignal(&priv->job.cond); return virDomainObjUnref(obj); } +int +qemuDomainObjEndAsyncJob(virDomainObjPtr obj) +{ + qemuDomainObjPrivatePtr priv = obj->privateData; -static void + qemuDomainObjResetAsyncJob(priv); + virCondBroadcast(&priv->job.asyncCond); + + return virDomainObjUnref(obj); +} + +void +qemuDomainObjEndNestedJob(virDomainObjPtr obj) +{ + qemuDomainObjPrivatePtr priv = obj->privateData; + + qemuDomainObjResetJob(priv); + virCondSignal(&priv->job.cond); + + /* safe to ignore since the surrounding async job increased the reference + * counter as well */ + ignore_value(virDomainObjUnref(obj)); +} + + +static int qemuDomainObjEnterMonitorInternal(struct qemud_driver *driver, virDomainObjPtr obj) { qemuDomainObjPrivatePtr priv = obj->privateData; + if (priv->job.active == QEMU_JOB_NONE && priv->job.asyncJob) { + if (qemuDomainObjBeginNestedJob(obj) < 0) + return -1; + if (!virDomainObjIsActive(obj)) { + qemuReportError(VIR_ERR_OPERATION_FAILED, "%s", + _("domain is no longer running")); + return -1; + } + } + qemuMonitorLock(priv->mon); qemuMonitorRef(priv->mon); ignore_value(virTimeMs(&priv->monStart)); virDomainObjUnlock(obj); if (driver) qemuDriverUnlock(driver); + + return 0; } static void @@ -657,20 +798,24 @@ qemuDomainObjExitMonitorInternal(struct qemud_driver *driver, if (refs == 0) { priv->mon = NULL; } + + if (priv->job.active == QEMU_JOB_ASYNC_NESTED) + qemuDomainObjEndNestedJob(obj); } /* * obj must be locked before calling, qemud_driver must be unlocked * * To be called immediately before any QEMU monitor API call - * Must have already called qemuDomainObjBeginJob(), and checked - * that the VM is still active. + * Must have already either called qemuDomainObjBeginJob() and checked + * that the VM is still active or called qemuDomainObjBeginAsyncJob, in which + * case this will call qemuDomainObjBeginNestedJob. * * To be followed with qemuDomainObjExitMonitor() once complete */ -void qemuDomainObjEnterMonitor(virDomainObjPtr obj) +int qemuDomainObjEnterMonitor(virDomainObjPtr obj) { - qemuDomainObjEnterMonitorInternal(NULL, obj); + return qemuDomainObjEnterMonitorInternal(NULL, obj); } /* obj must NOT be locked before calling, qemud_driver must be unlocked @@ -686,14 +831,16 @@ void qemuDomainObjExitMonitor(virDomainObjPtr obj) * obj must be locked before calling, qemud_driver must be locked * * To be called immediately before any QEMU monitor API call - * Must have already called qemuDomainObjBeginJob(). + * Must have already either called qemuDomainObjBeginJobWithDriver() and + * checked that the VM is still active or called qemuDomainObjBeginAsyncJob, + * in which case this will call qemuDomainObjBeginNestedJobWithDriver. * * To be followed with qemuDomainObjExitMonitorWithDriver() once complete */ -void qemuDomainObjEnterMonitorWithDriver(struct qemud_driver *driver, - virDomainObjPtr obj) +int qemuDomainObjEnterMonitorWithDriver(struct qemud_driver *driver, + virDomainObjPtr obj) { - qemuDomainObjEnterMonitorInternal(driver, obj); + return qemuDomainObjEnterMonitorInternal(driver, obj); } /* obj must NOT be locked before calling, qemud_driver must be unlocked, diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 214e578..85a3c03 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -30,16 +30,35 @@ # include "qemu_conf.h" # include "bitmap.h" +#define JOB_MASK(job) (1 << (job - 1)) +#define DEFAULT_JOB_MASK \ + (JOB_MASK(QEMU_JOB_QUERY) | JOB_MASK(QEMU_JOB_DESTROY)) + /* Only 1 job is allowed at any time * A job includes *all* monitor commands, even those just querying * information, not merely actions */ enum qemuDomainJob { QEMU_JOB_NONE = 0, /* Always set to 0 for easy if (jobActive) conditions */ - QEMU_JOB_UNSPECIFIED, - QEMU_JOB_MIGRATION_OUT, - QEMU_JOB_MIGRATION_IN, - QEMU_JOB_SAVE, - QEMU_JOB_DUMP, + QEMU_JOB_QUERY, /* Doesn't change any state */ + QEMU_JOB_DESTROY, /* Destroys the domain (cannot be masked out) */ + QEMU_JOB_SUSPEND, /* Suspends (stops vCPUs) the domain */ + QEMU_JOB_MODIFY, /* May change state */ + + /* The following two items must always be the last items */ + QEMU_JOB_ASYNC, /* Asynchronous job */ + QEMU_JOB_ASYNC_NESTED, /* Normal job within an async job */ +}; + +/* Async job consists of a series of jobs that may change state. Independent + * jobs that do not change state (and possibly others if explicitly allowed by + * current async job) are allowed to be run even if async job is active. + */ +enum qemuDomainAsyncJob { + QEMU_ASYNC_JOB_NONE = 0, + QEMU_ASYNC_JOB_MIGRATION_OUT, + QEMU_ASYNC_JOB_MIGRATION_IN, + QEMU_ASYNC_JOB_SAVE, + QEMU_ASYNC_JOB_DUMP, }; enum qemuDomainJobSignals { @@ -63,14 +82,16 @@ struct qemuDomainJobSignalsData { }; struct qemuDomainJobObj { - virCond cond; /* Use in conjunction with main virDomainObjPtr lock */ - virCond signalCond; /* Use to coordinate the safe queries during migration */ - - enum qemuDomainJob active; /* Currently running job */ + virCond cond; /* Use to coordinate jobs */ + enum qemuDomainJob active; /* Currently running job */ - unsigned long long start; /* When the job started */ - virDomainJobInfo info; /* Progress data */ + virCond asyncCond; /* Use to coordinate with async jobs */ + enum qemuDomainAsyncJob asyncJob; /* Currently active async job */ + unsigned long long mask; /* Jobs allowed during async job */ + unsigned long long start; /* When the async job started */ + virDomainJobInfo info; /* Async job progress data */ + virCond signalCond; /* Use to coordinate the safe queries during migration */ unsigned int signals; /* Signals for running job */ struct qemuDomainJobSignalsData signalsData; /* Signal specific data */ }; @@ -117,18 +138,43 @@ void qemuDomainEventQueue(struct qemud_driver *driver, void qemuDomainSetPrivateDataHooks(virCapsPtr caps); void qemuDomainSetNamespaceHooks(virCapsPtr caps); -int qemuDomainObjBeginJob(virDomainObjPtr obj) ATTRIBUTE_RETURN_CHECK; +int qemuDomainObjBeginJob(virDomainObjPtr obj, + enum qemuDomainJob job) + ATTRIBUTE_RETURN_CHECK; +int qemuDomainObjBeginAsyncJob(virDomainObjPtr obj, + enum qemuDomainAsyncJob asyncJob) + ATTRIBUTE_RETURN_CHECK; +int qemuDomainObjBeginNestedJob(virDomainObjPtr obj) + ATTRIBUTE_RETURN_CHECK; int qemuDomainObjBeginJobWithDriver(struct qemud_driver *driver, - virDomainObjPtr obj) ATTRIBUTE_RETURN_CHECK; -int qemuDomainObjEndJob(virDomainObjPtr obj) ATTRIBUTE_RETURN_CHECK; + virDomainObjPtr obj, + enum qemuDomainJob job) + ATTRIBUTE_RETURN_CHECK; +int qemuDomainObjBeginAsyncJobWithDriver(struct qemud_driver *driver, + virDomainObjPtr obj, + enum qemuDomainAsyncJob asyncJob) + ATTRIBUTE_RETURN_CHECK; +int qemuDomainObjBeginNestedJobWithDriver(struct qemud_driver *driver, + virDomainObjPtr obj) + ATTRIBUTE_RETURN_CHECK; + +int qemuDomainObjEndJob(virDomainObjPtr obj) + ATTRIBUTE_RETURN_CHECK; +int qemuDomainObjEndAsyncJob(virDomainObjPtr obj) + ATTRIBUTE_RETURN_CHECK; +void qemuDomainObjEndNestedJob(virDomainObjPtr obj); void qemuDomainObjSetJob(virDomainObjPtr obj, enum qemuDomainJob job); -void qemuDomainObjDiscardJob(virDomainObjPtr obj); +void qemuDomainObjSetAsyncJobMask(virDomainObjPtr obj, + unsigned long long allowedJobs); +void qemuDomainObjDiscardAsyncJob(virDomainObjPtr obj); -void qemuDomainObjEnterMonitor(virDomainObjPtr obj); +int qemuDomainObjEnterMonitor(virDomainObjPtr obj) + ATTRIBUTE_RETURN_CHECK; void qemuDomainObjExitMonitor(virDomainObjPtr obj); -void qemuDomainObjEnterMonitorWithDriver(struct qemud_driver *driver, - virDomainObjPtr obj); +int qemuDomainObjEnterMonitorWithDriver(struct qemud_driver *driver, + virDomainObjPtr obj) + ATTRIBUTE_RETURN_CHECK; void qemuDomainObjExitMonitorWithDriver(struct qemud_driver *driver, virDomainObjPtr obj); void qemuDomainObjEnterRemoteWithDriver(struct qemud_driver *driver, diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index e9cdcbe..96b3737 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -141,7 +141,8 @@ qemuAutostartDomain(void *payload, const void *name ATTRIBUTE_UNUSED, void *opaq virDomainObjLock(vm); virResetLastError(); - if (qemuDomainObjBeginJobWithDriver(data->driver, vm) < 0) { + if (qemuDomainObjBeginJobWithDriver(data->driver, vm, + QEMU_JOB_MODIFY) < 0) { err = virGetLastError(); VIR_ERROR(_("Failed to start job on VM '%s': %s"), vm->def->name, @@ -1274,7 +1275,7 @@ static virDomainPtr qemudDomainCreate(virConnectPtr conn, const char *xml, def = NULL; - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; /* XXXX free the 'vm' we created ? */ if (qemuProcessStart(conn, driver, vm, NULL, @@ -1343,7 +1344,7 @@ static int qemudDomainSuspend(virDomainPtr dom) { priv = vm->privateData; - if (priv->job.active == QEMU_JOB_MIGRATION_OUT) { + if (priv->job.asyncJob == QEMU_ASYNC_JOB_MIGRATION_OUT) { if (virDomainObjGetState(vm, NULL) != VIR_DOMAIN_PAUSED) { VIR_DEBUG("Requesting domain pause on %s", vm->def->name); @@ -1352,7 +1353,7 @@ static int qemudDomainSuspend(virDomainPtr dom) { ret = 0; goto cleanup; } else { - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_SUSPEND) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -1405,7 +1406,7 @@ static int qemudDomainResume(virDomainPtr dom) { goto cleanup; } - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -1461,7 +1462,7 @@ static int qemuDomainShutdown(virDomainPtr dom) { goto cleanup; } - if (qemuDomainObjBeginJob(vm) < 0) + if (qemuDomainObjBeginJob(vm, QEMU_JOB_MODIFY) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -1471,7 +1472,7 @@ static int qemuDomainShutdown(virDomainPtr dom) { } priv = vm->privateData; - qemuDomainObjEnterMonitor(vm); + ignore_value(qemuDomainObjEnterMonitor(vm)); ret = qemuMonitorSystemPowerdown(priv->mon); qemuDomainObjExitMonitor(vm); @@ -1511,7 +1512,7 @@ static int qemuDomainReboot(virDomainPtr dom, unsigned int flags) { #if HAVE_YAJL if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_MONITOR_JSON)) { - if (qemuDomainObjBeginJob(vm) < 0) + if (qemuDomainObjBeginJob(vm, QEMU_JOB_MODIFY) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -1520,7 +1521,7 @@ static int qemuDomainReboot(virDomainPtr dom, unsigned int flags) { goto endjob; } - qemuDomainObjEnterMonitor(vm); + ignore_value(qemuDomainObjEnterMonitor(vm)); ret = qemuMonitorSystemPowerdown(priv->mon); qemuDomainObjExitMonitor(vm); @@ -1571,7 +1572,7 @@ static int qemudDomainDestroy(virDomainPtr dom) { */ qemuProcessKill(vm); - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_DESTROY) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -1684,7 +1685,7 @@ static int qemudDomainSetMemoryFlags(virDomainPtr dom, unsigned long newmem, goto cleanup; } - if (qemuDomainObjBeginJob(vm) < 0) + if (qemuDomainObjBeginJob(vm, QEMU_JOB_MODIFY) < 0) goto cleanup; isActive = virDomainObjIsActive(vm); @@ -1749,7 +1750,7 @@ static int qemudDomainSetMemoryFlags(virDomainPtr dom, unsigned long newmem, if (flags & VIR_DOMAIN_AFFECT_LIVE) { priv = vm->privateData; - qemuDomainObjEnterMonitor(vm); + ignore_value(qemuDomainObjEnterMonitor(vm)); r = qemuMonitorSetBalloon(priv->mon, newmem); qemuDomainObjExitMonitor(vm); qemuAuditMemory(vm, vm->def->mem.cur_balloon, newmem, "update", @@ -1821,9 +1822,9 @@ static int qemuDomainInjectNMI(virDomainPtr domain, unsigned int flags) priv = vm->privateData; - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); ret = qemuMonitorInjectNMI(priv->mon); qemuDomainObjExitMonitorWithDriver(driver, vm); if (qemuDomainObjEndJob(vm) == 0) { @@ -1879,12 +1880,12 @@ static int qemudDomainGetInfo(virDomainPtr dom, (vm->def->memballoon->model == VIR_DOMAIN_MEMBALLOON_MODEL_NONE)) { info->memory = vm->def->mem.max_balloon; } else if (!priv->job.active) { - if (qemuDomainObjBeginJob(vm) < 0) + if (qemuDomainObjBeginJob(vm, QEMU_JOB_QUERY) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) err = 0; else { - qemuDomainObjEnterMonitor(vm); + ignore_value(qemuDomainObjEnterMonitor(vm)); err = qemuMonitorGetBalloonInfo(priv->mon, &balloon); qemuDomainObjExitMonitor(vm); } @@ -2122,11 +2123,10 @@ static int qemudDomainSaveFlag(struct qemud_driver *driver, virDomainPtr dom, priv = vm->privateData; - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginAsyncJobWithDriver(driver, vm, + QEMU_ASYNC_JOB_SAVE) < 0) goto cleanup; - qemuDomainObjSetJob(vm, QEMU_JOB_SAVE); - memset(&priv->job.info, 0, sizeof(priv->job.info)); priv->job.info.type = VIR_DOMAIN_JOB_UNBOUNDED; @@ -2294,7 +2294,7 @@ static int qemudDomainSaveFlag(struct qemud_driver *driver, virDomainPtr dom, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_SAVED); if (!vm->persistent) { - if (qemuDomainObjEndJob(vm) > 0) + if (qemuDomainObjEndAsyncJob(vm) > 0) virDomainRemoveInactive(&driver->domains, vm); vm = NULL; @@ -2310,7 +2310,7 @@ endjob: VIR_WARN("Unable to resume guest CPUs after save failure"); } } - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndAsyncJob(vm) == 0) vm = NULL; } @@ -2608,7 +2608,8 @@ static int qemudDomainCoreDump(virDomainPtr dom, } priv = vm->privateData; - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginAsyncJobWithDriver(driver, vm, + QEMU_ASYNC_JOB_DUMP) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -2617,8 +2618,6 @@ static int qemudDomainCoreDump(virDomainPtr dom, goto endjob; } - qemuDomainObjSetJob(vm, QEMU_JOB_DUMP); - /* Migrate will always stop the VM, so the resume condition is independent of whether the stop command is issued. */ resume = virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING; @@ -2664,7 +2663,7 @@ endjob: } } - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndAsyncJob(vm) == 0) vm = NULL; else if ((ret == 0) && (flags & VIR_DUMP_CRASH) && !vm->persistent) { virDomainRemoveInactive(&driver->domains, @@ -2708,7 +2707,7 @@ qemuDomainScreenshot(virDomainPtr dom, priv = vm->privateData; - if (qemuDomainObjBeginJob(vm) < 0) + if (qemuDomainObjBeginJob(vm, QEMU_JOB_QUERY) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -2738,7 +2737,7 @@ qemuDomainScreenshot(virDomainPtr dom, virSecurityManagerSetSavedStateLabel(qemu_driver->securityManager, vm, tmp); - qemuDomainObjEnterMonitor(vm); + ignore_value(qemuDomainObjEnterMonitor(vm)); if (qemuMonitorScreendump(priv->mon, tmp) < 0) { qemuDomainObjExitMonitor(vm); goto endjob; @@ -2793,7 +2792,8 @@ static void processWatchdogEvent(void *data, void *opaque) goto unlock; } - if (qemuDomainObjBeginJobWithDriver(driver, wdEvent->vm) < 0) { + if (qemuDomainObjBeginAsyncJobWithDriver(driver, wdEvent->vm, + QEMU_ASYNC_JOB_DUMP) < 0) { VIR_FREE(dumpfile); goto unlock; } @@ -2831,7 +2831,7 @@ endjob: /* Safe to ignore value since ref count was incremented in * qemuProcessHandleWatchdog(). */ - ignore_value(qemuDomainObjEndJob(wdEvent->vm)); + ignore_value(qemuDomainObjEndAsyncJob(wdEvent->vm)); unlock: if (virDomainObjUnref(wdEvent->vm) > 0) @@ -2848,7 +2848,7 @@ static int qemudDomainHotplugVcpus(virDomainObjPtr vm, unsigned int nvcpus) int oldvcpus = vm->def->vcpus; int vcpus = oldvcpus; - qemuDomainObjEnterMonitor(vm); + ignore_value(qemuDomainObjEnterMonitor(vm)); /* We need different branches here, because we want to offline * in reverse order to onlining, so any partial fail leaves us in a @@ -2934,7 +2934,7 @@ qemudDomainSetVcpusFlags(virDomainPtr dom, unsigned int nvcpus, goto cleanup; } - if (qemuDomainObjBeginJob(vm) < 0) + if (qemuDomainObjBeginJob(vm, QEMU_JOB_MODIFY) < 0) goto cleanup; if (!virDomainObjIsActive(vm) && (flags & VIR_DOMAIN_AFFECT_LIVE)) { @@ -3755,7 +3755,7 @@ qemuDomainRestore(virConnectPtr conn, } def = NULL; - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; ret = qemuDomainSaveImageStartVM(conn, driver, vm, &fd, &header, path); @@ -3848,10 +3848,10 @@ static char *qemuDomainGetXMLDesc(virDomainPtr dom, /* Don't delay if someone's using the monitor, just use * existing most recent data instead */ if (!priv->job.active) { - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_QUERY) < 0) goto cleanup; - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); err = qemuMonitorGetBalloonInfo(priv->mon, &balloon); qemuDomainObjExitMonitorWithDriver(driver, vm); if (qemuDomainObjEndJob(vm) == 0) { @@ -4078,7 +4078,7 @@ qemudDomainStartWithFlags(virDomainPtr dom, unsigned int flags) goto cleanup; } - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; if (virDomainObjIsActive(vm)) { @@ -4823,7 +4823,7 @@ qemuDomainModifyDeviceFlags(virDomainPtr dom, const char *xml, goto cleanup; } - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; if (virDomainObjIsActive(vm)) { @@ -6006,8 +6006,8 @@ qemudDomainBlockStats (virDomainPtr dom, } priv = vm->privateData; - if ((priv->job.active == QEMU_JOB_MIGRATION_OUT) - || (priv->job.active == QEMU_JOB_SAVE)) { + if ((priv->job.asyncJob == QEMU_ASYNC_JOB_MIGRATION_OUT) + || (priv->job.asyncJob == QEMU_ASYNC_JOB_SAVE)) { virDomainObjRef(vm); while (priv->job.signals & QEMU_JOB_SIGNAL_BLKSTAT) ignore_value(virCondWait(&priv->job.signalCond, &vm->lock)); @@ -6023,7 +6023,7 @@ qemudDomainBlockStats (virDomainPtr dom, if (virDomainObjUnref(vm) == 0) vm = NULL; } else { - if (qemuDomainObjBeginJob(vm) < 0) + if (qemuDomainObjBeginJob(vm, QEMU_JOB_QUERY) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -6032,7 +6032,7 @@ qemudDomainBlockStats (virDomainPtr dom, goto endjob; } - qemuDomainObjEnterMonitor(vm); + ignore_value(qemuDomainObjEnterMonitor(vm)); ret = qemuMonitorGetBlockStatsInfo(priv->mon, disk->info.alias, &stats->rd_req, @@ -6135,12 +6135,12 @@ qemudDomainMemoryStats (virDomainPtr dom, goto cleanup; } - if (qemuDomainObjBeginJob(vm) < 0) + if (qemuDomainObjBeginJob(vm, QEMU_JOB_QUERY) < 0) goto cleanup; if (virDomainObjIsActive(vm)) { qemuDomainObjPrivatePtr priv = vm->privateData; - qemuDomainObjEnterMonitor(vm); + ignore_value(qemuDomainObjEnterMonitor(vm)); ret = qemuMonitorGetMemoryStats(priv->mon, stats, nr_stats); qemuDomainObjExitMonitor(vm); } else { @@ -6259,7 +6259,7 @@ qemudDomainMemoryPeek (virDomainPtr dom, goto cleanup; } - if (qemuDomainObjBeginJob(vm) < 0) + if (qemuDomainObjBeginJob(vm, QEMU_JOB_QUERY) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -6283,7 +6283,7 @@ qemudDomainMemoryPeek (virDomainPtr dom, virSecurityManagerSetSavedStateLabel(qemu_driver->securityManager, vm, tmp); priv = vm->privateData; - qemuDomainObjEnterMonitor(vm); + ignore_value(qemuDomainObjEnterMonitor(vm)); if (flags == VIR_MEMORY_VIRTUAL) { if (qemuMonitorSaveVirtualMemory(priv->mon, offset, size, tmp) < 0) { qemuDomainObjExitMonitor(vm); @@ -6453,8 +6453,8 @@ static int qemuDomainGetBlockInfo(virDomainPtr dom, virDomainObjIsActive(vm)) { qemuDomainObjPrivatePtr priv = vm->privateData; - if ((priv->job.active == QEMU_JOB_MIGRATION_OUT) - || (priv->job.active == QEMU_JOB_SAVE)) { + if ((priv->job.asyncJob == QEMU_ASYNC_JOB_MIGRATION_OUT) + || (priv->job.asyncJob == QEMU_ASYNC_JOB_SAVE)) { virDomainObjRef(vm); while (priv->job.signals & QEMU_JOB_SIGNAL_BLKINFO) ignore_value(virCondWait(&priv->job.signalCond, &vm->lock)); @@ -6470,11 +6470,11 @@ static int qemuDomainGetBlockInfo(virDomainPtr dom, if (virDomainObjUnref(vm) == 0) vm = NULL; } else { - if (qemuDomainObjBeginJob(vm) < 0) + if (qemuDomainObjBeginJob(vm, QEMU_JOB_QUERY) < 0) goto cleanup; if (virDomainObjIsActive(vm)) { - qemuDomainObjEnterMonitor(vm); + ignore_value(qemuDomainObjEnterMonitor(vm)); ret = qemuMonitorGetBlockExtent(priv->mon, disk->info.alias, &info->allocation); @@ -7083,7 +7083,7 @@ qemuDomainMigrateConfirm3(virDomainPtr domain, goto cleanup; } - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; ret = qemuMigrationConfirm(driver, domain->conn, vm, @@ -7293,7 +7293,7 @@ static int qemuDomainGetJobInfo(virDomainPtr dom, priv = vm->privateData; if (virDomainObjIsActive(vm)) { - if (priv->job.active) { + if (priv->job.asyncJob) { memcpy(info, &priv->job.info, sizeof(*info)); /* Refresh elapsed time again just to ensure it @@ -7343,7 +7343,7 @@ static int qemuDomainAbortJob(virDomainPtr dom) { priv = vm->privateData; if (virDomainObjIsActive(vm)) { - if (priv->job.active) { + if (priv->job.asyncJob) { VIR_DEBUG("Requesting cancellation of job on vm %s", vm->def->name); priv->job.signals |= QEMU_JOB_SIGNAL_CANCEL; } else { @@ -7397,7 +7397,7 @@ qemuDomainMigrateSetMaxDowntime(virDomainPtr dom, priv = vm->privateData; - if (priv->job.active != QEMU_JOB_MIGRATION_OUT) { + if (priv->job.asyncJob != QEMU_ASYNC_JOB_MIGRATION_OUT) { qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", _("domain is not being migrated")); goto cleanup; @@ -7446,7 +7446,7 @@ qemuDomainMigrateSetMaxSpeed(virDomainPtr dom, priv = vm->privateData; - if (priv->job.active != QEMU_JOB_MIGRATION_OUT) { + if (priv->job.asyncJob != QEMU_ASYNC_JOB_MIGRATION_OUT) { qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", _("domain is not being migrated")); goto cleanup; @@ -7639,7 +7639,7 @@ qemuDomainSnapshotCreateActive(virConnectPtr conn, bool resume = false; int ret = -1; - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) return -1; if (virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING) { @@ -7658,7 +7658,7 @@ qemuDomainSnapshotCreateActive(virConnectPtr conn, } } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); ret = qemuMonitorCreateSnapshot(priv->mon, snap->def->name); qemuDomainObjExitMonitorWithDriver(driver, vm); @@ -7984,7 +7984,7 @@ static int qemuDomainRevertToSnapshot(virDomainSnapshotPtr snapshot, vm->current_snapshot = snap; - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; if (snap->def->state == VIR_DOMAIN_RUNNING @@ -7992,7 +7992,7 @@ static int qemuDomainRevertToSnapshot(virDomainSnapshotPtr snapshot, if (virDomainObjIsActive(vm)) { priv = vm->privateData; - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); rc = qemuMonitorLoadSnapshot(priv->mon, snap->def->name); qemuDomainObjExitMonitorWithDriver(driver, vm); if (rc < 0) @@ -8116,7 +8116,7 @@ static int qemuDomainSnapshotDiscard(struct qemud_driver *driver, } else { priv = vm->privateData; - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); /* we continue on even in the face of error */ qemuMonitorDeleteSnapshot(priv->mon, snap->def->name); qemuDomainObjExitMonitorWithDriver(driver, vm); @@ -8255,7 +8255,7 @@ static int qemuDomainSnapshotDelete(virDomainSnapshotPtr snapshot, goto cleanup; } - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; if (flags & VIR_DOMAIN_SNAPSHOT_DELETE_CHILDREN) { @@ -8324,9 +8324,9 @@ static int qemuDomainMonitorCommand(virDomainPtr domain, const char *cmd, hmp = !!(flags & VIR_DOMAIN_QEMU_MONITOR_COMMAND_HMP); - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); ret = qemuMonitorArbitraryCommand(priv->mon, cmd, result, hmp); qemuDomainObjExitMonitorWithDriver(driver, vm); if (qemuDomainObjEndJob(vm) == 0) { diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index a7f11ab..a7571cd 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -96,7 +96,7 @@ int qemuDomainChangeEjectableMedia(struct qemud_driver *driver, if (!(driveAlias = qemuDeviceDriveHostAlias(origdisk, priv->qemuCaps))) goto error; - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (disk->src) { const char *format = NULL; if (disk->type != VIR_DOMAIN_DISK_TYPE_DIR) { @@ -198,7 +198,7 @@ int qemuDomainAttachPciDiskDevice(struct qemud_driver *driver, goto error; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { ret = qemuMonitorAddDrive(priv->mon, drivestr); if (ret == 0) { @@ -295,7 +295,7 @@ int qemuDomainAttachPciControllerDevice(struct qemud_driver *driver, goto cleanup; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { ret = qemuMonitorAddDevice(priv->mon, devstr); } else { @@ -440,7 +440,7 @@ int qemuDomainAttachSCSIDisk(struct qemud_driver *driver, goto error; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { ret = qemuMonitorAddDrive(priv->mon, drivestr); if (ret == 0) { @@ -542,7 +542,7 @@ int qemuDomainAttachUsbMassstorageDevice(struct qemud_driver *driver, goto error; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { ret = qemuMonitorAddDrive(priv->mon, drivestr); if (ret == 0) { @@ -675,7 +675,7 @@ int qemuDomainAttachNetDevice(virConnectPtr conn, goto cleanup; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_NETDEV) && qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { if (qemuMonitorAddNetdev(priv->mon, netstr, tapfd, tapfd_name, @@ -711,7 +711,7 @@ int qemuDomainAttachNetDevice(virConnectPtr conn, goto try_remove; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { if (qemuMonitorAddDevice(priv->mon, nicstr) < 0) { qemuDomainObjExitMonitorWithDriver(driver, vm); @@ -767,7 +767,7 @@ try_remove: char *netdev_name; if (virAsprintf(&netdev_name, "host%s", net->info.alias) < 0) goto no_memory; - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuMonitorRemoveNetdev(priv->mon, netdev_name) < 0) VIR_WARN("Failed to remove network backend for netdev %s", netdev_name); @@ -780,7 +780,7 @@ try_remove: char *hostnet_name; if (virAsprintf(&hostnet_name, "host%s", net->info.alias) < 0) goto no_memory; - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuMonitorRemoveHostNetwork(priv->mon, vlan, hostnet_name) < 0) VIR_WARN("Failed to remove network backend for vlan %d, net %s", vlan, hostnet_name); @@ -841,14 +841,14 @@ int qemuDomainAttachHostPciDevice(struct qemud_driver *driver, priv->qemuCaps))) goto error; - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); ret = qemuMonitorAddDeviceWithFd(priv->mon, devstr, configfd, configfd_name); qemuDomainObjExitMonitorWithDriver(driver, vm); } else { virDomainDevicePCIAddress guestAddr; - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); ret = qemuMonitorAddPCIHostDevice(priv->mon, &hostdev->source.subsys.u.pci, &guestAddr); @@ -929,7 +929,7 @@ int qemuDomainAttachHostUsbDevice(struct qemud_driver *driver, goto error; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) ret = qemuMonitorAddDevice(priv->mon, devstr); else @@ -1237,7 +1237,7 @@ int qemuDomainDetachPciDiskDevice(struct qemud_driver *driver, goto cleanup; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { if (qemuMonitorDelDevice(priv->mon, detach->info.alias) < 0) { qemuDomainObjExitMonitor(vm); @@ -1333,7 +1333,7 @@ int qemuDomainDetachDiskDevice(struct qemud_driver *driver, goto cleanup; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuMonitorDelDevice(priv->mon, detach->info.alias) < 0) { qemuDomainObjExitMonitor(vm); qemuAuditDisk(vm, detach, NULL, "detach", false); @@ -1471,7 +1471,7 @@ int qemuDomainDetachPciControllerDevice(struct qemud_driver *driver, goto cleanup; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { if (qemuMonitorDelDevice(priv->mon, detach->info.alias)) { qemuDomainObjExitMonitor(vm); @@ -1566,7 +1566,7 @@ int qemuDomainDetachNetDevice(struct qemud_driver *driver, goto cleanup; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { if (qemuMonitorDelDevice(priv->mon, detach->info.alias) < 0) { qemuDomainObjExitMonitor(vm); @@ -1701,7 +1701,7 @@ int qemuDomainDetachHostPciDevice(struct qemud_driver *driver, return -1; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { ret = qemuMonitorDelDevice(priv->mon, detach->info.alias); } else { @@ -1804,7 +1804,7 @@ int qemuDomainDetachHostUsbDevice(struct qemud_driver *driver, return -1; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); ret = qemuMonitorDelDevice(priv->mon, detach->info.alias); qemuDomainObjExitMonitorWithDriver(driver, vm); qemuAuditHostdev(vm, detach, "detach", ret == 0); @@ -1879,7 +1879,7 @@ qemuDomainChangeGraphicsPasswords(struct qemud_driver *driver, if (!auth->passwd && !driver->vncPassword) return 0; - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); ret = qemuMonitorSetPassword(priv->mon, type, auth->passwd ? auth->passwd : defaultPasswd, diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 3634966..0712d73 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -749,9 +749,11 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, if (priv->job.signals & QEMU_JOB_SIGNAL_CANCEL) { priv->job.signals ^= QEMU_JOB_SIGNAL_CANCEL; VIR_DEBUG("Cancelling job at client request"); - qemuDomainObjEnterMonitorWithDriver(driver, vm); - ret = qemuMonitorMigrateCancel(priv->mon); - qemuDomainObjExitMonitorWithDriver(driver, vm); + ret = qemuDomainObjEnterMonitorWithDriver(driver, vm); + if (ret == 0) { + ret = qemuMonitorMigrateCancel(priv->mon); + qemuDomainObjExitMonitorWithDriver(driver, vm); + } if (ret < 0) { VIR_WARN("Unable to cancel job"); } @@ -766,9 +768,11 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, priv->job.signals ^= QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME; priv->job.signalsData.migrateDowntime = 0; VIR_DEBUG("Setting migration downtime to %llums", ms); - qemuDomainObjEnterMonitorWithDriver(driver, vm); - ret = qemuMonitorSetMigrationDowntime(priv->mon, ms); - qemuDomainObjExitMonitorWithDriver(driver, vm); + ret = qemuDomainObjEnterMonitorWithDriver(driver, vm); + if (ret == 0) { + ret = qemuMonitorSetMigrationDowntime(priv->mon, ms); + qemuDomainObjExitMonitorWithDriver(driver, vm); + } if (ret < 0) VIR_WARN("Unable to set migration downtime"); } else if (priv->job.signals & QEMU_JOB_SIGNAL_MIGRATE_SPEED) { @@ -777,21 +781,25 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, priv->job.signals ^= QEMU_JOB_SIGNAL_MIGRATE_SPEED; priv->job.signalsData.migrateBandwidth = 0; VIR_DEBUG("Setting migration bandwidth to %luMbs", bandwidth); - qemuDomainObjEnterMonitorWithDriver(driver, vm); - ret = qemuMonitorSetMigrationSpeed(priv->mon, bandwidth); - qemuDomainObjExitMonitorWithDriver(driver, vm); + ret = qemuDomainObjEnterMonitorWithDriver(driver, vm); + if (ret == 0) { + ret = qemuMonitorSetMigrationSpeed(priv->mon, bandwidth); + qemuDomainObjExitMonitorWithDriver(driver, vm); + } if (ret < 0) VIR_WARN("Unable to set migration speed"); } else if (priv->job.signals & QEMU_JOB_SIGNAL_BLKSTAT) { - qemuDomainObjEnterMonitorWithDriver(driver, vm); - ret = qemuMonitorGetBlockStatsInfo(priv->mon, - priv->job.signalsData.statDevName, - &priv->job.signalsData.blockStat->rd_req, - &priv->job.signalsData.blockStat->rd_bytes, - &priv->job.signalsData.blockStat->wr_req, - &priv->job.signalsData.blockStat->wr_bytes, - &priv->job.signalsData.blockStat->errs); - qemuDomainObjExitMonitorWithDriver(driver, vm); + ret = qemuDomainObjEnterMonitorWithDriver(driver, vm); + if (ret == 0) { + ret = qemuMonitorGetBlockStatsInfo(priv->mon, + priv->job.signalsData.statDevName, + &priv->job.signalsData.blockStat->rd_req, + &priv->job.signalsData.blockStat->rd_bytes, + &priv->job.signalsData.blockStat->wr_req, + &priv->job.signalsData.blockStat->wr_bytes, + &priv->job.signalsData.blockStat->errs); + qemuDomainObjExitMonitorWithDriver(driver, vm); + } *priv->job.signalsData.statRetCode = ret; priv->job.signals ^= QEMU_JOB_SIGNAL_BLKSTAT; @@ -799,11 +807,13 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, if (ret < 0) VIR_WARN("Unable to get block statistics"); } else if (priv->job.signals & QEMU_JOB_SIGNAL_BLKINFO) { - qemuDomainObjEnterMonitorWithDriver(driver, vm); - ret = qemuMonitorGetBlockExtent(priv->mon, - priv->job.signalsData.infoDevName, - &priv->job.signalsData.blockInfo->allocation); - qemuDomainObjExitMonitorWithDriver(driver, vm); + ret = qemuDomainObjEnterMonitorWithDriver(driver, vm); + if (ret == 0) { + ret = qemuMonitorGetBlockExtent(priv->mon, + priv->job.signalsData.infoDevName, + &priv->job.signalsData.blockInfo->allocation); + qemuDomainObjExitMonitorWithDriver(driver, vm); + } *priv->job.signalsData.infoRetCode = ret; priv->job.signals ^= QEMU_JOB_SIGNAL_BLKINFO; @@ -836,13 +846,15 @@ qemuMigrationUpdateJobStatus(struct qemud_driver *driver, return -1; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); - ret = qemuMonitorGetMigrationStatus(priv->mon, - &status, - &memProcessed, - &memRemaining, - &memTotal); - qemuDomainObjExitMonitorWithDriver(driver, vm); + ret = qemuDomainObjEnterMonitorWithDriver(driver, vm); + if (ret == 0) { + ret = qemuMonitorGetMigrationStatus(priv->mon, + &status, + &memProcessed, + &memRemaining, + &memTotal); + qemuDomainObjExitMonitorWithDriver(driver, vm); + } if (ret < 0 || virTimeMs(&priv->job.info.timeElapsed) < 0) { priv->job.info.type = VIR_DOMAIN_JOB_FAILED; @@ -897,14 +909,14 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) qemuDomainObjPrivatePtr priv = vm->privateData; const char *job; - switch (priv->job.active) { - case QEMU_JOB_MIGRATION_OUT: + switch (priv->job.asyncJob) { + case QEMU_ASYNC_JOB_MIGRATION_OUT: job = _("migration job"); break; - case QEMU_JOB_SAVE: + case QEMU_ASYNC_JOB_SAVE: job = _("domain save job"); break; - case QEMU_JOB_DUMP: + case QEMU_ASYNC_JOB_DUMP: job = _("domain core dump job"); break; default: @@ -969,14 +981,16 @@ qemuDomainMigrateGraphicsRelocate(struct qemud_driver *driver, if (cookie->graphics->type != VIR_DOMAIN_GRAPHICS_TYPE_SPICE) return 0; - qemuDomainObjEnterMonitorWithDriver(driver, vm); - ret = qemuMonitorGraphicsRelocate(priv->mon, - cookie->graphics->type, - cookie->remoteHostname, - cookie->graphics->port, - cookie->graphics->tlsPort, - cookie->graphics->tlsSubject); - qemuDomainObjExitMonitorWithDriver(driver, vm); + ret = qemuDomainObjEnterMonitorWithDriver(driver, vm); + if (ret == 0) { + ret = qemuMonitorGraphicsRelocate(priv->mon, + cookie->graphics->type, + cookie->remoteHostname, + cookie->graphics->port, + cookie->graphics->tlsPort, + cookie->graphics->tlsSubject); + qemuDomainObjExitMonitorWithDriver(driver, vm); + } return ret; } @@ -1108,9 +1122,9 @@ qemuMigrationPrepareTunnel(struct qemud_driver *driver, QEMU_MIGRATION_COOKIE_LOCKSTATE))) goto cleanup; - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginAsyncJobWithDriver(driver, vm, + QEMU_ASYNC_JOB_MIGRATION_IN) < 0) goto cleanup; - qemuDomainObjSetJob(vm, QEMU_JOB_MIGRATION_IN); /* Domain starts inactive, even if the domain XML had an id field. */ vm->def->id = -1; @@ -1144,7 +1158,7 @@ qemuMigrationPrepareTunnel(struct qemud_driver *driver, qemuAuditDomainStart(vm, "migrated", false); qemuProcessStop(driver, vm, 0, VIR_DOMAIN_SHUTOFF_FAILED); if (!vm->persistent) { - if (qemuDomainObjEndJob(vm) > 0) + if (qemuDomainObjEndAsyncJob(vm) > 0) virDomainRemoveInactive(&driver->domains, vm); vm = NULL; } @@ -1173,7 +1187,7 @@ qemuMigrationPrepareTunnel(struct qemud_driver *driver, endjob: if (vm && - qemuDomainObjEndJob(vm) == 0) + qemuDomainObjEndAsyncJob(vm) == 0) vm = NULL; /* We set a fake job active which is held across @@ -1183,7 +1197,7 @@ endjob: */ if (vm && virDomainObjIsActive(vm)) { - qemuDomainObjSetJob(vm, QEMU_JOB_MIGRATION_IN); + priv->job.asyncJob = QEMU_ASYNC_JOB_MIGRATION_IN; priv->job.info.type = VIR_DOMAIN_JOB_UNBOUNDED; priv->job.start = now; } @@ -1343,9 +1357,9 @@ qemuMigrationPrepareDirect(struct qemud_driver *driver, QEMU_MIGRATION_COOKIE_LOCKSTATE))) goto cleanup; - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginAsyncJobWithDriver(driver, vm, + QEMU_ASYNC_JOB_MIGRATION_IN) < 0) goto cleanup; - qemuDomainObjSetJob(vm, QEMU_JOB_MIGRATION_IN); /* Domain starts inactive, even if the domain XML had an id field. */ vm->def->id = -1; @@ -1361,7 +1375,7 @@ qemuMigrationPrepareDirect(struct qemud_driver *driver, * should have already done that. */ if (!vm->persistent) { - if (qemuDomainObjEndJob(vm) > 0) + if (qemuDomainObjEndAsyncJob(vm) > 0) virDomainRemoveInactive(&driver->domains, vm); vm = NULL; } @@ -1394,7 +1408,7 @@ qemuMigrationPrepareDirect(struct qemud_driver *driver, endjob: if (vm && - qemuDomainObjEndJob(vm) == 0) + qemuDomainObjEndAsyncJob(vm) == 0) vm = NULL; /* We set a fake job active which is held across @@ -1404,7 +1418,7 @@ endjob: */ if (vm && virDomainObjIsActive(vm)) { - qemuDomainObjSetJob(vm, QEMU_JOB_MIGRATION_IN); + priv->job.asyncJob = QEMU_ASYNC_JOB_MIGRATION_IN; priv->job.info.type = VIR_DOMAIN_JOB_UNBOUNDED; priv->job.start = now; } @@ -1488,7 +1502,9 @@ static int doNativeMigrate(struct qemud_driver *driver, goto cleanup; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + if (qemuDomainObjEnterMonitorWithDriver(driver, vm) < 0) + goto cleanup; + if (resource > 0 && qemuMonitorSetMigrationSpeed(priv->mon, resource) < 0) { qemuDomainObjExitMonitorWithDriver(driver, vm); @@ -1747,7 +1763,9 @@ static int doTunnelMigrate(struct qemud_driver *driver, goto cleanup; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + if (qemuDomainObjEnterMonitorWithDriver(driver, vm) < 0) + goto cleanup; + if (resource > 0 && qemuMonitorSetMigrationSpeed(priv->mon, resource) < 0) { qemuDomainObjExitMonitorWithDriver(driver, vm); @@ -1788,7 +1806,8 @@ static int doTunnelMigrate(struct qemud_driver *driver, /* it is also possible that the migrate didn't fail initially, but * rather failed later on. Check the output of "info migrate" */ - qemuDomainObjEnterMonitorWithDriver(driver, vm); + if (qemuDomainObjEnterMonitorWithDriver(driver, vm) < 0) + goto cancel; if (qemuMonitorGetMigrationStatus(priv->mon, &status, &transferred, @@ -1846,9 +1865,10 @@ cancel: if (ret != 0 && virDomainObjIsActive(vm)) { VIR_FORCE_CLOSE(client_sock); VIR_FORCE_CLOSE(qemu_sock); - qemuDomainObjEnterMonitorWithDriver(driver, vm); - qemuMonitorMigrateCancel(priv->mon); - qemuDomainObjExitMonitorWithDriver(driver, vm); + if (qemuDomainObjEnterMonitorWithDriver(driver, vm) == 0) { + qemuMonitorMigrateCancel(priv->mon); + qemuDomainObjExitMonitorWithDriver(driver, vm); + } } cleanup: @@ -2284,9 +2304,9 @@ int qemuMigrationPerform(struct qemud_driver *driver, cookieout, cookieoutlen, flags, NULLSTR(dname), resource, v3proto); - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginAsyncJobWithDriver(driver, vm, + QEMU_ASYNC_JOB_MIGRATION_OUT) < 0) goto cleanup; - qemuDomainObjSetJob(vm, QEMU_JOB_MIGRATION_OUT); if (!virDomainObjIsActive(vm)) { qemuReportError(VIR_ERR_OPERATION_INVALID, @@ -2365,7 +2385,7 @@ endjob: VIR_DOMAIN_EVENT_RESUMED_MIGRATED); } if (vm) { - if (qemuDomainObjEndJob(vm) == 0) { + if (qemuDomainObjEndAsyncJob(vm) == 0) { vm = NULL; } else if (!virDomainObjIsActive(vm) && (!vm->persistent || (flags & VIR_MIGRATE_UNDEFINE_SOURCE))) { @@ -2450,17 +2470,17 @@ qemuMigrationFinish(struct qemud_driver *driver, virErrorPtr orig_err = NULL; priv = vm->privateData; - if (priv->job.active != QEMU_JOB_MIGRATION_IN) { + if (priv->job.asyncJob != QEMU_ASYNC_JOB_MIGRATION_IN) { qemuReportError(VIR_ERR_NO_DOMAIN, _("domain '%s' is not processing incoming migration"), vm->def->name); goto cleanup; } - qemuDomainObjDiscardJob(vm); + qemuDomainObjDiscardAsyncJob(vm); if (!(mig = qemuMigrationEatCookie(driver, vm, cookiein, cookieinlen, 0))) goto cleanup; - if (qemuDomainObjBeginJobWithDriver(driver, vm) < 0) + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; /* Did the migration go as planned? If yes, return the domain @@ -2724,7 +2744,9 @@ qemuMigrationToFile(struct qemud_driver *driver, virDomainObjPtr vm, restoreLabel = true; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + if (qemuDomainObjEnterMonitorWithDriver(driver, vm) < 0) + goto cleanup; + if (!compressor) { const char *args[] = { "cat", NULL }; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 56593c2..534a8b1 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -374,7 +374,7 @@ qemuProcessFakeReboot(void *opaque) VIR_DEBUG("vm=%p", vm); qemuDriverLock(driver); virDomainObjLock(vm); - if (qemuDomainObjBeginJob(vm) < 0) + if (qemuDomainObjBeginJob(vm, QEMU_JOB_MODIFY) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -383,7 +383,7 @@ qemuProcessFakeReboot(void *opaque) goto endjob; } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuMonitorSystemReset(priv->mon) < 0) { qemuDomainObjExitMonitorWithDriver(driver, vm); goto endjob; @@ -814,7 +814,7 @@ qemuConnectMonitor(struct qemud_driver *driver, virDomainObjPtr vm) } - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); ret = qemuMonitorSetCapabilities(priv->mon); qemuDomainObjExitMonitorWithDriver(driver, vm); @@ -1163,7 +1163,7 @@ qemuProcessWaitForMonitor(struct qemud_driver* driver, goto cleanup; priv = vm->privateData; - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); ret = qemuMonitorGetPtyPaths(priv->mon, paths); qemuDomainObjExitMonitorWithDriver(driver, vm); @@ -1216,7 +1216,7 @@ qemuProcessDetectVcpuPIDs(struct qemud_driver *driver, /* What follows is now all KVM specific */ - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if ((ncpupids = qemuMonitorGetCPUInfo(priv->mon, &cpupids)) < 0) { qemuDomainObjExitMonitorWithDriver(driver, vm); return -1; @@ -1510,7 +1510,7 @@ qemuProcessInitPasswords(virConnectPtr conn, goto cleanup; alias = vm->def->disks[i]->info.alias; - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); ret = qemuMonitorSetDrivePassphrase(priv->mon, alias, secret); VIR_FREE(secret); qemuDomainObjExitMonitorWithDriver(driver, vm); @@ -1901,7 +1901,7 @@ qemuProcessInitPCIAddresses(struct qemud_driver *driver, int ret; qemuMonitorPCIAddress *addrs = NULL; - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); naddrs = qemuMonitorGetAllPCIAddresses(priv->mon, &addrs); qemuDomainObjExitMonitorWithDriver(driver, vm); @@ -2122,7 +2122,7 @@ qemuProcessStartCPUs(struct qemud_driver *driver, virDomainObjPtr vm, } VIR_FREE(priv->lockState); - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); ret = qemuMonitorStartCPUs(priv->mon, conn); qemuDomainObjExitMonitorWithDriver(driver, vm); @@ -2150,9 +2150,11 @@ int qemuProcessStopCPUs(struct qemud_driver *driver, virDomainObjPtr vm, oldState = virDomainObjGetState(vm, &oldReason); virDomainObjSetState(vm, VIR_DOMAIN_PAUSED, reason); - qemuDomainObjEnterMonitorWithDriver(driver, vm); - ret = qemuMonitorStopCPUs(priv->mon); - qemuDomainObjExitMonitorWithDriver(driver, vm); + ret = qemuDomainObjEnterMonitorWithDriver(driver, vm); + if (ret == 0) { + ret = qemuMonitorStopCPUs(priv->mon); + qemuDomainObjExitMonitorWithDriver(driver, vm); + } if (ret == 0) { if (virDomainLockProcessPause(driver->lockManager, vm, &priv->lockState) < 0) @@ -2198,7 +2200,7 @@ qemuProcessUpdateState(struct qemud_driver *driver, virDomainObjPtr vm) bool running; int ret; - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); ret = qemuMonitorGetStatus(priv->mon, &running); qemuDomainObjExitMonitorWithDriver(driver, vm); @@ -2244,6 +2246,9 @@ qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opa priv = obj->privateData; + /* Set fake job so that EnterMonitor* doesn't want to start a new one */ + priv->job.active = QEMU_JOB_MODIFY; + /* Hold an extra reference because we can't allow 'vm' to be * deleted if qemuConnectMonitor() failed */ virDomainObjRef(obj); @@ -2282,6 +2287,8 @@ qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opa if (qemuProcessFiltersInstantiate(conn, obj->def)) goto error; + priv->job.active = QEMU_JOB_NONE; + /* update domain state XML with possibly updated state in virDomainObj */ if (virDomainSaveStatus(driver->caps, driver->stateDir, obj) < 0) goto error; @@ -2695,7 +2702,7 @@ int qemuProcessStart(virConnectPtr conn, VIR_DEBUG("Setting initial memory amount"); cur_balloon = vm->def->mem.cur_balloon; - qemuDomainObjEnterMonitorWithDriver(driver, vm); + ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuMonitorSetBalloon(priv->mon, cur_balloon) < 0) { qemuDomainObjExitMonitorWithDriver(driver, vm); goto cleanup; @@ -3013,13 +3020,14 @@ static void qemuProcessAutoDestroyDom(void *payload, } priv = dom->privateData; - if (priv->job.active == QEMU_JOB_MIGRATION_IN) { - VIR_DEBUG("vm=%s has incoming migration active, cancelling", + if (priv->job.asyncJob) { + VIR_DEBUG("vm=%s has long-term job active, cancelling", dom->def->name); - qemuDomainObjDiscardJob(dom); + qemuDomainObjDiscardAsyncJob(dom); } - if (qemuDomainObjBeginJobWithDriver(data->driver, dom) < 0) + if (qemuDomainObjBeginJobWithDriver(data->driver, dom, + QEMU_JOB_DESTROY) < 0) goto cleanup; VIR_DEBUG("Killing domain"); -- 1.7.6

On Fri, Jul 08, 2011 at 01:34:09AM +0200, Jiri Denemark wrote:
Query commands are safe to be called during long running jobs (such as migration). This patch makes them all work without the need to special-case every single one of them.
The patch introduces new job.asyncCond condition and associated job.asyncJob which are dedicated to asynchronous (from qemu monitor point of view) jobs that can take arbitrarily long time to finish while qemu monitor is still usable for other commands.
The existing job.active (and job.cond condition) is used all other synchronous jobs (including the commands run during async job).
Locking schema is changed to use these two conditions. While asyncJob is active, only allowed set of synchronous jobs is allowed (the set can be different according to a particular asyncJob) so any method that communicates to qemu monitor needs to check if it is allowed to be executed during current asyncJob (if any). Once the check passes, the method needs to normally acquire job.cond to ensure no other command is running. Since domain object lock is released during that time, asyncJob could have been started in the meantime so the method needs to recheck the first condition. Then, normal jobs set job.active and asynchronous jobs set job.asyncJob and optionally change the list of allowed job groups.
Since asynchronous jobs only set job.asyncJob, other allowed commands can still be run when domain object is unlocked (when communicating to remote libvirtd or sleeping). To protect its own internal synchronous commands, the asynchronous job needs to start a special nested job before entering qemu monitor. The nested job doesn't check asyncJob, it only acquires job.cond and sets job.active to block other jobs. --- src/qemu/qemu_domain.c | 219 +++++++++++++++++++++++++++++++++++++-------- src/qemu/qemu_domain.h | 82 +++++++++++++---- src/qemu/qemu_driver.c | 122 +++++++++++++------------- src/qemu/qemu_hotplug.c | 38 ++++---- src/qemu/qemu_migration.c | 152 ++++++++++++++++++------------- src/qemu/qemu_process.c | 42 +++++---- 6 files changed, 439 insertions(+), 216 deletions(-)
ACK Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 07/07/2011 05:34 PM, Jiri Denemark wrote:
Query commands are safe to be called during long running jobs (such as migration). This patch makes them all work without the need to special-case every single one of them.
Git bisect says that this was the culprit commit that broke 'virsh managedsave'.
+static int qemuDomainObjEnterMonitorInternal(struct qemud_driver *driver, virDomainObjPtr obj) { qemuDomainObjPrivatePtr priv = obj->privateData;
+ if (priv->job.active == QEMU_JOB_NONE&& priv->job.asyncJob) { + if (qemuDomainObjBeginNestedJob(obj)< 0) + return -1; + if (!virDomainObjIsActive(obj)) { + qemuReportError(VIR_ERR_OPERATION_FAILED, "%s", + _("domain is no longer running")); + return -1; + } + }
I think this is the problem. Doing a managed save will eventually make the qemu process go away, so we reach a point where we cannot issue a query monitor command to see how the save is progressing. But this function only checks that vm is still active for QEMU_JOB_NONE, not for QEMU_ASYNC_JOB_SAVE. I'm trying out a patch now... -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

If libvirtd is restarted when a job is running, the new libvirtd process needs to know about that to be able to recover and rollback the operation. --- src/qemu/qemu_domain.c | 124 +++++++++++++++++++++++++++++++--------- src/qemu/qemu_domain.h | 35 ++++++++---- src/qemu/qemu_driver.c | 138 +++++++++++++++++++++++---------------------- src/qemu/qemu_hotplug.c | 12 ++-- src/qemu/qemu_migration.c | 20 ++++--- src/qemu/qemu_process.c | 8 +- 6 files changed, 212 insertions(+), 125 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 1ed5efd..062ecc7 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -44,6 +44,26 @@ #define QEMU_NAMESPACE_HREF "http://libvirt.org/schemas/domain/qemu/1.0" +VIR_ENUM_DECL(qemuDomainJob) +VIR_ENUM_IMPL(qemuDomainJob, QEMU_JOB_LAST, + "none", + "query", + "destroy", + "suspend", + "modify", + "none", /* async job is never stored in job.active */ + "async nested", +); + +VIR_ENUM_DECL(qemuDomainAsyncJob) +VIR_ENUM_IMPL(qemuDomainAsyncJob, QEMU_ASYNC_JOB_LAST, + "none", + "migration out", + "migration in", + "save", + "dump", +); + static void qemuDomainEventDispatchFunc(virConnectPtr conn, virDomainEventPtr event, @@ -214,6 +234,12 @@ static int qemuDomainObjPrivateXMLFormat(virBufferPtr buf, void *data) if (priv->lockState) virBufferAsprintf(buf, " <lockstate>%s</lockstate>\n", priv->lockState); + if (priv->job.active || priv->job.asyncJob) { + virBufferAsprintf(buf, " <job type='%s' async='%s'/>\n", + qemuDomainJobTypeToString(priv->job.active), + qemuDomainAsyncJobTypeToString(priv->job.asyncJob)); + } + return 0; } @@ -320,6 +346,32 @@ static int qemuDomainObjPrivateXMLParse(xmlXPathContextPtr ctxt, void *data) priv->lockState = virXPathString("string(./lockstate)", ctxt); + if ((tmp = virXPathString("string(./job[1]/@type)", ctxt))) { + int type; + + if ((type = qemuDomainJobTypeFromString(tmp)) < 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("Unknown job type %s"), tmp); + VIR_FREE(tmp); + goto error; + } + VIR_FREE(tmp); + priv->job.active = type; + } + + if ((tmp = virXPathString("string(./job[1]/@async)", ctxt))) { + int async; + + if ((async = qemuDomainAsyncJobTypeFromString(tmp)) < 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("Unknown async job type %s"), tmp); + VIR_FREE(tmp); + goto error; + } + VIR_FREE(tmp); + priv->job.asyncJob = async; + } + return 0; error: @@ -516,12 +568,16 @@ void qemuDomainSetNamespaceHooks(virCapsPtr caps) } void -qemuDomainObjSetJob(virDomainObjPtr obj, - enum qemuDomainJob job) +qemuDomainObjSaveJob(struct qemud_driver *driver, virDomainObjPtr obj) { - qemuDomainObjPrivatePtr priv = obj->privateData; + if (!virDomainObjIsActive(obj)) { + /* don't write the state file yet, it will be written once the domain + * gets activated */ + return; + } - priv->job.active = job; + if (virDomainSaveStatus(driver->caps, driver->stateDir, obj) < 0) + VIR_WARN("Failed to save status on vm %s", obj->def->name); } void @@ -537,13 +593,14 @@ qemuDomainObjSetAsyncJobMask(virDomainObjPtr obj, } void -qemuDomainObjDiscardAsyncJob(virDomainObjPtr obj) +qemuDomainObjDiscardAsyncJob(struct qemud_driver *driver, virDomainObjPtr obj) { qemuDomainObjPrivatePtr priv = obj->privateData; if (priv->job.active == QEMU_JOB_ASYNC_NESTED) qemuDomainObjResetJob(priv); qemuDomainObjResetAsyncJob(priv); + qemuDomainObjSaveJob(driver, obj); } static bool @@ -559,7 +616,7 @@ qemuDomainJobAllowed(qemuDomainObjPrivatePtr priv, enum qemuDomainJob job) * obj must be locked before calling; driver_locked says if qemu_driver is * locked or not. */ -static int +static int ATTRIBUTE_NONNULL(1) qemuDomainObjBeginJobInternal(struct qemud_driver *driver, bool driver_locked, virDomainObjPtr obj, @@ -611,6 +668,8 @@ retry: virDomainObjLock(obj); } + qemuDomainObjSaveJob(driver, obj); + return 0; error: @@ -639,16 +698,19 @@ error: * Upon successful return, the object will have its ref count increased, * successful calls must be followed by EndJob eventually */ -int qemuDomainObjBeginJob(virDomainObjPtr obj, enum qemuDomainJob job) +int qemuDomainObjBeginJob(struct qemud_driver *driver, + virDomainObjPtr obj, + enum qemuDomainJob job) { - return qemuDomainObjBeginJobInternal(NULL, false, obj, job, + return qemuDomainObjBeginJobInternal(driver, false, obj, job, QEMU_ASYNC_JOB_NONE); } -int qemuDomainObjBeginAsyncJob(virDomainObjPtr obj, +int qemuDomainObjBeginAsyncJob(struct qemud_driver *driver, + virDomainObjPtr obj, enum qemuDomainAsyncJob asyncJob) { - return qemuDomainObjBeginJobInternal(NULL, false, obj, QEMU_JOB_ASYNC, + return qemuDomainObjBeginJobInternal(driver, false, obj, QEMU_JOB_ASYNC, asyncJob); } @@ -692,9 +754,10 @@ int qemuDomainObjBeginAsyncJobWithDriver(struct qemud_driver *driver, * qemuDomainObjBeginJob{,WithDriver} instead. */ int -qemuDomainObjBeginNestedJob(virDomainObjPtr obj) +qemuDomainObjBeginNestedJob(struct qemud_driver *driver, + virDomainObjPtr obj) { - return qemuDomainObjBeginJobInternal(NULL, false, obj, + return qemuDomainObjBeginJobInternal(driver, false, obj, QEMU_JOB_ASYNC_NESTED, QEMU_ASYNC_JOB_NONE); } @@ -717,33 +780,36 @@ qemuDomainObjBeginNestedJobWithDriver(struct qemud_driver *driver, * Returns remaining refcount on 'obj', maybe 0 to indicated it * was deleted */ -int qemuDomainObjEndJob(virDomainObjPtr obj) +int qemuDomainObjEndJob(struct qemud_driver *driver, virDomainObjPtr obj) { qemuDomainObjPrivatePtr priv = obj->privateData; qemuDomainObjResetJob(priv); + qemuDomainObjSaveJob(driver, obj); virCondSignal(&priv->job.cond); return virDomainObjUnref(obj); } int -qemuDomainObjEndAsyncJob(virDomainObjPtr obj) +qemuDomainObjEndAsyncJob(struct qemud_driver *driver, virDomainObjPtr obj) { qemuDomainObjPrivatePtr priv = obj->privateData; qemuDomainObjResetAsyncJob(priv); + qemuDomainObjSaveJob(driver, obj); virCondBroadcast(&priv->job.asyncCond); return virDomainObjUnref(obj); } void -qemuDomainObjEndNestedJob(virDomainObjPtr obj) +qemuDomainObjEndNestedJob(struct qemud_driver *driver, virDomainObjPtr obj) { qemuDomainObjPrivatePtr priv = obj->privateData; qemuDomainObjResetJob(priv); + qemuDomainObjSaveJob(driver, obj); virCondSignal(&priv->job.cond); /* safe to ignore since the surrounding async job increased the reference @@ -752,14 +818,15 @@ qemuDomainObjEndNestedJob(virDomainObjPtr obj) } -static int +static int ATTRIBUTE_NONNULL(1) qemuDomainObjEnterMonitorInternal(struct qemud_driver *driver, + bool driver_locked, virDomainObjPtr obj) { qemuDomainObjPrivatePtr priv = obj->privateData; if (priv->job.active == QEMU_JOB_NONE && priv->job.asyncJob) { - if (qemuDomainObjBeginNestedJob(obj) < 0) + if (qemuDomainObjBeginNestedJob(driver, obj) < 0) return -1; if (!virDomainObjIsActive(obj)) { qemuReportError(VIR_ERR_OPERATION_FAILED, "%s", @@ -772,14 +839,15 @@ qemuDomainObjEnterMonitorInternal(struct qemud_driver *driver, qemuMonitorRef(priv->mon); ignore_value(virTimeMs(&priv->monStart)); virDomainObjUnlock(obj); - if (driver) + if (driver_locked) qemuDriverUnlock(driver); return 0; } -static void +static void ATTRIBUTE_NONNULL(1) qemuDomainObjExitMonitorInternal(struct qemud_driver *driver, + bool driver_locked, virDomainObjPtr obj) { qemuDomainObjPrivatePtr priv = obj->privateData; @@ -790,7 +858,7 @@ qemuDomainObjExitMonitorInternal(struct qemud_driver *driver, if (refs > 0) qemuMonitorUnlock(priv->mon); - if (driver) + if (driver_locked) qemuDriverLock(driver); virDomainObjLock(obj); @@ -800,7 +868,7 @@ qemuDomainObjExitMonitorInternal(struct qemud_driver *driver, } if (priv->job.active == QEMU_JOB_ASYNC_NESTED) - qemuDomainObjEndNestedJob(obj); + qemuDomainObjEndNestedJob(driver, obj); } /* @@ -813,18 +881,20 @@ qemuDomainObjExitMonitorInternal(struct qemud_driver *driver, * * To be followed with qemuDomainObjExitMonitor() once complete */ -int qemuDomainObjEnterMonitor(virDomainObjPtr obj) +int qemuDomainObjEnterMonitor(struct qemud_driver *driver, + virDomainObjPtr obj) { - return qemuDomainObjEnterMonitorInternal(NULL, obj); + return qemuDomainObjEnterMonitorInternal(driver, false, obj); } /* obj must NOT be locked before calling, qemud_driver must be unlocked * * Should be paired with an earlier qemuDomainObjEnterMonitor() call */ -void qemuDomainObjExitMonitor(virDomainObjPtr obj) +void qemuDomainObjExitMonitor(struct qemud_driver *driver, + virDomainObjPtr obj) { - qemuDomainObjExitMonitorInternal(NULL, obj); + qemuDomainObjExitMonitorInternal(driver, false, obj); } /* @@ -840,7 +910,7 @@ void qemuDomainObjExitMonitor(virDomainObjPtr obj) int qemuDomainObjEnterMonitorWithDriver(struct qemud_driver *driver, virDomainObjPtr obj) { - return qemuDomainObjEnterMonitorInternal(driver, obj); + return qemuDomainObjEnterMonitorInternal(driver, true, obj); } /* obj must NOT be locked before calling, qemud_driver must be unlocked, @@ -851,7 +921,7 @@ int qemuDomainObjEnterMonitorWithDriver(struct qemud_driver *driver, void qemuDomainObjExitMonitorWithDriver(struct qemud_driver *driver, virDomainObjPtr obj) { - qemuDomainObjExitMonitorInternal(driver, obj); + qemuDomainObjExitMonitorInternal(driver, true, obj); } void qemuDomainObjEnterRemoteWithDriver(struct qemud_driver *driver, diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 85a3c03..17d1356 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -44,9 +44,11 @@ enum qemuDomainJob { QEMU_JOB_SUSPEND, /* Suspends (stops vCPUs) the domain */ QEMU_JOB_MODIFY, /* May change state */ - /* The following two items must always be the last items */ + /* The following two items must always be the last items before JOB_LAST */ QEMU_JOB_ASYNC, /* Asynchronous job */ QEMU_JOB_ASYNC_NESTED, /* Normal job within an async job */ + + QEMU_JOB_LAST }; /* Async job consists of a series of jobs that may change state. Independent @@ -59,6 +61,8 @@ enum qemuDomainAsyncJob { QEMU_ASYNC_JOB_MIGRATION_IN, QEMU_ASYNC_JOB_SAVE, QEMU_ASYNC_JOB_DUMP, + + QEMU_ASYNC_JOB_LAST }; enum qemuDomainJobSignals { @@ -138,13 +142,16 @@ void qemuDomainEventQueue(struct qemud_driver *driver, void qemuDomainSetPrivateDataHooks(virCapsPtr caps); void qemuDomainSetNamespaceHooks(virCapsPtr caps); -int qemuDomainObjBeginJob(virDomainObjPtr obj, +int qemuDomainObjBeginJob(struct qemud_driver *driver, + virDomainObjPtr obj, enum qemuDomainJob job) ATTRIBUTE_RETURN_CHECK; -int qemuDomainObjBeginAsyncJob(virDomainObjPtr obj, +int qemuDomainObjBeginAsyncJob(struct qemud_driver *driver, + virDomainObjPtr obj, enum qemuDomainAsyncJob asyncJob) ATTRIBUTE_RETURN_CHECK; -int qemuDomainObjBeginNestedJob(virDomainObjPtr obj) +int qemuDomainObjBeginNestedJob(struct qemud_driver *driver, + virDomainObjPtr obj) ATTRIBUTE_RETURN_CHECK; int qemuDomainObjBeginJobWithDriver(struct qemud_driver *driver, virDomainObjPtr obj, @@ -158,20 +165,26 @@ int qemuDomainObjBeginNestedJobWithDriver(struct qemud_driver *driver, virDomainObjPtr obj) ATTRIBUTE_RETURN_CHECK; -int qemuDomainObjEndJob(virDomainObjPtr obj) +int qemuDomainObjEndJob(struct qemud_driver *driver, + virDomainObjPtr obj) ATTRIBUTE_RETURN_CHECK; -int qemuDomainObjEndAsyncJob(virDomainObjPtr obj) +int qemuDomainObjEndAsyncJob(struct qemud_driver *driver, + virDomainObjPtr obj) ATTRIBUTE_RETURN_CHECK; -void qemuDomainObjEndNestedJob(virDomainObjPtr obj); +void qemuDomainObjEndNestedJob(struct qemud_driver *driver, + virDomainObjPtr obj); -void qemuDomainObjSetJob(virDomainObjPtr obj, enum qemuDomainJob job); +void qemuDomainObjSaveJob(struct qemud_driver *driver, virDomainObjPtr obj); void qemuDomainObjSetAsyncJobMask(virDomainObjPtr obj, unsigned long long allowedJobs); -void qemuDomainObjDiscardAsyncJob(virDomainObjPtr obj); +void qemuDomainObjDiscardAsyncJob(struct qemud_driver *driver, + virDomainObjPtr obj); -int qemuDomainObjEnterMonitor(virDomainObjPtr obj) +int qemuDomainObjEnterMonitor(struct qemud_driver *driver, + virDomainObjPtr obj) ATTRIBUTE_RETURN_CHECK; -void qemuDomainObjExitMonitor(virDomainObjPtr obj); +void qemuDomainObjExitMonitor(struct qemud_driver *driver, + virDomainObjPtr obj); int qemuDomainObjEnterMonitorWithDriver(struct qemud_driver *driver, virDomainObjPtr obj) ATTRIBUTE_RETURN_CHECK; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 96b3737..9dcb248 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -157,7 +157,7 @@ qemuAutostartDomain(void *payload, const void *name ATTRIBUTE_UNUSED, void *opaq err ? err->message : _("unknown error")); } - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(data->driver, vm) == 0) vm = NULL; } @@ -1283,7 +1283,7 @@ static virDomainPtr qemudDomainCreate(virConnectPtr conn, const char *xml, (flags & VIR_DOMAIN_START_AUTODESTROY) != 0, -1, NULL, VIR_VM_OP_CREATE) < 0) { qemuAuditDomainStart(vm, "booted", false); - if (qemuDomainObjEndJob(vm) > 0) + if (qemuDomainObjEndJob(driver, vm) > 0) virDomainRemoveInactive(&driver->domains, vm); vm = NULL; @@ -1299,7 +1299,7 @@ static virDomainPtr qemudDomainCreate(virConnectPtr conn, const char *xml, if (dom) dom->id = vm->def->id; if (vm && - qemuDomainObjEndJob(vm) == 0) + qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; cleanup: @@ -1375,7 +1375,7 @@ static int qemudDomainSuspend(virDomainPtr dom) { } endjob: - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; cleanup: @@ -1431,7 +1431,7 @@ static int qemudDomainResume(virDomainPtr dom) { ret = 0; endjob: - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; cleanup: @@ -1462,7 +1462,7 @@ static int qemuDomainShutdown(virDomainPtr dom) { goto cleanup; } - if (qemuDomainObjBeginJob(vm, QEMU_JOB_MODIFY) < 0) + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -1472,14 +1472,14 @@ static int qemuDomainShutdown(virDomainPtr dom) { } priv = vm->privateData; - ignore_value(qemuDomainObjEnterMonitor(vm)); + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); ret = qemuMonitorSystemPowerdown(priv->mon); - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); priv->fakeReboot = false; endjob: - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; cleanup: @@ -1512,7 +1512,7 @@ static int qemuDomainReboot(virDomainPtr dom, unsigned int flags) { #if HAVE_YAJL if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_MONITOR_JSON)) { - if (qemuDomainObjBeginJob(vm, QEMU_JOB_MODIFY) < 0) + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -1521,14 +1521,14 @@ static int qemuDomainReboot(virDomainPtr dom, unsigned int flags) { goto endjob; } - ignore_value(qemuDomainObjEnterMonitor(vm)); + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); ret = qemuMonitorSystemPowerdown(priv->mon); - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); priv->fakeReboot = true; endjob: - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; } else { #endif @@ -1588,7 +1588,7 @@ static int qemudDomainDestroy(virDomainPtr dom) { qemuAuditDomainStop(vm, "destroyed"); if (!vm->persistent) { - if (qemuDomainObjEndJob(vm) > 0) + if (qemuDomainObjEndJob(driver, vm) > 0) virDomainRemoveInactive(&driver->domains, vm); vm = NULL; @@ -1597,7 +1597,7 @@ static int qemudDomainDestroy(virDomainPtr dom) { endjob: if (vm && - qemuDomainObjEndJob(vm) == 0) + qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; cleanup: @@ -1685,7 +1685,7 @@ static int qemudDomainSetMemoryFlags(virDomainPtr dom, unsigned long newmem, goto cleanup; } - if (qemuDomainObjBeginJob(vm, QEMU_JOB_MODIFY) < 0) + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; isActive = virDomainObjIsActive(vm); @@ -1750,9 +1750,9 @@ static int qemudDomainSetMemoryFlags(virDomainPtr dom, unsigned long newmem, if (flags & VIR_DOMAIN_AFFECT_LIVE) { priv = vm->privateData; - ignore_value(qemuDomainObjEnterMonitor(vm)); + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); r = qemuMonitorSetBalloon(priv->mon, newmem); - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); qemuAuditMemory(vm, vm->def->mem.cur_balloon, newmem, "update", r == 1); if (r < 0) @@ -1776,7 +1776,7 @@ static int qemudDomainSetMemoryFlags(virDomainPtr dom, unsigned long newmem, ret = 0; endjob: - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; cleanup: @@ -1827,7 +1827,7 @@ static int qemuDomainInjectNMI(virDomainPtr domain, unsigned int flags) ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); ret = qemuMonitorInjectNMI(priv->mon); qemuDomainObjExitMonitorWithDriver(driver, vm); - if (qemuDomainObjEndJob(vm) == 0) { + if (qemuDomainObjEndJob(driver, vm) == 0) { vm = NULL; goto cleanup; } @@ -1880,16 +1880,16 @@ static int qemudDomainGetInfo(virDomainPtr dom, (vm->def->memballoon->model == VIR_DOMAIN_MEMBALLOON_MODEL_NONE)) { info->memory = vm->def->mem.max_balloon; } else if (!priv->job.active) { - if (qemuDomainObjBeginJob(vm, QEMU_JOB_QUERY) < 0) + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_QUERY) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) err = 0; else { - ignore_value(qemuDomainObjEnterMonitor(vm)); + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); err = qemuMonitorGetBalloonInfo(priv->mon, &balloon); - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); } - if (qemuDomainObjEndJob(vm) == 0) { + if (qemuDomainObjEndJob(driver, vm) == 0) { vm = NULL; goto cleanup; } @@ -2294,7 +2294,7 @@ static int qemudDomainSaveFlag(struct qemud_driver *driver, virDomainPtr dom, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_SAVED); if (!vm->persistent) { - if (qemuDomainObjEndAsyncJob(vm) > 0) + if (qemuDomainObjEndAsyncJob(driver, vm) > 0) virDomainRemoveInactive(&driver->domains, vm); vm = NULL; @@ -2310,7 +2310,7 @@ endjob: VIR_WARN("Unable to resume guest CPUs after save failure"); } } - if (qemuDomainObjEndAsyncJob(vm) == 0) + if (qemuDomainObjEndAsyncJob(driver, vm) == 0) vm = NULL; } @@ -2663,7 +2663,7 @@ endjob: } } - if (qemuDomainObjEndAsyncJob(vm) == 0) + if (qemuDomainObjEndAsyncJob(driver, vm) == 0) vm = NULL; else if ((ret == 0) && (flags & VIR_DUMP_CRASH) && !vm->persistent) { virDomainRemoveInactive(&driver->domains, @@ -2707,7 +2707,7 @@ qemuDomainScreenshot(virDomainPtr dom, priv = vm->privateData; - if (qemuDomainObjBeginJob(vm, QEMU_JOB_QUERY) < 0) + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_QUERY) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -2737,12 +2737,12 @@ qemuDomainScreenshot(virDomainPtr dom, virSecurityManagerSetSavedStateLabel(qemu_driver->securityManager, vm, tmp); - ignore_value(qemuDomainObjEnterMonitor(vm)); + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); if (qemuMonitorScreendump(priv->mon, tmp) < 0) { - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); goto endjob; } - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); if (VIR_CLOSE(tmp_fd) < 0) { virReportSystemError(errno, _("unable to close %s"), tmp); @@ -2761,7 +2761,7 @@ endjob: VIR_FORCE_CLOSE(tmp_fd); VIR_FREE(tmp); - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; cleanup: @@ -2831,7 +2831,7 @@ endjob: /* Safe to ignore value since ref count was incremented in * qemuProcessHandleWatchdog(). */ - ignore_value(qemuDomainObjEndAsyncJob(wdEvent->vm)); + ignore_value(qemuDomainObjEndAsyncJob(driver, wdEvent->vm)); unlock: if (virDomainObjUnref(wdEvent->vm) > 0) @@ -2840,7 +2840,9 @@ unlock: VIR_FREE(wdEvent); } -static int qemudDomainHotplugVcpus(virDomainObjPtr vm, unsigned int nvcpus) +static int qemudDomainHotplugVcpus(struct qemud_driver *driver, + virDomainObjPtr vm, + unsigned int nvcpus) { qemuDomainObjPrivatePtr priv = vm->privateData; int i, rc = 1; @@ -2848,7 +2850,7 @@ static int qemudDomainHotplugVcpus(virDomainObjPtr vm, unsigned int nvcpus) int oldvcpus = vm->def->vcpus; int vcpus = oldvcpus; - ignore_value(qemuDomainObjEnterMonitor(vm)); + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); /* We need different branches here, because we want to offline * in reverse order to onlining, so any partial fail leaves us in a @@ -2880,7 +2882,7 @@ static int qemudDomainHotplugVcpus(virDomainObjPtr vm, unsigned int nvcpus) ret = 0; cleanup: - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); vm->def->vcpus = vcpus; qemuAuditVcpu(vm, oldvcpus, nvcpus, "update", rc == 1); return ret; @@ -2934,7 +2936,7 @@ qemudDomainSetVcpusFlags(virDomainPtr dom, unsigned int nvcpus, goto cleanup; } - if (qemuDomainObjBeginJob(vm, QEMU_JOB_MODIFY) < 0) + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; if (!virDomainObjIsActive(vm) && (flags & VIR_DOMAIN_AFFECT_LIVE)) { @@ -2990,11 +2992,11 @@ qemudDomainSetVcpusFlags(virDomainPtr dom, unsigned int nvcpus, break; case VIR_DOMAIN_AFFECT_LIVE: - ret = qemudDomainHotplugVcpus(vm, nvcpus); + ret = qemudDomainHotplugVcpus(driver, vm, nvcpus); break; case VIR_DOMAIN_AFFECT_LIVE | VIR_DOMAIN_AFFECT_CONFIG: - ret = qemudDomainHotplugVcpus(vm, nvcpus); + ret = qemudDomainHotplugVcpus(driver, vm, nvcpus); if (ret == 0) { persistentDef->vcpus = nvcpus; } @@ -3006,7 +3008,7 @@ qemudDomainSetVcpusFlags(virDomainPtr dom, unsigned int nvcpus, ret = virDomainSaveConfig(driver->configDir, persistentDef); endjob: - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; cleanup: @@ -3760,7 +3762,7 @@ qemuDomainRestore(virConnectPtr conn, ret = qemuDomainSaveImageStartVM(conn, driver, vm, &fd, &header, path); - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; else if (ret < 0 && !vm->persistent) { virDomainRemoveInactive(&driver->domains, vm); @@ -3854,7 +3856,7 @@ static char *qemuDomainGetXMLDesc(virDomainPtr dom, ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); err = qemuMonitorGetBalloonInfo(priv->mon, &balloon); qemuDomainObjExitMonitorWithDriver(driver, vm); - if (qemuDomainObjEndJob(vm) == 0) { + if (qemuDomainObjEndJob(driver, vm) == 0) { vm = NULL; goto cleanup; } @@ -4095,7 +4097,7 @@ qemudDomainStartWithFlags(virDomainPtr dom, unsigned int flags) ret = 0; endjob: - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; cleanup: @@ -4917,7 +4919,7 @@ qemuDomainModifyDeviceFlags(virDomainPtr dom, const char *xml, } endjob: - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; cleanup: @@ -6023,7 +6025,7 @@ qemudDomainBlockStats (virDomainPtr dom, if (virDomainObjUnref(vm) == 0) vm = NULL; } else { - if (qemuDomainObjBeginJob(vm, QEMU_JOB_QUERY) < 0) + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_QUERY) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -6032,7 +6034,7 @@ qemudDomainBlockStats (virDomainPtr dom, goto endjob; } - ignore_value(qemuDomainObjEnterMonitor(vm)); + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); ret = qemuMonitorGetBlockStatsInfo(priv->mon, disk->info.alias, &stats->rd_req, @@ -6040,10 +6042,10 @@ qemudDomainBlockStats (virDomainPtr dom, &stats->wr_req, &stats->wr_bytes, &stats->errs); - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); endjob: - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; } @@ -6135,20 +6137,20 @@ qemudDomainMemoryStats (virDomainPtr dom, goto cleanup; } - if (qemuDomainObjBeginJob(vm, QEMU_JOB_QUERY) < 0) + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_QUERY) < 0) goto cleanup; if (virDomainObjIsActive(vm)) { qemuDomainObjPrivatePtr priv = vm->privateData; - ignore_value(qemuDomainObjEnterMonitor(vm)); + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); ret = qemuMonitorGetMemoryStats(priv->mon, stats, nr_stats); - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); } else { qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", _("domain is not running")); } - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; cleanup: @@ -6259,7 +6261,7 @@ qemudDomainMemoryPeek (virDomainPtr dom, goto cleanup; } - if (qemuDomainObjBeginJob(vm, QEMU_JOB_QUERY) < 0) + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_QUERY) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -6283,19 +6285,19 @@ qemudDomainMemoryPeek (virDomainPtr dom, virSecurityManagerSetSavedStateLabel(qemu_driver->securityManager, vm, tmp); priv = vm->privateData; - ignore_value(qemuDomainObjEnterMonitor(vm)); + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); if (flags == VIR_MEMORY_VIRTUAL) { if (qemuMonitorSaveVirtualMemory(priv->mon, offset, size, tmp) < 0) { - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); goto endjob; } } else { if (qemuMonitorSavePhysicalMemory(priv->mon, offset, size, tmp) < 0) { - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); goto endjob; } } - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); /* Read the memory file into buffer. */ if (saferead (fd, buffer, size) == (ssize_t) -1) { @@ -6308,7 +6310,7 @@ qemudDomainMemoryPeek (virDomainPtr dom, ret = 0; endjob: - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; cleanup: @@ -6470,20 +6472,20 @@ static int qemuDomainGetBlockInfo(virDomainPtr dom, if (virDomainObjUnref(vm) == 0) vm = NULL; } else { - if (qemuDomainObjBeginJob(vm, QEMU_JOB_QUERY) < 0) + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_QUERY) < 0) goto cleanup; if (virDomainObjIsActive(vm)) { - ignore_value(qemuDomainObjEnterMonitor(vm)); + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); ret = qemuMonitorGetBlockExtent(priv->mon, disk->info.alias, &info->allocation); - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); } else { ret = 0; } - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; } } else { @@ -7090,7 +7092,7 @@ qemuDomainMigrateConfirm3(virDomainPtr domain, cookiein, cookieinlen, flags, cancelled); - if (qemuDomainObjEndJob(vm) == 0) { + if (qemuDomainObjEndJob(driver, vm) == 0) { vm = NULL; } else if (!virDomainObjIsActive(vm) && (!vm->persistent || (flags & VIR_MIGRATE_UNDEFINE_SOURCE))) { @@ -7671,7 +7673,7 @@ cleanup: _("resuming after snapshot failed")); } - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) *vmptr = NULL; return ret; @@ -8046,7 +8048,7 @@ static int qemuDomainRevertToSnapshot(virDomainSnapshotPtr snapshot, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_FROM_SNAPSHOT); if (!vm->persistent) { - if (qemuDomainObjEndJob(vm) > 0) + if (qemuDomainObjEndJob(driver, vm) > 0) virDomainRemoveInactive(&driver->domains, vm); vm = NULL; goto cleanup; @@ -8060,7 +8062,7 @@ static int qemuDomainRevertToSnapshot(virDomainSnapshotPtr snapshot, ret = 0; endjob: - if (vm && qemuDomainObjEndJob(vm) == 0) + if (vm && qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; cleanup: @@ -8281,7 +8283,7 @@ static int qemuDomainSnapshotDelete(virDomainSnapshotPtr snapshot, ret = qemuDomainSnapshotDiscard(driver, vm, snap); endjob: - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; cleanup: @@ -8329,7 +8331,7 @@ static int qemuDomainMonitorCommand(virDomainPtr domain, const char *cmd, ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); ret = qemuMonitorArbitraryCommand(priv->mon, cmd, result, hmp); qemuDomainObjExitMonitorWithDriver(driver, vm); - if (qemuDomainObjEndJob(vm) == 0) { + if (qemuDomainObjEndJob(driver, vm) == 0) { vm = NULL; goto cleanup; } diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index a7571cd..06e2c84 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -1240,14 +1240,14 @@ int qemuDomainDetachPciDiskDevice(struct qemud_driver *driver, ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { if (qemuMonitorDelDevice(priv->mon, detach->info.alias) < 0) { - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); qemuAuditDisk(vm, detach, NULL, "detach", false); goto cleanup; } } else { if (qemuMonitorRemovePCIDevice(priv->mon, &detach->info.addr.pci) < 0) { - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); qemuAuditDisk(vm, detach, NULL, "detach", false); goto cleanup; } @@ -1335,7 +1335,7 @@ int qemuDomainDetachDiskDevice(struct qemud_driver *driver, ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuMonitorDelDevice(priv->mon, detach->info.alias) < 0) { - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); qemuAuditDisk(vm, detach, NULL, "detach", false); goto cleanup; } @@ -1474,13 +1474,13 @@ int qemuDomainDetachPciControllerDevice(struct qemud_driver *driver, ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { if (qemuMonitorDelDevice(priv->mon, detach->info.alias)) { - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); goto cleanup; } } else { if (qemuMonitorRemovePCIDevice(priv->mon, &detach->info.addr.pci) < 0) { - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); goto cleanup; } } @@ -1569,7 +1569,7 @@ int qemuDomainDetachNetDevice(struct qemud_driver *driver, ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { if (qemuMonitorDelDevice(priv->mon, detach->info.alias) < 0) { - qemuDomainObjExitMonitor(vm); + qemuDomainObjExitMonitor(driver, vm); qemuAuditNet(vm, detach, NULL, "detach", false); goto cleanup; } diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 0712d73..4c516b0 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -1158,7 +1158,7 @@ qemuMigrationPrepareTunnel(struct qemud_driver *driver, qemuAuditDomainStart(vm, "migrated", false); qemuProcessStop(driver, vm, 0, VIR_DOMAIN_SHUTOFF_FAILED); if (!vm->persistent) { - if (qemuDomainObjEndAsyncJob(vm) > 0) + if (qemuDomainObjEndAsyncJob(driver, vm) > 0) virDomainRemoveInactive(&driver->domains, vm); vm = NULL; } @@ -1187,7 +1187,7 @@ qemuMigrationPrepareTunnel(struct qemud_driver *driver, endjob: if (vm && - qemuDomainObjEndAsyncJob(vm) == 0) + qemuDomainObjEndAsyncJob(driver, vm) == 0) vm = NULL; /* We set a fake job active which is held across @@ -1198,6 +1198,7 @@ endjob: if (vm && virDomainObjIsActive(vm)) { priv->job.asyncJob = QEMU_ASYNC_JOB_MIGRATION_IN; + qemuDomainObjSaveJob(driver, vm); priv->job.info.type = VIR_DOMAIN_JOB_UNBOUNDED; priv->job.start = now; } @@ -1375,7 +1376,7 @@ qemuMigrationPrepareDirect(struct qemud_driver *driver, * should have already done that. */ if (!vm->persistent) { - if (qemuDomainObjEndAsyncJob(vm) > 0) + if (qemuDomainObjEndAsyncJob(driver, vm) > 0) virDomainRemoveInactive(&driver->domains, vm); vm = NULL; } @@ -1408,7 +1409,7 @@ qemuMigrationPrepareDirect(struct qemud_driver *driver, endjob: if (vm && - qemuDomainObjEndAsyncJob(vm) == 0) + qemuDomainObjEndAsyncJob(driver, vm) == 0) vm = NULL; /* We set a fake job active which is held across @@ -1419,6 +1420,7 @@ endjob: if (vm && virDomainObjIsActive(vm)) { priv->job.asyncJob = QEMU_ASYNC_JOB_MIGRATION_IN; + qemuDomainObjSaveJob(driver, vm); priv->job.info.type = VIR_DOMAIN_JOB_UNBOUNDED; priv->job.start = now; } @@ -2385,7 +2387,7 @@ endjob: VIR_DOMAIN_EVENT_RESUMED_MIGRATED); } if (vm) { - if (qemuDomainObjEndAsyncJob(vm) == 0) { + if (qemuDomainObjEndAsyncJob(driver, vm) == 0) { vm = NULL; } else if (!virDomainObjIsActive(vm) && (!vm->persistent || (flags & VIR_MIGRATE_UNDEFINE_SOURCE))) { @@ -2475,7 +2477,7 @@ qemuMigrationFinish(struct qemud_driver *driver, _("domain '%s' is not processing incoming migration"), vm->def->name); goto cleanup; } - qemuDomainObjDiscardAsyncJob(vm); + qemuDomainObjDiscardAsyncJob(driver, vm); if (!(mig = qemuMigrationEatCookie(driver, vm, cookiein, cookieinlen, 0))) goto cleanup; @@ -2555,7 +2557,7 @@ qemuMigrationFinish(struct qemud_driver *driver, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_FAILED); if (!vm->persistent) { - if (qemuDomainObjEndJob(vm) > 0) + if (qemuDomainObjEndJob(driver, vm) > 0) virDomainRemoveInactive(&driver->domains, vm); vm = NULL; } @@ -2591,7 +2593,7 @@ qemuMigrationFinish(struct qemud_driver *driver, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_FAILED); if (!vm->persistent) { - if (qemuDomainObjEndJob(vm) > 0) + if (qemuDomainObjEndJob(driver, vm) > 0) virDomainRemoveInactive(&driver->domains, vm); vm = NULL; } @@ -2602,7 +2604,7 @@ qemuMigrationFinish(struct qemud_driver *driver, endjob: if (vm && - qemuDomainObjEndJob(vm) == 0) + qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; cleanup: diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 534a8b1..3ffde51 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -374,7 +374,7 @@ qemuProcessFakeReboot(void *opaque) VIR_DEBUG("vm=%p", vm); qemuDriverLock(driver); virDomainObjLock(vm); - if (qemuDomainObjBeginJob(vm, QEMU_JOB_MODIFY) < 0) + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -410,7 +410,7 @@ qemuProcessFakeReboot(void *opaque) ret = 0; endjob: - if (qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(driver, vm) == 0) vm = NULL; cleanup: @@ -3023,7 +3023,7 @@ static void qemuProcessAutoDestroyDom(void *payload, if (priv->job.asyncJob) { VIR_DEBUG("vm=%s has long-term job active, cancelling", dom->def->name); - qemuDomainObjDiscardAsyncJob(dom); + qemuDomainObjDiscardAsyncJob(data->driver, dom); } if (qemuDomainObjBeginJobWithDriver(data->driver, dom, @@ -3036,7 +3036,7 @@ static void qemuProcessAutoDestroyDom(void *payload, event = virDomainEventNewFromObj(dom, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_DESTROYED); - if (qemuDomainObjEndJob(dom) == 0) + if (qemuDomainObjEndJob(data->driver, dom) == 0) dom = NULL; if (dom && !dom->persistent) virDomainRemoveInactive(&data->driver->domains, dom); -- 1.7.6

On Fri, Jul 08, 2011 at 01:34:10AM +0200, Jiri Denemark wrote:
If libvirtd is restarted when a job is running, the new libvirtd process needs to know about that to be able to recover and rollback the operation. --- src/qemu/qemu_domain.c | 124 +++++++++++++++++++++++++++++++--------- src/qemu/qemu_domain.h | 35 ++++++++---- src/qemu/qemu_driver.c | 138 +++++++++++++++++++++++---------------------- src/qemu/qemu_hotplug.c | 12 ++-- src/qemu/qemu_migration.c | 20 ++++--- src/qemu/qemu_process.c | 8 +- 6 files changed, 212 insertions(+), 125 deletions(-)
ACK Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

Detect and react on situations when libvirtd was restarted or killed when a job was active. --- src/qemu/qemu_domain.c | 14 ++++++++ src/qemu/qemu_domain.h | 2 + src/qemu/qemu_process.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 062ecc7..b26308e 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -142,6 +142,20 @@ qemuDomainObjResetAsyncJob(qemuDomainObjPrivatePtr priv) memset(&job->signalsData, 0, sizeof(job->signalsData)); } +void +qemuDomainObjRestoreJob(virDomainObjPtr obj, + struct qemuDomainJobObj *job) +{ + qemuDomainObjPrivatePtr priv = obj->privateData; + + memset(job, 0, sizeof(*job)); + job->active = priv->job.active; + job->asyncJob = priv->job.asyncJob; + + qemuDomainObjResetJob(priv); + qemuDomainObjResetAsyncJob(priv); +} + static void qemuDomainObjFreeJob(qemuDomainObjPrivatePtr priv) { diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 17d1356..49be3d2 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -177,6 +177,8 @@ void qemuDomainObjEndNestedJob(struct qemud_driver *driver, void qemuDomainObjSaveJob(struct qemud_driver *driver, virDomainObjPtr obj); void qemuDomainObjSetAsyncJobMask(virDomainObjPtr obj, unsigned long long allowedJobs); +void qemuDomainObjRestoreJob(virDomainObjPtr obj, + struct qemuDomainJobObj *job); void qemuDomainObjDiscardAsyncJob(struct qemud_driver *driver, virDomainObjPtr obj); diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 3ffde51..49625b5 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2223,6 +2223,80 @@ qemuProcessUpdateState(struct qemud_driver *driver, virDomainObjPtr vm) return 0; } +static int +qemuProcessRecoverJob(struct qemud_driver *driver, + virDomainObjPtr vm, + virConnectPtr conn, + const struct qemuDomainJobObj *job) +{ + virDomainState state; + int reason; + + state = virDomainObjGetState(vm, &reason); + + switch (job->asyncJob) { + case QEMU_ASYNC_JOB_MIGRATION_OUT: + case QEMU_ASYNC_JOB_MIGRATION_IN: + /* we don't know what to do yet */ + break; + + case QEMU_ASYNC_JOB_SAVE: + case QEMU_ASYNC_JOB_DUMP: + /* TODO cancel possibly running migrate operation */ + /* resume the domain but only if it was paused as a result of + * running save/dump operation */ + if (state == VIR_DOMAIN_PAUSED && + ((job->asyncJob == QEMU_ASYNC_JOB_DUMP && + reason == VIR_DOMAIN_PAUSED_DUMP) || + (job->asyncJob == QEMU_ASYNC_JOB_SAVE && + reason == VIR_DOMAIN_PAUSED_SAVE) || + reason == VIR_DOMAIN_PAUSED_UNKNOWN)) { + if (qemuProcessStartCPUs(driver, vm, conn, + VIR_DOMAIN_RUNNING_UNPAUSED) < 0) { + VIR_WARN("Could not resume domain %s after", vm->def->name); + } + } + break; + + case QEMU_ASYNC_JOB_NONE: + case QEMU_ASYNC_JOB_LAST: + break; + } + + if (!virDomainObjIsActive(vm)) + return -1; + + switch (job->active) { + case QEMU_JOB_QUERY: + /* harmless */ + break; + + case QEMU_JOB_DESTROY: + VIR_DEBUG("Domain %s should have already been destroyed", + vm->def->name); + return -1; + + case QEMU_JOB_SUSPEND: + /* mostly harmless */ + break; + + case QEMU_JOB_MODIFY: + /* XXX depending on the command we may be in an inconsistent state and + * we should probably fall back to "monitor error" state and refuse to + */ + break; + + case QEMU_JOB_ASYNC: + case QEMU_JOB_ASYNC_NESTED: + /* async job was already handled above */ + case QEMU_JOB_NONE: + case QEMU_JOB_LAST: + break; + } + + return 0; +} + struct qemuProcessReconnectData { virConnectPtr conn; struct qemud_driver *driver; @@ -2239,9 +2313,12 @@ qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opa struct qemud_driver *driver = data->driver; qemuDomainObjPrivatePtr priv; virConnectPtr conn = data->conn; + struct qemuDomainJobObj oldjob; virDomainObjLock(obj); + qemuDomainObjRestoreJob(obj, &oldjob); + VIR_DEBUG("Reconnect monitor to %p '%s'", obj, obj->def->name); priv = obj->privateData; @@ -2287,6 +2364,9 @@ qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opa if (qemuProcessFiltersInstantiate(conn, obj->def)) goto error; + if (qemuProcessRecoverJob(driver, obj, conn, &oldjob) < 0) + goto error; + priv->job.active = QEMU_JOB_NONE; /* update domain state XML with possibly updated state in virDomainObj */ -- 1.7.6

On Fri, Jul 08, 2011 at 01:34:11AM +0200, Jiri Denemark wrote:
Detect and react on situations when libvirtd was restarted or killed when a job was active. --- src/qemu/qemu_domain.c | 14 ++++++++ src/qemu/qemu_domain.h | 2 + src/qemu/qemu_process.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 0 deletions(-)
ACK Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

Asynchronous jobs may take long time to finish and may consist of several phases which we need to now about to help with recovery/rollback after libvirtd restarts. --- src/qemu/qemu_domain.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++- src/qemu/qemu_domain.h | 9 ++++++ 2 files changed, 83 insertions(+), 1 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index b26308e..d0dd764 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -65,6 +65,46 @@ VIR_ENUM_IMPL(qemuDomainAsyncJob, QEMU_ASYNC_JOB_LAST, ); +const char * +qemuDomainAsyncJobPhaseToString(enum qemuDomainAsyncJob job, + int phase ATTRIBUTE_UNUSED) +{ + switch (job) { + case QEMU_ASYNC_JOB_MIGRATION_OUT: + case QEMU_ASYNC_JOB_MIGRATION_IN: + case QEMU_ASYNC_JOB_SAVE: + case QEMU_ASYNC_JOB_DUMP: + case QEMU_ASYNC_JOB_NONE: + case QEMU_ASYNC_JOB_LAST: + ; /* fall through */ + } + + return "none"; +} + +int +qemuDomainAsyncJobPhaseFromString(enum qemuDomainAsyncJob job, + const char *phase) +{ + if (!phase) + return 0; + + switch (job) { + case QEMU_ASYNC_JOB_MIGRATION_OUT: + case QEMU_ASYNC_JOB_MIGRATION_IN: + case QEMU_ASYNC_JOB_SAVE: + case QEMU_ASYNC_JOB_DUMP: + case QEMU_ASYNC_JOB_NONE: + case QEMU_ASYNC_JOB_LAST: + ; /* fall through */ + } + + if (STREQ(phase, "none")) + return 0; + else + return -1; +} + static void qemuDomainEventDispatchFunc(virConnectPtr conn, virDomainEventPtr event, virConnectDomainEventGenericCallback cb, @@ -135,6 +175,7 @@ qemuDomainObjResetAsyncJob(qemuDomainObjPrivatePtr priv) struct qemuDomainJobObj *job = &priv->job; job->asyncJob = QEMU_ASYNC_JOB_NONE; + job->phase = 0; job->mask = DEFAULT_JOB_MASK; job->start = 0; memset(&job->info, 0, sizeof(job->info)); @@ -151,6 +192,7 @@ qemuDomainObjRestoreJob(virDomainObjPtr obj, memset(job, 0, sizeof(*job)); job->active = priv->job.active; job->asyncJob = priv->job.asyncJob; + job->phase = priv->job.phase; qemuDomainObjResetJob(priv); qemuDomainObjResetAsyncJob(priv); @@ -249,9 +291,15 @@ static int qemuDomainObjPrivateXMLFormat(virBufferPtr buf, void *data) virBufferAsprintf(buf, " <lockstate>%s</lockstate>\n", priv->lockState); if (priv->job.active || priv->job.asyncJob) { - virBufferAsprintf(buf, " <job type='%s' async='%s'/>\n", + virBufferAsprintf(buf, " <job type='%s' async='%s'", qemuDomainJobTypeToString(priv->job.active), qemuDomainAsyncJobTypeToString(priv->job.asyncJob)); + if (priv->job.phase) { + virBufferAsprintf(buf, " phase='%s'", + qemuDomainAsyncJobPhaseToString( + priv->job.asyncJob, priv->job.phase)); + } + virBufferAddLit(buf, "/>\n"); } return 0; @@ -384,6 +432,17 @@ static int qemuDomainObjPrivateXMLParse(xmlXPathContextPtr ctxt, void *data) } VIR_FREE(tmp); priv->job.asyncJob = async; + + if ((tmp = virXPathString("string(./job[1]/@phase)", ctxt))) { + priv->job.phase = qemuDomainAsyncJobPhaseFromString(async, tmp); + if (priv->job.phase < 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("Unknown job phase %s"), tmp); + VIR_FREE(tmp); + goto error; + } + VIR_FREE(tmp); + } } return 0; @@ -595,6 +654,20 @@ qemuDomainObjSaveJob(struct qemud_driver *driver, virDomainObjPtr obj) } void +qemuDomainObjSetJobPhase(struct qemud_driver *driver, + virDomainObjPtr obj, + int phase) +{ + qemuDomainObjPrivatePtr priv = obj->privateData; + + if (!priv->job.asyncJob) + return; + + priv->job.phase = phase; + qemuDomainObjSaveJob(driver, obj); +} + +void qemuDomainObjSetAsyncJobMask(virDomainObjPtr obj, unsigned long long allowedJobs) { diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 49be3d2..7245e67 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -91,6 +91,7 @@ struct qemuDomainJobObj { virCond asyncCond; /* Use to coordinate with async jobs */ enum qemuDomainAsyncJob asyncJob; /* Currently active async job */ + int phase; /* Job phase (mainly for migrations) */ unsigned long long mask; /* Jobs allowed during async job */ unsigned long long start; /* When the async job started */ virDomainJobInfo info; /* Async job progress data */ @@ -133,6 +134,11 @@ struct qemuDomainWatchdogEvent int action; }; +const char *qemuDomainAsyncJobPhaseToString(enum qemuDomainAsyncJob job, + int phase); +int qemuDomainAsyncJobPhaseFromString(enum qemuDomainAsyncJob job, + const char *phase); + void qemuDomainEventFlush(int timer ATTRIBUTE_UNUSED, void *opaque); /* driver must be locked before calling */ @@ -175,6 +181,9 @@ void qemuDomainObjEndNestedJob(struct qemud_driver *driver, virDomainObjPtr obj); void qemuDomainObjSaveJob(struct qemud_driver *driver, virDomainObjPtr obj); +void qemuDomainObjSetJobPhase(struct qemud_driver *driver, + virDomainObjPtr obj, + int phase); void qemuDomainObjSetAsyncJobMask(virDomainObjPtr obj, unsigned long long allowedJobs); void qemuDomainObjRestoreJob(virDomainObjPtr obj, -- 1.7.6

On Fri, Jul 08, 2011 at 01:34:12AM +0200, Jiri Denemark wrote:
Asynchronous jobs may take long time to finish and may consist of several phases which we need to now about to help with recovery/rollback after libvirtd restarts. --- src/qemu/qemu_domain.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++- src/qemu/qemu_domain.h | 9 ++++++ 2 files changed, 83 insertions(+), 1 deletions(-)
ACK Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

Most of the code in these two functions is supposed to be identical but currently it isn't (which is natural since the code is duplicated). Let's move common parts of these functions into qemuMigrationPrepareAny. --- src/qemu/qemu_migration.c | 255 +++++++++++++++----------------------------- 1 files changed, 87 insertions(+), 168 deletions(-) diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 4c516b0..e595596 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -1057,40 +1057,33 @@ cleanup: /* Prepare is the first step, and it runs on the destination host. - * - * This version starts an empty VM listening on a localhost TCP port, and - * sets up the corresponding virStream to handle the incoming data. */ -int -qemuMigrationPrepareTunnel(struct qemud_driver *driver, - virConnectPtr dconn, - const char *cookiein, - int cookieinlen, - char **cookieout, - int *cookieoutlen, - virStreamPtr st, - const char *dname, - const char *dom_xml) + +static int +qemuMigrationPrepareAny(struct qemud_driver *driver, + virConnectPtr dconn, + const char *cookiein, + int cookieinlen, + char **cookieout, + int *cookieoutlen, + const char *dname, + const char *dom_xml, + const char *migrateFrom, + virStreamPtr st) { virDomainDefPtr def = NULL; virDomainObjPtr vm = NULL; virDomainEventPtr event = NULL; int ret = -1; - int internalret; int dataFD[2] = { -1, -1 }; qemuDomainObjPrivatePtr priv = NULL; unsigned long long now; qemuMigrationCookiePtr mig = NULL; - - VIR_DEBUG("driver=%p, dconn=%p, cookiein=%s, cookieinlen=%d, " - "cookieout=%p, cookieoutlen=%p, st=%p, dname=%s, dom_xml=%s", - driver, dconn, NULLSTR(cookiein), cookieinlen, - cookieout, cookieoutlen, st, NULLSTR(dname), dom_xml); + bool tunnel = !!st; if (virTimeMs(&now) < 0) return -1; - /* Parse the domain XML. */ if (!(def = virDomainDefParseString(driver->caps, dom_xml, VIR_DOMAIN_XML_INACTIVE))) goto cleanup; @@ -1129,50 +1122,45 @@ qemuMigrationPrepareTunnel(struct qemud_driver *driver, /* Domain starts inactive, even if the domain XML had an id field. */ vm->def->id = -1; - if (pipe(dataFD) < 0 || - virSetCloseExec(dataFD[1]) < 0) { + if (tunnel && + (pipe(dataFD) < 0 || virSetCloseExec(dataFD[1]) < 0)) { virReportSystemError(errno, "%s", _("cannot create pipe for tunnelled migration")); goto endjob; } /* Start the QEMU daemon, with the same command-line arguments plus - * -incoming stdio (which qemu_command might convert to exec:cat or fd:n) + * -incoming $migrateFrom */ - internalret = qemuProcessStart(dconn, driver, vm, "stdio", true, - true, dataFD[0], NULL, - VIR_VM_OP_MIGRATE_IN_START); - if (internalret < 0) { + if (qemuProcessStart(dconn, driver, vm, migrateFrom, true, + true, dataFD[0], NULL, + VIR_VM_OP_MIGRATE_IN_START) < 0) { qemuAuditDomainStart(vm, "migrated", false); /* Note that we don't set an error here because qemuProcessStart * should have already done that. */ - if (!vm->persistent) { - virDomainRemoveInactive(&driver->domains, vm); - vm = NULL; - } goto endjob; } - if (virFDStreamOpen(st, dataFD[1]) < 0) { - qemuAuditDomainStart(vm, "migrated", false); - qemuProcessStop(driver, vm, 0, VIR_DOMAIN_SHUTOFF_FAILED); - if (!vm->persistent) { - if (qemuDomainObjEndAsyncJob(driver, vm) > 0) - virDomainRemoveInactive(&driver->domains, vm); - vm = NULL; + if (tunnel) { + if (virFDStreamOpen(st, dataFD[1]) < 0) { + virReportSystemError(errno, "%s", + _("cannot pass pipe for tunnelled migration")); + qemuAuditDomainStart(vm, "migrated", false); + qemuProcessStop(driver, vm, 0, VIR_DOMAIN_SHUTOFF_FAILED); + goto endjob; } - virReportSystemError(errno, "%s", - _("cannot pass pipe for tunnelled migration")); - goto endjob; + dataFD[1] = -1; /* 'st' owns the FD now & will close it */ } - dataFD[1] = -1; /* 'st' owns the FD now & will close it */ - - qemuAuditDomainStart(vm, "migrated", true); - event = virDomainEventNewFromObj(vm, - VIR_DOMAIN_EVENT_STARTED, - VIR_DOMAIN_EVENT_STARTED_MIGRATED); + if (mig->lockState) { + VIR_DEBUG("Received lockstate %s", mig->lockState); + VIR_FREE(priv->lockState); + priv->lockState = mig->lockState; + mig->lockState = NULL; + } else { + VIR_DEBUG("Received no lockstate"); + } if (qemuMigrationBakeCookie(mig, driver, vm, cookieout, cookieoutlen, QEMU_MIGRATION_COOKIE_GRAPHICS) < 0) { @@ -1183,12 +1171,19 @@ qemuMigrationPrepareTunnel(struct qemud_driver *driver, VIR_WARN("Unable to encode migration cookie"); } + qemuAuditDomainStart(vm, "migrated", true); + event = virDomainEventNewFromObj(vm, + VIR_DOMAIN_EVENT_STARTED, + VIR_DOMAIN_EVENT_STARTED_MIGRATED); ret = 0; endjob: - if (vm && - qemuDomainObjEndAsyncJob(driver, vm) == 0) + if (qemuDomainObjEndAsyncJob(driver, vm) == 0) { + vm = NULL; + } else if (!vm->persistent && !virDomainObjIsActive(vm)) { + virDomainRemoveInactive(&driver->domains, vm); vm = NULL; + } /* We set a fake job active which is held across * API calls until the finish() call. This prevents @@ -1216,6 +1211,38 @@ cleanup: } +/* + * This version starts an empty VM listening on a localhost TCP port, and + * sets up the corresponding virStream to handle the incoming data. + */ +int +qemuMigrationPrepareTunnel(struct qemud_driver *driver, + virConnectPtr dconn, + const char *cookiein, + int cookieinlen, + char **cookieout, + int *cookieoutlen, + virStreamPtr st, + const char *dname, + const char *dom_xml) +{ + int ret; + + VIR_DEBUG("driver=%p, dconn=%p, cookiein=%s, cookieinlen=%d, " + "cookieout=%p, cookieoutlen=%p, st=%p, dname=%s, dom_xml=%s", + driver, dconn, NULLSTR(cookiein), cookieinlen, + cookieout, cookieoutlen, st, NULLSTR(dname), dom_xml); + + /* QEMU will be started with -incoming stdio (which qemu_command might + * convert to exec:cat or fd:n) + */ + ret = qemuMigrationPrepareAny(driver, dconn, cookiein, cookieinlen, + cookieout, cookieoutlen, dname, dom_xml, + "stdio", st); + return ret; +} + + int qemuMigrationPrepareDirect(struct qemud_driver *driver, virConnectPtr dconn, @@ -1229,18 +1256,11 @@ qemuMigrationPrepareDirect(struct qemud_driver *driver, const char *dom_xml) { static int port = 0; - virDomainDefPtr def = NULL; - virDomainObjPtr vm = NULL; int this_port; char *hostname = NULL; char migrateFrom [64]; const char *p; - virDomainEventPtr event = NULL; int ret = -1; - int internalret; - qemuDomainObjPrivatePtr priv = NULL; - unsigned long long now; - qemuMigrationCookiePtr mig = NULL; VIR_DEBUG("driver=%p, dconn=%p, cookiein=%s, cookieinlen=%d, " "cookieout=%p, cookieoutlen=%p, uri_in=%s, uri_out=%p, " @@ -1249,9 +1269,6 @@ qemuMigrationPrepareDirect(struct qemud_driver *driver, cookieout, cookieoutlen, NULLSTR(uri_in), uri_out, NULLSTR(dname), dom_xml); - if (virTimeMs(&now) < 0) - return -1; - /* The URI passed in may be NULL or a string "tcp://somehostname:port". * * If the URI passed in is NULL then we allocate a port number @@ -1272,7 +1289,8 @@ qemuMigrationPrepareDirect(struct qemud_driver *driver, if (STRPREFIX(hostname, "localhost")) { qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("hostname on destination resolved to localhost, but migration requires an FQDN")); + _("hostname on destination resolved to localhost," + " but migration requires an FQDN")); goto cleanup; } @@ -1282,8 +1300,7 @@ qemuMigrationPrepareDirect(struct qemud_driver *driver, * new targets accept both syntaxes though. */ /* Caller frees */ - internalret = virAsprintf(uri_out, "tcp:%s:%d", hostname, this_port); - if (internalret < 0) { + if (virAsprintf(uri_out, "tcp:%s:%d", hostname, this_port) < 0) { virReportOOMError(); goto cleanup; } @@ -1293,8 +1310,9 @@ qemuMigrationPrepareDirect(struct qemud_driver *driver, * characters in hostname part don't matter. */ if (!STRPREFIX (uri_in, "tcp:")) { - qemuReportError (VIR_ERR_INVALID_ARG, - "%s", _("only tcp URIs are supported for KVM/QEMU migrations")); + qemuReportError(VIR_ERR_INVALID_ARG, "%s", + _("only tcp URIs are supported for KVM/QEMU" + " migrations")); goto cleanup; } @@ -1326,115 +1344,16 @@ qemuMigrationPrepareDirect(struct qemud_driver *driver, if (*uri_out) VIR_DEBUG("Generated uri_out=%s", *uri_out); - /* Parse the domain XML. */ - if (!(def = virDomainDefParseString(driver->caps, dom_xml, - VIR_DOMAIN_XML_INACTIVE))) - goto cleanup; - - if (!qemuMigrationIsAllowed(def)) - goto cleanup; - - /* Target domain name, maybe renamed. */ - if (dname) { - VIR_FREE(def->name); - def->name = strdup(dname); - if (def->name == NULL) - goto cleanup; - } - - if (virDomainObjIsDuplicate(&driver->domains, def, 1) < 0) - goto cleanup; - - if (!(vm = virDomainAssignDef(driver->caps, - &driver->domains, - def, true))) { - /* virDomainAssignDef already set the error */ - goto cleanup; - } - def = NULL; - priv = vm->privateData; - - if (!(mig = qemuMigrationEatCookie(driver, vm, cookiein, cookieinlen, - QEMU_MIGRATION_COOKIE_LOCKSTATE))) - goto cleanup; - - if (qemuDomainObjBeginAsyncJobWithDriver(driver, vm, - QEMU_ASYNC_JOB_MIGRATION_IN) < 0) - goto cleanup; - - /* Domain starts inactive, even if the domain XML had an id field. */ - vm->def->id = -1; - - /* Start the QEMU daemon, with the same command-line arguments plus - * -incoming tcp:0.0.0.0:port - */ - snprintf (migrateFrom, sizeof (migrateFrom), "tcp:0.0.0.0:%d", this_port); - if (qemuProcessStart(dconn, driver, vm, migrateFrom, true, true, - -1, NULL, VIR_VM_OP_MIGRATE_IN_START) < 0) { - qemuAuditDomainStart(vm, "migrated", false); - /* Note that we don't set an error here because qemuProcessStart - * should have already done that. - */ - if (!vm->persistent) { - if (qemuDomainObjEndAsyncJob(driver, vm) > 0) - virDomainRemoveInactive(&driver->domains, vm); - vm = NULL; - } - goto endjob; - } - - if (mig->lockState) { - VIR_DEBUG("Received lockstate %s", mig->lockState); - VIR_FREE(priv->lockState); - priv->lockState = mig->lockState; - mig->lockState = NULL; - } else { - VIR_DEBUG("Received no lockstate"); - } - - if (qemuMigrationBakeCookie(mig, driver, vm, cookieout, cookieoutlen, - QEMU_MIGRATION_COOKIE_GRAPHICS) < 0) { - /* We could tear down the whole guest here, but - * cookie data is (so far) non-critical, so that - * seems a little harsh. We'll just warn for now. - */ - VIR_WARN("Unable to encode migration cookie"); - } - - qemuAuditDomainStart(vm, "migrated", true); - event = virDomainEventNewFromObj(vm, - VIR_DOMAIN_EVENT_STARTED, - VIR_DOMAIN_EVENT_STARTED_MIGRATED); - ret = 0; - -endjob: - if (vm && - qemuDomainObjEndAsyncJob(driver, vm) == 0) - vm = NULL; - - /* We set a fake job active which is held across - * API calls until the finish() call. This prevents - * any other APIs being invoked while incoming - * migration is taking place - */ - if (vm && - virDomainObjIsActive(vm)) { - priv->job.asyncJob = QEMU_ASYNC_JOB_MIGRATION_IN; - qemuDomainObjSaveJob(driver, vm); - priv->job.info.type = VIR_DOMAIN_JOB_UNBOUNDED; - priv->job.start = now; - } + /* QEMU will be started with -incoming tcp:0.0.0.0:port */ + snprintf(migrateFrom, sizeof (migrateFrom), "tcp:0.0.0.0:%d", this_port); + ret = qemuMigrationPrepareAny(driver, dconn, cookiein, cookieinlen, + cookieout, cookieoutlen, dname, dom_xml, + migrateFrom, NULL); cleanup: VIR_FREE(hostname); - virDomainDefFree(def); if (ret != 0) VIR_FREE(*uri_out); - if (vm) - virDomainObjUnlock(vm); - if (event) - qemuDomainEventQueue(driver, event); - qemuMigrationCookieFree(mig); return ret; } -- 1.7.6

On Fri, Jul 08, 2011 at 01:34:13AM +0200, Jiri Denemark wrote:
Most of the code in these two functions is supposed to be identical but currently it isn't (which is natural since the code is duplicated). Let's move common parts of these functions into qemuMigrationPrepareAny.
Were there any actual bugs fixed in this consolidation, or were the differences in the code just cosmetic ? If so it'd be good to mention the bugs fixed in the commit message
--- src/qemu/qemu_migration.c | 255 +++++++++++++++----------------------------- 1 files changed, 87 insertions(+), 168 deletions(-)
ACK Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Mon, Jul 11, 2011 at 21:28:30 +0100, Daniel P. Berrange wrote:
On Fri, Jul 08, 2011 at 01:34:13AM +0200, Jiri Denemark wrote:
Most of the code in these two functions is supposed to be identical but currently it isn't (which is natural since the code is duplicated). Let's move common parts of these functions into qemuMigrationPrepareAny.
Were there any actual bugs fixed in this consolidation, or were the differences in the code just cosmetic ? If so it'd be good to mention the bugs fixed in the commit message
Yes, I added the following to the commit message: This also fixes qemuMigrationPrepareTunnel which didn't store received lockState in the domain object. Jirka

This patch introduces several helper methods to deal with jobs and phases during migration in a simpler manner. --- src/qemu/MIGRATION.txt | 55 +++++++++++++++++++++++++++ src/qemu/qemu_domain.c | 5 ++ src/qemu/qemu_migration.c | 91 +++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_migration.h | 36 ++++++++++++++++++ 4 files changed, 187 insertions(+), 0 deletions(-) create mode 100644 src/qemu/MIGRATION.txt diff --git a/src/qemu/MIGRATION.txt b/src/qemu/MIGRATION.txt new file mode 100644 index 0000000..6c32998 --- /dev/null +++ b/src/qemu/MIGRATION.txt @@ -0,0 +1,55 @@ + QEMU Migration Locking Rules + ============================ + +Migration is a complicated beast which may span across several APIs on both +source and destination side and we need to keep the domain we are migrating in +a consistent state during the whole process. + +To avoid anyone from changing the domain in the middle of migration we need to +keep MIGRATION_OUT job active during migration from Begin to Confirm on the +source side and MIGRATION_IN job has to be active from Prepare to Finish on +the destination side. + +For this purpose we introduce several helper methods to deal with locking +primitives (described in THREADS.txt) in the right way: + +* qemuMigrationJobStart + +* qemuMigrationJobContinue + +* qemuMigrationJobStartPhase + +* qemuMigrationJobSetPhase + +* qemuMigrationJobFinish + +The sequence of calling qemuMigrationJob* helper methods is as follows: + +- The first API of a migration protocol (Prepare or Perform/Begin depending on + migration type and version) has to start migration job and keep it active: + + qemuMigrationJobStart(driver, vm, QEMU_JOB_MIGRATION_{IN,OUT}); + qemuMigrationJobSetPhase(driver, vm, QEMU_MIGRATION_PHASE_*); + ...do work... + qemuMigrationJobContinue(vm); + +- All consequent phases except for the last one have to keep the job active: + + if (!qemuMigrationJobIsActive(vm, QEMU_JOB_MIGRATION_{IN,OUT})) + return; + qemuMigrationJobStartPhase(driver, vm, QEMU_MIGRATION_PHASE_*); + ...do work... + qemuMigrationJobContinue(vm); + +- The last migration phase finally finishes the migration job: + + if (!qemuMigrationJobIsActive(vm, QEMU_JOB_MIGRATION_{IN,OUT})) + return; + qemuMigrationJobStartPhase(driver, vm, QEMU_MIGRATION_PHASE_*); + ...do work... + qemuMigrationJobFinish(driver, vm); + +While migration job is running (i.e., after qemuMigrationJobStart* but before +qemuMigrationJob{Continue,Finish}), migration phase can be advanced using + + qemuMigrationJobSetPhase(driver, vm, QEMU_MIGRATION_PHASE_*); diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index d0dd764..39cbf0e 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -26,6 +26,7 @@ #include "qemu_domain.h" #include "qemu_command.h" #include "qemu_capabilities.h" +#include "qemu_migration.h" #include "memory.h" #include "logging.h" #include "virterror_internal.h" @@ -72,6 +73,8 @@ qemuDomainAsyncJobPhaseToString(enum qemuDomainAsyncJob job, switch (job) { case QEMU_ASYNC_JOB_MIGRATION_OUT: case QEMU_ASYNC_JOB_MIGRATION_IN: + return qemuMigrationJobPhaseTypeToString(phase); + case QEMU_ASYNC_JOB_SAVE: case QEMU_ASYNC_JOB_DUMP: case QEMU_ASYNC_JOB_NONE: @@ -92,6 +95,8 @@ qemuDomainAsyncJobPhaseFromString(enum qemuDomainAsyncJob job, switch (job) { case QEMU_ASYNC_JOB_MIGRATION_OUT: case QEMU_ASYNC_JOB_MIGRATION_IN: + return qemuMigrationJobPhaseTypeFromString(phase); + case QEMU_ASYNC_JOB_SAVE: case QEMU_ASYNC_JOB_DUMP: case QEMU_ASYNC_JOB_NONE: diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index e595596..33aa89b 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -46,6 +46,19 @@ #define VIR_FROM_THIS VIR_FROM_QEMU +VIR_ENUM_IMPL(qemuMigrationJobPhase, QEMU_MIGRATION_PHASE_LAST, + "none", + "preform2", + "begin3", + "perform3", + "perform3_done", + "confirm3_cancelled", + "confirm3", + "prepare", + "finish2", + "finish3", +); + enum qemuMigrationCookieFlags { QEMU_MIGRATION_COOKIE_FLAG_GRAPHICS, QEMU_MIGRATION_COOKIE_FLAG_LOCKSTATE, @@ -2749,3 +2762,81 @@ cleanup: } return ret; } + +int +qemuMigrationJobStart(struct qemud_driver *driver, + virDomainObjPtr vm, + enum qemuDomainAsyncJob job) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + + if (qemuDomainObjBeginAsyncJobWithDriver(driver, vm, job) < 0) + return -1; + + if (job == QEMU_ASYNC_JOB_MIGRATION_IN) + qemuDomainObjSetAsyncJobMask(vm, QEMU_JOB_NONE); + else + qemuDomainObjSetAsyncJobMask(vm, DEFAULT_JOB_MASK); + + priv->job.info.type = VIR_DOMAIN_JOB_UNBOUNDED; + + return 0; +} + +void +qemuMigrationJobSetPhase(struct qemud_driver *driver, + virDomainObjPtr vm, + enum qemuMigrationJobPhase phase) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + + if (phase < priv->job.phase) { + VIR_ERROR(_("migration protocol going backwards %s => %s"), + qemuMigrationJobPhaseTypeToString(priv->job.phase), + qemuMigrationJobPhaseTypeToString(phase)); + return; + } + + qemuDomainObjSetJobPhase(driver, vm, phase); +} + +void +qemuMigrationJobStartPhase(struct qemud_driver *driver, + virDomainObjPtr vm, + enum qemuMigrationJobPhase phase) +{ + virDomainObjRef(vm); + qemuMigrationJobSetPhase(driver, vm, phase); +} + +int +qemuMigrationJobContinue(virDomainObjPtr vm) +{ + return virDomainObjUnref(vm); +} + +bool +qemuMigrationJobIsActive(virDomainObjPtr vm, + enum qemuDomainAsyncJob job) +{ + qemuDomainObjPrivatePtr priv = vm->privateData; + + if (priv->job.asyncJob != job) { + const char *msg; + + if (job == QEMU_ASYNC_JOB_MIGRATION_IN) + msg = _("domain '%s' is not processing incoming migration"); + else + msg = _("domain '%s' is not being migrated"); + + qemuReportError(VIR_ERR_OPERATION_INVALID, msg, vm->def->name); + return false; + } + return true; +} + +int +qemuMigrationJobFinish(struct qemud_driver *driver, virDomainObjPtr vm) +{ + return qemuDomainObjEndAsyncJob(driver, vm); +} diff --git a/src/qemu/qemu_migration.h b/src/qemu/qemu_migration.h index d3a3743..4342173 100644 --- a/src/qemu/qemu_migration.h +++ b/src/qemu/qemu_migration.h @@ -23,7 +23,43 @@ # define __QEMU_MIGRATION_H__ # include "qemu_conf.h" +# include "qemu_domain.h" +enum qemuMigrationJobPhase { + QEMU_MIGRATION_PHASE_NONE = 0, + QEMU_MIGRATION_PHASE_PERFORM2, + QEMU_MIGRATION_PHASE_BEGIN3, + QEMU_MIGRATION_PHASE_PERFORM3, + QEMU_MIGRATION_PHASE_PERFORM3_DONE, + QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED, + QEMU_MIGRATION_PHASE_CONFIRM3, + QEMU_MIGRATION_PHASE_PREPARE, + QEMU_MIGRATION_PHASE_FINISH2, + QEMU_MIGRATION_PHASE_FINISH3, + + QEMU_MIGRATION_PHASE_LAST +}; +VIR_ENUM_DECL(qemuMigrationJobPhase) + +int qemuMigrationJobStart(struct qemud_driver *driver, + virDomainObjPtr vm, + enum qemuDomainAsyncJob job) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; +void qemuMigrationJobSetPhase(struct qemud_driver *driver, + virDomainObjPtr vm, + enum qemuMigrationJobPhase phase) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); +void qemuMigrationJobStartPhase(struct qemud_driver *driver, + virDomainObjPtr vm, + enum qemuMigrationJobPhase phase) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); +int qemuMigrationJobContinue(virDomainObjPtr obj) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_RETURN_CHECK; +bool qemuMigrationJobIsActive(virDomainObjPtr vm, + enum qemuDomainAsyncJob job) + ATTRIBUTE_NONNULL(1); +int qemuMigrationJobFinish(struct qemud_driver *driver, virDomainObjPtr obj) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_RETURN_CHECK; bool qemuMigrationIsAllowed(virDomainDefPtr def) ATTRIBUTE_NONNULL(1); -- 1.7.6

Make MIGRATION_IN use the new helper methods. --- src/qemu/qemu_domain.c | 2 +- src/qemu/qemu_domain.h | 1 - src/qemu/qemu_migration.c | 80 +++++++++++++++++++------------------------- 3 files changed, 36 insertions(+), 47 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 39cbf0e..2e4228a 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -645,7 +645,7 @@ void qemuDomainSetNamespaceHooks(virCapsPtr caps) caps->ns.href = qemuDomainDefNamespaceHref; } -void +static void qemuDomainObjSaveJob(struct qemud_driver *driver, virDomainObjPtr obj) { if (!virDomainObjIsActive(obj)) { diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 7245e67..387c64c 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -180,7 +180,6 @@ int qemuDomainObjEndAsyncJob(struct qemud_driver *driver, void qemuDomainObjEndNestedJob(struct qemud_driver *driver, virDomainObjPtr obj); -void qemuDomainObjSaveJob(struct qemud_driver *driver, virDomainObjPtr obj); void qemuDomainObjSetJobPhase(struct qemud_driver *driver, virDomainObjPtr obj, int phase); diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 33aa89b..5e31b7d 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -1128,9 +1128,9 @@ qemuMigrationPrepareAny(struct qemud_driver *driver, QEMU_MIGRATION_COOKIE_LOCKSTATE))) goto cleanup; - if (qemuDomainObjBeginAsyncJobWithDriver(driver, vm, - QEMU_ASYNC_JOB_MIGRATION_IN) < 0) + if (qemuMigrationJobStart(driver, vm, QEMU_ASYNC_JOB_MIGRATION_IN) < 0) goto cleanup; + qemuMigrationJobSetPhase(driver, vm, QEMU_MIGRATION_PHASE_PREPARE); /* Domain starts inactive, even if the domain XML had an id field. */ vm->def->id = -1; @@ -1188,28 +1188,19 @@ qemuMigrationPrepareAny(struct qemud_driver *driver, event = virDomainEventNewFromObj(vm, VIR_DOMAIN_EVENT_STARTED, VIR_DOMAIN_EVENT_STARTED_MIGRATED); - ret = 0; -endjob: - if (qemuDomainObjEndAsyncJob(driver, vm) == 0) { - vm = NULL; - } else if (!vm->persistent && !virDomainObjIsActive(vm)) { - virDomainRemoveInactive(&driver->domains, vm); + /* We keep the job active across API calls until the finish() call. + * This prevents any other APIs being invoked while incoming + * migration is taking place. + */ + if (qemuMigrationJobContinue(vm) == 0) { vm = NULL; + qemuReportError(VIR_ERR_OPERATION_FAILED, + "%s", _("domain disappeared")); + goto cleanup; } - /* We set a fake job active which is held across - * API calls until the finish() call. This prevents - * any other APIs being invoked while incoming - * migration is taking place - */ - if (vm && - virDomainObjIsActive(vm)) { - priv->job.asyncJob = QEMU_ASYNC_JOB_MIGRATION_IN; - qemuDomainObjSaveJob(driver, vm); - priv->job.info.type = VIR_DOMAIN_JOB_UNBOUNDED; - priv->job.start = now; - } + ret = 0; cleanup: virDomainDefFree(def); @@ -1221,6 +1212,15 @@ cleanup: qemuDomainEventQueue(driver, event); qemuMigrationCookieFree(mig); return ret; + +endjob: + if (qemuMigrationJobFinish(driver, vm) == 0) { + vm = NULL; + } else if (!vm->persistent) { + virDomainRemoveInactive(&driver->domains, vm); + vm = NULL; + } + goto cleanup; } @@ -2395,26 +2395,22 @@ qemuMigrationFinish(struct qemud_driver *driver, virDomainPtr dom = NULL; virDomainEventPtr event = NULL; int newVM = 1; - qemuDomainObjPrivatePtr priv = NULL; qemuMigrationCookiePtr mig = NULL; + virErrorPtr orig_err = NULL; + VIR_DEBUG("driver=%p, dconn=%p, vm=%p, cookiein=%s, cookieinlen=%d, " "cookieout=%p, cookieoutlen=%p, flags=%lu, retcode=%d", driver, dconn, vm, NULLSTR(cookiein), cookieinlen, cookieout, cookieoutlen, flags, retcode); - virErrorPtr orig_err = NULL; - priv = vm->privateData; - if (priv->job.asyncJob != QEMU_ASYNC_JOB_MIGRATION_IN) { - qemuReportError(VIR_ERR_NO_DOMAIN, - _("domain '%s' is not processing incoming migration"), vm->def->name); + if (!qemuMigrationJobIsActive(vm, QEMU_ASYNC_JOB_MIGRATION_IN)) goto cleanup; - } - qemuDomainObjDiscardAsyncJob(driver, vm); - if (!(mig = qemuMigrationEatCookie(driver, vm, cookiein, cookieinlen, 0))) - goto cleanup; + qemuMigrationJobStartPhase(driver, vm, + v3proto ? QEMU_MIGRATION_PHASE_FINISH3 + : QEMU_MIGRATION_PHASE_FINISH2); - if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) + if (!(mig = qemuMigrationEatCookie(driver, vm, cookiein, cookieinlen, 0))) goto cleanup; /* Did the migration go as planned? If yes, return the domain @@ -2456,7 +2452,6 @@ qemuMigrationFinish(struct qemud_driver *driver, if (event) qemuDomainEventQueue(driver, event); event = NULL; - } if (!(flags & VIR_MIGRATE_PAUSED)) { @@ -2488,11 +2483,6 @@ qemuMigrationFinish(struct qemud_driver *driver, event = virDomainEventNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_FAILED); - if (!vm->persistent) { - if (qemuDomainObjEndJob(driver, vm) > 0) - virDomainRemoveInactive(&driver->domains, vm); - vm = NULL; - } } goto endjob; } @@ -2524,20 +2514,20 @@ qemuMigrationFinish(struct qemud_driver *driver, event = virDomainEventNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_FAILED); - if (!vm->persistent) { - if (qemuDomainObjEndJob(driver, vm) > 0) - virDomainRemoveInactive(&driver->domains, vm); - vm = NULL; - } } if (qemuMigrationBakeCookie(mig, driver, vm, cookieout, cookieoutlen, 0) < 0) VIR_WARN("Unable to encode migration cookie"); endjob: - if (vm && - qemuDomainObjEndJob(driver, vm) == 0) - vm = NULL; + if (vm) { + if (qemuMigrationJobFinish(driver, vm) == 0) { + vm = NULL; + } else if (!vm->persistent && !virDomainObjIsActive(vm)) { + virDomainRemoveInactive(&driver->domains, vm); + vm = NULL; + } + } cleanup: if (vm) -- 1.7.6

Make MIGRATION_OUT use the new helper methods. This also introduces new protection to migration v3 process: the migration job is held from Begin to Confirm to avoid changes to a domain during migration (esp. between Begin and Perform phases). This change, however brings a small issue when Prepare step fails during normal (not p2p or tunneled) migration when old client talks to new libvirtd. The old client doesn't call Confirm (with cancelled == 1) to end the migration job. I'm open to suggestions how to solve this in the best way. My idea is to use flags for Begin3 and Perform3 that would enable this extra protection by moving migration job start from Perform to Begin phase while maintaining the old behavior if the flags are not set. Note that virDomainAbortJob will be able to cancel migration in any phase (i.e., not only in Perform phase). --- src/libvirt.c | 11 ++- src/qemu/qemu_driver.c | 47 ++++++++- src/qemu/qemu_migration.c | 232 +++++++++++++++++++++++++++++++-------------- 3 files changed, 211 insertions(+), 79 deletions(-) diff --git a/src/libvirt.c b/src/libvirt.c index e00c64f..fa32d83 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -3761,7 +3761,7 @@ virDomainMigrateVersion3(virDomainPtr domain, int ret; virDomainInfo info; virErrorPtr orig_err = NULL; - int cancelled; + int cancelled = 1; VIR_DOMAIN_DEBUG(domain, "dconn=%p xmlin=%s, flags=%lu, " "dname=%s, uri=%s, bandwidth=%lu", dconn, NULLSTR(xmlin), flags, @@ -3798,14 +3798,16 @@ virDomainMigrateVersion3(virDomainPtr domain, (dconn, cookiein, cookieinlen, &cookieout, &cookieoutlen, uri, &uri_out, flags, dname, bandwidth, dom_xml); VIR_FREE (dom_xml); - if (ret == -1) - goto done; + if (ret == -1) { + /* Make sure Confirm doesn't overwrite the error */ + orig_err = virSaveLastError(); + goto confirm; + } if (uri == NULL && uri_out == NULL) { virLibConnError(VIR_ERR_INTERNAL_ERROR, _("domainMigratePrepare3 did not set uri")); virDispatchError(domain->conn); - cancelled = 1; goto finish; } if (uri_out) @@ -3871,6 +3873,7 @@ finish: if (!orig_err) orig_err = virSaveLastError(); +confirm: /* * If cancelled, then src VM will be restarted, else * it will be killed diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 9dcb248..e317c5b 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -6850,12 +6850,42 @@ qemuDomainMigrateBegin3(virDomainPtr domain, goto cleanup; } - xml = qemuMigrationBegin(driver, vm, xmlin, - cookieout, cookieoutlen); + if (qemuMigrationJobStart(driver, vm, QEMU_ASYNC_JOB_MIGRATION_OUT) < 0) + goto cleanup; + + if (!virDomainObjIsActive(vm)) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("domain is not running")); + goto endjob; + } + + if (!(xml = qemuMigrationBegin(driver, vm, xmlin, + cookieout, cookieoutlen))) + goto endjob; + + /* We keep the job active across API calls until the confirm() call. + * This prevents any other APIs being invoked while migration is taking + * place. + */ + if (qemuMigrationJobContinue(vm) == 0) { + vm = NULL; + qemuReportError(VIR_ERR_OPERATION_FAILED, + "%s", _("domain disappeared")); + VIR_FREE(xml); + if (cookieout) + VIR_FREE(*cookieout); + } cleanup: + if (vm) + virDomainObjUnlock(vm); qemuDriverUnlock(driver); return xml; + +endjob: + if (qemuMigrationJobFinish(driver, vm) == 0) + vm = NULL; + goto cleanup; } static int @@ -7005,7 +7035,6 @@ qemuDomainMigratePerform3(virDomainPtr dom, dconnuri, uri, cookiein, cookieinlen, cookieout, cookieoutlen, flags, dname, resource, true); - cleanup: qemuDriverUnlock(driver); return ret; @@ -7065,6 +7094,7 @@ qemuDomainMigrateConfirm3(virDomainPtr domain, struct qemud_driver *driver = domain->conn->privateData; virDomainObjPtr vm; int ret = -1; + enum qemuMigrationJobPhase phase; virCheckFlags(VIR_MIGRATE_LIVE | VIR_MIGRATE_PEER2PEER | @@ -7085,14 +7115,21 @@ qemuDomainMigrateConfirm3(virDomainPtr domain, goto cleanup; } - if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) + if (!qemuMigrationJobIsActive(vm, QEMU_ASYNC_JOB_MIGRATION_OUT)) goto cleanup; + if (cancelled) + phase = QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED; + else + phase = QEMU_MIGRATION_PHASE_CONFIRM3; + + qemuMigrationJobStartPhase(driver, vm, phase); + ret = qemuMigrationConfirm(driver, domain->conn, vm, cookiein, cookieinlen, flags, cancelled); - if (qemuDomainObjEndJob(driver, vm) == 0) { + if (qemuMigrationJobFinish(driver, vm) == 0) { vm = NULL; } else if (!virDomainObjIsActive(vm) && (!vm->persistent || (flags & VIR_MIGRATE_UNDEFINE_SOURCE))) { diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 5e31b7d..bcca454 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -1009,6 +1009,7 @@ qemuDomainMigrateGraphicsRelocate(struct qemud_driver *driver, } +/* The caller is supposed to lock the vm and start a migration job. */ char *qemuMigrationBegin(struct qemud_driver *driver, virDomainObjPtr vm, const char *xmlin, @@ -1021,11 +1022,7 @@ char *qemuMigrationBegin(struct qemud_driver *driver, VIR_DEBUG("driver=%p, vm=%p, xmlin=%s, cookieout=%p, cookieoutlen=%p", driver, vm, NULLSTR(xmlin), cookieout, cookieoutlen); - if (!virDomainObjIsActive(vm)) { - qemuReportError(VIR_ERR_OPERATION_INVALID, - "%s", _("domain is not running")); - goto cleanup; - } + qemuMigrationJobSetPhase(driver, vm, QEMU_MIGRATION_PHASE_BEGIN3); if (qemuProcessAutoDestroyActive(driver, vm)) { qemuReportError(VIR_ERR_OPERATION_INVALID, @@ -1062,7 +1059,6 @@ char *qemuMigrationBegin(struct qemud_driver *driver, } cleanup: - virDomainObjUnlock(vm); qemuMigrationCookieFree(mig); virDomainDefFree(def); return rv; @@ -1902,6 +1898,7 @@ static int doPeer2PeerMigrate2(struct qemud_driver *driver, * until the migration is complete. */ VIR_DEBUG("Perform %p", sconn); + qemuMigrationJobSetPhase(driver, vm, QEMU_MIGRATION_PHASE_PERFORM2); if (flags & VIR_MIGRATE_TUNNELLED) ret = doTunnelMigrate(driver, vm, st, NULL, 0, NULL, NULL, @@ -2036,6 +2033,7 @@ static int doPeer2PeerMigrate3(struct qemud_driver *driver, * confirm migration completion. */ VIR_DEBUG("Perform3 %p uri=%s uri_out=%s", sconn, uri, uri_out); + qemuMigrationJobSetPhase(driver, vm, QEMU_MIGRATION_PHASE_PERFORM3); VIR_FREE(cookiein); cookiein = cookieout; cookieinlen = cookieoutlen; @@ -2053,8 +2051,12 @@ static int doPeer2PeerMigrate3(struct qemud_driver *driver, flags, dname, resource); /* Perform failed. Make sure Finish doesn't overwrite the error */ - if (ret < 0) + if (ret < 0) { orig_err = virSaveLastError(); + } else { + qemuMigrationJobSetPhase(driver, vm, + QEMU_MIGRATION_PHASE_PERFORM3_DONE); + } /* If Perform returns < 0, then we need to cancel the VM * startup on the destination @@ -2211,35 +2213,32 @@ cleanup: } -int qemuMigrationPerform(struct qemud_driver *driver, - virConnectPtr conn, - virDomainObjPtr vm, - const char *xmlin, - const char *dconnuri, - const char *uri, - const char *cookiein, - int cookieinlen, - char **cookieout, - int *cookieoutlen, - unsigned long flags, - const char *dname, - unsigned long resource, - bool v3proto) +/* + * This implements perform part of the migration protocol when migration job + * does not need to be active across several APIs, i.e., peer2peer migration or + * perform phase of v2 non-peer2peer migration. + */ +static int +qemuMigrationPerformJob(struct qemud_driver *driver, + virConnectPtr conn, + virDomainObjPtr vm, + const char *xmlin, + const char *dconnuri, + const char *uri, + const char *cookiein, + int cookieinlen, + char **cookieout, + int *cookieoutlen, + unsigned long flags, + const char *dname, + unsigned long resource, + bool v3proto) { virDomainEventPtr event = NULL; int ret = -1; int resume = 0; - qemuDomainObjPrivatePtr priv = vm->privateData; - VIR_DEBUG("driver=%p, conn=%p, vm=%p, xmlin=%s, dconnuri=%s, " - "uri=%s, cookiein=%s, cookieinlen=%d, cookieout=%p, " - "cookieoutlen=%p, flags=%lu, dname=%s, resource=%lu, v3proto=%d", - driver, conn, vm, NULLSTR(xmlin), NULLSTR(dconnuri), - NULLSTR(uri), NULLSTR(cookiein), cookieinlen, - cookieout, cookieoutlen, flags, NULLSTR(dname), - resource, v3proto); - - if (qemuDomainObjBeginAsyncJobWithDriver(driver, vm, - QEMU_ASYNC_JOB_MIGRATION_OUT) < 0) + + if (qemuMigrationJobStart(driver, vm, QEMU_ASYNC_JOB_MIGRATION_OUT) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -2254,52 +2253,33 @@ int qemuMigrationPerform(struct qemud_driver *driver, goto endjob; } - memset(&priv->job.info, 0, sizeof(priv->job.info)); - priv->job.info.type = VIR_DOMAIN_JOB_UNBOUNDED; - resume = virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING; if ((flags & (VIR_MIGRATE_TUNNELLED | VIR_MIGRATE_PEER2PEER))) { - if (cookieinlen) { - qemuReportError(VIR_ERR_OPERATION_INVALID, - "%s", _("received unexpected cookie with P2P migration")); - goto endjob; - } - - if (doPeer2PeerMigrate(driver, conn, vm, xmlin, - dconnuri, uri, flags, dname, - resource, &v3proto) < 0) - /* doPeer2PeerMigrate already set the error, so just get out */ - goto endjob; + ret = doPeer2PeerMigrate(driver, conn, vm, xmlin, + dconnuri, uri, flags, dname, + resource, &v3proto); } else { - if (dconnuri) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, - "%s", _("Unexpected dconnuri parameter with non-peer2peer migration")); - goto endjob; - } - if (doNativeMigrate(driver, vm, uri, cookiein, cookieinlen, - cookieout, cookieoutlen, - flags, dname, resource) < 0) - goto endjob; + qemuMigrationJobSetPhase(driver, vm, QEMU_MIGRATION_PHASE_PERFORM2); + ret = doNativeMigrate(driver, vm, uri, cookiein, cookieinlen, + cookieout, cookieoutlen, + flags, dname, resource); } + if (ret < 0) + goto endjob; /* * In v3 protocol, the source VM is not killed off until the * confirm step. */ - if (v3proto) { - resume = 0; - } else { + if (!v3proto) { qemuProcessStop(driver, vm, 1, VIR_DOMAIN_SHUTOFF_MIGRATED); qemuAuditDomainStop(vm, "migrated"); - resume = 0; - event = virDomainEventNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_MIGRATED); } - - ret = 0; + resume = 0; endjob: if (resume && virDomainObjGetState(vm, NULL) == VIR_DOMAIN_PAUSED) { @@ -2319,10 +2299,11 @@ endjob: VIR_DOMAIN_EVENT_RESUMED_MIGRATED); } if (vm) { - if (qemuDomainObjEndAsyncJob(driver, vm) == 0) { + if (qemuMigrationJobFinish(driver, vm) == 0) { vm = NULL; } else if (!virDomainObjIsActive(vm) && - (!vm->persistent || (flags & VIR_MIGRATE_UNDEFINE_SOURCE))) { + (!vm->persistent || + (ret == 0 && (flags & VIR_MIGRATE_UNDEFINE_SOURCE)))) { if (flags & VIR_MIGRATE_UNDEFINE_SOURCE) virDomainDeleteConfig(driver->configDir, driver->autostartDir, vm); virDomainRemoveInactive(&driver->domains, vm); @@ -2338,6 +2319,118 @@ cleanup: return ret; } +/* + * This implements perform phase of v3 migration protocol. + */ +static int +qemuMigrationPerformPhase(struct qemud_driver *driver, + virConnectPtr conn, + virDomainObjPtr vm, + const char *uri, + const char *cookiein, + int cookieinlen, + char **cookieout, + int *cookieoutlen, + unsigned long flags, + const char *dname, + unsigned long resource) +{ + virDomainEventPtr event = NULL; + int ret = -1; + bool resume; + + qemuMigrationJobStartPhase(driver, vm, QEMU_MIGRATION_PHASE_PERFORM3); + + resume = virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING; + ret = doNativeMigrate(driver, vm, uri, cookiein, cookieinlen, + cookieout, cookieoutlen, + flags, dname, resource); + + if (ret < 0 && resume && + virDomainObjGetState(vm, NULL) == VIR_DOMAIN_PAUSED) { + /* we got here through some sort of failure; start the domain again */ + if (qemuProcessStartCPUs(driver, vm, conn, + VIR_DOMAIN_RUNNING_MIGRATION_CANCELED) < 0) { + /* Hm, we already know we are in error here. We don't want to + * overwrite the previous error, though, so we just throw something + * to the logs and hope for the best + */ + VIR_ERROR(_("Failed to resume guest %s after failure"), + vm->def->name); + } + + event = virDomainEventNewFromObj(vm, + VIR_DOMAIN_EVENT_RESUMED, + VIR_DOMAIN_EVENT_RESUMED_MIGRATED); + } + if (ret == 0) { + qemuMigrationJobSetPhase(driver, vm, + QEMU_MIGRATION_PHASE_PERFORM3_DONE); + } + + if (vm) { + if (qemuMigrationJobContinue(vm) == 0) { + vm = NULL; + } else if (!virDomainObjIsActive(vm) && !vm->persistent) { + virDomainRemoveInactive(&driver->domains, vm); + vm = NULL; + } + } + + if (vm) + virDomainObjUnlock(vm); + if (event) + qemuDomainEventQueue(driver, event); + return ret; +} + +int +qemuMigrationPerform(struct qemud_driver *driver, + virConnectPtr conn, + virDomainObjPtr vm, + const char *xmlin, + const char *dconnuri, + const char *uri, + const char *cookiein, + int cookieinlen, + char **cookieout, + int *cookieoutlen, + unsigned long flags, + const char *dname, + unsigned long resource, + bool v3proto) +{ + if ((flags & (VIR_MIGRATE_TUNNELLED | VIR_MIGRATE_PEER2PEER))) { + if (cookieinlen) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("received unexpected cookie with P2P migration")); + return -1; + } + + return qemuMigrationPerformJob(driver, conn, vm, xmlin, dconnuri, uri, + cookiein, cookieinlen, cookieout, + cookieoutlen, flags, dname, resource, + v3proto); + } else { + if (dconnuri) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("Unexpected dconnuri parameter with non-peer2peer migration")); + return -1; + } + + if (v3proto) { + return qemuMigrationPerformPhase(driver, conn, vm, uri, + cookiein, cookieinlen, + cookieout, cookieoutlen, + flags, dname, resource); + } else { + return qemuMigrationPerformJob(driver, conn, vm, xmlin, dconnuri, + uri, cookiein, cookieinlen, + cookieout, cookieoutlen, flags, + dname, resource, v3proto); + } + } +} #if WITH_MACVTAP static void @@ -2559,15 +2652,14 @@ int qemuMigrationConfirm(struct qemud_driver *driver, driver, conn, vm, NULLSTR(cookiein), cookieinlen, flags, retcode); + qemuMigrationJobSetPhase(driver, vm, + retcode == 0 + ? QEMU_MIGRATION_PHASE_CONFIRM3 + : QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED); + if (!(mig = qemuMigrationEatCookie(driver, vm, cookiein, cookieinlen, 0))) return -1; - if (!virDomainObjIsActive(vm)) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("guest unexpectedly quit")); - goto cleanup; - } - /* Did the migration go as planned? If yes, kill off the * domain object, but if no, resume CPUs */ -- 1.7.6

On Fri, Jul 08, 2011 at 01:34:16AM +0200, Jiri Denemark wrote:
Make MIGRATION_OUT use the new helper methods.
This also introduces new protection to migration v3 process: the migration job is held from Begin to Confirm to avoid changes to a domain during migration (esp. between Begin and Perform phases). This change, however brings a small issue when Prepare step fails during normal (not p2p or tunneled) migration when old client talks to new libvirtd. The old client doesn't call Confirm (with cancelled == 1) to end the migration job. I'm open to suggestions how to solve this in the best way. My idea is to use flags for Begin3 and Perform3 that would enable this extra protection by moving migration job start from Perform to Begin phase while maintaining the old behavior if the flags are not set. Note that virDomainAbortJob will be able to cancel migration in any phase (i.e., not only in Perform phase).
What do you mean by 'old client' in this context ? A v3 migration client from the 0.9.3 release, or a v2 migration client from a < 0.9.3 release Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Tue, Jul 12, 2011 at 12:18:41 +0100, Daniel P. Berrange wrote:
On Fri, Jul 08, 2011 at 01:34:16AM +0200, Jiri Denemark wrote:
Make MIGRATION_OUT use the new helper methods.
This also introduces new protection to migration v3 process: the migration job is held from Begin to Confirm to avoid changes to a domain during migration (esp. between Begin and Perform phases). This change, however brings a small issue when Prepare step fails during normal (not p2p or tunneled) migration when old client talks to new libvirtd. The old client doesn't call Confirm (with cancelled == 1) to end the migration job. I'm open to suggestions how to solve this in the best way. My idea is to use flags for Begin3 and Perform3 that would enable this extra protection by moving migration job start from Perform to Begin phase while maintaining the old behavior if the flags are not set. Note that virDomainAbortJob will be able to cancel migration in any phase (i.e., not only in Perform phase).
What do you mean by 'old client' in this context ? A v3 migration client from the 0.9.3 release, or a v2 migration client from a < 0.9.3 release
V3 migration client from 0.9.3. V2 migration is ok in this aspect since it has no Begin API (but that, on the other hand, means that the domain can change between DumpXML and Perform). Jirka

--- src/qemu/qemu_process.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 109 insertions(+), 1 deletions(-) diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 49625b5..ddb29c1 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -38,6 +38,7 @@ #include "qemu_hostdev.h" #include "qemu_hotplug.h" #include "qemu_bridge_filter.h" +#include "qemu_migration.h" #if HAVE_NUMACTL # define NUMA_VERSION1_COMPATIBILITY 1 @@ -2224,6 +2225,111 @@ qemuProcessUpdateState(struct qemud_driver *driver, virDomainObjPtr vm) } static int +qemuProcessRecoverMigration(struct qemud_driver *driver, + virDomainObjPtr vm, + virConnectPtr conn, + enum qemuDomainAsyncJob job, + enum qemuMigrationJobPhase phase, + virDomainState state, + int reason) +{ + if (job == QEMU_ASYNC_JOB_MIGRATION_IN) { + switch (phase) { + case QEMU_MIGRATION_PHASE_NONE: + case QEMU_MIGRATION_PHASE_PERFORM2: + case QEMU_MIGRATION_PHASE_BEGIN3: + case QEMU_MIGRATION_PHASE_PERFORM3: + case QEMU_MIGRATION_PHASE_PERFORM3_DONE: + case QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED: + case QEMU_MIGRATION_PHASE_CONFIRM3: + case QEMU_MIGRATION_PHASE_LAST: + break; + + case QEMU_MIGRATION_PHASE_PREPARE: + VIR_DEBUG("Killing unfinished incoming migration for domain %s", + vm->def->name); + return -1; + + case QEMU_MIGRATION_PHASE_FINISH2: + /* source domain is already killed so let's just resume the domain + * and hope we are all set */ + VIR_DEBUG("Incoming migration finished, resuming domain %s", + vm->def->name); + if (qemuProcessStartCPUs(driver, vm, conn, + VIR_DOMAIN_RUNNING_UNPAUSED) < 0) { + VIR_WARN("Could not resume domain %s", vm->def->name); + } + break; + + case QEMU_MIGRATION_PHASE_FINISH3: + /* migration finished, we started resuming the domain but didn't + * confirm success or failure yet; killing it seems safest */ + VIR_DEBUG("Killing migrated domain %s", vm->def->name); + return -1; + } + } else if (job == QEMU_ASYNC_JOB_MIGRATION_OUT) { + switch (phase) { + case QEMU_MIGRATION_PHASE_NONE: + case QEMU_MIGRATION_PHASE_PREPARE: + case QEMU_MIGRATION_PHASE_FINISH2: + case QEMU_MIGRATION_PHASE_FINISH3: + case QEMU_MIGRATION_PHASE_LAST: + break; + + case QEMU_MIGRATION_PHASE_BEGIN3: + /* nothing happen so far, just forget we were about to migrate the + * domain */ + break; + + case QEMU_MIGRATION_PHASE_PERFORM2: + case QEMU_MIGRATION_PHASE_PERFORM3: + /* migration is still in progress, let's cancel it and resume the + * domain */ + VIR_DEBUG("Canceling unfinished outgoing migration of domain %s", + vm->def->name); + /* TODO cancel possibly running migrate operation */ + /* resume the domain but only if it was paused as a result of + * migration */ + if (state == VIR_DOMAIN_PAUSED && + (reason == VIR_DOMAIN_PAUSED_MIGRATION || + reason == VIR_DOMAIN_PAUSED_UNKNOWN)) { + if (qemuProcessStartCPUs(driver, vm, conn, + VIR_DOMAIN_RUNNING_UNPAUSED) < 0) { + VIR_WARN("Could not resume domain %s", vm->def->name); + } + } + break; + + case QEMU_MIGRATION_PHASE_PERFORM3_DONE: + /* migration finished but we didn't have a chance to get the result + * of Finish3 step; third party needs to check what to do next + */ + break; + + case QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED: + /* Finish3 failed, we need to resume the domain */ + VIR_DEBUG("Resuming domain %s after failed migration", + vm->def->name); + if (state == VIR_DOMAIN_PAUSED && + (reason == VIR_DOMAIN_PAUSED_MIGRATION || + reason == VIR_DOMAIN_PAUSED_UNKNOWN)) { + if (qemuProcessStartCPUs(driver, vm, conn, + VIR_DOMAIN_RUNNING_UNPAUSED) < 0) { + VIR_WARN("Could not resume domain %s", vm->def->name); + } + } + break; + + case QEMU_MIGRATION_PHASE_CONFIRM3: + /* migration completed, we need to kill the domain here */ + return -1; + } + } + + return 0; +} + +static int qemuProcessRecoverJob(struct qemud_driver *driver, virDomainObjPtr vm, virConnectPtr conn, @@ -2237,7 +2343,9 @@ qemuProcessRecoverJob(struct qemud_driver *driver, switch (job->asyncJob) { case QEMU_ASYNC_JOB_MIGRATION_OUT: case QEMU_ASYNC_JOB_MIGRATION_IN: - /* we don't know what to do yet */ + if (qemuProcessRecoverMigration(driver, vm, conn, job->asyncJob, + job->phase, state, reason) < 0) + return -1; break; case QEMU_ASYNC_JOB_SAVE: -- 1.7.6

When monitor is entered with qemuDomainObjEnterMonitorWithDriver, the correct method for leaving and unlocking the monitor is qemuDomainObjExitMonitorWithDriver. --- src/qemu/qemu_hotplug.c | 12 ++++++------ 1 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index 06e2c84..c86f128 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -1240,14 +1240,14 @@ int qemuDomainDetachPciDiskDevice(struct qemud_driver *driver, ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { if (qemuMonitorDelDevice(priv->mon, detach->info.alias) < 0) { - qemuDomainObjExitMonitor(driver, vm); + qemuDomainObjExitMonitorWithDriver(driver, vm); qemuAuditDisk(vm, detach, NULL, "detach", false); goto cleanup; } } else { if (qemuMonitorRemovePCIDevice(priv->mon, &detach->info.addr.pci) < 0) { - qemuDomainObjExitMonitor(driver, vm); + qemuDomainObjExitMonitorWithDriver(driver, vm); qemuAuditDisk(vm, detach, NULL, "detach", false); goto cleanup; } @@ -1335,7 +1335,7 @@ int qemuDomainDetachDiskDevice(struct qemud_driver *driver, ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuMonitorDelDevice(priv->mon, detach->info.alias) < 0) { - qemuDomainObjExitMonitor(driver, vm); + qemuDomainObjExitMonitorWithDriver(driver, vm); qemuAuditDisk(vm, detach, NULL, "detach", false); goto cleanup; } @@ -1474,13 +1474,13 @@ int qemuDomainDetachPciControllerDevice(struct qemud_driver *driver, ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { if (qemuMonitorDelDevice(priv->mon, detach->info.alias)) { - qemuDomainObjExitMonitor(driver, vm); + qemuDomainObjExitMonitorWithDriver(driver, vm); goto cleanup; } } else { if (qemuMonitorRemovePCIDevice(priv->mon, &detach->info.addr.pci) < 0) { - qemuDomainObjExitMonitor(driver, vm); + qemuDomainObjExitMonitorWithDriver(driver, vm); goto cleanup; } } @@ -1569,7 +1569,7 @@ int qemuDomainDetachNetDevice(struct qemud_driver *driver, ignore_value(qemuDomainObjEnterMonitorWithDriver(driver, vm)); if (qemuCapsGet(priv->qemuCaps, QEMU_CAPS_DEVICE)) { if (qemuMonitorDelDevice(priv->mon, detach->info.alias) < 0) { - qemuDomainObjExitMonitor(driver, vm); + qemuDomainObjExitMonitorWithDriver(driver, vm); qemuAuditNet(vm, detach, NULL, "detach", false); goto cleanup; } -- 1.7.6

On 07/07/2011 05:34 PM, Jiri Denemark wrote:
When monitor is entered with qemuDomainObjEnterMonitorWithDriver, the correct method for leaving and unlocking the monitor is qemuDomainObjExitMonitorWithDriver. --- src/qemu/qemu_hotplug.c | 12 ++++++------ 1 files changed, 6 insertions(+), 6 deletions(-)
ACK - and this one is probably worth applying sooner rather than later. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

Like other query commands, this can now be called directly during migration. --- src/qemu/qemu_domain.h | 4 ---- src/qemu/qemu_driver.c | 42 ++++++++++++------------------------------ src/qemu/qemu_migration.c | 14 -------------- 3 files changed, 12 insertions(+), 48 deletions(-) diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 387c64c..0958bf2 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -71,7 +71,6 @@ enum qemuDomainJobSignals { QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME = 1 << 2, /* Request migration downtime change */ QEMU_JOB_SIGNAL_MIGRATE_SPEED = 1 << 3, /* Request migration speed change */ QEMU_JOB_SIGNAL_BLKSTAT = 1 << 4, /* Request blkstat during migration */ - QEMU_JOB_SIGNAL_BLKINFO = 1 << 5, /* Request blkinfo during migration */ }; struct qemuDomainJobSignalsData { @@ -80,9 +79,6 @@ struct qemuDomainJobSignalsData { char *statDevName; /* Device name used by blkstat calls */ virDomainBlockStatsPtr blockStat; /* Block statistics for QEMU_JOB_SIGNAL_BLKSTAT */ int *statRetCode; /* Return code for the blkstat calls */ - char *infoDevName; /* Device name used by blkinfo calls */ - virDomainBlockInfoPtr blockInfo; /* Block information for QEMU_JOB_SIGNAL_BLKINFO */ - int *infoRetCode; /* Return code for the blkinfo calls */ }; struct qemuDomainJobObj { diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index e317c5b..76dfbd8 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -6455,39 +6455,21 @@ static int qemuDomainGetBlockInfo(virDomainPtr dom, virDomainObjIsActive(vm)) { qemuDomainObjPrivatePtr priv = vm->privateData; - if ((priv->job.asyncJob == QEMU_ASYNC_JOB_MIGRATION_OUT) - || (priv->job.asyncJob == QEMU_ASYNC_JOB_SAVE)) { - virDomainObjRef(vm); - while (priv->job.signals & QEMU_JOB_SIGNAL_BLKINFO) - ignore_value(virCondWait(&priv->job.signalCond, &vm->lock)); - - priv->job.signalsData.infoDevName = disk->info.alias; - priv->job.signalsData.blockInfo = info; - priv->job.signalsData.infoRetCode = &ret; - priv->job.signals |= QEMU_JOB_SIGNAL_BLKINFO; - - while (priv->job.signals & QEMU_JOB_SIGNAL_BLKINFO) - ignore_value(virCondWait(&priv->job.signalCond, &vm->lock)); + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_QUERY) < 0) + goto cleanup; - if (virDomainObjUnref(vm) == 0) - vm = NULL; + if (virDomainObjIsActive(vm)) { + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); + ret = qemuMonitorGetBlockExtent(priv->mon, + disk->info.alias, + &info->allocation); + qemuDomainObjExitMonitor(driver, vm); } else { - if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_QUERY) < 0) - goto cleanup; - - if (virDomainObjIsActive(vm)) { - ignore_value(qemuDomainObjEnterMonitor(driver, vm)); - ret = qemuMonitorGetBlockExtent(priv->mon, - disk->info.alias, - &info->allocation); - qemuDomainObjExitMonitor(driver, vm); - } else { - ret = 0; - } - - if (qemuDomainObjEndJob(driver, vm) == 0) - vm = NULL; + ret = 0; } + + if (qemuDomainObjEndJob(driver, vm) == 0) + vm = NULL; } else { ret = 0; } diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index bcca454..8a14d2e 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -819,20 +819,6 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, if (ret < 0) VIR_WARN("Unable to get block statistics"); - } else if (priv->job.signals & QEMU_JOB_SIGNAL_BLKINFO) { - ret = qemuDomainObjEnterMonitorWithDriver(driver, vm); - if (ret == 0) { - ret = qemuMonitorGetBlockExtent(priv->mon, - priv->job.signalsData.infoDevName, - &priv->job.signalsData.blockInfo->allocation); - qemuDomainObjExitMonitorWithDriver(driver, vm); - } - - *priv->job.signalsData.infoRetCode = ret; - priv->job.signals ^= QEMU_JOB_SIGNAL_BLKINFO; - - if (ret < 0) - VIR_WARN("Unable to get block information"); } else { ret = 0; } -- 1.7.6

Like other query commands, this can now be called directly during migration. --- src/qemu/qemu_domain.h | 4 --- src/qemu/qemu_driver.c | 54 +++++++++++++++------------------------------ src/qemu/qemu_migration.c | 18 --------------- 3 files changed, 18 insertions(+), 58 deletions(-) diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 0958bf2..c2ad456 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -70,15 +70,11 @@ enum qemuDomainJobSignals { QEMU_JOB_SIGNAL_SUSPEND = 1 << 1, /* Request VM suspend to finish live migration offline */ QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME = 1 << 2, /* Request migration downtime change */ QEMU_JOB_SIGNAL_MIGRATE_SPEED = 1 << 3, /* Request migration speed change */ - QEMU_JOB_SIGNAL_BLKSTAT = 1 << 4, /* Request blkstat during migration */ }; struct qemuDomainJobSignalsData { unsigned long long migrateDowntime; /* Data for QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME */ unsigned long migrateBandwidth; /* Data for QEMU_JOB_SIGNAL_MIGRATE_SPEED */ - char *statDevName; /* Device name used by blkstat calls */ - virDomainBlockStatsPtr blockStat; /* Block statistics for QEMU_JOB_SIGNAL_BLKSTAT */ - int *statRetCode; /* Return code for the blkstat calls */ }; struct qemuDomainJobObj { diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 76dfbd8..cd64f85 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -6008,46 +6008,28 @@ qemudDomainBlockStats (virDomainPtr dom, } priv = vm->privateData; - if ((priv->job.asyncJob == QEMU_ASYNC_JOB_MIGRATION_OUT) - || (priv->job.asyncJob == QEMU_ASYNC_JOB_SAVE)) { - virDomainObjRef(vm); - while (priv->job.signals & QEMU_JOB_SIGNAL_BLKSTAT) - ignore_value(virCondWait(&priv->job.signalCond, &vm->lock)); - - priv->job.signalsData.statDevName = disk->info.alias; - priv->job.signalsData.blockStat = stats; - priv->job.signalsData.statRetCode = &ret; - priv->job.signals |= QEMU_JOB_SIGNAL_BLKSTAT; - - while (priv->job.signals & QEMU_JOB_SIGNAL_BLKSTAT) - ignore_value(virCondWait(&priv->job.signalCond, &vm->lock)); - - if (virDomainObjUnref(vm) == 0) - vm = NULL; - } else { - if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_QUERY) < 0) - goto cleanup; + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_QUERY) < 0) + goto cleanup; - if (!virDomainObjIsActive(vm)) { - qemuReportError(VIR_ERR_OPERATION_INVALID, - "%s", _("domain is not running")); - goto endjob; - } + if (!virDomainObjIsActive(vm)) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("domain is not running")); + goto endjob; + } - ignore_value(qemuDomainObjEnterMonitor(driver, vm)); - ret = qemuMonitorGetBlockStatsInfo(priv->mon, - disk->info.alias, - &stats->rd_req, - &stats->rd_bytes, - &stats->wr_req, - &stats->wr_bytes, - &stats->errs); - qemuDomainObjExitMonitor(driver, vm); + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); + ret = qemuMonitorGetBlockStatsInfo(priv->mon, + disk->info.alias, + &stats->rd_req, + &stats->rd_bytes, + &stats->wr_req, + &stats->wr_bytes, + &stats->errs); + qemuDomainObjExitMonitor(driver, vm); endjob: - if (qemuDomainObjEndJob(driver, vm) == 0) - vm = NULL; - } + if (qemuDomainObjEndJob(driver, vm) == 0) + vm = NULL; cleanup: if (vm) diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 8a14d2e..a7566f4 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -801,24 +801,6 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, } if (ret < 0) VIR_WARN("Unable to set migration speed"); - } else if (priv->job.signals & QEMU_JOB_SIGNAL_BLKSTAT) { - ret = qemuDomainObjEnterMonitorWithDriver(driver, vm); - if (ret == 0) { - ret = qemuMonitorGetBlockStatsInfo(priv->mon, - priv->job.signalsData.statDevName, - &priv->job.signalsData.blockStat->rd_req, - &priv->job.signalsData.blockStat->rd_bytes, - &priv->job.signalsData.blockStat->wr_req, - &priv->job.signalsData.blockStat->wr_bytes, - &priv->job.signalsData.blockStat->errs); - qemuDomainObjExitMonitorWithDriver(driver, vm); - } - - *priv->job.signalsData.statRetCode = ret; - priv->job.signals ^= QEMU_JOB_SIGNAL_BLKSTAT; - - if (ret < 0) - VIR_WARN("Unable to get block statistics"); } else { ret = 0; } -- 1.7.6

Call qemu monitor command directly within a special job that is only allowed during outgoing migration. --- src/qemu/qemu_domain.c | 1 + src/qemu/qemu_domain.h | 3 +-- src/qemu/qemu_driver.c | 23 +++++++++++++++-------- src/qemu/qemu_migration.c | 21 +++++---------------- src/qemu/qemu_process.c | 1 + 5 files changed, 23 insertions(+), 26 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 2e4228a..c47ab60 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -52,6 +52,7 @@ VIR_ENUM_IMPL(qemuDomainJob, QEMU_JOB_LAST, "destroy", "suspend", "modify", + "migration operation", "none", /* async job is never stored in job.active */ "async nested", ); diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index c2ad456..7e6b522 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -43,6 +43,7 @@ enum qemuDomainJob { QEMU_JOB_DESTROY, /* Destroys the domain (cannot be masked out) */ QEMU_JOB_SUSPEND, /* Suspends (stops vCPUs) the domain */ QEMU_JOB_MODIFY, /* May change state */ + QEMU_JOB_MIGRATION_OP, /* Operation influencing outgoing migration */ /* The following two items must always be the last items before JOB_LAST */ QEMU_JOB_ASYNC, /* Asynchronous job */ @@ -69,12 +70,10 @@ enum qemuDomainJobSignals { QEMU_JOB_SIGNAL_CANCEL = 1 << 0, /* Request job cancellation */ QEMU_JOB_SIGNAL_SUSPEND = 1 << 1, /* Request VM suspend to finish live migration offline */ QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME = 1 << 2, /* Request migration downtime change */ - QEMU_JOB_SIGNAL_MIGRATE_SPEED = 1 << 3, /* Request migration speed change */ }; struct qemuDomainJobSignalsData { unsigned long long migrateDowntime; /* Data for QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME */ - unsigned long migrateBandwidth; /* Data for QEMU_JOB_SIGNAL_MIGRATE_SPEED */ }; struct qemuDomainJobObj { diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index cd64f85..1235787 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -7432,19 +7432,23 @@ qemuDomainMigrateSetMaxSpeed(virDomainPtr dom, qemuDriverLock(driver); vm = virDomainFindByUUID(&driver->domains, dom->uuid); + qemuDriverUnlock(driver); if (!vm) { char uuidstr[VIR_UUID_STRING_BUFLEN]; virUUIDFormat(dom->uuid, uuidstr); qemuReportError(VIR_ERR_NO_DOMAIN, _("no domain with matching uuid '%s'"), uuidstr); - goto cleanup; + return -1; } + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MIGRATION_OP) < 0) + goto cleanup; + if (!virDomainObjIsActive(vm)) { qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", _("domain is not running")); - goto cleanup; + goto endjob; } priv = vm->privateData; @@ -7452,18 +7456,21 @@ qemuDomainMigrateSetMaxSpeed(virDomainPtr dom, if (priv->job.asyncJob != QEMU_ASYNC_JOB_MIGRATION_OUT) { qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", _("domain is not being migrated")); - goto cleanup; + goto endjob; } - VIR_DEBUG("Requesting migration speed change to %luMbs", bandwidth); - priv->job.signalsData.migrateBandwidth = bandwidth; - priv->job.signals |= QEMU_JOB_SIGNAL_MIGRATE_SPEED; - ret = 0; + VIR_DEBUG("Setting migration bandwidth to %luMbs", bandwidth); + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); + ret = qemuMonitorSetMigrationSpeed(priv->mon, bandwidth); + qemuDomainObjExitMonitor(driver, vm); + +endjob: + if (qemuDomainObjEndJob(driver, vm) == 0) + vm = NULL; cleanup: if (vm) virDomainObjUnlock(vm); - qemuDriverUnlock(driver); return ret; } diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index a7566f4..d7ad97f 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -788,19 +788,6 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, } if (ret < 0) VIR_WARN("Unable to set migration downtime"); - } else if (priv->job.signals & QEMU_JOB_SIGNAL_MIGRATE_SPEED) { - unsigned long bandwidth = priv->job.signalsData.migrateBandwidth; - - priv->job.signals ^= QEMU_JOB_SIGNAL_MIGRATE_SPEED; - priv->job.signalsData.migrateBandwidth = 0; - VIR_DEBUG("Setting migration bandwidth to %luMbs", bandwidth); - ret = qemuDomainObjEnterMonitorWithDriver(driver, vm); - if (ret == 0) { - ret = qemuMonitorSetMigrationSpeed(priv->mon, bandwidth); - qemuDomainObjExitMonitorWithDriver(driver, vm); - } - if (ret < 0) - VIR_WARN("Unable to set migration speed"); } else { ret = 0; } @@ -2823,10 +2810,12 @@ qemuMigrationJobStart(struct qemud_driver *driver, if (qemuDomainObjBeginAsyncJobWithDriver(driver, vm, job) < 0) return -1; - if (job == QEMU_ASYNC_JOB_MIGRATION_IN) + if (job == QEMU_ASYNC_JOB_MIGRATION_IN) { qemuDomainObjSetAsyncJobMask(vm, QEMU_JOB_NONE); - else - qemuDomainObjSetAsyncJobMask(vm, DEFAULT_JOB_MASK); + } else { + qemuDomainObjSetAsyncJobMask(vm, DEFAULT_JOB_MASK | + JOB_MASK(QEMU_JOB_MIGRATION_OP)); + } priv->job.info.type = VIR_DOMAIN_JOB_UNBOUNDED; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index ddb29c1..9126c71 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2394,6 +2394,7 @@ qemuProcessRecoverJob(struct qemud_driver *driver, */ break; + case QEMU_JOB_MIGRATION_OP: case QEMU_JOB_ASYNC: case QEMU_JOB_ASYNC_NESTED: /* async job was already handled above */ -- 1.7.6

Call qemu monitor command directly within a special job that is only allowed during outgoing migration. --- src/qemu/qemu_domain.c | 1 - src/qemu/qemu_domain.h | 6 ------ src/qemu/qemu_driver.c | 23 +++++++++++++++-------- src/qemu/qemu_migration.c | 13 ------------- 4 files changed, 15 insertions(+), 28 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index c47ab60..deaf9fd 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -186,7 +186,6 @@ qemuDomainObjResetAsyncJob(qemuDomainObjPrivatePtr priv) job->start = 0; memset(&job->info, 0, sizeof(job->info)); job->signals = 0; - memset(&job->signalsData, 0, sizeof(job->signalsData)); } void diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 7e6b522..71cefd9 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -69,11 +69,6 @@ enum qemuDomainAsyncJob { enum qemuDomainJobSignals { QEMU_JOB_SIGNAL_CANCEL = 1 << 0, /* Request job cancellation */ QEMU_JOB_SIGNAL_SUSPEND = 1 << 1, /* Request VM suspend to finish live migration offline */ - QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME = 1 << 2, /* Request migration downtime change */ -}; - -struct qemuDomainJobSignalsData { - unsigned long long migrateDowntime; /* Data for QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME */ }; struct qemuDomainJobObj { @@ -89,7 +84,6 @@ struct qemuDomainJobObj { virCond signalCond; /* Use to coordinate the safe queries during migration */ unsigned int signals; /* Signals for running job */ - struct qemuDomainJobSignalsData signalsData; /* Signal specific data */ }; typedef struct _qemuDomainPCIAddressSet qemuDomainPCIAddressSet; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 1235787..d62e99d 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -7383,19 +7383,23 @@ qemuDomainMigrateSetMaxDowntime(virDomainPtr dom, qemuDriverLock(driver); vm = virDomainFindByUUID(&driver->domains, dom->uuid); + qemuDriverUnlock(driver); if (!vm) { char uuidstr[VIR_UUID_STRING_BUFLEN]; virUUIDFormat(dom->uuid, uuidstr); qemuReportError(VIR_ERR_NO_DOMAIN, _("no domain with matching uuid '%s'"), uuidstr); - goto cleanup; + return -1; } + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MIGRATION_OP) < 0) + goto cleanup; + if (!virDomainObjIsActive(vm)) { qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", _("domain is not running")); - goto cleanup; + goto endjob; } priv = vm->privateData; @@ -7403,18 +7407,21 @@ qemuDomainMigrateSetMaxDowntime(virDomainPtr dom, if (priv->job.asyncJob != QEMU_ASYNC_JOB_MIGRATION_OUT) { qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", _("domain is not being migrated")); - goto cleanup; + goto endjob; } - VIR_DEBUG("Requesting migration downtime change to %llums", downtime); - priv->job.signalsData.migrateDowntime = downtime; - priv->job.signals |= QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME; - ret = 0; + VIR_DEBUG("Setting migration downtime to %llums", downtime); + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); + ret = qemuMonitorSetMigrationDowntime(priv->mon, downtime); + qemuDomainObjExitMonitor(driver, vm); + +endjob: + if (qemuDomainObjEndJob(driver, vm) == 0) + vm = NULL; cleanup: if (vm) virDomainObjUnlock(vm); - qemuDriverUnlock(driver); return ret; } diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index d7ad97f..ee2c5a0 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -775,19 +775,6 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, VIR_DEBUG("Pausing domain for non-live migration"); if (qemuMigrationSetOffline(driver, vm) < 0) VIR_WARN("Unable to pause domain"); - } else if (priv->job.signals & QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME) { - unsigned long long ms = priv->job.signalsData.migrateDowntime; - - priv->job.signals ^= QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME; - priv->job.signalsData.migrateDowntime = 0; - VIR_DEBUG("Setting migration downtime to %llums", ms); - ret = qemuDomainObjEnterMonitorWithDriver(driver, vm); - if (ret == 0) { - ret = qemuMonitorSetMigrationDowntime(priv->mon, ms); - qemuDomainObjExitMonitorWithDriver(driver, vm); - } - if (ret < 0) - VIR_WARN("Unable to set migration downtime"); } else { ret = 0; } -- 1.7.6

--- src/qemu/qemu_domain.h | 1 - src/qemu/qemu_driver.c | 46 ++++++++++++++++++++++---------------------- src/qemu/qemu_migration.c | 6 +---- 3 files changed, 24 insertions(+), 29 deletions(-) diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 71cefd9..3da7931 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -68,7 +68,6 @@ enum qemuDomainAsyncJob { enum qemuDomainJobSignals { QEMU_JOB_SIGNAL_CANCEL = 1 << 0, /* Request job cancellation */ - QEMU_JOB_SIGNAL_SUSPEND = 1 << 1, /* Request VM suspend to finish live migration offline */ }; struct qemuDomainJobObj { diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index d62e99d..8b186f7 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -1319,6 +1319,8 @@ static int qemudDomainSuspend(virDomainPtr dom) { int ret = -1; virDomainEventPtr event = NULL; qemuDomainObjPrivatePtr priv; + virDomainPausedReason reason; + int eventDetail; qemuDriverLock(driver); vm = virDomainFindByUUID(&driver->domains, dom->uuid); @@ -1345,34 +1347,32 @@ static int qemudDomainSuspend(virDomainPtr dom) { priv = vm->privateData; if (priv->job.asyncJob == QEMU_ASYNC_JOB_MIGRATION_OUT) { - if (virDomainObjGetState(vm, NULL) != VIR_DOMAIN_PAUSED) { - VIR_DEBUG("Requesting domain pause on %s", - vm->def->name); - priv->job.signals |= QEMU_JOB_SIGNAL_SUSPEND; - } - ret = 0; - goto cleanup; + reason = VIR_DOMAIN_PAUSED_MIGRATION; + eventDetail = VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED; } else { - if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_SUSPEND) < 0) - goto cleanup; + reason = VIR_DOMAIN_PAUSED_USER; + eventDetail = VIR_DOMAIN_EVENT_SUSPENDED_PAUSED; + } - if (!virDomainObjIsActive(vm)) { - qemuReportError(VIR_ERR_OPERATION_INVALID, - "%s", _("domain is not running")); + if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_SUSPEND) < 0) + goto cleanup; + + if (!virDomainObjIsActive(vm)) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("domain is not running")); + goto endjob; + } + if (virDomainObjGetState(vm, NULL) != VIR_DOMAIN_PAUSED) { + if (qemuProcessStopCPUs(driver, vm, reason) < 0) { goto endjob; } - if (virDomainObjGetState(vm, NULL) != VIR_DOMAIN_PAUSED) { - if (qemuProcessStopCPUs(driver, vm, VIR_DOMAIN_PAUSED_USER) < 0) { - goto endjob; - } - event = virDomainEventNewFromObj(vm, - VIR_DOMAIN_EVENT_SUSPENDED, - VIR_DOMAIN_EVENT_SUSPENDED_PAUSED); - } - if (virDomainSaveStatus(driver->caps, driver->stateDir, vm) < 0) - goto endjob; - ret = 0; + event = virDomainEventNewFromObj(vm, + VIR_DOMAIN_EVENT_SUSPENDED, + eventDetail); } + if (virDomainSaveStatus(driver->caps, driver->stateDir, vm) < 0) + goto endjob; + ret = 0; endjob: if (qemuDomainObjEndJob(driver, vm) == 0) diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index ee2c5a0..de00811 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -770,11 +770,6 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, if (ret < 0) { VIR_WARN("Unable to cancel job"); } - } else if (priv->job.signals & QEMU_JOB_SIGNAL_SUSPEND) { - priv->job.signals ^= QEMU_JOB_SIGNAL_SUSPEND; - VIR_DEBUG("Pausing domain for non-live migration"); - if (qemuMigrationSetOffline(driver, vm) < 0) - VIR_WARN("Unable to pause domain"); } else { ret = 0; } @@ -2801,6 +2796,7 @@ qemuMigrationJobStart(struct qemud_driver *driver, qemuDomainObjSetAsyncJobMask(vm, QEMU_JOB_NONE); } else { qemuDomainObjSetAsyncJobMask(vm, DEFAULT_JOB_MASK | + JOB_MASK(QEMU_JOB_SUSPEND) | JOB_MASK(QEMU_JOB_MIGRATION_OP)); } -- 1.7.6

This doesn't abort migration job in any phase, yet. --- src/qemu/qemu_domain.c | 9 +------- src/qemu/qemu_domain.h | 14 ++++-------- src/qemu/qemu_driver.c | 36 ++++++++++++++++++++++----------- src/qemu/qemu_migration.c | 48 --------------------------------------------- src/qemu/qemu_process.c | 12 +++++++++- 5 files changed, 40 insertions(+), 79 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index deaf9fd..7e43238 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -52,6 +52,7 @@ VIR_ENUM_IMPL(qemuDomainJob, QEMU_JOB_LAST, "destroy", "suspend", "modify", + "abort", "migration operation", "none", /* async job is never stored in job.active */ "async nested", @@ -158,12 +159,6 @@ qemuDomainObjInitJob(qemuDomainObjPrivatePtr priv) return -1; } - if (virCondInit(&priv->job.signalCond) < 0) { - ignore_value(virCondDestroy(&priv->job.cond)); - ignore_value(virCondDestroy(&priv->job.asyncCond)); - return -1; - } - return 0; } @@ -185,7 +180,6 @@ qemuDomainObjResetAsyncJob(qemuDomainObjPrivatePtr priv) job->mask = DEFAULT_JOB_MASK; job->start = 0; memset(&job->info, 0, sizeof(job->info)); - job->signals = 0; } void @@ -208,7 +202,6 @@ qemuDomainObjFreeJob(qemuDomainObjPrivatePtr priv) { ignore_value(virCondDestroy(&priv->job.cond)); ignore_value(virCondDestroy(&priv->job.asyncCond)); - ignore_value(virCondDestroy(&priv->job.signalCond)); } diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 3da7931..234acab 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -31,8 +31,10 @@ # include "bitmap.h" #define JOB_MASK(job) (1 << (job - 1)) -#define DEFAULT_JOB_MASK \ - (JOB_MASK(QEMU_JOB_QUERY) | JOB_MASK(QEMU_JOB_DESTROY)) +#define DEFAULT_JOB_MASK \ + (JOB_MASK(QEMU_JOB_QUERY) | \ + JOB_MASK(QEMU_JOB_DESTROY) | \ + JOB_MASK(QEMU_JOB_ABORT)) /* Only 1 job is allowed at any time * A job includes *all* monitor commands, even those just querying @@ -43,6 +45,7 @@ enum qemuDomainJob { QEMU_JOB_DESTROY, /* Destroys the domain (cannot be masked out) */ QEMU_JOB_SUSPEND, /* Suspends (stops vCPUs) the domain */ QEMU_JOB_MODIFY, /* May change state */ + QEMU_JOB_ABORT, /* Abort current async job */ QEMU_JOB_MIGRATION_OP, /* Operation influencing outgoing migration */ /* The following two items must always be the last items before JOB_LAST */ @@ -66,10 +69,6 @@ enum qemuDomainAsyncJob { QEMU_ASYNC_JOB_LAST }; -enum qemuDomainJobSignals { - QEMU_JOB_SIGNAL_CANCEL = 1 << 0, /* Request job cancellation */ -}; - struct qemuDomainJobObj { virCond cond; /* Use to coordinate jobs */ enum qemuDomainJob active; /* Currently running job */ @@ -80,9 +79,6 @@ struct qemuDomainJobObj { unsigned long long mask; /* Jobs allowed during async job */ unsigned long long start; /* When the async job started */ virDomainJobInfo info; /* Async job progress data */ - - virCond signalCond; /* Use to coordinate the safe queries during migration */ - unsigned int signals; /* Signals for running job */ }; typedef struct _qemuDomainPCIAddressSet qemuDomainPCIAddressSet; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 8b186f7..6fd6019 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -7343,24 +7343,36 @@ static int qemuDomainAbortJob(virDomainPtr dom) { goto cleanup; } - priv = vm->privateData; + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_ABORT) < 0) + goto cleanup; if (virDomainObjIsActive(vm)) { - if (priv->job.asyncJob) { - VIR_DEBUG("Requesting cancellation of job on vm %s", vm->def->name); - priv->job.signals |= QEMU_JOB_SIGNAL_CANCEL; - } else { - qemuReportError(VIR_ERR_OPERATION_INVALID, - "%s", _("no job is active on the domain")); - goto cleanup; - } - } else { qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", _("domain is not running")); - goto cleanup; + goto endjob; } - ret = 0; + priv = vm->privateData; + + if (!priv->job.asyncJob) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("no job is active on the domain")); + goto endjob; + } else if (priv->job.asyncJob == QEMU_ASYNC_JOB_MIGRATION_IN) { + qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("cannot abort incoming migration;" + " use virDomainDestroy instead")); + goto endjob; + } + + VIR_DEBUG("Cancelling job at client request"); + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); + ret = qemuMonitorMigrateCancel(priv->mon); + qemuDomainObjExitMonitor(driver, vm); + +endjob: + if (qemuDomainObjEndJob(driver, vm) == 0) + vm = NULL; cleanup: if (vm) diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index de00811..c1c91d5 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -743,42 +743,6 @@ qemuMigrationSetOffline(struct qemud_driver *driver, static int -qemuMigrationProcessJobSignals(struct qemud_driver *driver, - virDomainObjPtr vm, - const char *job, - bool cleanup) -{ - qemuDomainObjPrivatePtr priv = vm->privateData; - int ret = -1; - - if (!virDomainObjIsActive(vm)) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, _("%s: %s"), - job, _("guest unexpectedly quit")); - if (cleanup) - priv->job.signals = 0; - return -1; - } - - if (priv->job.signals & QEMU_JOB_SIGNAL_CANCEL) { - priv->job.signals ^= QEMU_JOB_SIGNAL_CANCEL; - VIR_DEBUG("Cancelling job at client request"); - ret = qemuDomainObjEnterMonitorWithDriver(driver, vm); - if (ret == 0) { - ret = qemuMonitorMigrateCancel(priv->mon); - qemuDomainObjExitMonitorWithDriver(driver, vm); - } - if (ret < 0) { - VIR_WARN("Unable to cancel job"); - } - } else { - ret = 0; - } - - return ret; -} - - -static int qemuMigrationUpdateJobStatus(struct qemud_driver *driver, virDomainObjPtr vm, const char *job) @@ -878,17 +842,10 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) while (priv->job.info.type == VIR_DOMAIN_JOB_UNBOUNDED) { /* Poll every 50ms for progress & to allow cancellation */ struct timespec ts = { .tv_sec = 0, .tv_nsec = 50 * 1000 * 1000ull }; - while (priv->job.signals) { - if (qemuMigrationProcessJobSignals(driver, vm, job, false) < 0) - goto cleanup; - } - - virCondSignal(&priv->job.signalCond); if (qemuMigrationUpdateJobStatus(driver, vm, job) < 0) goto cleanup; - virDomainObjUnlock(vm); qemuDriverUnlock(driver); @@ -899,11 +856,6 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) } cleanup: - while (priv->job.signals) { - qemuMigrationProcessJobSignals(driver, vm, job, true); - } - virCondBroadcast(&priv->job.signalCond); - if (priv->job.info.type == VIR_DOMAIN_JOB_COMPLETED) return 0; else diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 9126c71..962f2dc 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -2233,6 +2233,8 @@ qemuProcessRecoverMigration(struct qemud_driver *driver, virDomainState state, int reason) { + qemuDomainObjPrivatePtr priv = vm->privateData; + if (job == QEMU_ASYNC_JOB_MIGRATION_IN) { switch (phase) { case QEMU_MIGRATION_PHASE_NONE: @@ -2287,7 +2289,9 @@ qemuProcessRecoverMigration(struct qemud_driver *driver, * domain */ VIR_DEBUG("Canceling unfinished outgoing migration of domain %s", vm->def->name); - /* TODO cancel possibly running migrate operation */ + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); + ignore_value(qemuMonitorMigrateCancel(priv->mon)); + qemuDomainObjExitMonitor(driver, vm); /* resume the domain but only if it was paused as a result of * migration */ if (state == VIR_DOMAIN_PAUSED && @@ -2335,6 +2339,7 @@ qemuProcessRecoverJob(struct qemud_driver *driver, virConnectPtr conn, const struct qemuDomainJobObj *job) { + qemuDomainObjPrivatePtr priv = vm->privateData; virDomainState state; int reason; @@ -2350,7 +2355,9 @@ qemuProcessRecoverJob(struct qemud_driver *driver, case QEMU_ASYNC_JOB_SAVE: case QEMU_ASYNC_JOB_DUMP: - /* TODO cancel possibly running migrate operation */ + ignore_value(qemuDomainObjEnterMonitor(driver, vm)); + ignore_value(qemuMonitorMigrateCancel(priv->mon)); + qemuDomainObjExitMonitor(driver, vm); /* resume the domain but only if it was paused as a result of * running save/dump operation */ if (state == VIR_DOMAIN_PAUSED && @@ -2395,6 +2402,7 @@ qemuProcessRecoverJob(struct qemud_driver *driver, break; case QEMU_JOB_MIGRATION_OP: + case QEMU_JOB_ABORT: case QEMU_JOB_ASYNC: case QEMU_JOB_ASYNC_NESTED: /* async job was already handled above */ -- 1.7.6

On 07/07/2011 05:34 PM, Jiri Denemark wrote:
This series is also available at https://gitorious.org/~jirka/libvirt/jirka-staging/commits/migration-recover...
The series does several things: - persists current job and its phase in status xml - allows safe monitor commands to be run during migration/save/dump jobs - implements recovery when libvirtd is restarted while a job is active - consolidates some code and fixes bugs I found when working in the area
Sounds cool! -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

On Fri, Jul 08, 2011 at 01:34:05AM +0200, Jiri Denemark wrote:
This series is also available at https://gitorious.org/~jirka/libvirt/jirka-staging/commits/migration-recover...
The series does several things: - persists current job and its phase in status xml - allows safe monitor commands to be run during migration/save/dump jobs - implements recovery when libvirtd is restarted while a job is active - consolidates some code and fixes bugs I found when working in the area
The series is not perfect and still needs some corner cases to be fixed but I think it's better to send the series for review now and add small additional fixes in the next version(s) instead of waiting for it to be perfect.
What level of testing has it had so far ? In particular does it pass the giant migration test case in the TCK ? And if so, what combinations of client/source/dest libvirt were tested. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Mon, Jul 11, 2011 at 21:33:41 +0100, Daniel P. Berrange wrote:
On Fri, Jul 08, 2011 at 01:34:05AM +0200, Jiri Denemark wrote:
This series is also available at https://gitorious.org/~jirka/libvirt/jirka-staging/commits/migration-recover...
The series does several things: - persists current job and its phase in status xml - allows safe monitor commands to be run during migration/save/dump jobs - implements recovery when libvirtd is restarted while a job is active - consolidates some code and fixes bugs I found when working in the area
The series is not perfect and still needs some corner cases to be fixed but I think it's better to send the series for review now and add small additional fixes in the next version(s) instead of waiting for it to be perfect.
What level of testing has it had so far ? In particular does it pass the giant migration test case in the TCK ?
Only manual testing so far with a special focus on failed migration and recovery/rollback. I tested successful migrations as well, but only a limited set of options of normal/p2p migration and libvirt was exactly the same version for client/source/dest. I still need to run this through the migration test case, but I don't have a setup for that though. Could you run this easily through the beast? That would be perfect but if not, I'll create the setup for it and run it myself. Jirka

On Fri, Jul 08, 2011 at 01:34:05 +0200, Jiri Denemark wrote:
This series is also available at https://gitorious.org/~jirka/libvirt/jirka-staging/commits/migration-recover...
The series does several things: - persists current job and its phase in status xml - allows safe monitor commands to be run during migration/save/dump jobs - implements recovery when libvirtd is restarted while a job is active - consolidates some code and fixes bugs I found when working in the area
The series is not perfect and still needs some corner cases to be fixed but I think it's better to send the series for review now and add small additional fixes in the next version(s) instead of waiting for it to be perfect.
OK, I pushed the following patches (01-08/19 and 13/19) which were already acked. When doing so, I also updated documentation in src/qemu/THREADS.txt as part of the "qemu: Allow all query commands to be run during long jobs" patch. The diff to THREADS.txt is attached.
qemu: Separate job related data into a new object qemu: Consolidate BeginJob{,WithDriver} into a single method qemu: Consolidate {Enter,Exit}Monitor{,WithDriver} qemu: Allow all query commands to be run during long jobs qemu: Save job type in domain status XML qemu: Recover from interrupted jobs qemu: Add support for job phase qemu: Consolidate qemuMigrationPrepare{Direct,Tunnel} qemu: Fix monitor unlocking in some error paths
Jirka

On 07/07/2011 05:34 PM, Jiri Denemark wrote:
This series is also available at https://gitorious.org/~jirka/libvirt/jirka-staging/commits/migration-recover...
The series does several things: - persists current job and its phase in status xml - allows safe monitor commands to be run during migration/save/dump jobs - implements recovery when libvirtd is restarted while a job is active - consolidates some code and fixes bugs I found when working in the area
git bisect is pointing to this series as the cause of a regression in 'virsh managedsave dom' triggering libvirtd core dumps if some other process is actively making queries on domain at the same time (virt-manager is a great process for fitting that bill). I'm trying to further narrow down which patch introduced the regression, and see if I can plug the race (probably a case of not checking whether the monitor still exists when getting the condition for an asynchronous job, since the whole point of virsh [managed]save is that the domain will go away when the save completes, but that it is time-consuming enough that we want to query domain state in the meantime). Program received signal SIGSEGV, Segmentation fault. [Switching to Thread 0x7ffff06d0700 (LWP 11419)] 0x00000000004b9ad8 in qemuMonitorSend (mon=0x7fffe815c060, msg=0x7ffff06cf380) at qemu/qemu_monitor.c:801 801 while (!mon->msg->finished) { (gdb) bt #0 0x00000000004b9ad8 in qemuMonitorSend (mon=0x7fffe815c060, msg=0x7ffff06cf380) at qemu/qemu_monitor.c:801 #1 0x00000000004c77ae in qemuMonitorJSONCommandWithFd (mon=0x7fffe815c060, cmd=0x7fffd8000940, scm_fd=-1, reply=0x7ffff06cf480) at qemu/qemu_monitor_json.c:225 #2 0x00000000004c78e5 in qemuMonitorJSONCommand (mon=0x7fffe815c060, cmd=0x7fffd8000940, reply=0x7ffff06cf480) at qemu/qemu_monitor_json.c:254 #3 0x00000000004cc19c in qemuMonitorJSONGetMigrationStatus ( mon=0x7fffe815c060, status=0x7ffff06cf580, transferred=0x7ffff06cf570, remaining=0x7ffff06cf568, total=0x7ffff06cf560) at qemu/qemu_monitor_json.c:1920 #4 0x00000000004bc1b3 in qemuMonitorGetMigrationStatus (mon=0x7fffe815c060, status=0x7ffff06cf580, transferred=0x7ffff06cf570, remaining=0x7ffff06cf568, total=0x7ffff06cf560) at qemu/qemu_monitor.c:1532 #5 0x00000000004b201b in qemuMigrationUpdateJobStatus (driver=0x7fffe80089f0, vm=0x7fffe8015cd0, job=0x5427b6 "domain save job") at qemu/qemu_migration.c:765 #6 0x00000000004b2383 in qemuMigrationWaitForCompletion ( driver=0x7fffe80089f0, vm=0x7fffe8015cd0) at qemu/qemu_migration.c:846 #7 0x00000000004b7806 in qemuMigrationToFile (driver=0x7fffe80089f0, vm=0x7fffe8015cd0, fd=27, offset=4096, path=0x7fffd8000990 "/var/lib/libvirt/qemu/save/fedora_12.save", compressor=0x0, is_reg=true, bypassSecurityDriver=true) at qemu/qemu_migration.c:2766 #8 0x000000000046a90d in qemuDomainSaveInternal (driver=0x7fffe80089f0, dom=0x7fffd8000ad0, vm=0x7fffe8015cd0, path=0x7fffd8000990 "/var/lib/libvirt/qemu/save/fedora_12.save", compressed=0, bypass_cache=false) at qemu/qemu_driver.c:2386 -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

At 07/28/2011 05:41 AM, Eric Blake Write:
On 07/07/2011 05:34 PM, Jiri Denemark wrote:
This series is also available at https://gitorious.org/~jirka/libvirt/jirka-staging/commits/migration-recover...
The series does several things: - persists current job and its phase in status xml - allows safe monitor commands to be run during migration/save/dump jobs - implements recovery when libvirtd is restarted while a job is active - consolidates some code and fixes bugs I found when working in the area
git bisect is pointing to this series as the cause of a regression in 'virsh managedsave dom' triggering libvirtd core dumps if some other process is actively making queries on domain at the same time (virt-manager is a great process for fitting that bill). I'm trying to further narrow down which patch introduced the regression, and see if I can plug the race (probably a case of not checking whether the monitor still exists when getting the condition for an asynchronous job, since the whole point of virsh [managed]save is that the domain will go away when the save completes, but that it is time-consuming enough that we want to query domain state in the meantime).
I can reproduce this bug.
Program received signal SIGSEGV, Segmentation fault. [Switching to Thread 0x7ffff06d0700 (LWP 11419)] 0x00000000004b9ad8 in qemuMonitorSend (mon=0x7fffe815c060, msg=0x7ffff06cf380) at qemu/qemu_monitor.c:801 801 while (!mon->msg->finished) {
The reason is that mon->msg is NULL. I add some debug codes, and found that we send monitor command while the last command is not finished, and then libvirtd crashed. After reading the code, I think something is wrong in the function qemuDomainObjEnterMonitorInternal(): if (priv->job.active == QEMU_JOB_NONE && priv->job.asyncJob) { if (qemuDomainObjBeginNestedJob(driver, obj) < 0) We can run query job while asyncJob is running. When we query the migration's status, priv->job.active is not QEMU_JOB_NONE, and we do not wait the query job finished. So we send monitor command while last command is not finished. It's very dangerous. When we run a async job, we can not know whether the job is nested async job according to priv->job.active's value. I think we should introduce four functions for async nested job: qemuDomainObjAsyncEnterMonitor() qemuDomainObjAsyncEnterMonitorWithDriver() qemuDomainObjAsyncExitMonitor() qemuDomainObjAsyncExitMonitorWithDriver() The qemuDomainObjEnterMonitorInternal()'s caller should pass a bool value to tell qemuDomainObjEnterMonitorInternal() whether the job is a async nested job. Thanks Wen Congyang.
(gdb) bt #0 0x00000000004b9ad8 in qemuMonitorSend (mon=0x7fffe815c060, msg=0x7ffff06cf380) at qemu/qemu_monitor.c:801 #1 0x00000000004c77ae in qemuMonitorJSONCommandWithFd (mon=0x7fffe815c060, cmd=0x7fffd8000940, scm_fd=-1, reply=0x7ffff06cf480) at qemu/qemu_monitor_json.c:225 #2 0x00000000004c78e5 in qemuMonitorJSONCommand (mon=0x7fffe815c060, cmd=0x7fffd8000940, reply=0x7ffff06cf480) at qemu/qemu_monitor_json.c:254 #3 0x00000000004cc19c in qemuMonitorJSONGetMigrationStatus ( mon=0x7fffe815c060, status=0x7ffff06cf580, transferred=0x7ffff06cf570, remaining=0x7ffff06cf568, total=0x7ffff06cf560) at qemu/qemu_monitor_json.c:1920 #4 0x00000000004bc1b3 in qemuMonitorGetMigrationStatus (mon=0x7fffe815c060, status=0x7ffff06cf580, transferred=0x7ffff06cf570, remaining=0x7ffff06cf568, total=0x7ffff06cf560) at qemu/qemu_monitor.c:1532 #5 0x00000000004b201b in qemuMigrationUpdateJobStatus (driver=0x7fffe80089f0, vm=0x7fffe8015cd0, job=0x5427b6 "domain save job") at qemu/qemu_migration.c:765 #6 0x00000000004b2383 in qemuMigrationWaitForCompletion ( driver=0x7fffe80089f0, vm=0x7fffe8015cd0) at qemu/qemu_migration.c:846 #7 0x00000000004b7806 in qemuMigrationToFile (driver=0x7fffe80089f0, vm=0x7fffe8015cd0, fd=27, offset=4096, path=0x7fffd8000990 "/var/lib/libvirt/qemu/save/fedora_12.save", compressor=0x0, is_reg=true, bypassSecurityDriver=true) at qemu/qemu_migration.c:2766 #8 0x000000000046a90d in qemuDomainSaveInternal (driver=0x7fffe80089f0, dom=0x7fffd8000ad0, vm=0x7fffe8015cd0, path=0x7fffd8000990 "/var/lib/libvirt/qemu/save/fedora_12.save", compressed=0, bypass_cache=false) at qemu/qemu_driver.c:2386

At 07/28/2011 03:26 PM, Wen Congyang Write:
At 07/28/2011 05:41 AM, Eric Blake Write:
On 07/07/2011 05:34 PM, Jiri Denemark wrote:
This series is also available at https://gitorious.org/~jirka/libvirt/jirka-staging/commits/migration-recover...
The series does several things: - persists current job and its phase in status xml - allows safe monitor commands to be run during migration/save/dump jobs - implements recovery when libvirtd is restarted while a job is active - consolidates some code and fixes bugs I found when working in the area
git bisect is pointing to this series as the cause of a regression in 'virsh managedsave dom' triggering libvirtd core dumps if some other process is actively making queries on domain at the same time (virt-manager is a great process for fitting that bill). I'm trying to further narrow down which patch introduced the regression, and see if I can plug the race (probably a case of not checking whether the monitor still exists when getting the condition for an asynchronous job, since the whole point of virsh [managed]save is that the domain will go away when the save completes, but that it is time-consuming enough that we want to query domain state in the meantime).
I can reproduce this bug.
Program received signal SIGSEGV, Segmentation fault. [Switching to Thread 0x7ffff06d0700 (LWP 11419)] 0x00000000004b9ad8 in qemuMonitorSend (mon=0x7fffe815c060, msg=0x7ffff06cf380) at qemu/qemu_monitor.c:801 801 while (!mon->msg->finished) {
The reason is that mon->msg is NULL. I add some debug codes, and found that we send monitor command while the last command is not finished, and then libvirtd crashed.
After reading the code, I think something is wrong in the function qemuDomainObjEnterMonitorInternal(): if (priv->job.active == QEMU_JOB_NONE && priv->job.asyncJob) { if (qemuDomainObjBeginNestedJob(driver, obj) < 0) We can run query job while asyncJob is running. When we query the migration's status, priv->job.active is not QEMU_JOB_NONE, and we do not wait the query job finished. So we send monitor command while last command is not finished. It's very dangerous. When we run a async job, we can not know whether the job is nested async job according to priv->job.active's value.
I think we should introduce four functions for async nested job:
Some functions(for example qemuProcessStopCPUs) can be used by sync job and async job. So we do not know which type job call these functions when we enter these functions. We support run sync job and async job at the same time. It means that the monitor commands for two jobs can be run in any order. Another way to fix this bug is: If we try to send monitor command while the last command is not finished, we wait the last monitor command to finish.
qemuDomainObjAsyncEnterMonitor() qemuDomainObjAsyncEnterMonitorWithDriver() qemuDomainObjAsyncExitMonitor() qemuDomainObjAsyncExitMonitorWithDriver()
The qemuDomainObjEnterMonitorInternal()'s caller should pass a bool value to tell qemuDomainObjEnterMonitorInternal() whether the job is a async nested job.
Thanks Wen Congyang.
(gdb) bt #0 0x00000000004b9ad8 in qemuMonitorSend (mon=0x7fffe815c060, msg=0x7ffff06cf380) at qemu/qemu_monitor.c:801 #1 0x00000000004c77ae in qemuMonitorJSONCommandWithFd (mon=0x7fffe815c060, cmd=0x7fffd8000940, scm_fd=-1, reply=0x7ffff06cf480) at qemu/qemu_monitor_json.c:225 #2 0x00000000004c78e5 in qemuMonitorJSONCommand (mon=0x7fffe815c060, cmd=0x7fffd8000940, reply=0x7ffff06cf480) at qemu/qemu_monitor_json.c:254 #3 0x00000000004cc19c in qemuMonitorJSONGetMigrationStatus ( mon=0x7fffe815c060, status=0x7ffff06cf580, transferred=0x7ffff06cf570, remaining=0x7ffff06cf568, total=0x7ffff06cf560) at qemu/qemu_monitor_json.c:1920 #4 0x00000000004bc1b3 in qemuMonitorGetMigrationStatus (mon=0x7fffe815c060, status=0x7ffff06cf580, transferred=0x7ffff06cf570, remaining=0x7ffff06cf568, total=0x7ffff06cf560) at qemu/qemu_monitor.c:1532 #5 0x00000000004b201b in qemuMigrationUpdateJobStatus (driver=0x7fffe80089f0, vm=0x7fffe8015cd0, job=0x5427b6 "domain save job") at qemu/qemu_migration.c:765 #6 0x00000000004b2383 in qemuMigrationWaitForCompletion ( driver=0x7fffe80089f0, vm=0x7fffe8015cd0) at qemu/qemu_migration.c:846 #7 0x00000000004b7806 in qemuMigrationToFile (driver=0x7fffe80089f0, vm=0x7fffe8015cd0, fd=27, offset=4096, path=0x7fffd8000990 "/var/lib/libvirt/qemu/save/fedora_12.save", compressor=0x0, is_reg=true, bypassSecurityDriver=true) at qemu/qemu_migration.c:2766 #8 0x000000000046a90d in qemuDomainSaveInternal (driver=0x7fffe80089f0, dom=0x7fffd8000ad0, vm=0x7fffe8015cd0, path=0x7fffd8000990 "/var/lib/libvirt/qemu/save/fedora_12.save", compressed=0, bypass_cache=false) at qemu/qemu_driver.c:2386
-- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
participants (4)
-
Daniel P. Berrange
-
Eric Blake
-
Jiri Denemark
-
Wen Congyang