[libvirt] [PATCH] qemu: allow blkstat/blkinfo calls during migration

References: - https://www.redhat.com/archives/libvir-list/2011-May/msg00210.html - https://www.redhat.com/archives/libvir-list/2011-May/msg00287.html --- src/qemu/qemu_domain.c | 6 +++ src/qemu/qemu_domain.h | 7 ++++ src/qemu/qemu_driver.c | 86 +++++++++++++++++++++++++++++++-------------- src/qemu/qemu_migration.c | 31 ++++++++++++++++ 4 files changed, 103 insertions(+), 27 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index c61f9bf..d4e53c4 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -526,6 +526,12 @@ int qemuDomainObjBeginJob(virDomainObjPtr obj) priv->jobStart = timeval_to_ms(now); memset(&priv->jobInfo, 0, sizeof(priv->jobInfo)); + if (virCondInit(&priv->signalCond) < 0) { + virReportSystemError(errno, + "%s", _("cannot initialize signal condition")); + return -1; + } + return 0; } diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 6d24f53..b82986c 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -47,11 +47,17 @@ enum qemuDomainJobSignals { QEMU_JOB_SIGNAL_SUSPEND = 1 << 1, /* Request VM suspend to finish live migration offline */ QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME = 1 << 2, /* Request migration downtime change */ QEMU_JOB_SIGNAL_MIGRATE_SPEED = 1 << 3, /* Request migration speed change */ + QEMU_JOB_SIGNAL_BLKSTAT = 1 << 4, /* Request blkstat during migration */ + QEMU_JOB_SIGNAL_BLKINFO = 1 << 5, /* Request blkinfo during migration */ }; struct qemuDomainJobSignalsData { unsigned long long migrateDowntime; /* Data for QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME */ unsigned long migrateBandwidth; /* Data for QEMU_JOB_SIGNAL_MIGRATE_SPEED */ + char *devname; /* Device name used by blkstat/blkinfo calls */ + int returnCode; /* Return code for the blkstat/blkinfo calls */ + virDomainBlockStatsPtr blockStat; /* Block statistics for QEMU_JOB_SIGNAL_BLKSTAT */ + virDomainBlockInfoPtr blockInfo; /* Block information for QEMU_JOB_SIGNAL_BLKINFO */ }; typedef struct _qemuDomainPCIAddressSet qemuDomainPCIAddressSet; @@ -61,6 +67,7 @@ typedef struct _qemuDomainObjPrivate qemuDomainObjPrivate; typedef qemuDomainObjPrivate *qemuDomainObjPrivatePtr; struct _qemuDomainObjPrivate { virCond jobCond; /* Use in conjunction with main virDomainObjPtr lock */ + virCond signalCond; /* Use in conjunction with main virDomainObjPtr lock */ enum qemuDomainJob jobActive; /* Currently running job */ unsigned int jobSignals; /* Signals for running job */ struct qemuDomainJobSignalsData jobSignalsData; /* Signal specific data */ diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 0fd0f10..f9f5e83 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5031,13 +5031,10 @@ qemudDomainBlockStats (virDomainPtr dom, goto cleanup; } - if (qemuDomainObjBeginJob(vm) < 0) - goto cleanup; - if (!virDomainObjIsActive(vm)) { qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", _("domain is not running")); - goto endjob; + goto cleanup; } for (i = 0 ; i < vm->def->ndisks ; i++) { @@ -5050,29 +5047,48 @@ qemudDomainBlockStats (virDomainPtr dom, if (!disk) { qemuReportError(VIR_ERR_INVALID_ARG, _("invalid path: %s"), path); - goto endjob; + goto cleanup; } if (!disk->info.alias) { qemuReportError(VIR_ERR_INTERNAL_ERROR, _("missing disk device alias name for %s"), disk->dst); - goto endjob; + goto cleanup; } priv = vm->privateData; - qemuDomainObjEnterMonitor(vm); - ret = qemuMonitorGetBlockStatsInfo(priv->mon, - disk->info.alias, - &stats->rd_req, - &stats->rd_bytes, - &stats->wr_req, - &stats->wr_bytes, - &stats->errs); - qemuDomainObjExitMonitor(vm); + if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT) + || (priv->jobActive == QEMU_JOB_SAVE)) { -endjob: - if (qemuDomainObjEndJob(vm) == 0) - vm = NULL; + while (priv->jobSignals) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + priv->jobSignals |= QEMU_JOB_SIGNAL_BLKSTAT; + priv->jobSignalsData.devname = disk->info.alias; + priv->jobSignalsData.blockStat = stats; + priv->jobSignalsData.returnCode = -1; + + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + ret = priv->jobSignalsData.returnCode; + } else { + if (qemuDomainObjBeginJob(vm) < 0) + goto cleanup; + + qemuDomainObjEnterMonitor(vm); + ret = qemuMonitorGetBlockStatsInfo(priv->mon, + disk->info.alias, + &stats->rd_req, + &stats->rd_bytes, + &stats->wr_req, + &stats->wr_bytes, + &stats->errs); + qemuDomainObjExitMonitor(vm); + + if (qemuDomainObjEndJob(vm) == 0) + vm = NULL; + } cleanup: if (vm) @@ -5473,23 +5489,39 @@ static int qemuDomainGetBlockInfo(virDomainPtr dom, disk format and on a block device, then query highest allocated extent from QEMU */ if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && - format != VIR_STORAGE_FILE_RAW && - S_ISBLK(sb.st_mode)) { + format != VIR_STORAGE_FILE_RAW && S_ISBLK(sb.st_mode)) { qemuDomainObjPrivatePtr priv = vm->privateData; - if (qemuDomainObjBeginJob(vm) < 0) - goto cleanup; - if (!virDomainObjIsActive(vm)) + + if (!virDomainObjIsActive(vm)) { ret = 0; - else { + } else if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT) + || (priv->jobActive == QEMU_JOB_SAVE)) { + + while (priv->jobSignals) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + priv->jobSignals |= QEMU_JOB_SIGNAL_BLKINFO; + priv->jobSignalsData.devname = disk->info.alias; + priv->jobSignalsData.blockInfo = info; + priv->jobSignalsData.returnCode = -1; + + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + ret = priv->jobSignalsData.returnCode; + } else { + if (qemuDomainObjBeginJob(vm) < 0) + goto cleanup; + qemuDomainObjEnterMonitor(vm); ret = qemuMonitorGetBlockExtent(priv->mon, disk->info.alias, &info->allocation); qemuDomainObjExitMonitor(vm); - } - if (qemuDomainObjEndJob(vm) == 0) - vm = NULL; + if (qemuDomainObjEndJob(vm) == 0) + vm = NULL; + } } else { ret = 0; } diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 6738a53..54e41f9 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -156,6 +156,34 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) qemuDomainObjExitMonitorWithDriver(driver, vm); if (rc < 0) VIR_WARN0("Unable to set migration speed"); + } else if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) { + qemuDomainObjEnterMonitorWithDriver(driver, vm); + rc = qemuMonitorGetBlockStatsInfo(priv->mon, + priv->jobSignalsData.devname, + &priv->jobSignalsData.blockStat->rd_req, + &priv->jobSignalsData.blockStat->rd_bytes, + &priv->jobSignalsData.blockStat->wr_req, + &priv->jobSignalsData.blockStat->wr_bytes, + &priv->jobSignalsData.blockStat->errs); + qemuDomainObjExitMonitorWithDriver(driver, vm); + + priv->jobSignalsData.returnCode = rc; + priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKSTAT; + + if (rc < 0) + VIR_WARN0("Unable to get block statistics"); + } else if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) { + qemuDomainObjEnterMonitorWithDriver(driver, vm); + rc = qemuMonitorGetBlockExtent(priv->mon, + priv->jobSignalsData.devname, + &priv->jobSignalsData.blockInfo->allocation); + qemuDomainObjExitMonitorWithDriver(driver, vm); + + priv->jobSignalsData.returnCode = rc; + priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKINFO; + + if (rc < 0) + VIR_WARN0("Unable to get block information"); } /* Repeat check because the job signals might have caused @@ -223,6 +251,8 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) break; } + virCondSignal(&priv->signalCond); + virDomainObjUnlock(vm); qemuDriverUnlock(driver); @@ -233,6 +263,7 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) } cleanup: + virCondBroadcast(&priv->signalCond); return ret; } -- 1.7.1

On 05/10/2011 09:57 AM, Federico Simoncelli wrote:
References: - https://www.redhat.com/archives/libvir-list/2011-May/msg00210.html - https://www.redhat.com/archives/libvir-list/2011-May/msg00287.html
Can you also provide a summary of these messages, as well as a summary of your code changes, in your commit message, so that future 'git log' readers won't have to fire up a browser? Something like: Add some additional job signal flags for doing blkstat/blkinfo during a migration. Add a condition variable that can be used to efficiently wait for the migration code to clear the signal flag.
--- src/qemu/qemu_domain.c | 6 +++ src/qemu/qemu_domain.h | 7 ++++ src/qemu/qemu_driver.c | 86 +++++++++++++++++++++++++++++++-------------- src/qemu/qemu_migration.c | 31 ++++++++++++++++ 4 files changed, 103 insertions(+), 27 deletions(-)
You've previously contributed under a gmail account; any preference which of the two accounts should be listed in AUTHORS?
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index c61f9bf..d4e53c4 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -526,6 +526,12 @@ int qemuDomainObjBeginJob(virDomainObjPtr obj) priv->jobStart = timeval_to_ms(now); memset(&priv->jobInfo, 0, sizeof(priv->jobInfo));
+ if (virCondInit(&priv->signalCond) < 0) { + virReportSystemError(errno, + "%s", _("cannot initialize signal condition")); + return -1; + } +
This attempts to initialize the condition variable on every call to qemuDomainObjBeginJob, which is undefined behavior after the first initialization. Also, I don't see any counterpart virCondDestroy in your patch, which is a resource leak. I think you meant to add the init in qemuDomainObjPrivateAlloc, and a counterpart destroy in qemuDomainObjPrivateFree. EEK! When did we ever initialize priv->jobCond? I'm impressed that the qemu driver even works in the first place, since it is calling virCondWaitUntil on an uninitialized condition! [That is, we're implicitly relying on PTHREAD_COND_INITIALIZER just _happening_ to be an all-zeros initialization in Linux, where qemuDomainObjPrivateAlloc zero-initialized the condition variable, but relying on this implicit behavior violates POSIX]
return 0; }
diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 6d24f53..b82986c 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -47,11 +47,17 @@ enum qemuDomainJobSignals { QEMU_JOB_SIGNAL_SUSPEND = 1 << 1, /* Request VM suspend to finish live migration offline */ QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME = 1 << 2, /* Request migration downtime change */ QEMU_JOB_SIGNAL_MIGRATE_SPEED = 1 << 3, /* Request migration speed change */ + QEMU_JOB_SIGNAL_BLKSTAT = 1 << 4, /* Request blkstat during migration */ + QEMU_JOB_SIGNAL_BLKINFO = 1 << 5, /* Request blkinfo during migration */ };
struct qemuDomainJobSignalsData { unsigned long long migrateDowntime; /* Data for QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME */ unsigned long migrateBandwidth; /* Data for QEMU_JOB_SIGNAL_MIGRATE_SPEED */ + char *devname; /* Device name used by blkstat/blkinfo calls */ + int returnCode; /* Return code for the blkstat/blkinfo calls */ + virDomainBlockStatsPtr blockStat; /* Block statistics for QEMU_JOB_SIGNAL_BLKSTAT */ + virDomainBlockInfoPtr blockInfo; /* Block information for QEMU_JOB_SIGNAL_BLKINFO */
What happens if both APIs are called in the window before either can be acted on? Shouldn't you have two returnCode fields, one for each QEMU_JOB_SIGNAL_BLK* bit, so they aren't stomping on each other?
};
typedef struct _qemuDomainPCIAddressSet qemuDomainPCIAddressSet; @@ -61,6 +67,7 @@ typedef struct _qemuDomainObjPrivate qemuDomainObjPrivate; typedef qemuDomainObjPrivate *qemuDomainObjPrivatePtr; struct _qemuDomainObjPrivate { virCond jobCond; /* Use in conjunction with main virDomainObjPtr lock */ + virCond signalCond; /* Use in conjunction with main virDomainObjPtr lock */
Comment is a bit misleading; it is used to coordinate between migration and safe query APIs.
enum qemuDomainJob jobActive; /* Currently running job */ unsigned int jobSignals; /* Signals for running job */ struct qemuDomainJobSignalsData jobSignalsData; /* Signal specific data */ diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 0fd0f10..f9f5e83 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5031,13 +5031,10 @@ qemudDomainBlockStats (virDomainPtr dom, goto cleanup; }
- if (qemuDomainObjBeginJob(vm) < 0) - goto cleanup; - if (!virDomainObjIsActive(vm)) { qemuReportError(VIR_ERR_OPERATION_INVALID, "%s", _("domain is not running")); - goto endjob; + goto cleanup; }
for (i = 0 ; i < vm->def->ndisks ; i++) { @@ -5050,29 +5047,48 @@ qemudDomainBlockStats (virDomainPtr dom, if (!disk) { qemuReportError(VIR_ERR_INVALID_ARG, _("invalid path: %s"), path); - goto endjob; + goto cleanup; }
if (!disk->info.alias) { qemuReportError(VIR_ERR_INTERNAL_ERROR, _("missing disk device alias name for %s"), disk->dst); - goto endjob; + goto cleanup; }
priv = vm->privateData; - qemuDomainObjEnterMonitor(vm); - ret = qemuMonitorGetBlockStatsInfo(priv->mon, - disk->info.alias, - &stats->rd_req, - &stats->rd_bytes, - &stats->wr_req, - &stats->wr_bytes, - &stats->errs); - qemuDomainObjExitMonitor(vm); + if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT) + || (priv->jobActive == QEMU_JOB_SAVE)) {
Odd indentation.
-endjob: - if (qemuDomainObjEndJob(vm) == 0) - vm = NULL; + while (priv->jobSignals) + ignore_value(virCondWait(&priv->signalCond, &vm->lock));
Per src/qemu/THREADS.txt, we need to increase the reference count prior to virCondWait, and decrement it after we read jobSignalsData.returnCode. Otherwise, because the condition value is dropping the vm lock, it is possible that the vm could go away in the meantime, and without holding an extra reference to the vm, we risk trying to regain vm->lock after it has been garbage collected by another thread. The qemuDomainObjBeginJob for the non-migration case already took care of that for us.
+ + priv->jobSignals |= QEMU_JOB_SIGNAL_BLKSTAT; + priv->jobSignalsData.devname = disk->info.alias; + priv->jobSignalsData.blockStat = stats; + priv->jobSignalsData.returnCode = -1; + + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + ret = priv->jobSignalsData.returnCode; + } else { + if (qemuDomainObjBeginJob(vm) < 0) + goto cleanup; + + qemuDomainObjEnterMonitor(vm);
Missing a check that the vm is still active (it could have died in the time it took to get the job condition).
+ ret = qemuMonitorGetBlockStatsInfo(priv->mon, + disk->info.alias, + &stats->rd_req, + &stats->rd_bytes, + &stats->wr_req, + &stats->wr_bytes, + &stats->errs); + qemuDomainObjExitMonitor(vm); + + if (qemuDomainObjEndJob(vm) == 0) + vm = NULL; + }
cleanup: if (vm) @@ -5473,23 +5489,39 @@ static int qemuDomainGetBlockInfo(virDomainPtr dom, disk format and on a block device, then query highest allocated extent from QEMU */ if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && - format != VIR_STORAGE_FILE_RAW && - S_ISBLK(sb.st_mode)) { + format != VIR_STORAGE_FILE_RAW && S_ISBLK(sb.st_mode)) {
Spurious whitespace change.
qemuDomainObjPrivatePtr priv = vm->privateData; - if (qemuDomainObjBeginJob(vm) < 0) - goto cleanup; - if (!virDomainObjIsActive(vm)) + + if (!virDomainObjIsActive(vm)) { ret = 0; - else { + } else if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT) + || (priv->jobActive == QEMU_JOB_SAVE)) {
Inconsistent indentation.
+ + while (priv->jobSignals) + ignore_value(virCondWait(&priv->signalCond, &vm->lock));
Same problem with needing to temporarily increase vm's reference count.
+ + priv->jobSignals |= QEMU_JOB_SIGNAL_BLKINFO; + priv->jobSignalsData.devname = disk->info.alias; + priv->jobSignalsData.blockInfo = info; + priv->jobSignalsData.returnCode = -1; + + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + ret = priv->jobSignalsData.returnCode;
Yep, I've definitely convinced myself that you need two returnCode slots. Thread 1 starts a migration. Thread 2 queries BlockStats, waits for the condition variable and priv->jobSignals to be 0. Thread 2 sets priv->jobSignals to non-zero, then waits for condition variable and priv->jobSignals to clear BLKSTATS. Thread 3 queries BlockInfo. Thread 1 processes the BlockStats query, and clears priv->jobSignals Thread 3 then gets condition lock instead of thread 2, and requests blkinfo Thread 1 then gets condition lock instead of thread 2, and nukes the returnCode Thread 2 finally gets condition lock, but sees the answer intended for thread 3.
+ } else { + if (qemuDomainObjBeginJob(vm) < 0) + goto cleanup; +
Missing a check if vm is still active.
qemuDomainObjEnterMonitor(vm); ret = qemuMonitorGetBlockExtent(priv->mon, disk->info.alias, &info->allocation); qemuDomainObjExitMonitor(vm); - }
- if (qemuDomainObjEndJob(vm) == 0) - vm = NULL; + if (qemuDomainObjEndJob(vm) == 0) + vm = NULL; + } } else { ret = 0; } diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 6738a53..54e41f9 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -156,6 +156,34 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) qemuDomainObjExitMonitorWithDriver(driver, vm); if (rc < 0) VIR_WARN0("Unable to set migration speed"); + } else if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) { + qemuDomainObjEnterMonitorWithDriver(driver, vm); + rc = qemuMonitorGetBlockStatsInfo(priv->mon, + priv->jobSignalsData.devname, + &priv->jobSignalsData.blockStat->rd_req, + &priv->jobSignalsData.blockStat->rd_bytes, + &priv->jobSignalsData.blockStat->wr_req, + &priv->jobSignalsData.blockStat->wr_bytes, + &priv->jobSignalsData.blockStat->errs); + qemuDomainObjExitMonitorWithDriver(driver, vm); + + priv->jobSignalsData.returnCode = rc; + priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKSTAT; + + if (rc < 0) + VIR_WARN0("Unable to get block statistics"); + } else if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) {
Pre-existing, but I think we should process all signals on a single pass, rather than just the first signal we encounter (which then delays the remaining signals for the next sleep and time around the loop).
+ qemuDomainObjEnterMonitorWithDriver(driver, vm); + rc = qemuMonitorGetBlockExtent(priv->mon, + priv->jobSignalsData.devname, + &priv->jobSignalsData.blockInfo->allocation); + qemuDomainObjExitMonitorWithDriver(driver, vm); + + priv->jobSignalsData.returnCode = rc; + priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKINFO; + + if (rc < 0) + VIR_WARN0("Unable to get block information"); }
/* Repeat check because the job signals might have caused @@ -223,6 +251,8 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) break; }
+ virCondSignal(&priv->signalCond); + virDomainObjUnlock(vm); qemuDriverUnlock(driver);
@@ -233,6 +263,7 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) }
cleanup: + virCondBroadcast(&priv->signalCond); return ret; }
Overall, I like the idea, but we need a v2. Do you need me to help? -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

Hi Eric, thank you for the quick and detailed patch review! I think I fixed all the problems that you described but if you find anything that I missed or anything new I'll be happy to accept the help you offered :) You can update my email address in AUTHORS to the redhat one. commit a0ca40ce07f6c54901aac4e32bcfd573980bd38f Author: Federico Simoncelli <fsimonce@redhat.com> Date: Tue May 10 11:36:48 2011 +0100 qemu: allow blkstat/blkinfo calls during migration Originally most of libvirt domain-specific calls were blocking during a migration. A new mechanism to allow specific calls (blkstat/blkinfo) to be executed in such condition has been implemented. In the long term it'd be desirable to get a more general solution to mark further APIs as migration safe, without needing special case code. * src/qemu/qemu_migration.c: add some additional job signal flags for doing blkstat/blkinfo during a migration * src/qemu/qemu_domain.c: add a condition variable that can be used to efficiently wait for the migration code to clear the signal flag * src/qemu/qemu_driver.c: execute blkstat/blkinfo using the job signal flags during migration --- src/qemu/qemu_domain.c | 6 +++ src/qemu/qemu_domain.h | 9 ++++ src/qemu/qemu_driver.c | 103 ++++++++++++++++++++++++++++++++------------- src/qemu/qemu_migration.c | 35 +++++++++++++++ 4 files changed, 123 insertions(+), 30 deletions(-) -- Federico.

Originally most of libvirt domain-specific calls were blocking during a migration. A new mechanism to allow specific calls (blkstat/blkinfo) to be executed in such condition has been implemented. In the long term it'd be desirable to get a more general solution to mark further APIs as migration safe, without needing special case code. * src/qemu/qemu_migration.c: add some additional job signal flags for doing blkstat/blkinfo during a migration * src/qemu/qemu_domain.c: add a condition variable that can be used to efficiently wait for the migration code to clear the signal flag * src/qemu/qemu_driver.c: execute blkstat/blkinfo using the job signal flags during migration --- src/qemu/qemu_domain.c | 6 +++ src/qemu/qemu_domain.h | 9 ++++ src/qemu/qemu_driver.c | 103 ++++++++++++++++++++++++++++++++------------- src/qemu/qemu_migration.c | 35 +++++++++++++++ 4 files changed, 123 insertions(+), 30 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index c61f9bf..570d65e 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -110,6 +110,11 @@ static void *qemuDomainObjPrivateAlloc(void) if (VIR_ALLOC(priv) < 0) return NULL; + if (virCondInit(&priv->signalCond) < 0) { + VIR_FREE(priv); + return NULL; + } + return priv; } @@ -122,6 +127,7 @@ static void qemuDomainObjPrivateFree(void *data) qemuDomainPCIAddressSetFree(priv->pciaddrs); virDomainChrSourceDefFree(priv->monConfig); VIR_FREE(priv->vcpupids); + ignore_value(virCondDestroy(&priv->signalCond)); /* This should never be non-NULL if we get here, but just in case... */ if (priv->mon) { diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 6d24f53..af513e7 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -47,11 +47,19 @@ enum qemuDomainJobSignals { QEMU_JOB_SIGNAL_SUSPEND = 1 << 1, /* Request VM suspend to finish live migration offline */ QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME = 1 << 2, /* Request migration downtime change */ QEMU_JOB_SIGNAL_MIGRATE_SPEED = 1 << 3, /* Request migration speed change */ + QEMU_JOB_SIGNAL_BLKSTAT = 1 << 4, /* Request blkstat during migration */ + QEMU_JOB_SIGNAL_BLKINFO = 1 << 5, /* Request blkinfo during migration */ }; struct qemuDomainJobSignalsData { unsigned long long migrateDowntime; /* Data for QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME */ unsigned long migrateBandwidth; /* Data for QEMU_JOB_SIGNAL_MIGRATE_SPEED */ + char *statDevName; /* Device name used by blkstat calls */ + virDomainBlockStatsPtr blockStat; /* Block statistics for QEMU_JOB_SIGNAL_BLKSTAT */ + int statRetCode; /* Return code for the blkstat calls */ + char *infoDevName; /* Device name used by blkinfo calls */ + virDomainBlockInfoPtr blockInfo; /* Block information for QEMU_JOB_SIGNAL_BLKINFO */ + int infoRetCode; /* Return code for the blkinfo calls */ }; typedef struct _qemuDomainPCIAddressSet qemuDomainPCIAddressSet; @@ -61,6 +69,7 @@ typedef struct _qemuDomainObjPrivate qemuDomainObjPrivate; typedef qemuDomainObjPrivate *qemuDomainObjPrivatePtr; struct _qemuDomainObjPrivate { virCond jobCond; /* Use in conjunction with main virDomainObjPtr lock */ + virCond signalCond; /* Use to coordinate the safe queries during migration */ enum qemuDomainJob jobActive; /* Currently running job */ unsigned int jobSignals; /* Signals for running job */ struct qemuDomainJobSignalsData jobSignalsData; /* Signal specific data */ diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 4f288d3..f7b2cb4 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5082,15 +5082,6 @@ qemudDomainBlockStats (virDomainPtr dom, goto cleanup; } - if (qemuDomainObjBeginJob(vm) < 0) - goto cleanup; - - if (!virDomainObjIsActive(vm)) { - qemuReportError(VIR_ERR_OPERATION_INVALID, - "%s", _("domain is not running")); - goto endjob; - } - for (i = 0 ; i < vm->def->ndisks ; i++) { if (STREQ(path, vm->def->disks[i]->dst)) { disk = vm->def->disks[i]; @@ -5101,29 +5092,57 @@ qemudDomainBlockStats (virDomainPtr dom, if (!disk) { qemuReportError(VIR_ERR_INVALID_ARG, _("invalid path: %s"), path); - goto endjob; + goto cleanup; } if (!disk->info.alias) { qemuReportError(VIR_ERR_INTERNAL_ERROR, _("missing disk device alias name for %s"), disk->dst); - goto endjob; + goto cleanup; } priv = vm->privateData; - qemuDomainObjEnterMonitor(vm); - ret = qemuMonitorGetBlockStatsInfo(priv->mon, - disk->info.alias, - &stats->rd_req, - &stats->rd_bytes, - &stats->wr_req, - &stats->wr_bytes, - &stats->errs); - qemuDomainObjExitMonitor(vm); + if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT) + || (priv->jobActive == QEMU_JOB_SAVE)) { + virDomainObjRef(vm); + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + priv->jobSignals |= QEMU_JOB_SIGNAL_BLKSTAT; + priv->jobSignalsData.statDevName = disk->info.alias; + priv->jobSignalsData.blockStat = stats; + priv->jobSignalsData.statRetCode = -1; + + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + ret = priv->jobSignalsData.statRetCode; + if (virDomainObjUnref(vm) == 0) + vm = NULL; + } else { + if (qemuDomainObjBeginJob(vm) < 0) + goto cleanup; + + if (!virDomainObjIsActive(vm)) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("domain is not running")); + goto endjob; + } + + qemuDomainObjEnterMonitor(vm); + ret = qemuMonitorGetBlockStatsInfo(priv->mon, + disk->info.alias, + &stats->rd_req, + &stats->rd_bytes, + &stats->wr_req, + &stats->wr_bytes, + &stats->errs); + qemuDomainObjExitMonitor(vm); endjob: - if (qemuDomainObjEndJob(vm) == 0) - vm = NULL; + if (qemuDomainObjEndJob(vm) == 0) + vm = NULL; + } cleanup: if (vm) @@ -5527,20 +5546,44 @@ static int qemuDomainGetBlockInfo(virDomainPtr dom, format != VIR_STORAGE_FILE_RAW && S_ISBLK(sb.st_mode)) { qemuDomainObjPrivatePtr priv = vm->privateData; - if (qemuDomainObjBeginJob(vm) < 0) - goto cleanup; - if (!virDomainObjIsActive(vm)) - ret = 0; - else { + + if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT) + || (priv->jobActive == QEMU_JOB_SAVE)) { + virDomainObjRef(vm); + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + priv->jobSignals |= QEMU_JOB_SIGNAL_BLKINFO; + priv->jobSignalsData.infoDevName = disk->info.alias; + priv->jobSignalsData.blockInfo = info; + priv->jobSignalsData.infoRetCode = -1; + + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + ret = priv->jobSignalsData.infoRetCode; + if (virDomainObjUnref(vm) == 0) + vm = NULL; + } else { + if (qemuDomainObjBeginJob(vm) < 0) + goto cleanup; + + if (!virDomainObjIsActive(vm)) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("domain is not running")); + goto endjob; + } + qemuDomainObjEnterMonitor(vm); ret = qemuMonitorGetBlockExtent(priv->mon, disk->info.alias, &info->allocation); qemuDomainObjExitMonitor(vm); - } - if (qemuDomainObjEndJob(vm) == 0) - vm = NULL; +endjob: + if (qemuDomainObjEndJob(vm) == 0) + vm = NULL; + } } else { ret = 0; } diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 6738a53..21fdb4c 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -158,6 +158,38 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) VIR_WARN0("Unable to set migration speed"); } + if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) { + qemuDomainObjEnterMonitorWithDriver(driver, vm); + rc = qemuMonitorGetBlockStatsInfo(priv->mon, + priv->jobSignalsData.statDevName, + &priv->jobSignalsData.blockStat->rd_req, + &priv->jobSignalsData.blockStat->rd_bytes, + &priv->jobSignalsData.blockStat->wr_req, + &priv->jobSignalsData.blockStat->wr_bytes, + &priv->jobSignalsData.blockStat->errs); + qemuDomainObjExitMonitorWithDriver(driver, vm); + + priv->jobSignalsData.statRetCode = rc; + priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKSTAT; + + if (rc < 0) + VIR_WARN0("Unable to get block statistics"); + } + + if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) { + qemuDomainObjEnterMonitorWithDriver(driver, vm); + rc = qemuMonitorGetBlockExtent(priv->mon, + priv->jobSignalsData.infoDevName, + &priv->jobSignalsData.blockInfo->allocation); + qemuDomainObjExitMonitorWithDriver(driver, vm); + + priv->jobSignalsData.infoRetCode = rc; + priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKINFO; + + if (rc < 0) + VIR_WARN0("Unable to get block information"); + } + /* Repeat check because the job signals might have caused * guest to die */ @@ -223,6 +255,8 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) break; } + virCondSignal(&priv->signalCond); + virDomainObjUnlock(vm); qemuDriverUnlock(driver); @@ -233,6 +267,7 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) } cleanup: + virCondBroadcast(&priv->signalCond); return ret; } -- 1.7.1

On 05/11/2011 07:26 AM, Federico Simoncelli wrote:
Originally most of libvirt domain-specific calls were blocking during a migration. A new mechanism to allow specific calls (blkstat/blkinfo) to be executed in such condition has been implemented. In the long term it'd be desirable to get a more general solution to mark further APIs as migration safe, without needing special case code.
* src/qemu/qemu_migration.c: add some additional job signal flags for doing blkstat/blkinfo during a migration * src/qemu/qemu_domain.c: add a condition variable that can be used to efficiently wait for the migration code to clear the signal flag * src/qemu/qemu_driver.c: execute blkstat/blkinfo using the job signal flags during migration --- src/qemu/qemu_domain.c | 6 +++ src/qemu/qemu_domain.h | 9 ++++ src/qemu/qemu_driver.c | 103 ++++++++++++++++++++++++++++++++------------- src/qemu/qemu_migration.c | 35 +++++++++++++++
This conflicts with patch 10/16 in Dan's migration patches [1]. I'll hold off pushing it until Dan's patches are in, and hopefully the rebase is not too difficult. You could try the rebase now on his preview repository [2] [1] https://www.redhat.com/archives/libvir-list/2011-May/msg00771.html [2] http://gitorious.org/~berrange/libvirt/staging/commits/migrate-locking-3 That said, though, it looks like you addressed my findings from v1. Unfortunately, I still see a couple of places for improvement.
+++ b/src/qemu/qemu_domain.c @@ -110,6 +110,11 @@ static void *qemuDomainObjPrivateAlloc(void) if (VIR_ALLOC(priv) < 0) return NULL;
+ if (virCondInit(&priv->signalCond) < 0) { + VIR_FREE(priv); + return NULL;
As long as we're touching this function, let's also fix the bug where priv->jobCond was never initialized.
+ if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT) + || (priv->jobActive == QEMU_JOB_SAVE)) { + virDomainObjRef(vm); + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->signalCond, &vm->lock));
Hmm, should we mark priv->jobSignals as volatile in the header, to ensure the compiler won't optimize this into an infinite loop?
+++ b/src/qemu/qemu_migration.c @@ -158,6 +158,38 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) VIR_WARN0("Unable to set migration speed"); }
The earlier 'else if' chain should likewise be broken into consecutive 'if's. In Dan's patch, this moved into the new method qemuMigrationProcessJobSignals
+ if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) { + qemuDomainObjEnterMonitorWithDriver(driver, vm); + rc = qemuMonitorGetBlockStatsInfo(priv->mon,
I'm still wondering if we need to hold the signalLock condition during the duration where we drop driver lock to call out to the monitor. I can't convince myself that we need to, but I also can't convince myself that your code is safe without it (I guess it goes to show that I haven't done much programming on condition variables in any prior job - they're cool tools, but hard to wrap your head around when first learning them).
+ priv->jobSignalsData.statDevName, + &priv->jobSignalsData.blockStat->rd_req, + &priv->jobSignalsData.blockStat->rd_bytes, + &priv->jobSignalsData.blockStat->wr_req, + &priv->jobSignalsData.blockStat->wr_bytes, + &priv->jobSignalsData.blockStat->errs); + qemuDomainObjExitMonitorWithDriver(driver, vm); + + priv->jobSignalsData.statRetCode = rc; + priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKSTAT; + + if (rc < 0) + VIR_WARN0("Unable to get block statistics"); + } + + if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) {
Oh dear, I just realized a bug. By breaking 'else if' into separate 'if', we now need to check if the VM is alive after every time we regain the driver lock (that is, after every qemuDomainObjExitMonitorWithDriver). Hmm, maybe the better approach is to keep things as an 'else if' chain, but to make qemuMigrationProcessJobSignals be a while loop that iterates until all bits are serviced (so that the live vm check is factored to only appear once in the loop body). Back when these checks were part of the larger qemuMigrationWaitForCompletion function, the overall loop also included a sleep, which is what I didn't like; but with Dan's patch in place, a loop with no sleep seems reasonable.
+ qemuDomainObjEnterMonitorWithDriver(driver, vm); + rc = qemuMonitorGetBlockExtent(priv->mon, + priv->jobSignalsData.infoDevName, + &priv->jobSignalsData.blockInfo->allocation); + qemuDomainObjExitMonitorWithDriver(driver, vm); + + priv->jobSignalsData.infoRetCode = rc; + priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKINFO; + + if (rc < 0) + VIR_WARN0("Unable to get block information");
Hmm, your rebase still hasn't picked up the latest patches; you need to s/VIR_WARN0/VIR_WARN/ per commit b65f37a. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

----- Original Message -----
From: "Eric Blake" <eblake@redhat.com> To: "Federico Simoncelli" <fsimonce@redhat.com> Cc: libvir-list@redhat.com Sent: Friday, May 13, 2011 1:38:45 AM Subject: Re: [PATCH] qemu: allow blkstat/blkinfo calls during migration On 05/11/2011 07:26 AM, Federico Simoncelli wrote:
+ if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT) + || (priv->jobActive == QEMU_JOB_SAVE)) { + virDomainObjRef(vm); + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->signalCond, &vm->lock));
Hmm, should we mark priv->jobSignals as volatile in the header, to ensure the compiler won't optimize this into an infinite loop?
Not sure. http://software.intel.com/en-us/blogs/2007/11/30/volatile-almost-useless-for...
+ if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) { + qemuDomainObjEnterMonitorWithDriver(driver, vm); + rc = qemuMonitorGetBlockStatsInfo(priv->mon,
I'm still wondering if we need to hold the signalLock condition during the duration where we drop driver lock to call out to the monitor. I can't convince myself that we need to, but I also can't convince myself that your code is safe without it (I guess it goes to show that I haven't done much programming on condition variables in any prior job - they're cool tools, but hard to wrap your head around when first learning them).
There is no signalLock, on the other hand the signalCond is used to wake up a thread (using virCondSignal/virCondBroadcast) that is currently sleeping on virCondWait (and not holding vm->lock). -- Federico

Originally most of libvirt domain-specific calls were blocking during a migration. A new mechanism to allow specific calls (blkstat/blkinfo) to be executed in such condition has been implemented. In the long term it'd be desirable to get a more general solution to mark further APIs as migration safe, without needing special case code. * src/qemu/qemu_migration.c: add some additional job signal flags for doing blkstat/blkinfo during a migration * src/qemu/qemu_domain.c: add a condition variable that can be used to efficiently wait for the migration code to clear the signal flag * src/qemu/qemu_driver.c: execute blkstat/blkinfo using the job signal flags during migration --- src/qemu/qemu_domain.c | 13 ++++++ src/qemu/qemu_domain.h | 9 ++++ src/qemu/qemu_driver.c | 103 ++++++++++++++++++++++++++++++++------------- src/qemu/qemu_migration.c | 38 ++++++++++++++++- 4 files changed, 131 insertions(+), 32 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 0ac4861..ee029c5 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -110,7 +110,19 @@ static void *qemuDomainObjPrivateAlloc(void) if (VIR_ALLOC(priv) < 0) return NULL; + if (virCondInit(&priv->jobCond) < 0) + goto initfail; + + if (virCondInit(&priv->signalCond) < 0) { + ignore_value(virCondDestroy(&priv->jobCond)); + goto initfail; + } + return priv; + +initfail: + VIR_FREE(priv); + return NULL; } static void qemuDomainObjPrivateFree(void *data) @@ -123,6 +135,7 @@ static void qemuDomainObjPrivateFree(void *data) virDomainChrSourceDefFree(priv->monConfig); VIR_FREE(priv->vcpupids); VIR_FREE(priv->lockState); + ignore_value(virCondDestroy(&priv->signalCond)); /* This should never be non-NULL if we get here, but just in case... */ if (priv->mon) { diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 0fca974..a07a500 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -47,11 +47,19 @@ enum qemuDomainJobSignals { QEMU_JOB_SIGNAL_SUSPEND = 1 << 1, /* Request VM suspend to finish live migration offline */ QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME = 1 << 2, /* Request migration downtime change */ QEMU_JOB_SIGNAL_MIGRATE_SPEED = 1 << 3, /* Request migration speed change */ + QEMU_JOB_SIGNAL_BLKSTAT = 1 << 4, /* Request blkstat during migration */ + QEMU_JOB_SIGNAL_BLKINFO = 1 << 5, /* Request blkinfo during migration */ }; struct qemuDomainJobSignalsData { unsigned long long migrateDowntime; /* Data for QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME */ unsigned long migrateBandwidth; /* Data for QEMU_JOB_SIGNAL_MIGRATE_SPEED */ + char *statDevName; /* Device name used by blkstat calls */ + virDomainBlockStatsPtr blockStat; /* Block statistics for QEMU_JOB_SIGNAL_BLKSTAT */ + int statRetCode; /* Return code for the blkstat calls */ + char *infoDevName; /* Device name used by blkinfo calls */ + virDomainBlockInfoPtr blockInfo; /* Block information for QEMU_JOB_SIGNAL_BLKINFO */ + int infoRetCode; /* Return code for the blkinfo calls */ }; typedef struct _qemuDomainPCIAddressSet qemuDomainPCIAddressSet; @@ -61,6 +69,7 @@ typedef struct _qemuDomainObjPrivate qemuDomainObjPrivate; typedef qemuDomainObjPrivate *qemuDomainObjPrivatePtr; struct _qemuDomainObjPrivate { virCond jobCond; /* Use in conjunction with main virDomainObjPtr lock */ + virCond signalCond; /* Use to coordinate the safe queries during migration */ enum qemuDomainJob jobActive; /* Currently running job */ unsigned int jobSignals; /* Signals for running job */ struct qemuDomainJobSignalsData jobSignalsData; /* Signal specific data */ diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 30ee79e..5987ed4 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5104,15 +5104,6 @@ qemudDomainBlockStats (virDomainPtr dom, goto cleanup; } - if (qemuDomainObjBeginJob(vm) < 0) - goto cleanup; - - if (!virDomainObjIsActive(vm)) { - qemuReportError(VIR_ERR_OPERATION_INVALID, - "%s", _("domain is not running")); - goto endjob; - } - for (i = 0 ; i < vm->def->ndisks ; i++) { if (STREQ(path, vm->def->disks[i]->dst)) { disk = vm->def->disks[i]; @@ -5123,29 +5114,57 @@ qemudDomainBlockStats (virDomainPtr dom, if (!disk) { qemuReportError(VIR_ERR_INVALID_ARG, _("invalid path: %s"), path); - goto endjob; + goto cleanup; } if (!disk->info.alias) { qemuReportError(VIR_ERR_INTERNAL_ERROR, _("missing disk device alias name for %s"), disk->dst); - goto endjob; + goto cleanup; } priv = vm->privateData; - qemuDomainObjEnterMonitor(vm); - ret = qemuMonitorGetBlockStatsInfo(priv->mon, - disk->info.alias, - &stats->rd_req, - &stats->rd_bytes, - &stats->wr_req, - &stats->wr_bytes, - &stats->errs); - qemuDomainObjExitMonitor(vm); + if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT) + || (priv->jobActive == QEMU_JOB_SAVE)) { + virDomainObjRef(vm); + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + priv->jobSignals |= QEMU_JOB_SIGNAL_BLKSTAT; + priv->jobSignalsData.statDevName = disk->info.alias; + priv->jobSignalsData.blockStat = stats; + priv->jobSignalsData.statRetCode = -1; + + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + ret = priv->jobSignalsData.statRetCode; + if (virDomainObjUnref(vm) == 0) + vm = NULL; + } else { + if (qemuDomainObjBeginJob(vm) < 0) + goto cleanup; + + if (!virDomainObjIsActive(vm)) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("domain is not running")); + goto endjob; + } + + qemuDomainObjEnterMonitor(vm); + ret = qemuMonitorGetBlockStatsInfo(priv->mon, + disk->info.alias, + &stats->rd_req, + &stats->rd_bytes, + &stats->wr_req, + &stats->wr_bytes, + &stats->errs); + qemuDomainObjExitMonitor(vm); endjob: - if (qemuDomainObjEndJob(vm) == 0) - vm = NULL; + if (qemuDomainObjEndJob(vm) == 0) + vm = NULL; + } cleanup: if (vm) @@ -5549,20 +5568,44 @@ static int qemuDomainGetBlockInfo(virDomainPtr dom, format != VIR_STORAGE_FILE_RAW && S_ISBLK(sb.st_mode)) { qemuDomainObjPrivatePtr priv = vm->privateData; - if (qemuDomainObjBeginJob(vm) < 0) - goto cleanup; - if (!virDomainObjIsActive(vm)) - ret = 0; - else { + + if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT) + || (priv->jobActive == QEMU_JOB_SAVE)) { + virDomainObjRef(vm); + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + priv->jobSignals |= QEMU_JOB_SIGNAL_BLKINFO; + priv->jobSignalsData.infoDevName = disk->info.alias; + priv->jobSignalsData.blockInfo = info; + priv->jobSignalsData.infoRetCode = -1; + + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + ret = priv->jobSignalsData.infoRetCode; + if (virDomainObjUnref(vm) == 0) + vm = NULL; + } else { + if (qemuDomainObjBeginJob(vm) < 0) + goto cleanup; + + if (!virDomainObjIsActive(vm)) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("domain is not running")); + goto endjob; + } + qemuDomainObjEnterMonitor(vm); ret = qemuMonitorGetBlockExtent(priv->mon, disk->info.alias, &info->allocation); qemuDomainObjExitMonitor(vm); - } - if (qemuDomainObjEndJob(vm) == 0) - vm = NULL; +endjob: + if (qemuDomainObjEndJob(vm) == 0) + vm = NULL; + } } else { ret = 0; } diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 8f18390..3bb0890 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -681,6 +681,34 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, qemuDomainObjExitMonitorWithDriver(driver, vm); if (ret < 0) VIR_WARN("Unable to set migration speed"); + } else if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) { + qemuDomainObjEnterMonitorWithDriver(driver, vm); + ret = qemuMonitorGetBlockStatsInfo(priv->mon, + priv->jobSignalsData.statDevName, + &priv->jobSignalsData.blockStat->rd_req, + &priv->jobSignalsData.blockStat->rd_bytes, + &priv->jobSignalsData.blockStat->wr_req, + &priv->jobSignalsData.blockStat->wr_bytes, + &priv->jobSignalsData.blockStat->errs); + qemuDomainObjExitMonitorWithDriver(driver, vm); + + priv->jobSignalsData.statRetCode = ret; + priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKSTAT; + + if (ret < 0) + VIR_WARN("Unable to get block statistics"); + } else if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) { + qemuDomainObjEnterMonitorWithDriver(driver, vm); + ret = qemuMonitorGetBlockExtent(priv->mon, + priv->jobSignalsData.infoDevName, + &priv->jobSignalsData.blockInfo->allocation); + qemuDomainObjExitMonitorWithDriver(driver, vm); + + priv->jobSignalsData.infoRetCode = ret; + priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKINFO; + + if (ret < 0) + VIR_WARN("Unable to get block information"); } else { ret = 0; } @@ -796,8 +824,12 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) job = _("job"); } - if (qemuMigrationProcessJobSignals(driver, vm, job) < 0) - goto cleanup; + while (priv->jobSignals) { + if (qemuMigrationProcessJobSignals(driver, vm, job) < 0) + goto cleanup; + } + + virCondSignal(&priv->signalCond); if (qemuMigrationUpdateJobStatus(driver, vm, job) < 0) goto cleanup; @@ -813,6 +845,8 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) } cleanup: + virCondBroadcast(&priv->signalCond); + if (priv->jobInfo.type == VIR_DOMAIN_JOB_COMPLETED) return 0; else -- 1.7.1

On 05/13/2011 04:11 AM, Federico Simoncelli wrote:
Originally most of libvirt domain-specific calls were blocking during a migration. A new mechanism to allow specific calls (blkstat/blkinfo) to be executed in such condition has been implemented. In the long term it'd be desirable to get a more general solution to mark further APIs as migration safe, without needing special case code.
* src/qemu/qemu_migration.c: add some additional job signal flags for doing blkstat/blkinfo during a migration * src/qemu/qemu_domain.c: add a condition variable that can be used to efficiently wait for the migration code to clear the signal flag * src/qemu/qemu_driver.c: execute blkstat/blkinfo using the job signal flags during migration --- src/qemu/qemu_domain.c | 13 ++++++ src/qemu/qemu_domain.h | 9 ++++ src/qemu/qemu_driver.c | 103 ++++++++++++++++++++++++++++++++------------- src/qemu/qemu_migration.c | 38 ++++++++++++++++- 4 files changed, 131 insertions(+), 32 deletions(-)
ACK; but pending danpb's commits actually going in. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

On Fri, May 13, 2011 at 05:05:14PM -0600, Eric Blake wrote:
On 05/13/2011 04:11 AM, Federico Simoncelli wrote:
Originally most of libvirt domain-specific calls were blocking during a migration. A new mechanism to allow specific calls (blkstat/blkinfo) to be executed in such condition has been implemented. In the long term it'd be desirable to get a more general solution to mark further APIs as migration safe, without needing special case code.
* src/qemu/qemu_migration.c: add some additional job signal flags for doing blkstat/blkinfo during a migration * src/qemu/qemu_domain.c: add a condition variable that can be used to efficiently wait for the migration code to clear the signal flag * src/qemu/qemu_driver.c: execute blkstat/blkinfo using the job signal flags during migration --- src/qemu/qemu_domain.c | 13 ++++++ src/qemu/qemu_domain.h | 9 ++++ src/qemu/qemu_driver.c | 103 ++++++++++++++++++++++++++++++++------------- src/qemu/qemu_migration.c | 38 ++++++++++++++++- 4 files changed, 131 insertions(+), 32 deletions(-)
ACK; but pending danpb's commits actually going in.
I've pushed my migration series, so this can be rebased now Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 05/13/2011 04:11 AM, Federico Simoncelli wrote:
Originally most of libvirt domain-specific calls were blocking during a migration. A new mechanism to allow specific calls (blkstat/blkinfo) to be executed in such condition has been implemented. In the long term it'd be desirable to get a more general solution to mark further APIs as migration safe, without needing special case code.
Actually, we now need a v4, although I'll try to post one shortly.
@@ -123,6 +135,7 @@ static void qemuDomainObjPrivateFree(void *data) virDomainChrSourceDefFree(priv->monConfig); VIR_FREE(priv->vcpupids); VIR_FREE(priv->lockState); + ignore_value(virCondDestroy(&priv->signalCond));
We should also add a virCondDestroy for priv->jobCond.
+ if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT) + || (priv->jobActive == QEMU_JOB_SAVE)) { + virDomainObjRef(vm); + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + priv->jobSignals |= QEMU_JOB_SIGNAL_BLKSTAT; + priv->jobSignalsData.statDevName = disk->info.alias; + priv->jobSignalsData.blockStat = stats; + priv->jobSignalsData.statRetCode = -1; +
For safety sake, I'd rather see the jobSignals assignment after the jobSignalsData assignments. I'm not sure if we need some sort of memory barrier to ensure that a compiler (and/or hardware) won't rearrange the assignments to complete out of order, but we really don't want the migration thread to see the jobSignalsData contents until after they are stable.
@@ -796,8 +824,12 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) job = _("job"); }
- if (qemuMigrationProcessJobSignals(driver, vm, job) < 0) - goto cleanup; + while (priv->jobSignals) { + if (qemuMigrationProcessJobSignals(driver, vm, job) < 0) + goto cleanup; + }
This while loop continues until priv->jobSignals is 0 _or_ an error occurs...
+ + virCondSignal(&priv->signalCond);
if (qemuMigrationUpdateJobStatus(driver, vm, job) < 0) goto cleanup; @@ -813,6 +845,8 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) }
cleanup: + virCondBroadcast(&priv->signalCond); +
...therefore, on error, this can raise the condition variable while jobSignals is non-zero. We've just introduced a deadlock: thread 1 starts a migration thread 2 queries block info, sets the jobSignals bit, and waits for the cond variable and the bit to be clear something goes wrong with migration (suppose qemu is killed externally) thread 1 finally gets to qemuMigrationUpdateJobStatus, but sees that vm is no longer live so it exits with error thread 2 is now stuck waiting for the jobSignals to clear, but thread 1 is no longer going to clear it I'm still working on the right way to fix it, but I think that qemuMigrationProcessJobSignals needs a bool parameter that says whether it is in cleanup mode, in which case it always clears at least one jobSignals bit even on error, and for blkinfo/blkstat, it sets the ret value to -1 before clearing the bit. That way, qemuMigrationWaitForCompletion always ends with jobSignals == 0 and driver lock held, and as long as the driver lock is then not released until jobActive has been reset, then no new qemudDomainBlockStats will start, and the existing one in thread 2 will correctly fail for the same reason that migration in thread 1 failed (that is, the vm went away early). -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

From: Federico Simoncelli <fsimonce@redhat.com> Originally most of libvirt domain-specific calls were blocking during a migration. A new mechanism to allow specific calls (blkstat/blkinfo) to be executed in such condition has been implemented. In the long term it'd be desirable to get a more general solution to mark further APIs as migration safe, without needing special case code. * src/qemu/qemu_migration.c: add some additional job signal flags for doing blkstat/blkinfo during a migration * src/qemu/qemu_domain.c: add a condition variable that can be used to efficiently wait for the migration code to clear the signal flag * src/qemu/qemu_driver.c: execute blkstat/blkinfo using the job signal flags during migration --- My changes in v4: Fix the mailmap (elided from this mail) for Federico's preferred address. Add cleanup for jobCond. Defer changes to jobSignals until after jobSignalData is stable. Alter qemuMigrationProcessJobSignals to have a mode of operation that guarantees that a jobSignals bit is cleared, regardless of other errors. .mailmap | 1 + AUTHORS | 2 +- src/qemu/qemu_domain.c | 14 ++++++ src/qemu/qemu_domain.h | 9 ++++ src/qemu/qemu_driver.c | 107 +++++++++++++++++++++++++++++++------------- src/qemu/qemu_migration.c | 76 +++++++++++++++++++++++++------- 6 files changed, 159 insertions(+), 50 deletions(-) diff --git a/.mailmap b/.mailmap index f73e26b..2b98322 100644 --- a/.mailmap +++ b/.mailmap diff --git a/AUTHORS b/AUTHORS index 1bb1f0f..47860a9 100644 --- a/AUTHORS +++ b/AUTHORS diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index bcacb18..8f4915c 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -89,7 +89,19 @@ static void *qemuDomainObjPrivateAlloc(void) if (VIR_ALLOC(priv) < 0) return NULL; + if (virCondInit(&priv->jobCond) < 0) + goto initfail; + + if (virCondInit(&priv->signalCond) < 0) { + ignore_value(virCondDestroy(&priv->jobCond)); + goto initfail; + } + return priv; + +initfail: + VIR_FREE(priv); + return NULL; } static void qemuDomainObjPrivateFree(void *data) @@ -101,6 +113,8 @@ static void qemuDomainObjPrivateFree(void *data) qemuDomainPCIAddressSetFree(priv->pciaddrs); virDomainChrSourceDefFree(priv->monConfig); VIR_FREE(priv->vcpupids); + ignore_value(virCondDestroy(&priv->jobCond)); + ignore_value(virCondDestroy(&priv->signalCond)); /* This should never be non-NULL if we get here, but just in case... */ if (priv->mon) { diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 6d24f53..af513e7 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -47,11 +47,19 @@ enum qemuDomainJobSignals { QEMU_JOB_SIGNAL_SUSPEND = 1 << 1, /* Request VM suspend to finish live migration offline */ QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME = 1 << 2, /* Request migration downtime change */ QEMU_JOB_SIGNAL_MIGRATE_SPEED = 1 << 3, /* Request migration speed change */ + QEMU_JOB_SIGNAL_BLKSTAT = 1 << 4, /* Request blkstat during migration */ + QEMU_JOB_SIGNAL_BLKINFO = 1 << 5, /* Request blkinfo during migration */ }; struct qemuDomainJobSignalsData { unsigned long long migrateDowntime; /* Data for QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME */ unsigned long migrateBandwidth; /* Data for QEMU_JOB_SIGNAL_MIGRATE_SPEED */ + char *statDevName; /* Device name used by blkstat calls */ + virDomainBlockStatsPtr blockStat; /* Block statistics for QEMU_JOB_SIGNAL_BLKSTAT */ + int statRetCode; /* Return code for the blkstat calls */ + char *infoDevName; /* Device name used by blkinfo calls */ + virDomainBlockInfoPtr blockInfo; /* Block information for QEMU_JOB_SIGNAL_BLKINFO */ + int infoRetCode; /* Return code for the blkinfo calls */ }; typedef struct _qemuDomainPCIAddressSet qemuDomainPCIAddressSet; @@ -61,6 +69,7 @@ typedef struct _qemuDomainObjPrivate qemuDomainObjPrivate; typedef qemuDomainObjPrivate *qemuDomainObjPrivatePtr; struct _qemuDomainObjPrivate { virCond jobCond; /* Use in conjunction with main virDomainObjPtr lock */ + virCond signalCond; /* Use to coordinate the safe queries during migration */ enum qemuDomainJob jobActive; /* Currently running job */ unsigned int jobSignals; /* Signals for running job */ struct qemuDomainJobSignalsData jobSignalsData; /* Signal specific data */ diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index ccaae66..2bd4d0b 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5205,15 +5205,6 @@ qemudDomainBlockStats (virDomainPtr dom, goto cleanup; } - if (qemuDomainObjBeginJob(vm) < 0) - goto cleanup; - - if (!virDomainObjIsActive(vm)) { - qemuReportError(VIR_ERR_OPERATION_INVALID, - "%s", _("domain is not running")); - goto endjob; - } - for (i = 0 ; i < vm->def->ndisks ; i++) { if (STREQ(path, vm->def->disks[i]->dst)) { disk = vm->def->disks[i]; @@ -5224,29 +5215,57 @@ qemudDomainBlockStats (virDomainPtr dom, if (!disk) { qemuReportError(VIR_ERR_INVALID_ARG, _("invalid path: %s"), path); - goto endjob; + goto cleanup; } if (!disk->info.alias) { qemuReportError(VIR_ERR_INTERNAL_ERROR, _("missing disk device alias name for %s"), disk->dst); - goto endjob; + goto cleanup; } priv = vm->privateData; - qemuDomainObjEnterMonitor(vm); - ret = qemuMonitorGetBlockStatsInfo(priv->mon, - disk->info.alias, - &stats->rd_req, - &stats->rd_bytes, - &stats->wr_req, - &stats->wr_bytes, - &stats->errs); - qemuDomainObjExitMonitor(vm); + if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT) + || (priv->jobActive == QEMU_JOB_SAVE)) { + virDomainObjRef(vm); + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + priv->jobSignalsData.statDevName = disk->info.alias; + priv->jobSignalsData.blockStat = stats; + priv->jobSignalsData.statRetCode = -1; + priv->jobSignals |= QEMU_JOB_SIGNAL_BLKSTAT; + + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + ret = priv->jobSignalsData.statRetCode; + if (virDomainObjUnref(vm) == 0) + vm = NULL; + } else { + if (qemuDomainObjBeginJob(vm) < 0) + goto cleanup; + + if (!virDomainObjIsActive(vm)) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("domain is not running")); + goto endjob; + } + + qemuDomainObjEnterMonitor(vm); + ret = qemuMonitorGetBlockStatsInfo(priv->mon, + disk->info.alias, + &stats->rd_req, + &stats->rd_bytes, + &stats->wr_req, + &stats->wr_bytes, + &stats->errs); + qemuDomainObjExitMonitor(vm); endjob: - if (qemuDomainObjEndJob(vm) == 0) - vm = NULL; + if (qemuDomainObjEndJob(vm) == 0) + vm = NULL; + } cleanup: if (vm) @@ -5650,20 +5669,44 @@ static int qemuDomainGetBlockInfo(virDomainPtr dom, format != VIR_STORAGE_FILE_RAW && S_ISBLK(sb.st_mode)) { qemuDomainObjPrivatePtr priv = vm->privateData; - if (qemuDomainObjBeginJob(vm) < 0) - goto cleanup; - if (!virDomainObjIsActive(vm)) - ret = 0; - else { + + if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT) + || (priv->jobActive == QEMU_JOB_SAVE)) { + virDomainObjRef(vm); + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + priv->jobSignalsData.infoDevName = disk->info.alias; + priv->jobSignalsData.blockInfo = info; + priv->jobSignalsData.infoRetCode = -1; + priv->jobSignals |= QEMU_JOB_SIGNAL_BLKINFO; + + while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) + ignore_value(virCondWait(&priv->signalCond, &vm->lock)); + + ret = priv->jobSignalsData.infoRetCode; + if (virDomainObjUnref(vm) == 0) + vm = NULL; + } else { + if (qemuDomainObjBeginJob(vm) < 0) + goto cleanup; + + if (!virDomainObjIsActive(vm)) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("domain is not running")); + goto endjob; + } + qemuDomainObjEnterMonitor(vm); ret = qemuMonitorGetBlockExtent(priv->mon, disk->info.alias, &info->allocation); qemuDomainObjExitMonitor(vm); - } - if (qemuDomainObjEndJob(vm) == 0) - vm = NULL; +endjob: + if (qemuDomainObjEndJob(vm) == 0) + vm = NULL; + } } else { ret = 0; } @@ -6543,8 +6586,8 @@ qemuDomainMigrateSetMaxDowntime(virDomainPtr dom, } VIR_DEBUG("Requesting migration downtime change to %llums", downtime); - priv->jobSignals |= QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME; priv->jobSignalsData.migrateDowntime = downtime; + priv->jobSignals |= QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME; ret = 0; cleanup: @@ -6592,8 +6635,8 @@ qemuDomainMigrateSetMaxSpeed(virDomainPtr dom, } VIR_DEBUG("Requesting migration speed change to %luMbs", bandwidth); - priv->jobSignals |= QEMU_JOB_SIGNAL_MIGRATE_SPEED; priv->jobSignalsData.migrateBandwidth = bandwidth; + priv->jobSignals |= QEMU_JOB_SIGNAL_MIGRATE_SPEED; ret = 0; cleanup: diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index cb408ac..a091e2a 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -586,7 +586,8 @@ qemuMigrationSetOffline(struct qemud_driver *driver, static int qemuMigrationProcessJobSignals(struct qemud_driver *driver, virDomainObjPtr vm, - const char *job) + const char *job, + bool cleanup) { qemuDomainObjPrivatePtr priv = vm->privateData; int ret = -1; @@ -594,6 +595,11 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, if (!virDomainObjIsActive(vm)) { qemuReportError(VIR_ERR_INTERNAL_ERROR, _("%s: %s"), job, _("guest unexpectedly quit")); + if (cleanup) { + priv->jobSignals = 0; + priv->jobSignalsData.statRetCode = -1; + priv->jobSignalsData.infoRetCode = -1; + } return -1; } @@ -633,6 +639,34 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, qemuDomainObjExitMonitorWithDriver(driver, vm); if (ret < 0) VIR_WARN("Unable to set migration speed"); + } else if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) { + qemuDomainObjEnterMonitorWithDriver(driver, vm); + ret = qemuMonitorGetBlockStatsInfo(priv->mon, + priv->jobSignalsData.statDevName, + &priv->jobSignalsData.blockStat->rd_req, + &priv->jobSignalsData.blockStat->rd_bytes, + &priv->jobSignalsData.blockStat->wr_req, + &priv->jobSignalsData.blockStat->wr_bytes, + &priv->jobSignalsData.blockStat->errs); + qemuDomainObjExitMonitorWithDriver(driver, vm); + + priv->jobSignalsData.statRetCode = ret; + priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKSTAT; + + if (ret < 0) + VIR_WARN("Unable to get block statistics"); + } else if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) { + qemuDomainObjEnterMonitorWithDriver(driver, vm); + ret = qemuMonitorGetBlockExtent(priv->mon, + priv->jobSignalsData.infoDevName, + &priv->jobSignalsData.blockInfo->allocation); + qemuDomainObjExitMonitorWithDriver(driver, vm); + + priv->jobSignalsData.infoRetCode = ret; + priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKINFO; + + if (ret < 0) + VIR_WARN("Unable to get block information"); } else { ret = 0; } @@ -726,30 +760,33 @@ int qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) { qemuDomainObjPrivatePtr priv = vm->privateData; + const char *job; + + switch (priv->jobActive) { + case QEMU_JOB_MIGRATION_OUT: + job = _("migration job"); + break; + case QEMU_JOB_SAVE: + job = _("domain save job"); + break; + case QEMU_JOB_DUMP: + job = _("domain core dump job"); + break; + default: + job = _("job"); + } priv->jobInfo.type = VIR_DOMAIN_JOB_UNBOUNDED; while (priv->jobInfo.type == VIR_DOMAIN_JOB_UNBOUNDED) { /* Poll every 50ms for progress & to allow cancellation */ struct timespec ts = { .tv_sec = 0, .tv_nsec = 50 * 1000 * 1000ull }; - const char *job; - - switch (priv->jobActive) { - case QEMU_JOB_MIGRATION_OUT: - job = _("migration job"); - break; - case QEMU_JOB_SAVE: - job = _("domain save job"); - break; - case QEMU_JOB_DUMP: - job = _("domain core dump job"); - break; - default: - job = _("job"); + while (priv->jobSignals) { + if (qemuMigrationProcessJobSignals(driver, vm, job, false) < 0) + goto cleanup; } - if (qemuMigrationProcessJobSignals(driver, vm, job) < 0) - goto cleanup; + virCondSignal(&priv->signalCond); if (qemuMigrationUpdateJobStatus(driver, vm, job) < 0) goto cleanup; @@ -765,6 +802,11 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm) } cleanup: + while (priv->jobSignals) { + qemuMigrationProcessJobSignals(driver, vm, job, true); + } + virCondBroadcast(&priv->signalCond); + if (priv->jobInfo.type == VIR_DOMAIN_JOB_COMPLETED) return 0; else -- 1.7.4.4

On 05/16/2011 07:30 PM, Eric Blake wrote:
+ while (priv->jobSignals& QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->signalCond,&vm->lock)); + + priv->jobSignalsData.statDevName = disk->info.alias; + priv->jobSignalsData.blockStat = stats; + priv->jobSignalsData.statRetCode = -1; + priv->jobSignals |= QEMU_JOB_SIGNAL_BLKSTAT; + + while (priv->jobSignals& QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->signalCond,&vm->lock));
I'm not sure that the original problem with jobSignals is fixed (the one that you solved by splitting the retcode field). A similar race can happen with two threads, both requesting blkstat. Thread 1 starts a migration. Thread 2 queries BlockStats, waits for the condition variable and priv->jobSignals to be 0. Thread 2 sets priv->jobSignals to non-zero, then waits for condition variable and priv->jobSignals to clear BLKSTATS. Thread 1 processes thread 2's query, and clears priv->jobSignals Thread 3 sees priv->jobSignals == 0 and queries BlockStats. Thread 1 processes thread 3's query, and clears priv->jobSignals Thread 2 gets condition lock, but sees the answer intended for thread 3. At this point I'm not even sure if there is a deadlock or what. I think the code should use two condition variables. signalCond is broadcast when you _can start_ a job, resultCond is broadcast when a job has finished. while ((priv->jobSignals | priv->jobResults) & QEMU_JOB_SIGNAL_BLKSTAT) ignore_value(virCondWait(&priv->signalCond, &vm->lock)); priv->jobSignalsData.statDevName = disk->info.alias; priv->jobSignalsData.blockStat = stats; priv->jobSignalsData.statRetCode = -1; priv->jobSignals |= QEMU_JOB_SIGNAL_BLKSTAT; while ((priv->jobResults & QEMU_JOB_SIGNAL_BLKSTAT) == 0) ignore_value(virCondWait(&priv->resultCond, &vm->lock)); ret = priv->jobSignalsData.statRetCode; priv->jobResults ^= QEMU_JOB_SIGNAL_BLKSTAT; virCondBroadcast(&priv->signalCond); ... priv->jobSignalsData.statRetCode = ret; priv->jobResults |= QEMU_JOB_SIGNAL_BLKSTAT; priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKSTAT; virCondBroadcast(&priv->resultCond); ... while (priv->jobSignals & ~priv->jobResults) { if (qemuMigrationProcessJobSignals(driver, vm, job, false) < 0) goto cleanup; } ... cleanup: while (priv->jobSignals & ~priv->jobResults) { qemuMigrationProcessJobSignals(driver, vm, job, false); }

On 05/20/2011 03:11 PM, Paolo Bonzini wrote:
On 05/16/2011 07:30 PM, Eric Blake wrote:
+ while (priv->jobSignals& QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->signalCond,&vm->lock)); + + priv->jobSignalsData.statDevName = disk->info.alias; + priv->jobSignalsData.blockStat = stats; + priv->jobSignalsData.statRetCode = -1; + priv->jobSignals |= QEMU_JOB_SIGNAL_BLKSTAT; + + while (priv->jobSignals& QEMU_JOB_SIGNAL_BLKSTAT) + ignore_value(virCondWait(&priv->signalCond,&vm->lock));
I'm not sure that the original problem with jobSignals is fixed (the one that you solved by splitting the retcode field). A similar race can happen with two threads, both requesting blkstat.
Thread 1 starts a migration. Thread 2 queries BlockStats, waits for the condition variable and priv->jobSignals to be 0. Thread 2 sets priv->jobSignals to non-zero, then waits for condition variable and priv->jobSignals to clear BLKSTATS. Thread 1 processes thread 2's query, and clears priv->jobSignals Thread 3 sees priv->jobSignals == 0 and queries BlockStats. Thread 1 processes thread 3's query, and clears priv->jobSignals Thread 2 gets condition lock, but sees the answer intended for thread 3.
I think we're safe. It shouldn't matter if thread 2 reads thread 3's answer (because they are both read-only queries, and should be getting the same answer; or even if the answers differ, reading the newer answer is not horrible because it is still an accurate answer). The harder problem is if thread 3 starts overwriting the result area to NULL before setting the jobSignals request, and before thread 2 can read the result area; but we aren't doing a memset (that is, the query code is only ever reading the result data, and it is only ever being modified while jobSignals is set). However,
I think the code should use two condition variables. signalCond is broadcast when you _can start_ a job, resultCond is broadcast when a job has finished.
you probably have a point here. Using two variables will make it much easier to prove correctness, especially if we add other interfaces. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

On 05/21/2011 12:11 AM, Eric Blake wrote:
I think we're safe. It shouldn't matter if thread 2 reads thread 3's answer (because they are both read-only queries, and should be getting the same answer; or even if the answers differ, reading the newer answer is not horrible because it is still an accurate answer).
The queries could be for different disks, and one might fail. The problem is mostly if the first fails and the second succeeds, and the program sees a success instead of a failure. ... but a much simpler fix is to make statRetCode a pointer, so that there is nothing to do on the producer side after the flag has been reset. I strongly suggest doing this, as it is a very simple change that Cannot Make Things Worse (TM). Paolo

On 05/21/2011 01:21 AM, Paolo Bonzini wrote:
On 05/21/2011 12:11 AM, Eric Blake wrote:
I think we're safe. It shouldn't matter if thread 2 reads thread 3's answer (because they are both read-only queries, and should be getting the same answer; or even if the answers differ, reading the newer answer is not horrible because it is still an accurate answer).
The queries could be for different disks, and one might fail. The problem is mostly if the first fails and the second succeeds, and the program sees a success instead of a failure.
... but a much simpler fix is to make statRetCode a pointer, so that there is nothing to do on the producer side after the flag has been reset. I strongly suggest doing this, as it is a very simple change that Cannot Make Things Worse (TM).
Ah, that does make sense. Instead of storing the answers directly in the memory shared between query and migration loop, the shared memory should instead store pointers to stack-local memory in the query code. Thus, the migration loop writes back to unique memory, and the moment it clears the condition flag, it doesn't matter if some other thread starts overwriting the pointers, because the first caller now knows that its answers are safe. Federico, do you want to tackle this, or shall I? -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

If this is correct I'll squash it with the v4 patch. --- src/qemu/qemu_domain.h | 4 ++-- src/qemu/qemu_driver.c | 6 ++---- src/qemu/qemu_migration.c | 8 ++++---- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index af513e7..effaebc 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -56,10 +56,10 @@ struct qemuDomainJobSignalsData { unsigned long migrateBandwidth; /* Data for QEMU_JOB_SIGNAL_MIGRATE_SPEED */ char *statDevName; /* Device name used by blkstat calls */ virDomainBlockStatsPtr blockStat; /* Block statistics for QEMU_JOB_SIGNAL_BLKSTAT */ - int statRetCode; /* Return code for the blkstat calls */ + int *statRetCode; /* Return code for the blkstat calls */ char *infoDevName; /* Device name used by blkinfo calls */ virDomainBlockInfoPtr blockInfo; /* Block information for QEMU_JOB_SIGNAL_BLKINFO */ - int infoRetCode; /* Return code for the blkinfo calls */ + int *infoRetCode; /* Return code for the blkinfo calls */ }; typedef struct _qemuDomainPCIAddressSet qemuDomainPCIAddressSet; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index a4e430b..d4287dc 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5298,13 +5298,12 @@ qemudDomainBlockStats (virDomainPtr dom, priv->jobSignalsData.statDevName = disk->info.alias; priv->jobSignalsData.blockStat = stats; - priv->jobSignalsData.statRetCode = -1; + priv->jobSignalsData.statRetCode = &ret; priv->jobSignals |= QEMU_JOB_SIGNAL_BLKSTAT; while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) ignore_value(virCondWait(&priv->signalCond, &vm->lock)); - ret = priv->jobSignalsData.statRetCode; if (virDomainObjUnref(vm) == 0) vm = NULL; } else { @@ -5745,13 +5744,12 @@ static int qemuDomainGetBlockInfo(virDomainPtr dom, priv->jobSignalsData.infoDevName = disk->info.alias; priv->jobSignalsData.blockInfo = info; - priv->jobSignalsData.infoRetCode = -1; + priv->jobSignalsData.infoRetCode = &ret; priv->jobSignals |= QEMU_JOB_SIGNAL_BLKINFO; while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) ignore_value(virCondWait(&priv->signalCond, &vm->lock)); - ret = priv->jobSignalsData.infoRetCode; if (virDomainObjUnref(vm) == 0) vm = NULL; } else { diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 28b168c..b767fb7 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -659,8 +659,8 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, job, _("guest unexpectedly quit")); if (cleanup) { priv->jobSignals = 0; - priv->jobSignalsData.statRetCode = -1; - priv->jobSignalsData.infoRetCode = -1; + priv->jobSignalsData.statRetCode = NULL; + priv->jobSignalsData.infoRetCode = NULL; } return -1; } @@ -712,7 +712,7 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, &priv->jobSignalsData.blockStat->errs); qemuDomainObjExitMonitorWithDriver(driver, vm); - priv->jobSignalsData.statRetCode = ret; + *priv->jobSignalsData.statRetCode = ret; priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKSTAT; if (ret < 0) @@ -724,7 +724,7 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, &priv->jobSignalsData.blockInfo->allocation); qemuDomainObjExitMonitorWithDriver(driver, vm); - priv->jobSignalsData.infoRetCode = ret; + *priv->jobSignalsData.infoRetCode = ret; priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKINFO; if (ret < 0) -- 1.7.1

On 05/26/2011 03:45 AM, Federico Simoncelli wrote:
If this is correct I'll squash it with the v4 patch.
--- src/qemu/qemu_domain.h | 4 ++-- src/qemu/qemu_driver.c | 6 ++---- src/qemu/qemu_migration.c | 8 ++++---- 3 files changed, 8 insertions(+), 10 deletions(-)
Almost!
@@ -5745,13 +5744,12 @@ static int qemuDomainGetBlockInfo(virDomainPtr dom,
priv->jobSignalsData.infoDevName = disk->info.alias; priv->jobSignalsData.blockInfo = info; - priv->jobSignalsData.infoRetCode = -1; + priv->jobSignalsData.infoRetCode = &ret; priv->jobSignals |= QEMU_JOB_SIGNAL_BLKINFO;
So far, so good - ret starts life as -1, and we are now off-loading the responsibility for setting ret to any other value onto qemuMigrationProcessJobSignals after a successful query.
+++ b/src/qemu/qemu_migration.c @@ -659,8 +659,8 @@ qemuMigrationProcessJobSignals(struct qemud_driver *driver, job, _("guest unexpectedly quit")); if (cleanup) { priv->jobSignals = 0; - priv->jobSignalsData.statRetCode = -1; - priv->jobSignalsData.infoRetCode = -1; + priv->jobSignalsData.statRetCode = NULL; + priv->jobSignalsData.infoRetCode = NULL;
But here, there's no need to set the pointers to NULL; by clearing jobSignals, we guarantee that qemuDomainGetBlockInfo will eventually wake up on the condition, at which point it will notice that ret is still -1 because migration never set it. And since migration is aborting, we will never again access priv->jobSignalsData.statRetCode to try and dereference it, so this is a dead assignment. So ACK with that nit fixed, and I've pushed the modified v4. -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org
participants (4)
-
Daniel P. Berrange
-
Eric Blake
-
Federico Simoncelli
-
Paolo Bonzini