Quoting Jim Fehlig <jfehlig(a)novell.com>:
> Markus Groß wrote:
>> For core dumping to work correctly the following patch
>> for xen is needed:
>>
http://lists.xensource.com/archives/html/xen-devel/2011-05/msg01469.html
>>
>>
>> This patch is in xen-unstable and is considered for backport to
>> the xen stable branches. Without this patch the mapped
>> memory pages of the pv guest are not unmapped after core-dump.
>>
>> ---
>> src/libxl/libxl_driver.c | 87
>> ++++++++++++++++++++++++++++++++++++++++++++++
>> 1 files changed, 87 insertions(+), 0 deletions(-)
>>
>> diff --git a/src/libxl/libxl_driver.c b/src/libxl/libxl_driver.c
>> index b2cc0e8..75008db 100644
>> --- a/src/libxl/libxl_driver.c
>> +++ b/src/libxl/libxl_driver.c
>> @@ -1627,6 +1627,92 @@ libxlDomainGetState(virDomainPtr dom,
>> }
>>
>> static int
>> +libxlDomainCoreDump(virDomainPtr dom, const char *to, int flags)
>> +{
>> + libxlDriverPrivatePtr driver = dom->conn->privateData;
>> + libxlDomainObjPrivatePtr priv;
>> + virDomainObjPtr vm;
>> + virDomainEventPtr event = NULL;
>> + int paused = 0;
>> + int ret = -1;
>> +
>> + virCheckFlags(VIR_DUMP_LIVE | VIR_DUMP_CRASH, -1);
>> +
>> + libxlDriverLock(driver);
>> + vm = virDomainFindByUUID(&driver->domains, dom->uuid);
>> +
>> + if (!vm) {
>> + char uuidstr[VIR_UUID_STRING_BUFLEN];
>> + virUUIDFormat(dom->uuid, uuidstr);
>> + libxlError(VIR_ERR_NO_DOMAIN,
>> + _("No domain with matching uuid '%s'"),
uuidstr);
>> + goto cleanup;
>> + }
>> +
>> + if (!virDomainObjIsActive(vm)) {
>> + libxlError(VIR_ERR_OPERATION_INVALID, "%s", _("Domain is
>> not running"));
>> + goto cleanup;
>> + }
>> +
>> + priv = vm->privateData;
>> +
>> + if (!(flags & VIR_DUMP_LIVE) &&
>> + virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING) {
>> + if (libxl_domain_pause(&priv->ctx, dom->id) != 0) {
>> + libxlError(VIR_ERR_INTERNAL_ERROR,
>> + _("Failed to suspend domain '%d' with
>> libxenlight"),
>> + dom->id);
>>
>
> I think there could be a little more info in that error message, e.g.
> "Before dumping core, failed to suspend ...".
>
>> + goto cleanup;
>> + }
>> + virDomainObjSetState(vm, VIR_DOMAIN_PAUSED,
>> VIR_DOMAIN_PAUSED_DUMP);
>> + paused = 1;
>> + }
>> +
>> + if (libxl_domain_core_dump(&priv->ctx, dom->id, to) != 0) {
>> + libxlError(VIR_ERR_INTERNAL_ERROR,
>> + _("Failed to dump core of domain '%d' with
>> libxenlight"),
>> + dom->id);
>> + goto cleanup;
>> + }
>>
>
> If core dumping fails and the domain was paused, it won't be unpaused by
> jumping to cleanup.
>
>> +
>> + if (flags & VIR_DUMP_CRASH) {
>> + if (libxlVmReap(driver, vm, 1, VIR_DOMAIN_SHUTOFF_CRASHED)
>> != 0) {
>> + libxlError(VIR_ERR_INTERNAL_ERROR,
>> + _("Failed to destroy domain '%d'"),
dom->id);
>> + goto cleanup;
>> + }
>> +
>> + event = virDomainEventNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED,
>> +
>> VIR_DOMAIN_EVENT_STOPPED_CRASHED);
>> +
>> + } else if (paused) {
>> + if (libxl_domain_unpause(&priv->ctx, dom->id) != 0) {
>> + libxlError(VIR_ERR_INTERNAL_ERROR,
>> + _("Failed to resume domain '%d' with
>> libxenlight"),
>> + dom->id);
>>
>
> Here too more info in the error message, e.g. "After dumping core,
> failed to resume ..."
>
>> + goto cleanup;
>> + }
>> + virDomainObjSetState(vm, VIR_DOMAIN_RUNNING,
>> + VIR_DOMAIN_RUNNING_UNPAUSED);
>> + }
>> +
>> + if ((flags & VIR_DUMP_CRASH) && !vm->persistent) {
>> + virDomainRemoveInactive(&driver->domains, vm);
>> + vm = NULL;
>> + }
>> +
>> + ret = 0;
>> +
>> +cleanup:
>> + if (vm)
>> + virDomainObjUnlock(vm);
>> + if (event)
>> + libxlDomainEventQueue(driver, event);
>> + libxlDriverUnlock(driver);
>>
>
> Do we need to have the driver locked during the core dump? Most of the
> driver functions use this pattern
>
> libxlDriverLock(driver);
> vm = virDomainFindByUUID(&driver->domains, dom->uuid);
> libxlDriverUnlock(driver);
>
> Regards,
> Jim
I think we do, since the calls to libxlVmReap and
virDomainRemoveInactive both use the driver as parameter and at least
virDomainRemoveInactive is able to modify the contents of it.
But if you can convince me otherwise we can unlock the driver
directly
after obtaining the vm object.
A core dump could take quite some time on a large memory domain. How
about unlocking before invoking this long running operation and then
reacquiring?
I will post a reworked version of this patch on monday.
Ok, thanks! Monday is a US holiday, but hopefully I'll have some time
in the evening to review this and your other patches. We're running out
of time before 0.9.2 release ...
Regards,
Jim