[libvirt] [PATCH 0/4] Multiple problems with saving to block devices

This patch series makes it possible to save to a block device, instead of a plain file. There were multiple problems - WHen save failed, we might de-reference a NULL pointer - When save failed, we unlinked the device node !! - The approach of using >> to append, doesn't work with block devices - CGroups was blocking QEMU access to the block device when enabled One remaining problem is not in libvirt, but rather QEMU. The QEMU exec: based migration often fails to detect failure of the command and will thus hang forever attempting a migration that'll never succeed! Fortunately you can now work around this in libvirt using the virsh domjobabort command

If a transient QEMU crashes during save attempt, then the virDomainPtr object may be freed. If a persistent QEMU crashes during save, then the 'priv->mon' field is no longer valid since it will be inactive. * src/qemu/qemu_driver.c: Fix two crashes when QEMU exits during a save attempt --- src/qemu/qemu_driver.c | 36 ++++++++++++++++++++++-------------- 1 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 39feac7..91fe963 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4997,19 +4997,20 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, } endjob: - if (ret != 0 && header.was_running) { - qemuDomainObjEnterMonitorWithDriver(driver, vm); - rc = qemuMonitorStartCPUs(priv->mon, dom->conn); - qemuDomainObjExitMonitorWithDriver(driver, vm); - if (rc < 0) - VIR_WARN0("Unable to resume guest CPUs after save failure"); - else - vm->state = VIR_DOMAIN_RUNNING; - } + if (vm) { + if (ret != 0 && header.was_running && priv->mon) { + qemuDomainObjEnterMonitorWithDriver(driver, vm); + rc = qemuMonitorStartCPUs(priv->mon, dom->conn); + qemuDomainObjExitMonitorWithDriver(driver, vm); + if (rc < 0) + VIR_WARN0("Unable to resume guest CPUs after save failure"); + else + vm->state = VIR_DOMAIN_RUNNING; + } - if (vm && - qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(vm) == 0) vm = NULL; + } cleanup: VIR_FREE(xml); @@ -7185,9 +7186,16 @@ static int qemudDomainAttachNetDevice(virConnectPtr conn, } /* FIXME - need to support vhost-net here (5th arg) */ - if (!(netstr = qemuBuildHostNetStr(net, ' ', - vlan, tapfd_name, 0))) - goto try_tapfd_close; + if ((qemuCmdFlags & QEMUD_CMD_FLAG_NETDEV) && + (qemuCmdFlags & QEMUD_CMD_FLAG_DEVICE)) { + if (!(netstr = qemuBuildHostNetStr(net, ',', + -1, tapfd_name, 0))) + goto try_tapfd_close; + } else { + if (!(netstr = qemuBuildHostNetStr(net, ' ', + vlan, tapfd_name, 0))) + goto try_tapfd_close; + } qemuDomainObjEnterMonitorWithDriver(driver, vm); if ((qemuCmdFlags & QEMUD_CMD_FLAG_NETDEV) && -- 1.6.5.2

On Wed, Apr 21, 2010 at 05:56:10PM +0100, Daniel P. Berrange wrote:
If a transient QEMU crashes during save attempt, then the virDomainPtr object may be freed. If a persistent QEMU crashes during save, then the 'priv->mon' field is no longer valid since it will be inactive.
* src/qemu/qemu_driver.c: Fix two crashes when QEMU exits during a save attempt --- src/qemu/qemu_driver.c | 36 ++++++++++++++++++++++-------------- 1 files changed, 22 insertions(+), 14 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 39feac7..91fe963 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4997,19 +4997,20 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, }
endjob: - if (ret != 0 && header.was_running) { - qemuDomainObjEnterMonitorWithDriver(driver, vm); - rc = qemuMonitorStartCPUs(priv->mon, dom->conn); - qemuDomainObjExitMonitorWithDriver(driver, vm); - if (rc < 0) - VIR_WARN0("Unable to resume guest CPUs after save failure"); - else - vm->state = VIR_DOMAIN_RUNNING; - } + if (vm) { + if (ret != 0 && header.was_running && priv->mon) { + qemuDomainObjEnterMonitorWithDriver(driver, vm); + rc = qemuMonitorStartCPUs(priv->mon, dom->conn); + qemuDomainObjExitMonitorWithDriver(driver, vm); + if (rc < 0) + VIR_WARN0("Unable to resume guest CPUs after save failure"); + else + vm->state = VIR_DOMAIN_RUNNING; + }
- if (vm && - qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(vm) == 0) vm = NULL; + }
cleanup: VIR_FREE(xml); @@ -7185,9 +7186,16 @@ static int qemudDomainAttachNetDevice(virConnectPtr conn, }
/* FIXME - need to support vhost-net here (5th arg) */ - if (!(netstr = qemuBuildHostNetStr(net, ' ', - vlan, tapfd_name, 0))) - goto try_tapfd_close; + if ((qemuCmdFlags & QEMUD_CMD_FLAG_NETDEV) && + (qemuCmdFlags & QEMUD_CMD_FLAG_DEVICE)) { + if (!(netstr = qemuBuildHostNetStr(net, ',', + -1, tapfd_name, 0))) + goto try_tapfd_close; + } else { + if (!(netstr = qemuBuildHostNetStr(net, ' ', + vlan, tapfd_name, 0))) + goto try_tapfd_close; + }
qemuDomainObjEnterMonitorWithDriver(driver, vm); if ((qemuCmdFlags & QEMUD_CMD_FLAG_NETDEV) &&
ACK, Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

It is possible to use block devices with domain save/restore. Upon failure QEMU unlinks the path being saved to. This isn't good when it is a block device ! * src/qemu/qemu_driver.c: Don't unlink block devices if save fails --- src/qemu/qemu_driver.c | 22 ++++++++++++++++++++-- 1 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 91fe963..50cd982 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4787,6 +4787,8 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, int rc; virDomainEventPtr event = NULL; qemuDomainObjPrivatePtr priv; + struct stat sb; + int is_bdev = 0; memset(&header, 0, sizeof(header)); memcpy(header.magic, QEMUD_SAVE_MAGIC, sizeof(header.magic)); @@ -4840,6 +4842,21 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, } header.xml_len = strlen(xml) + 1; + /* path might be a pre-existing block dev, in which case + * we need to skip the create step, and also avoid unlink + * in the failure case */ + if (stat(path, &sb) < 0) { + if (errno != ENOENT) { + virReportSystemError(errno, _("unable to access %s"), path); + goto endjob; + } else { + is_bdev = 0; + } + } else { + is_bdev = S_ISBLK(sb.st_mode); + } + + /* Setup hook data needed by virFileOperation hook function */ hdata.dom = dom; hdata.path = path; @@ -4849,7 +4866,8 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, /* Write header to file, followed by XML */ /* First try creating the file as root */ - if ((rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, + if (!is_bdev && + (rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, S_IRUSR|S_IWUSR, getuid(), getgid(), qemudDomainSaveFileOpHook, &hdata, @@ -5014,7 +5032,7 @@ endjob: cleanup: VIR_FREE(xml); - if (ret != 0) + if (ret != 0 && !is_bdev) unlink(path); if (vm) virDomainObjUnlock(vm); -- 1.6.5.2

On Wed, Apr 21, 2010 at 05:56:11PM +0100, Daniel P. Berrange wrote:
It is possible to use block devices with domain save/restore. Upon failure QEMU unlinks the path being saved to. This isn't good when it is a block device !
* src/qemu/qemu_driver.c: Don't unlink block devices if save fails --- src/qemu/qemu_driver.c | 22 ++++++++++++++++++++-- 1 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 91fe963..50cd982 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4787,6 +4787,8 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, int rc; virDomainEventPtr event = NULL; qemuDomainObjPrivatePtr priv; + struct stat sb; + int is_bdev = 0;
memset(&header, 0, sizeof(header)); memcpy(header.magic, QEMUD_SAVE_MAGIC, sizeof(header.magic)); @@ -4840,6 +4842,21 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, } header.xml_len = strlen(xml) + 1;
+ /* path might be a pre-existing block dev, in which case + * we need to skip the create step, and also avoid unlink + * in the failure case */ + if (stat(path, &sb) < 0) { + if (errno != ENOENT) { + virReportSystemError(errno, _("unable to access %s"), path); + goto endjob; + } else { + is_bdev = 0; + }
what about EACCESS can we fail stat() but still be able to append to the file ? That sounds unlikely though...
+ } else { + is_bdev = S_ISBLK(sb.st_mode); + } + + /* Setup hook data needed by virFileOperation hook function */ hdata.dom = dom; hdata.path = path; @@ -4849,7 +4866,8 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, /* Write header to file, followed by XML */
/* First try creating the file as root */ - if ((rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, + if (!is_bdev && + (rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, S_IRUSR|S_IWUSR, getuid(), getgid(), qemudDomainSaveFileOpHook, &hdata, @@ -5014,7 +5032,7 @@ endjob:
cleanup: VIR_FREE(xml); - if (ret != 0) + if (ret != 0 && !is_bdev) unlink(path); if (vm) virDomainObjUnlock(vm);
ACK, Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

On Wed, Apr 21, 2010 at 10:38:28PM +0200, Daniel Veillard wrote:
On Wed, Apr 21, 2010 at 05:56:11PM +0100, Daniel P. Berrange wrote:
It is possible to use block devices with domain save/restore. Upon failure QEMU unlinks the path being saved to. This isn't good when it is a block device !
* src/qemu/qemu_driver.c: Don't unlink block devices if save fails --- src/qemu/qemu_driver.c | 22 ++++++++++++++++++++-- 1 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 91fe963..50cd982 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4787,6 +4787,8 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, int rc; virDomainEventPtr event = NULL; qemuDomainObjPrivatePtr priv; + struct stat sb; + int is_bdev = 0;
memset(&header, 0, sizeof(header)); memcpy(header.magic, QEMUD_SAVE_MAGIC, sizeof(header.magic)); @@ -4840,6 +4842,21 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, } header.xml_len = strlen(xml) + 1;
+ /* path might be a pre-existing block dev, in which case + * we need to skip the create step, and also avoid unlink + * in the failure case */ + if (stat(path, &sb) < 0) { + if (errno != ENOENT) { + virReportSystemError(errno, _("unable to access %s"), path); + goto endjob; + } else { + is_bdev = 0; + }
what about EACCESS can we fail stat() but still be able to append to the file ? That sounds unlikely though...
IMHO, if we get EACCESS we're better just reporting the error because its not a normal situation to be in.
+ } else { + is_bdev = S_ISBLK(sb.st_mode); + } + + /* Setup hook data needed by virFileOperation hook function */ hdata.dom = dom; hdata.path = path; @@ -4849,7 +4866,8 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, /* Write header to file, followed by XML */
/* First try creating the file as root */ - if ((rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, + if (!is_bdev && + (rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, S_IRUSR|S_IWUSR, getuid(), getgid(), qemudDomainSaveFileOpHook, &hdata, @@ -5014,7 +5032,7 @@ endjob:
cleanup: VIR_FREE(xml); - if (ret != 0) + if (ret != 0 && !is_bdev) unlink(path); if (vm) virDomainObjUnlock(vm);
ACK,
Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

On 04/21/2010 10:56 AM, Daniel P. Berrange wrote:
It is possible to use block devices with domain save/restore. Upon failure QEMU unlinks the path being saved to. This isn't good when it is a block device !
+ struct stat sb; + int is_bdev = 0;
Should this be bool instead of int?
+ /* path might be a pre-existing block dev, in which case + * we need to skip the create step, and also avoid unlink + * in the failure case */ + if (stat(path, &sb) < 0) { + if (errno != ENOENT) { + virReportSystemError(errno, _("unable to access %s"), path); + goto endjob; + } else { + is_bdev = 0; + } + } else { + is_bdev = S_ISBLK(sb.st_mode);
although if you use bool, you'd have to explicitly compare against 0 here, based on the gnulib restraints on stdbool.h. Also, what about other non-regular files? A directory will fail on the open, but what about named fifos (on the other hand, is anyone crazy enough to try a non-seekable file as the file to open?). So I'm wondering if it's better to check for S_ISREG, changing is_bdev to is_reg, and changing the cleanup logic to only attempt unlink() if is_reg, rather than skipping if is_bdev.
+ } + + /* Setup hook data needed by virFileOperation hook function */ hdata.dom = dom; hdata.path = path; @@ -4849,7 +4866,8 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, /* Write header to file, followed by XML */
/* First try creating the file as root */ - if ((rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, + if (!is_bdev && + (rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY,
This is racy. You stat() prior to open()ing. Wouldn't it be better to open(), then fstat()? -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

On Wed, Apr 21, 2010 at 02:49:42PM -0600, Eric Blake wrote:
On 04/21/2010 10:56 AM, Daniel P. Berrange wrote:
It is possible to use block devices with domain save/restore. Upon failure QEMU unlinks the path being saved to. This isn't good when it is a block device !
+ struct stat sb; + int is_bdev = 0;
Should this be bool instead of int?
+ /* path might be a pre-existing block dev, in which case + * we need to skip the create step, and also avoid unlink + * in the failure case */ + if (stat(path, &sb) < 0) { + if (errno != ENOENT) { + virReportSystemError(errno, _("unable to access %s"), path); + goto endjob; + } else { + is_bdev = 0; + } + } else { + is_bdev = S_ISBLK(sb.st_mode);
although if you use bool, you'd have to explicitly compare against 0 here, based on the gnulib restraints on stdbool.h. Also, what about other non-regular files? A directory will fail on the open, but what about named fifos (on the other hand, is anyone crazy enough to try a non-seekable file as the file to open?). So I'm wondering if it's better to check for S_ISREG, changing is_bdev to is_reg, and changing the cleanup logic to only attempt unlink() if is_reg, rather than skipping if is_bdev.
+ } + + /* Setup hook data needed by virFileOperation hook function */ hdata.dom = dom; hdata.path = path; @@ -4849,7 +4866,8 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, /* Write header to file, followed by XML */
/* First try creating the file as root */ - if ((rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, + if (!is_bdev && + (rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY,
This is racy. You stat() prior to open()ing. Wouldn't it be better to open(), then fstat()?
That would require significant changes to the virFileOperation code which is not something I think is worth the effort here. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

On Thu, Apr 22, 2010 at 11:35:17AM +0100, Daniel P. Berrange wrote:
On Wed, Apr 21, 2010 at 02:49:42PM -0600, Eric Blake wrote:
On 04/21/2010 10:56 AM, Daniel P. Berrange wrote:
It is possible to use block devices with domain save/restore. Upon failure QEMU unlinks the path being saved to. This isn't good when it is a block device !
+ struct stat sb; + int is_bdev = 0;
Should this be bool instead of int?
+ /* path might be a pre-existing block dev, in which case + * we need to skip the create step, and also avoid unlink + * in the failure case */ + if (stat(path, &sb) < 0) { + if (errno != ENOENT) { + virReportSystemError(errno, _("unable to access %s"), path); + goto endjob; + } else { + is_bdev = 0; + } + } else { + is_bdev = S_ISBLK(sb.st_mode);
although if you use bool, you'd have to explicitly compare against 0 here, based on the gnulib restraints on stdbool.h. Also, what about other non-regular files? A directory will fail on the open, but what about named fifos (on the other hand, is anyone crazy enough to try a non-seekable file as the file to open?). So I'm wondering if it's better to check for S_ISREG, changing is_bdev to is_reg, and changing the cleanup logic to only attempt unlink() if is_reg, rather than skipping if is_bdev.
+ } + + /* Setup hook data needed by virFileOperation hook function */ hdata.dom = dom; hdata.path = path; @@ -4849,7 +4866,8 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, /* Write header to file, followed by XML */
/* First try creating the file as root */ - if ((rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, + if (!is_bdev && + (rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY,
This is racy. You stat() prior to open()ing. Wouldn't it be better to open(), then fstat()?
That would require significant changes to the virFileOperation code which is not something I think is worth the effort here.
Actually, the whole thing is racy regardless of what we do, since QEMU itself will spawn another program which will re-open this file. To avoid a race, we'd need to switch to QEMU's 'fd:' based migration and pass the pre-opened file handle to it Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

The save process was relying on use of the shell >> append operator to ensure the save data was placed after the libvirt header + XML. This doesn't work for block devices though. Replace this code with use of 'dd' and its 'seek' parameter. The qemuMonitorMigateToCommand() monitor API is used for both save/coredump, and migration via UNIX socket. We can't simply switch this to use 'dd' since this causes problems with the migration usage. Thus, create a dedicated qemuMonitorMigateToFile which can accept an filename + offset, and remove the filename from the current qemuMonitorMigateToCommand() API * src/qemu/qemu_driver.c: Switch to qemuMonitorMigateToFile for save and core dump * src/qemu/qemu_monitor.c, src/qemu/qemu_monitor.h, src/qemu/qemu_monitor_json.c, src/qemu/qemu_monitor_json.h, src/qemu/qemu_monitor_text.c, src/qemu/qemu_monitor_text.h: Create a new qemuMonitorMigateToFile, separate from the existing qemuMonitorMigateToCommand to allow handling file offsets --- src/qemu/qemu_driver.c | 172 +++++++++++++++++++++++------------------- src/qemu/qemu_monitor.c | 28 ++++++-- src/qemu/qemu_monitor.h | 9 ++- src/qemu/qemu_monitor_json.c | 35 ++++++++- src/qemu/qemu_monitor_json.h | 9 ++- src/qemu/qemu_monitor_text.c | 35 ++++++++- src/qemu/qemu_monitor_text.h | 9 ++- 7 files changed, 201 insertions(+), 96 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 50cd982..1f1e703 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4789,6 +4789,7 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, qemuDomainObjPrivatePtr priv; struct stat sb; int is_bdev = 0; + unsigned long long offset; memset(&header, 0, sizeof(header)); memcpy(header.magic, QEMUD_SAVE_MAGIC, sizeof(header.magic)); @@ -4862,104 +4863,119 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, hdata.path = path; hdata.xml = xml; hdata.header = &header; + offset = sizeof(header) + header.xml_len; /* Write header to file, followed by XML */ /* First try creating the file as root */ - if (!is_bdev && - (rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, - S_IRUSR|S_IWUSR, - getuid(), getgid(), - qemudDomainSaveFileOpHook, &hdata, - 0)) != 0) { - - /* If we failed as root, and the error was permission-denied - (EACCES), assume it's on a network-connected share where - root access is restricted (eg, root-squashed NFS). If the - qemu user (driver->user) is non-root, just set a flag to - bypass security driver shenanigans, and retry the operation - after doing setuid to qemu user */ - - if ((rc != EACCES) || - driver->user == getuid()) { - virReportSystemError(rc, _("Failed to create domain save file '%s'"), - path); + if (is_bdev) { + int fd = open(path, O_WRONLY | O_TRUNC); + if (fd < 0) { + virReportSystemError(errno, _("unable to open %s"), path); goto endjob; } - -#ifdef __linux__ - /* On Linux we can also verify the FS-type of the directory. */ - char *dirpath, *p; - struct statfs st; - int statfs_ret; - - if ((dirpath = strdup(path)) == NULL) { - virReportOOMError(); + if ((rc = qemudDomainSaveFileOpHook(fd, &hdata)) != 0) { + close(fd); goto endjob; } + if (close(fd) < 0) { + virReportSystemError(errno, _("unable to close %s"), path); + goto endjob; + } + } else { + if ((rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, + S_IRUSR|S_IWUSR, + getuid(), getgid(), + qemudDomainSaveFileOpHook, &hdata, + 0)) != 0) { + /* If we failed as root, and the error was permission-denied + (EACCES), assume it's on a network-connected share where + root access is restricted (eg, root-squashed NFS). If the + qemu user (driver->user) is non-root, just set a flag to + bypass security driver shenanigans, and retry the operation + after doing setuid to qemu user */ + + if ((rc != EACCES) || + driver->user == getuid()) { + virReportSystemError(rc, _("Failed to create domain save file '%s'"), + path); + goto endjob; + } - do { - // Try less and less of the path until we get to a - // directory we can stat. Even if we don't have 'x' - // permission on any directory in the path on the NFS - // server (assuming it's NFS), we will be able to stat the - // mount point, and that will properly tell us if the - // fstype is NFS. +#ifdef __linux__ + /* On Linux we can also verify the FS-type of the directory. */ + char *dirpath, *p; + struct statfs st; + int statfs_ret; - if ((p = strrchr(dirpath, '/')) == NULL) { - qemuReportError(VIR_ERR_INVALID_ARG, - _("Invalid relative path '%s' for domain save file"), - path); - VIR_FREE(dirpath); + if ((dirpath = strdup(path)) == NULL) { + virReportOOMError(); goto endjob; } - if (p == dirpath) - *(p+1) = '\0'; - else - *p = '\0'; + do { + // Try less and less of the path until we get to a + // directory we can stat. Even if we don't have 'x' + // permission on any directory in the path on the NFS + // server (assuming it's NFS), we will be able to stat the + // mount point, and that will properly tell us if the + // fstype is NFS. + + if ((p = strrchr(dirpath, '/')) == NULL) { + qemuReportError(VIR_ERR_INVALID_ARG, + _("Invalid relative path '%s' for domain save file"), + path); + VIR_FREE(dirpath); + goto endjob; + } - statfs_ret = statfs(dirpath, &st); + if (p == dirpath) + *(p+1) = '\0'; + else + *p = '\0'; - } while ((statfs_ret == -1) && (p != dirpath)); + statfs_ret = statfs(dirpath, &st); - if (statfs_ret == -1) { - virReportSystemError(errno, - _("Failed to create domain save file '%s'" - " statfs of all elements of path failed."), - path); - VIR_FREE(dirpath); - goto endjob; - } + } while ((statfs_ret == -1) && (p != dirpath)); - if (st.f_type != NFS_SUPER_MAGIC) { - virReportSystemError(rc, - _("Failed to create domain save file '%s'" - " (fstype of '%s' is 0x%X"), - path, dirpath, (unsigned int) st.f_type); + if (statfs_ret == -1) { + virReportSystemError(errno, + _("Failed to create domain save file '%s'" + " statfs of all elements of path failed."), + path); + VIR_FREE(dirpath); + goto endjob; + } + + if (st.f_type != NFS_SUPER_MAGIC) { + virReportSystemError(rc, + _("Failed to create domain save file '%s'" + " (fstype of '%s' is 0x%X"), + path, dirpath, (unsigned int) st.f_type); + VIR_FREE(dirpath); + goto endjob; + } VIR_FREE(dirpath); - goto endjob; - } - VIR_FREE(dirpath); #endif - /* Retry creating the file as driver->user */ + /* Retry creating the file as driver->user */ - if ((rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, - S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP, - driver->user, driver->group, - qemudDomainSaveFileOpHook, &hdata, - VIR_FILE_OP_AS_UID)) != 0) { - virReportSystemError(rc, _("Error from child process creating '%s'"), + if ((rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, + S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP, + driver->user, driver->group, + qemudDomainSaveFileOpHook, &hdata, + VIR_FILE_OP_AS_UID)) != 0) { + virReportSystemError(rc, _("Error from child process creating '%s'"), path); - goto endjob; - } + goto endjob; + } - /* Since we had to setuid to create the file, and the fstype - is NFS, we assume it's a root-squashing NFS share, and that - the security driver stuff would have failed anyway */ + /* Since we had to setuid to create the file, and the fstype + is NFS, we assume it's a root-squashing NFS share, and that + the security driver stuff would have failed anyway */ - bypassSecurityDriver = 1; + bypassSecurityDriver = 1; + } } @@ -4972,7 +4988,7 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, if (header.compressed == QEMUD_SAVE_FORMAT_RAW) { const char *args[] = { "cat", NULL }; qemuDomainObjEnterMonitorWithDriver(driver, vm); - rc = qemuMonitorMigrateToCommand(priv->mon, 1, args, path); + rc = qemuMonitorMigrateToFile(priv->mon, 1, args, path, offset); qemuDomainObjExitMonitorWithDriver(driver, vm); } else { const char *prog = qemudSaveCompressionTypeToString(header.compressed); @@ -4982,7 +4998,7 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, NULL }; qemuDomainObjEnterMonitorWithDriver(driver, vm); - rc = qemuMonitorMigrateToCommand(priv->mon, 1, args, path); + rc = qemuMonitorMigrateToFile(priv->mon, 1, args, path, offset); qemuDomainObjExitMonitorWithDriver(driver, vm); } @@ -5275,7 +5291,7 @@ static int qemudDomainCoreDump(virDomainPtr dom, } qemuDomainObjEnterMonitor(vm); - ret = qemuMonitorMigrateToCommand(priv->mon, 1, args, path); + ret = qemuMonitorMigrateToFile(priv->mon, 1, args, path, 0); qemuDomainObjExitMonitor(vm); if (ret < 0) @@ -10043,7 +10059,7 @@ static int doTunnelMigrate(virDomainPtr dom, internalret = qemuMonitorMigrateToUnix(priv->mon, 1, unixfile); else if (qemuCmdFlags & QEMUD_CMD_FLAG_MIGRATE_QEMU_EXEC) { const char *args[] = { "nc", "-U", unixfile, NULL }; - internalret = qemuMonitorMigrateToCommand(priv->mon, 1, args, "/dev/null"); + internalret = qemuMonitorMigrateToCommand(priv->mon, 1, args); } else { internalret = -1; } diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c index 32c3cd5..ebb47ce 100644 --- a/src/qemu/qemu_monitor.c +++ b/src/qemu/qemu_monitor.c @@ -1202,17 +1202,33 @@ int qemuMonitorMigrateToHost(qemuMonitorPtr mon, int qemuMonitorMigrateToCommand(qemuMonitorPtr mon, int background, - const char * const *argv, - const char *target) + const char * const *argv) { int ret; - DEBUG("mon=%p, fd=%d argv=%p target=%s", - mon, mon->fd, argv, target); + DEBUG("mon=%p, fd=%d argv=%p", + mon, mon->fd, argv); if (mon->json) - ret = qemuMonitorJSONMigrateToCommand(mon, background, argv, target); + ret = qemuMonitorJSONMigrateToCommand(mon, background, argv); else - ret = qemuMonitorTextMigrateToCommand(mon, background, argv, target); + ret = qemuMonitorTextMigrateToCommand(mon, background, argv); + return ret; +} + +int qemuMonitorMigrateToFile(qemuMonitorPtr mon, + int background, + const char * const *argv, + const char *target, + unsigned long long offset) +{ + int ret; + DEBUG("mon=%p, fd=%d argv=%p target=%s offset=%llu", + mon, mon->fd, argv, target, offset); + + if (mon->json) + ret = qemuMonitorJSONMigrateToFile(mon, background, argv, target, offset); + else + ret = qemuMonitorTextMigrateToFile(mon, background, argv, target, offset); return ret; } diff --git a/src/qemu/qemu_monitor.h b/src/qemu/qemu_monitor.h index a0f198b..0d29499 100644 --- a/src/qemu/qemu_monitor.h +++ b/src/qemu/qemu_monitor.h @@ -251,8 +251,13 @@ int qemuMonitorMigrateToHost(qemuMonitorPtr mon, int qemuMonitorMigrateToCommand(qemuMonitorPtr mon, int background, - const char * const *argv, - const char *target); + const char * const *argv); + +int qemuMonitorMigrateToFile(qemuMonitorPtr mon, + int background, + const char * const *argv, + const char *target, + unsigned long long offset); int qemuMonitorMigrateToUnix(qemuMonitorPtr mon, int background, diff --git a/src/qemu/qemu_monitor_json.c b/src/qemu/qemu_monitor_json.c index b0773ab..c3e9830 100644 --- a/src/qemu/qemu_monitor_json.c +++ b/src/qemu/qemu_monitor_json.c @@ -1652,8 +1652,36 @@ int qemuMonitorJSONMigrateToHost(qemuMonitorPtr mon, int qemuMonitorJSONMigrateToCommand(qemuMonitorPtr mon, int background, - const char * const *argv, - const char *target) + const char * const *argv) +{ + char *argstr; + char *dest = NULL; + int ret = -1; + + argstr = virArgvToString(argv); + if (!argstr) { + virReportOOMError(); + goto cleanup; + } + + if (virAsprintf(&dest, "exec:%s", argstr) < 0) { + virReportOOMError(); + goto cleanup; + } + + ret = qemuMonitorJSONMigrate(mon, background, dest); + +cleanup: + VIR_FREE(argstr); + VIR_FREE(dest); + return ret; +} + +int qemuMonitorJSONMigrateToFile(qemuMonitorPtr mon, + int background, + const char * const *argv, + const char *target, + unsigned long long offset) { char *argstr; char *dest = NULL; @@ -1673,7 +1701,8 @@ int qemuMonitorJSONMigrateToCommand(qemuMonitorPtr mon, goto cleanup; } - if (virAsprintf(&dest, "exec:%s >>%s 2>/dev/null", argstr, safe_target) < 0) { + if (virAsprintf(&dest, "exec:%s | dd of=%s seek=%llub", + argstr, safe_target, offset) < 0) { virReportOOMError(); goto cleanup; } diff --git a/src/qemu/qemu_monitor_json.h b/src/qemu/qemu_monitor_json.h index 157a5c0..4dcb3e0 100644 --- a/src/qemu/qemu_monitor_json.h +++ b/src/qemu/qemu_monitor_json.h @@ -104,8 +104,13 @@ int qemuMonitorJSONMigrateToHost(qemuMonitorPtr mon, int qemuMonitorJSONMigrateToCommand(qemuMonitorPtr mon, int background, - const char * const *argv, - const char *target); + const char * const *argv); + +int qemuMonitorJSONMigrateToFile(qemuMonitorPtr mon, + int background, + const char * const *argv, + const char *target, + unsigned long long offset); int qemuMonitorJSONMigrateToUnix(qemuMonitorPtr mon, int background, diff --git a/src/qemu/qemu_monitor_text.c b/src/qemu/qemu_monitor_text.c index 6490ea6..e7f5949 100644 --- a/src/qemu/qemu_monitor_text.c +++ b/src/qemu/qemu_monitor_text.c @@ -1202,8 +1202,36 @@ int qemuMonitorTextMigrateToHost(qemuMonitorPtr mon, int qemuMonitorTextMigrateToCommand(qemuMonitorPtr mon, int background, - const char * const *argv, - const char *target) + const char * const *argv) +{ + char *argstr; + char *dest = NULL; + int ret = -1; + + argstr = virArgvToString(argv); + if (!argstr) { + virReportOOMError(); + goto cleanup; + } + + if (virAsprintf(&dest, "exec:%s", argstr) < 0) { + virReportOOMError(); + goto cleanup; + } + + ret = qemuMonitorTextMigrate(mon, background, dest); + +cleanup: + VIR_FREE(argstr); + VIR_FREE(dest); + return ret; +} + +int qemuMonitorTextMigrateToFile(qemuMonitorPtr mon, + int background, + const char * const *argv, + const char *target, + unsigned long long offset) { char *argstr; char *dest = NULL; @@ -1223,7 +1251,8 @@ int qemuMonitorTextMigrateToCommand(qemuMonitorPtr mon, goto cleanup; } - if (virAsprintf(&dest, "exec:%s >>%s 2>/dev/null", argstr, safe_target) < 0) { + if (virAsprintf(&dest, "exec:%s | dd of=%s seek=%llub", + argstr, safe_target, offset) < 0) { virReportOOMError(); goto cleanup; } diff --git a/src/qemu/qemu_monitor_text.h b/src/qemu/qemu_monitor_text.h index c80957e..25be828 100644 --- a/src/qemu/qemu_monitor_text.h +++ b/src/qemu/qemu_monitor_text.h @@ -99,8 +99,13 @@ int qemuMonitorTextMigrateToHost(qemuMonitorPtr mon, int qemuMonitorTextMigrateToCommand(qemuMonitorPtr mon, int background, - const char * const *argv, - const char *target); + const char * const *argv); + +int qemuMonitorTextMigrateToFile(qemuMonitorPtr mon, + int background, + const char * const *argv, + const char *target, + unsigned long long offset); int qemuMonitorTextMigrateToUnix(qemuMonitorPtr mon, int background, -- 1.6.5.2

On Wed, Apr 21, 2010 at 05:56:12PM +0100, Daniel P. Berrange wrote:
The save process was relying on use of the shell >> append operator to ensure the save data was placed after the libvirt header + XML. This doesn't work for block devices though. Replace this code with use of 'dd' and its 'seek' parameter.
The qemuMonitorMigateToCommand() monitor API is used for both save/coredump, and migration via UNIX socket. We can't simply switch this to use 'dd' since this causes problems with the migration usage. Thus, create a dedicated qemuMonitorMigateToFile which can accept an filename + offset, and remove the filename from the current qemuMonitorMigateToCommand() API
* src/qemu/qemu_driver.c: Switch to qemuMonitorMigateToFile for save and core dump * src/qemu/qemu_monitor.c, src/qemu/qemu_monitor.h, src/qemu/qemu_monitor_json.c, src/qemu/qemu_monitor_json.h, src/qemu/qemu_monitor_text.c, src/qemu/qemu_monitor_text.h: Create a new qemuMonitorMigateToFile, separate from the existing qemuMonitorMigateToCommand to allow handling file offsets --- src/qemu/qemu_driver.c | 172 +++++++++++++++++++++++------------------- src/qemu/qemu_monitor.c | 28 ++++++-- src/qemu/qemu_monitor.h | 9 ++- src/qemu/qemu_monitor_json.c | 35 ++++++++- src/qemu/qemu_monitor_json.h | 9 ++- src/qemu/qemu_monitor_text.c | 35 ++++++++- src/qemu/qemu_monitor_text.h | 9 ++- 7 files changed, 201 insertions(+), 96 deletions(-)
[...]
- if (virAsprintf(&dest, "exec:%s >>%s 2>/dev/null", argstr, safe_target) < 0) { + if (virAsprintf(&dest, "exec:%s | dd of=%s seek=%llub", + argstr, safe_target, offset) < 0) {
hum %llu will be converted to the value and then 'b' is appended. But my reading is that 'b' means block i.e. 512 bytes, and what we need is skeep to offset bytes, i.e. use seek=%lluc since 'c' means characters, or am I mistaken ?
virReportOOMError(); goto cleanup; }
[...]
@@ -1223,7 +1251,8 @@ int qemuMonitorTextMigrateToCommand(qemuMonitorPtr mon, goto cleanup; }
- if (virAsprintf(&dest, "exec:%s >>%s 2>/dev/null", argstr, safe_target) < 0) { + if (virAsprintf(&dest, "exec:%s | dd of=%s seek=%llub", + argstr, safe_target, offset) < 0) { virReportOOMError(); goto cleanup;
same question. The main part of the patch is hard to read but I don't spot anything, however I wonder about the dd seek argument. ACK once resolved one way or another :-) Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

On Wed, Apr 21, 2010 at 11:02:33PM +0200, Daniel Veillard wrote:
On Wed, Apr 21, 2010 at 05:56:12PM +0100, Daniel P. Berrange wrote:
The save process was relying on use of the shell >> append operator to ensure the save data was placed after the libvirt header + XML. This doesn't work for block devices though. Replace this code with use of 'dd' and its 'seek' parameter.
The qemuMonitorMigateToCommand() monitor API is used for both save/coredump, and migration via UNIX socket. We can't simply switch this to use 'dd' since this causes problems with the migration usage. Thus, create a dedicated qemuMonitorMigateToFile which can accept an filename + offset, and remove the filename from the current qemuMonitorMigateToCommand() API
* src/qemu/qemu_driver.c: Switch to qemuMonitorMigateToFile for save and core dump * src/qemu/qemu_monitor.c, src/qemu/qemu_monitor.h, src/qemu/qemu_monitor_json.c, src/qemu/qemu_monitor_json.h, src/qemu/qemu_monitor_text.c, src/qemu/qemu_monitor_text.h: Create a new qemuMonitorMigateToFile, separate from the existing qemuMonitorMigateToCommand to allow handling file offsets --- src/qemu/qemu_driver.c | 172 +++++++++++++++++++++++------------------- src/qemu/qemu_monitor.c | 28 ++++++-- src/qemu/qemu_monitor.h | 9 ++- src/qemu/qemu_monitor_json.c | 35 ++++++++- src/qemu/qemu_monitor_json.h | 9 ++- src/qemu/qemu_monitor_text.c | 35 ++++++++- src/qemu/qemu_monitor_text.h | 9 ++- 7 files changed, 201 insertions(+), 96 deletions(-)
[...]
- if (virAsprintf(&dest, "exec:%s >>%s 2>/dev/null", argstr, safe_target) < 0) { + if (virAsprintf(&dest, "exec:%s | dd of=%s seek=%llub", + argstr, safe_target, offset) < 0) {
hum %llu will be converted to the value and then 'b' is appended. But my reading is that 'b' means block i.e. 512 bytes, and what we need is skeep to offset bytes, i.e. use seek=%lluc since 'c' means characters, or am I mistaken ?
Hmm, yes I think you are correct. Oddly QEMU was happy enough restoring from this.
virReportOOMError(); goto cleanup; }
[...]
@@ -1223,7 +1251,8 @@ int qemuMonitorTextMigrateToCommand(qemuMonitorPtr mon, goto cleanup; }
- if (virAsprintf(&dest, "exec:%s >>%s 2>/dev/null", argstr, safe_target) < 0) { + if (virAsprintf(&dest, "exec:%s | dd of=%s seek=%llub", + argstr, safe_target, offset) < 0) { virReportOOMError(); goto cleanup;
same question.
The main part of the patch is hard to read but I don't spot anything, however I wonder about the dd seek argument.
ACK once resolved one way or another :-)
Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

On Thu, Apr 22, 2010 at 11:36:06AM +0100, Daniel P. Berrange wrote:
On Wed, Apr 21, 2010 at 11:02:33PM +0200, Daniel Veillard wrote:
On Wed, Apr 21, 2010 at 05:56:12PM +0100, Daniel P. Berrange wrote:
The save process was relying on use of the shell >> append operator to ensure the save data was placed after the libvirt header + XML. This doesn't work for block devices though. Replace this code with use of 'dd' and its 'seek' parameter.
The qemuMonitorMigateToCommand() monitor API is used for both save/coredump, and migration via UNIX socket. We can't simply switch this to use 'dd' since this causes problems with the migration usage. Thus, create a dedicated qemuMonitorMigateToFile which can accept an filename + offset, and remove the filename from the current qemuMonitorMigateToCommand() API
* src/qemu/qemu_driver.c: Switch to qemuMonitorMigateToFile for save and core dump * src/qemu/qemu_monitor.c, src/qemu/qemu_monitor.h, src/qemu/qemu_monitor_json.c, src/qemu/qemu_monitor_json.h, src/qemu/qemu_monitor_text.c, src/qemu/qemu_monitor_text.h: Create a new qemuMonitorMigateToFile, separate from the existing qemuMonitorMigateToCommand to allow handling file offsets --- src/qemu/qemu_driver.c | 172 +++++++++++++++++++++++------------------- src/qemu/qemu_monitor.c | 28 ++++++-- src/qemu/qemu_monitor.h | 9 ++- src/qemu/qemu_monitor_json.c | 35 ++++++++- src/qemu/qemu_monitor_json.h | 9 ++- src/qemu/qemu_monitor_text.c | 35 ++++++++- src/qemu/qemu_monitor_text.h | 9 ++- 7 files changed, 201 insertions(+), 96 deletions(-)
[...]
- if (virAsprintf(&dest, "exec:%s >>%s 2>/dev/null", argstr, safe_target) < 0) { + if (virAsprintf(&dest, "exec:%s | dd of=%s seek=%llub", + argstr, safe_target, offset) < 0) {
hum %llu will be converted to the value and then 'b' is appended. But my reading is that 'b' means block i.e. 512 bytes, and what we need is skeep to offset bytes, i.e. use seek=%lluc since 'c' means characters, or am I mistaken ?
Hmm, yes I think you are correct. Oddly QEMU was happy enough restoring from this.
I assume dd will just push at the end if the offset is greater than the file size instead of creating a sparse file, so we end up with what we expect :-) Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

On Thu, Apr 22, 2010 at 01:16:15PM +0200, Daniel Veillard wrote:
On Thu, Apr 22, 2010 at 11:36:06AM +0100, Daniel P. Berrange wrote:
On Wed, Apr 21, 2010 at 11:02:33PM +0200, Daniel Veillard wrote:
On Wed, Apr 21, 2010 at 05:56:12PM +0100, Daniel P. Berrange wrote:
The save process was relying on use of the shell >> append operator to ensure the save data was placed after the libvirt header + XML. This doesn't work for block devices though. Replace this code with use of 'dd' and its 'seek' parameter.
The qemuMonitorMigateToCommand() monitor API is used for both save/coredump, and migration via UNIX socket. We can't simply switch this to use 'dd' since this causes problems with the migration usage. Thus, create a dedicated qemuMonitorMigateToFile which can accept an filename + offset, and remove the filename from the current qemuMonitorMigateToCommand() API
* src/qemu/qemu_driver.c: Switch to qemuMonitorMigateToFile for save and core dump * src/qemu/qemu_monitor.c, src/qemu/qemu_monitor.h, src/qemu/qemu_monitor_json.c, src/qemu/qemu_monitor_json.h, src/qemu/qemu_monitor_text.c, src/qemu/qemu_monitor_text.h: Create a new qemuMonitorMigateToFile, separate from the existing qemuMonitorMigateToCommand to allow handling file offsets --- src/qemu/qemu_driver.c | 172 +++++++++++++++++++++++------------------- src/qemu/qemu_monitor.c | 28 ++++++-- src/qemu/qemu_monitor.h | 9 ++- src/qemu/qemu_monitor_json.c | 35 ++++++++- src/qemu/qemu_monitor_json.h | 9 ++- src/qemu/qemu_monitor_text.c | 35 ++++++++- src/qemu/qemu_monitor_text.h | 9 ++- 7 files changed, 201 insertions(+), 96 deletions(-)
[...]
- if (virAsprintf(&dest, "exec:%s >>%s 2>/dev/null", argstr, safe_target) < 0) { + if (virAsprintf(&dest, "exec:%s | dd of=%s seek=%llub", + argstr, safe_target, offset) < 0) {
hum %llu will be converted to the value and then 'b' is appended. But my reading is that 'b' means block i.e. 512 bytes, and what we need is skeep to offset bytes, i.e. use seek=%lluc since 'c' means characters, or am I mistaken ?
Hmm, yes I think you are correct. Oddly QEMU was happy enough restoring from this.
I assume dd will just push at the end if the offset is greater than the file size instead of creating a sparse file, so we end up with what we expect :-)
It turns out that QEMU wasn't happy at all. I forgot that when QEMU has problems loading a save file, it just prints a message to stderr and carries on with life as if nothing were wrong :-( Also 'seek' doesn't work in the way I expected. Adding a suffix 'c' does not mean that it interprets the value as bytes. It always interprets it as the block size (512), and uses the suffix as a multiplier :-( So there is no way to seek an arbitrary number of bytes that isn't a multiple of the block size. We don't want dd reading/writing in 1 byte sizes, so I'm adding padding instead. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

On 04/21/2010 10:56 AM, Daniel P. Berrange wrote:
The save process was relying on use of the shell >> append operator to ensure the save data was placed after the libvirt header + XML. This doesn't work for block devices though. Replace this code with use of 'dd' and its 'seek' parameter.
The qemuMonitorMigateToCommand() monitor API is used for both save/coredump, and migration via UNIX socket. We can't simply switch this to use 'dd' since this causes problems with the migration usage. Thus, create a dedicated qemuMonitorMigateToFile which can accept an filename + offset, and remove the filename from the current qemuMonitorMigateToCommand() API
+++ b/src/qemu/qemu_driver.c @@ -4789,6 +4789,7 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, qemuDomainObjPrivatePtr priv; struct stat sb; int is_bdev = 0; + unsigned long long offset;
Should this be off_t instead of ull?
@@ -1673,7 +1701,8 @@ int qemuMonitorJSONMigrateToCommand(qemuMonitorPtr mon, goto cleanup; }
- if (virAsprintf(&dest, "exec:%s >>%s 2>/dev/null", argstr, safe_target) < 0) { + if (virAsprintf(&dest, "exec:%s | dd of=%s seek=%llub", + argstr, safe_target, offset) < 0) {
Don't you still need to silence stderr, particularly since dd writes to stderr even on success? (2 instances) -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

On Wed, Apr 21, 2010 at 03:03:43PM -0600, Eric Blake wrote:
On 04/21/2010 10:56 AM, Daniel P. Berrange wrote:
The save process was relying on use of the shell >> append operator to ensure the save data was placed after the libvirt header + XML. This doesn't work for block devices though. Replace this code with use of 'dd' and its 'seek' parameter.
The qemuMonitorMigateToCommand() monitor API is used for both save/coredump, and migration via UNIX socket. We can't simply switch this to use 'dd' since this causes problems with the migration usage. Thus, create a dedicated qemuMonitorMigateToFile which can accept an filename + offset, and remove the filename from the current qemuMonitorMigateToCommand() API
+++ b/src/qemu/qemu_driver.c @@ -4789,6 +4789,7 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, qemuDomainObjPrivatePtr priv; struct stat sb; int is_bdev = 0; + unsigned long long offset;
Should this be off_t instead of ull?
@@ -1673,7 +1701,8 @@ int qemuMonitorJSONMigrateToCommand(qemuMonitorPtr mon, goto cleanup; }
- if (virAsprintf(&dest, "exec:%s >>%s 2>/dev/null", argstr, safe_target) < 0) { + if (virAsprintf(&dest, "exec:%s | dd of=%s seek=%llub", + argstr, safe_target, offset) < 0) {
Don't you still need to silence stderr, particularly since dd writes to stderr even on success? (2 instances)
I didn't want to silence stderr, because I want it to end up in the QEMU logfile if anything goes wrong. So i really need a way to make dd keep quiet on success, rather than throwing away stderr Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

[adding bug-coreutils] On 04/22/2010 04:37 AM, Daniel P. Berrange wrote:
- if (virAsprintf(&dest, "exec:%s >>%s 2>/dev/null", argstr, safe_target) < 0) { + if (virAsprintf(&dest, "exec:%s | dd of=%s seek=%llub", + argstr, safe_target, offset) < 0) {
Don't you still need to silence stderr, particularly since dd writes to stderr even on success? (2 instances)
I didn't want to silence stderr, because I want it to end up in the QEMU logfile if anything goes wrong. So i really need a way to make dd keep quiet on success, rather than throwing away stderr
Coreutils comes with an extension 'dd status=noxfer' which silences some (but not all) output to stderr, but you'd have to test whether you are targetting coreutils' dd (if dd comes from somewhere else, like busybox, you'll cause a syntax error that prevents dd from doing anything at all). There was a patch submitted to coreutils [1] that would add status=noinfo, but it is currently held up by copyright status and lack of documentation. Maybe I should revive that patch (or rather, write it from scratch, to avoid copyright taint). But even so, you are still up against the issue of testing whether you are targetting new-enough dd. [1] http://lists.gnu.org/archive/html/bug-coreutils/2010-02/msg00161.html About the best you can portably do, then, is capture stderr, then check the exit status of dd; if the exit status is 0, discard the captured stderr; otherwise, pass the stderr on to the logfile: foo | dd of=a seek=$n 2>b; st=$?; if test $st != 0; then cat b >&2; \ fi && rm -f b && exit $st -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org

On Thu, Apr 22, 2010 at 08:39:26AM -0600, Eric Blake wrote:
[adding bug-coreutils]
On 04/22/2010 04:37 AM, Daniel P. Berrange wrote:
- if (virAsprintf(&dest, "exec:%s >>%s 2>/dev/null", argstr, safe_target) < 0) { + if (virAsprintf(&dest, "exec:%s | dd of=%s seek=%llub", + argstr, safe_target, offset) < 0) {
Don't you still need to silence stderr, particularly since dd writes to stderr even on success? (2 instances)
I didn't want to silence stderr, because I want it to end up in the QEMU logfile if anything goes wrong. So i really need a way to make dd keep quiet on success, rather than throwing away stderr
Coreutils comes with an extension 'dd status=noxfer' which silences some (but not all) output to stderr, but you'd have to test whether you are targetting coreutils' dd (if dd comes from somewhere else, like busybox, you'll cause a syntax error that prevents dd from doing anything at all).
Yep, just looked at status=noxfer, but there's not much point in it since it still prints other stats unconditionally.
There was a patch submitted to coreutils [1] that would add status=noinfo, but it is currently held up by copyright status and lack of documentation. Maybe I should revive that patch (or rather, write it from scratch, to avoid copyright taint). But even so, you are still up against the issue of testing whether you are targetting new-enough dd.
[1] http://lists.gnu.org/archive/html/bug-coreutils/2010-02/msg00161.html
About the best you can portably do, then, is capture stderr, then check the exit status of dd; if the exit status is 0, discard the captured stderr; otherwise, pass the stderr on to the logfile:
foo | dd of=a seek=$n 2>b; st=$?; if test $st != 0; then cat b >&2; \ fi && rm -f b && exit $st
It really isn't worth the bother. A couple of lines of dd output in the logfile is no harm to anything. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

tags 6004 + notabug close 6004 thanks Eric Blake wrote:
[adding bug-coreutils]
On 04/22/2010 04:37 AM, Daniel P. Berrange wrote:
- if (virAsprintf(&dest, "exec:%s >>%s 2>/dev/null", argstr, safe_target) < 0) { + if (virAsprintf(&dest, "exec:%s | dd of=%s seek=%llub", + argstr, safe_target, offset) < 0) {
Don't you still need to silence stderr, particularly since dd writes to stderr even on success? (2 instances)
I didn't want to silence stderr, because I want it to end up in the QEMU logfile if anything goes wrong. So i really need a way to make dd keep quiet on success, rather than throwing away stderr
Coreutils comes with an extension 'dd status=noxfer' which silences some (but not all) output to stderr, but you'd have to test whether you are targetting coreutils' dd (if dd comes from somewhere else, like busybox, you'll cause a syntax error that prevents dd from doing anything at all).
There was a patch submitted to coreutils [1] that would add status=noinfo, but it is currently held up by copyright status and lack of documentation. Maybe I should revive that patch (or rather, write it from scratch, to avoid copyright taint). But even so, you are still up against the issue of testing whether you are targetting new-enough dd.
[1] http://lists.gnu.org/archive/html/bug-coreutils/2010-02/msg00161.html ...
As of coreutils-8.20, dd now accepts 'status=none' to suppress all informational output. so I've closed this.

When cgroups is enabled, access to block devices is likely to be restricted to a whitelist. Prior to saving a guest to a block device, it is neccessary to add the block device to the whitelist. THis is not required upon restore, since QEMU reads from stdin * src/qemu/qemu_driver.c: Add block device to cgroups whitelist if neccessary during dmoain save. --- src/qemu/qemu_driver.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 files changed, 39 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 1f1e703..d5fc4d1 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4790,6 +4790,7 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, struct stat sb; int is_bdev = 0; unsigned long long offset; + virCgroupPtr cgroup = NULL; memset(&header, 0, sizeof(header)); memcpy(header.magic, QEMUD_SAVE_MAGIC, sizeof(header.magic)); @@ -4979,6 +4980,23 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, } + if (is_bdev && + qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_DEVICES)) { + if (virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0) != 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("Unable to find cgroup for %s\n"), + vm->def->name); + goto endjob; + } + rc = virCgroupAllowDevicePath(cgroup, path); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to allow device %s for %s"), + path, vm->def->name); + goto endjob; + } + } + if ((!bypassSecurityDriver) && driver->securityDriver && driver->securityDriver->domainSetSavedStateLabel && @@ -5016,6 +5034,16 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, driver->securityDriver->domainRestoreSavedStateLabel(vm, path) == -1) goto endjob; + if (is_bdev && cgroup != NULL) { + rc = virCgroupDenyDevicePath(cgroup, path); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to deny device %s for %s"), + path, vm->def->name); + goto endjob; + } + } + ret = 0; /* Shut it down */ @@ -5042,6 +5070,16 @@ endjob: vm->state = VIR_DOMAIN_RUNNING; } + if (ret != 0 && is_bdev && cgroup != NULL) { + rc = virCgroupDenyDevicePath(cgroup, path); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to deny device %s for %s"), + path, vm->def->name); + goto endjob; + } + } + if (qemuDomainObjEndJob(vm) == 0) vm = NULL; } @@ -5054,6 +5092,7 @@ cleanup: virDomainObjUnlock(vm); if (event) qemuDomainEventQueue(driver, event); + virCgroupFree(&cgroup); qemuDriverUnlock(driver); return ret; } -- 1.6.5.2

On Wed, Apr 21, 2010 at 05:56:13PM +0100, Daniel P. Berrange wrote:
When cgroups is enabled, access to block devices is likely to be restricted to a whitelist. Prior to saving a guest to a block device, it is neccessary to add the block device to the whitelist. THis is not required upon restore, since QEMU reads from stdin
* src/qemu/qemu_driver.c: Add block device to cgroups whitelist if neccessary during dmoain save. --- src/qemu/qemu_driver.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 files changed, 39 insertions(+), 0 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 1f1e703..d5fc4d1 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4790,6 +4790,7 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, struct stat sb; int is_bdev = 0; unsigned long long offset; + virCgroupPtr cgroup = NULL;
memset(&header, 0, sizeof(header)); memcpy(header.magic, QEMUD_SAVE_MAGIC, sizeof(header.magic)); @@ -4979,6 +4980,23 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, }
+ if (is_bdev && + qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_DEVICES)) { + if (virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0) != 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("Unable to find cgroup for %s\n"), + vm->def->name); + goto endjob; + } + rc = virCgroupAllowDevicePath(cgroup, path); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to allow device %s for %s"), + path, vm->def->name); + goto endjob; + } + } + if ((!bypassSecurityDriver) && driver->securityDriver && driver->securityDriver->domainSetSavedStateLabel && @@ -5016,6 +5034,16 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, driver->securityDriver->domainRestoreSavedStateLabel(vm, path) == -1) goto endjob;
+ if (is_bdev && cgroup != NULL) { + rc = virCgroupDenyDevicePath(cgroup, path); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to deny device %s for %s"), + path, vm->def->name); + goto endjob; + } + } + ret = 0;
/* Shut it down */ @@ -5042,6 +5070,16 @@ endjob: vm->state = VIR_DOMAIN_RUNNING; }
+ if (ret != 0 && is_bdev && cgroup != NULL) { + rc = virCgroupDenyDevicePath(cgroup, path); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to deny device %s for %s"), + path, vm->def->name); + goto endjob; + } + } + if (qemuDomainObjEndJob(vm) == 0) vm = NULL; } @@ -5054,6 +5092,7 @@ cleanup: virDomainObjUnlock(vm); if (event) qemuDomainEventQueue(driver, event); + virCgroupFree(&cgroup); qemuDriverUnlock(driver); return ret; }
ACK, Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

On 04/21/2010 10:56 AM, Daniel P. Berrange wrote:
When cgroups is enabled, access to block devices is likely to be restricted to a whitelist. Prior to saving a guest to a block device, it is neccessary to add the block device to the whitelist. THis is
s/neccessary/necessary/g; s/THis/This/
not required upon restore, since QEMU reads from stdin
* src/qemu/qemu_driver.c: Add block device to cgroups whitelist if neccessary during dmoain save.
s/dmoain/domain/ -- Eric Blake eblake@redhat.com +1-801-349-2682 Libvirt virtualization library http://libvirt.org
participants (4)
-
Daniel P. Berrange
-
Daniel Veillard
-
Eric Blake
-
Jim Meyering