[libvirt] [PATCH 0/4] Multiple problems with saving to block devices (v2)

Update of https://www.redhat.com/archives/libvir-list/2010-April/msg00912.html Changes this time: - Pad header/xml to multiple of 512 - Set dd block size & seek as a muliplier of that - Invert check for block dev, to check for regular file instead

If a transient QEMU crashes during save attempt, then the virDomainPtr object may be freed. If a persistent QEMU crashes during save, then the 'priv->mon' field is no longer valid since it will be inactive. * src/qemu/qemu_driver.c: Fix two crashes when QEMU exits during a save attempt --- src/qemu/qemu_driver.c | 36 ++++++++++++++++++++++-------------- 1 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 39feac7..91fe963 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4997,19 +4997,20 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, } endjob: - if (ret != 0 && header.was_running) { - qemuDomainObjEnterMonitorWithDriver(driver, vm); - rc = qemuMonitorStartCPUs(priv->mon, dom->conn); - qemuDomainObjExitMonitorWithDriver(driver, vm); - if (rc < 0) - VIR_WARN0("Unable to resume guest CPUs after save failure"); - else - vm->state = VIR_DOMAIN_RUNNING; - } + if (vm) { + if (ret != 0 && header.was_running && priv->mon) { + qemuDomainObjEnterMonitorWithDriver(driver, vm); + rc = qemuMonitorStartCPUs(priv->mon, dom->conn); + qemuDomainObjExitMonitorWithDriver(driver, vm); + if (rc < 0) + VIR_WARN0("Unable to resume guest CPUs after save failure"); + else + vm->state = VIR_DOMAIN_RUNNING; + } - if (vm && - qemuDomainObjEndJob(vm) == 0) + if (qemuDomainObjEndJob(vm) == 0) vm = NULL; + } cleanup: VIR_FREE(xml); @@ -7185,9 +7186,16 @@ static int qemudDomainAttachNetDevice(virConnectPtr conn, } /* FIXME - need to support vhost-net here (5th arg) */ - if (!(netstr = qemuBuildHostNetStr(net, ' ', - vlan, tapfd_name, 0))) - goto try_tapfd_close; + if ((qemuCmdFlags & QEMUD_CMD_FLAG_NETDEV) && + (qemuCmdFlags & QEMUD_CMD_FLAG_DEVICE)) { + if (!(netstr = qemuBuildHostNetStr(net, ',', + -1, tapfd_name, 0))) + goto try_tapfd_close; + } else { + if (!(netstr = qemuBuildHostNetStr(net, ' ', + vlan, tapfd_name, 0))) + goto try_tapfd_close; + } qemuDomainObjEnterMonitorWithDriver(driver, vm); if ((qemuCmdFlags & QEMUD_CMD_FLAG_NETDEV) && -- 1.6.5.2

It is possible to use block devices with domain save/restore. Upon failure QEMU unlinks the path being saved to. This isn't good when it is a block device ! * src/qemu/qemu_driver.c: Don't unlink block devices if save fails --- src/qemu/qemu_driver.c | 22 ++++++++++++++++++++-- 1 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 91fe963..41a516c 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4787,6 +4787,8 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, int rc; virDomainEventPtr event = NULL; qemuDomainObjPrivatePtr priv; + struct stat sb; + int is_reg = 0; memset(&header, 0, sizeof(header)); memcpy(header.magic, QEMUD_SAVE_MAGIC, sizeof(header.magic)); @@ -4840,6 +4842,21 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, } header.xml_len = strlen(xml) + 1; + /* path might be a pre-existing block dev, in which case + * we need to skip the create step, and also avoid unlink + * in the failure case */ + if (stat(path, &sb) < 0) { + if (errno != ENOENT) { + virReportSystemError(errno, _("unable to access %s"), path); + goto endjob; + } else { + is_reg = 1; + } + } else { + is_reg = S_ISREG(sb.st_mode); + } + + /* Setup hook data needed by virFileOperation hook function */ hdata.dom = dom; hdata.path = path; @@ -4849,7 +4866,8 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, /* Write header to file, followed by XML */ /* First try creating the file as root */ - if ((rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, + if (is_reg && + (rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, S_IRUSR|S_IWUSR, getuid(), getgid(), qemudDomainSaveFileOpHook, &hdata, @@ -5014,7 +5032,7 @@ endjob: cleanup: VIR_FREE(xml); - if (ret != 0) + if (ret != 0 && is_reg) unlink(path); if (vm) virDomainObjUnlock(vm); -- 1.6.5.2

On 04/22/2010 07:43 AM, Daniel P. Berrange wrote:
It is possible to use block devices with domain save/restore. Upon failure QEMU unlinks the path being saved to. This isn't good when it is a block device !
* src/qemu/qemu_driver.c: Don't unlink block devices if save fails --- src/qemu/qemu_driver.c | 22 ++++++++++++++++++++-- 1 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 91fe963..41a516c 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4787,6 +4787,8 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, int rc; virDomainEventPtr event = NULL; qemuDomainObjPrivatePtr priv; + struct stat sb; + int is_reg = 0;
memset(&header, 0, sizeof(header)); memcpy(header.magic, QEMUD_SAVE_MAGIC, sizeof(header.magic)); @@ -4840,6 +4842,21 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, } header.xml_len = strlen(xml) + 1;
+ /* path might be a pre-existing block dev, in which case + * we need to skip the create step, and also avoid unlink + * in the failure case */ + if (stat(path,&sb)< 0) { + if (errno != ENOENT) {
In the case of a target on a root-squashed NFS server, this stat() will fail if the directory is not marked as w+rx. In this case, it will return EACCES. Changing the if to also check for (errno != EACCES) allows it to pass this point (and shouldn't hurt in any other circumstances). I tried it with this small modification, and save/restore to root-squash NFS works properly.

The save process was relying on use of the shell >> append operator to ensure the save data was placed after the libvirt header + XML. This doesn't work for block devices though. Replace this code with use of 'dd' and its 'seek' parameter. This means that we need to pad the header + XML out to a multiple of dd block size (in this case we choose 512). The qemuMonitorMigateToCommand() monitor API is used for both save/coredump, and migration via UNIX socket. We can't simply switch this to use 'dd' since this causes problems with the migration usage. Thus, create a dedicated qemuMonitorMigateToFile which can accept an filename + offset, and remove the filename from the current qemuMonitorMigateToCommand() API * src/qemu/qemu_driver.c: Switch to qemuMonitorMigateToFile for save and core dump * src/qemu/qemu_monitor.c, src/qemu/qemu_monitor.h, src/qemu/qemu_monitor_json.c, src/qemu/qemu_monitor_json.h, src/qemu/qemu_monitor_text.c, src/qemu/qemu_monitor_text.h: Create a new qemuMonitorMigateToFile, separate from the existing qemuMonitorMigateToCommand to allow handling file offsets --- src/qemu/qemu_driver.c | 190 +++++++++++++++++++++++++----------------- src/qemu/qemu_monitor.c | 35 +++++++-- src/qemu/qemu_monitor.h | 11 ++- src/qemu/qemu_monitor_json.c | 37 ++++++++- src/qemu/qemu_monitor_json.h | 9 ++- src/qemu/qemu_monitor_text.c | 37 ++++++++- src/qemu/qemu_monitor_text.h | 9 ++- 7 files changed, 232 insertions(+), 96 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 41a516c..09b6493 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4789,6 +4789,7 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, qemuDomainObjPrivatePtr priv; struct stat sb; int is_reg = 0; + unsigned long long offset; memset(&header, 0, sizeof(header)); memcpy(header.magic, QEMUD_SAVE_MAGIC, sizeof(header.magic)); @@ -4862,104 +4863,137 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, hdata.path = path; hdata.xml = xml; hdata.header = &header; + offset = sizeof(header) + header.xml_len; + + /* Due to way we append QEMU state on our header with dd, + * we need to ensure there's a 512 byte boundary. Unfortunately + * we don't have an explicit offset in the header, so we fake + * it by padding the XML string with NULLs */ + if (offset % QEMU_MONITOR_MIGRATE_TO_FILE_BS) { + unsigned long long pad = + QEMU_MONITOR_MIGRATE_TO_FILE_BS - + (offset % QEMU_MONITOR_MIGRATE_TO_FILE_BS); + + if (VIR_REALLOC_N(xml, header.xml_len + pad) < 0) { + virReportOOMError(); + goto endjob; + } + memset(xml + header.xml_len, 0, pad); + offset += pad; + header.xml_len += pad; + } /* Write header to file, followed by XML */ /* First try creating the file as root */ - if (is_reg && - (rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, - S_IRUSR|S_IWUSR, - getuid(), getgid(), - qemudDomainSaveFileOpHook, &hdata, - 0)) != 0) { - - /* If we failed as root, and the error was permission-denied - (EACCES), assume it's on a network-connected share where - root access is restricted (eg, root-squashed NFS). If the - qemu user (driver->user) is non-root, just set a flag to - bypass security driver shenanigans, and retry the operation - after doing setuid to qemu user */ - - if ((rc != EACCES) || - driver->user == getuid()) { - virReportSystemError(rc, _("Failed to create domain save file '%s'"), - path); + if (!is_reg) { + int fd = open(path, O_WRONLY | O_TRUNC); + if (fd < 0) { + virReportSystemError(errno, _("unable to open %s"), path); goto endjob; } - -#ifdef __linux__ - /* On Linux we can also verify the FS-type of the directory. */ - char *dirpath, *p; - struct statfs st; - int statfs_ret; - - if ((dirpath = strdup(path)) == NULL) { - virReportOOMError(); + if ((rc = qemudDomainSaveFileOpHook(fd, &hdata)) != 0) { + close(fd); + goto endjob; + } + if (close(fd) < 0) { + virReportSystemError(errno, _("unable to close %s"), path); goto endjob; } + } else { + if ((rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, + S_IRUSR|S_IWUSR, + getuid(), getgid(), + qemudDomainSaveFileOpHook, &hdata, + 0)) != 0) { + /* If we failed as root, and the error was permission-denied + (EACCES), assume it's on a network-connected share where + root access is restricted (eg, root-squashed NFS). If the + qemu user (driver->user) is non-root, just set a flag to + bypass security driver shenanigans, and retry the operation + after doing setuid to qemu user */ + + if ((rc != EACCES) || + driver->user == getuid()) { + virReportSystemError(rc, _("Failed to create domain save file '%s'"), + path); + goto endjob; + } - do { - // Try less and less of the path until we get to a - // directory we can stat. Even if we don't have 'x' - // permission on any directory in the path on the NFS - // server (assuming it's NFS), we will be able to stat the - // mount point, and that will properly tell us if the - // fstype is NFS. +#ifdef __linux__ + /* On Linux we can also verify the FS-type of the directory. */ + char *dirpath, *p; + struct statfs st; + int statfs_ret; - if ((p = strrchr(dirpath, '/')) == NULL) { - qemuReportError(VIR_ERR_INVALID_ARG, - _("Invalid relative path '%s' for domain save file"), - path); - VIR_FREE(dirpath); + if ((dirpath = strdup(path)) == NULL) { + virReportOOMError(); goto endjob; } - if (p == dirpath) - *(p+1) = '\0'; - else - *p = '\0'; + do { + // Try less and less of the path until we get to a + // directory we can stat. Even if we don't have 'x' + // permission on any directory in the path on the NFS + // server (assuming it's NFS), we will be able to stat the + // mount point, and that will properly tell us if the + // fstype is NFS. + + if ((p = strrchr(dirpath, '/')) == NULL) { + qemuReportError(VIR_ERR_INVALID_ARG, + _("Invalid relative path '%s' for domain save file"), + path); + VIR_FREE(dirpath); + goto endjob; + } - statfs_ret = statfs(dirpath, &st); + if (p == dirpath) + *(p+1) = '\0'; + else + *p = '\0'; - } while ((statfs_ret == -1) && (p != dirpath)); + statfs_ret = statfs(dirpath, &st); - if (statfs_ret == -1) { - virReportSystemError(errno, - _("Failed to create domain save file '%s'" - " statfs of all elements of path failed."), - path); - VIR_FREE(dirpath); - goto endjob; - } + } while ((statfs_ret == -1) && (p != dirpath)); - if (st.f_type != NFS_SUPER_MAGIC) { - virReportSystemError(rc, - _("Failed to create domain save file '%s'" - " (fstype of '%s' is 0x%X"), - path, dirpath, (unsigned int) st.f_type); + if (statfs_ret == -1) { + virReportSystemError(errno, + _("Failed to create domain save file '%s'" + " statfs of all elements of path failed."), + path); + VIR_FREE(dirpath); + goto endjob; + } + + if (st.f_type != NFS_SUPER_MAGIC) { + virReportSystemError(rc, + _("Failed to create domain save file '%s'" + " (fstype of '%s' is 0x%X"), + path, dirpath, (unsigned int) st.f_type); + VIR_FREE(dirpath); + goto endjob; + } VIR_FREE(dirpath); - goto endjob; - } - VIR_FREE(dirpath); #endif - /* Retry creating the file as driver->user */ + /* Retry creating the file as driver->user */ - if ((rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, - S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP, - driver->user, driver->group, - qemudDomainSaveFileOpHook, &hdata, - VIR_FILE_OP_AS_UID)) != 0) { - virReportSystemError(rc, _("Error from child process creating '%s'"), + if ((rc = virFileOperation(path, O_CREAT|O_TRUNC|O_WRONLY, + S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP, + driver->user, driver->group, + qemudDomainSaveFileOpHook, &hdata, + VIR_FILE_OP_AS_UID)) != 0) { + virReportSystemError(rc, _("Error from child process creating '%s'"), path); - goto endjob; - } + goto endjob; + } - /* Since we had to setuid to create the file, and the fstype - is NFS, we assume it's a root-squashing NFS share, and that - the security driver stuff would have failed anyway */ + /* Since we had to setuid to create the file, and the fstype + is NFS, we assume it's a root-squashing NFS share, and that + the security driver stuff would have failed anyway */ - bypassSecurityDriver = 1; + bypassSecurityDriver = 1; + } } @@ -4972,7 +5006,7 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, if (header.compressed == QEMUD_SAVE_FORMAT_RAW) { const char *args[] = { "cat", NULL }; qemuDomainObjEnterMonitorWithDriver(driver, vm); - rc = qemuMonitorMigrateToCommand(priv->mon, 1, args, path); + rc = qemuMonitorMigrateToFile(priv->mon, 1, args, path, offset); qemuDomainObjExitMonitorWithDriver(driver, vm); } else { const char *prog = qemudSaveCompressionTypeToString(header.compressed); @@ -4982,7 +5016,7 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, NULL }; qemuDomainObjEnterMonitorWithDriver(driver, vm); - rc = qemuMonitorMigrateToCommand(priv->mon, 1, args, path); + rc = qemuMonitorMigrateToFile(priv->mon, 1, args, path, offset); qemuDomainObjExitMonitorWithDriver(driver, vm); } @@ -5275,7 +5309,7 @@ static int qemudDomainCoreDump(virDomainPtr dom, } qemuDomainObjEnterMonitor(vm); - ret = qemuMonitorMigrateToCommand(priv->mon, 1, args, path); + ret = qemuMonitorMigrateToFile(priv->mon, 1, args, path, 0); qemuDomainObjExitMonitor(vm); if (ret < 0) @@ -10043,7 +10077,7 @@ static int doTunnelMigrate(virDomainPtr dom, internalret = qemuMonitorMigrateToUnix(priv->mon, 1, unixfile); else if (qemuCmdFlags & QEMUD_CMD_FLAG_MIGRATE_QEMU_EXEC) { const char *args[] = { "nc", "-U", unixfile, NULL }; - internalret = qemuMonitorMigrateToCommand(priv->mon, 1, args, "/dev/null"); + internalret = qemuMonitorMigrateToCommand(priv->mon, 1, args); } else { internalret = -1; } diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c index 32c3cd5..800f3c5 100644 --- a/src/qemu/qemu_monitor.c +++ b/src/qemu/qemu_monitor.c @@ -1202,17 +1202,40 @@ int qemuMonitorMigrateToHost(qemuMonitorPtr mon, int qemuMonitorMigrateToCommand(qemuMonitorPtr mon, int background, - const char * const *argv, - const char *target) + const char * const *argv) { int ret; - DEBUG("mon=%p, fd=%d argv=%p target=%s", - mon, mon->fd, argv, target); + DEBUG("mon=%p, fd=%d argv=%p", + mon, mon->fd, argv); if (mon->json) - ret = qemuMonitorJSONMigrateToCommand(mon, background, argv, target); + ret = qemuMonitorJSONMigrateToCommand(mon, background, argv); else - ret = qemuMonitorTextMigrateToCommand(mon, background, argv, target); + ret = qemuMonitorTextMigrateToCommand(mon, background, argv); + return ret; +} + +int qemuMonitorMigrateToFile(qemuMonitorPtr mon, + int background, + const char * const *argv, + const char *target, + unsigned long long offset) +{ + int ret; + DEBUG("mon=%p, fd=%d argv=%p target=%s offset=%llu", + mon, mon->fd, argv, target, offset); + + if (offset % QEMU_MONITOR_MIGRATE_TO_FILE_BS) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("file offset must be a multiple of %llu"), + QEMU_MONITOR_MIGRATE_TO_FILE_BS); + return -1; + } + + if (mon->json) + ret = qemuMonitorJSONMigrateToFile(mon, background, argv, target, offset); + else + ret = qemuMonitorTextMigrateToFile(mon, background, argv, target, offset); return ret; } diff --git a/src/qemu/qemu_monitor.h b/src/qemu/qemu_monitor.h index a0f198b..a190ed7 100644 --- a/src/qemu/qemu_monitor.h +++ b/src/qemu/qemu_monitor.h @@ -251,8 +251,15 @@ int qemuMonitorMigrateToHost(qemuMonitorPtr mon, int qemuMonitorMigrateToCommand(qemuMonitorPtr mon, int background, - const char * const *argv, - const char *target); + const char * const *argv); + +#define QEMU_MONITOR_MIGRATE_TO_FILE_BS 512llu + +int qemuMonitorMigrateToFile(qemuMonitorPtr mon, + int background, + const char * const *argv, + const char *target, + unsigned long long offset); int qemuMonitorMigrateToUnix(qemuMonitorPtr mon, int background, diff --git a/src/qemu/qemu_monitor_json.c b/src/qemu/qemu_monitor_json.c index b0773ab..3e61b12 100644 --- a/src/qemu/qemu_monitor_json.c +++ b/src/qemu/qemu_monitor_json.c @@ -1652,8 +1652,36 @@ int qemuMonitorJSONMigrateToHost(qemuMonitorPtr mon, int qemuMonitorJSONMigrateToCommand(qemuMonitorPtr mon, int background, - const char * const *argv, - const char *target) + const char * const *argv) +{ + char *argstr; + char *dest = NULL; + int ret = -1; + + argstr = virArgvToString(argv); + if (!argstr) { + virReportOOMError(); + goto cleanup; + } + + if (virAsprintf(&dest, "exec:%s", argstr) < 0) { + virReportOOMError(); + goto cleanup; + } + + ret = qemuMonitorJSONMigrate(mon, background, dest); + +cleanup: + VIR_FREE(argstr); + VIR_FREE(dest); + return ret; +} + +int qemuMonitorJSONMigrateToFile(qemuMonitorPtr mon, + int background, + const char * const *argv, + const char *target, + unsigned long long offset) { char *argstr; char *dest = NULL; @@ -1673,7 +1701,10 @@ int qemuMonitorJSONMigrateToCommand(qemuMonitorPtr mon, goto cleanup; } - if (virAsprintf(&dest, "exec:%s >>%s 2>/dev/null", argstr, safe_target) < 0) { + if (virAsprintf(&dest, "exec:%s | dd of=%s bs=%llu seek=%llu", + argstr, safe_target, + QEMU_MONITOR_MIGRATE_TO_FILE_BS, + offset / QEMU_MONITOR_MIGRATE_TO_FILE_BS) < 0) { virReportOOMError(); goto cleanup; } diff --git a/src/qemu/qemu_monitor_json.h b/src/qemu/qemu_monitor_json.h index 157a5c0..4dcb3e0 100644 --- a/src/qemu/qemu_monitor_json.h +++ b/src/qemu/qemu_monitor_json.h @@ -104,8 +104,13 @@ int qemuMonitorJSONMigrateToHost(qemuMonitorPtr mon, int qemuMonitorJSONMigrateToCommand(qemuMonitorPtr mon, int background, - const char * const *argv, - const char *target); + const char * const *argv); + +int qemuMonitorJSONMigrateToFile(qemuMonitorPtr mon, + int background, + const char * const *argv, + const char *target, + unsigned long long offset); int qemuMonitorJSONMigrateToUnix(qemuMonitorPtr mon, int background, diff --git a/src/qemu/qemu_monitor_text.c b/src/qemu/qemu_monitor_text.c index 6490ea6..937f5b1 100644 --- a/src/qemu/qemu_monitor_text.c +++ b/src/qemu/qemu_monitor_text.c @@ -1202,8 +1202,36 @@ int qemuMonitorTextMigrateToHost(qemuMonitorPtr mon, int qemuMonitorTextMigrateToCommand(qemuMonitorPtr mon, int background, - const char * const *argv, - const char *target) + const char * const *argv) +{ + char *argstr; + char *dest = NULL; + int ret = -1; + + argstr = virArgvToString(argv); + if (!argstr) { + virReportOOMError(); + goto cleanup; + } + + if (virAsprintf(&dest, "exec:%s", argstr) < 0) { + virReportOOMError(); + goto cleanup; + } + + ret = qemuMonitorTextMigrate(mon, background, dest); + +cleanup: + VIR_FREE(argstr); + VIR_FREE(dest); + return ret; +} + +int qemuMonitorTextMigrateToFile(qemuMonitorPtr mon, + int background, + const char * const *argv, + const char *target, + unsigned long long offset) { char *argstr; char *dest = NULL; @@ -1223,7 +1251,10 @@ int qemuMonitorTextMigrateToCommand(qemuMonitorPtr mon, goto cleanup; } - if (virAsprintf(&dest, "exec:%s >>%s 2>/dev/null", argstr, safe_target) < 0) { + if (virAsprintf(&dest, "exec:%s | dd of=%s bs=%llu seek=%llu", + argstr, safe_target, + QEMU_MONITOR_MIGRATE_TO_FILE_BS, + offset / QEMU_MONITOR_MIGRATE_TO_FILE_BS) < 0) { virReportOOMError(); goto cleanup; } diff --git a/src/qemu/qemu_monitor_text.h b/src/qemu/qemu_monitor_text.h index c80957e..25be828 100644 --- a/src/qemu/qemu_monitor_text.h +++ b/src/qemu/qemu_monitor_text.h @@ -99,8 +99,13 @@ int qemuMonitorTextMigrateToHost(qemuMonitorPtr mon, int qemuMonitorTextMigrateToCommand(qemuMonitorPtr mon, int background, - const char * const *argv, - const char *target); + const char * const *argv); + +int qemuMonitorTextMigrateToFile(qemuMonitorPtr mon, + int background, + const char * const *argv, + const char *target, + unsigned long long offset); int qemuMonitorTextMigrateToUnix(qemuMonitorPtr mon, int background, -- 1.6.5.2

On Thu, Apr 22, 2010 at 12:43:28PM +0100, Daniel P. Berrange wrote:
The save process was relying on use of the shell >> append operator to ensure the save data was placed after the libvirt header + XML. This doesn't work for block devices though. Replace this code with use of 'dd' and its 'seek' parameter. This means that we need to pad the header + XML out to a multiple of dd block size (in this case we choose 512).
err, why ? As I pointed out dd seems just fine taking byte specified offset, not necessarilly multiple of 512. The main problem I see with this is that it does break the format and a somain saved (or managed save) with 0.8.0 would not restore with 0.8.1. I really want to understand why this is needed before going that route because I see this as a serious compatibility break. Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

On Thu, Apr 22, 2010 at 02:00:56PM +0200, Daniel Veillard wrote:
On Thu, Apr 22, 2010 at 12:43:28PM +0100, Daniel P. Berrange wrote:
The save process was relying on use of the shell >> append operator to ensure the save data was placed after the libvirt header + XML. This doesn't work for block devices though. Replace this code with use of 'dd' and its 'seek' parameter. This means that we need to pad the header + XML out to a multiple of dd block size (in this case we choose 512).
err, why ? As I pointed out dd seems just fine taking byte specified offset, not necessarilly multiple of 512. The main problem I see with this is that it does break the format and a somain saved (or managed save) with 0.8.0 would not restore with 0.8.1. I really want to understand why this is needed before going that route because I see this as a serious compatibility break.
Okay fater soem out of band discussion, it proves to not be a compatibility problem becasue the length of the xml header is updated, ACK to the serie, Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/

On Thu, Apr 22, 2010 at 02:12:22PM +0200, Daniel Veillard wrote:
On Thu, Apr 22, 2010 at 02:00:56PM +0200, Daniel Veillard wrote:
On Thu, Apr 22, 2010 at 12:43:28PM +0100, Daniel P. Berrange wrote:
The save process was relying on use of the shell >> append operator to ensure the save data was placed after the libvirt header + XML. This doesn't work for block devices though. Replace this code with use of 'dd' and its 'seek' parameter. This means that we need to pad the header + XML out to a multiple of dd block size (in this case we choose 512).
err, why ? As I pointed out dd seems just fine taking byte specified offset, not necessarilly multiple of 512. The main problem I see with this is that it does break the format and a somain saved (or managed save) with 0.8.0 would not restore with 0.8.1. I really want to understand why this is needed before going that route because I see this as a serious compatibility break.
Okay fater soem out of band discussion, it proves to not be a compatibility problem becasue the length of the xml header is updated,
Confirming in the code, if you do dd bs=512 seek=15c Then, the way the code actually handles that is that when it processes the ARGV, it reads the value '15', then applies the multipler 'c' (1 in this case). When it actually comes to seek though, it will unconditionally apply the block size multiplier too. So there is no way you can seek a value smaller than the block size. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

On 04/22/2010 07:43 AM, Daniel P. Berrange wrote:
The save process was relying on use of the shell>> append operator to ensure the save data was placed after the libvirt header + XML. This doesn't work for block devices though. Replace this code with use of 'dd' and its 'seek' parameter. This means that we need to pad the header + XML out to a multiple of dd block size (in this case we choose 512).
The qemuMonitorMigateToCommand() monitor API is used for both save/coredump, and migration via UNIX socket. We can't simply switch this to use 'dd' since this causes problems with the migration usage. Thus, create a dedicated qemuMonitorMigateToFile which can accept an filename + offset, and remove the filename from the current qemuMonitorMigateToCommand() API
* src/qemu/qemu_driver.c: Switch to qemuMonitorMigateToFile for save and core dump * src/qemu/qemu_monitor.c, src/qemu/qemu_monitor.h, src/qemu/qemu_monitor_json.c, src/qemu/qemu_monitor_json.h, src/qemu/qemu_monitor_text.c, src/qemu/qemu_monitor_text.h: Create a new qemuMonitorMigateToFile, separate from the existing qemuMonitorMigateToCommand to allow handling file offsets --- src/qemu/qemu_driver.c | 190 +++++++++++++++++++++++++----------------- src/qemu/qemu_monitor.c | 35 +++++++-- src/qemu/qemu_monitor.h | 11 ++- src/qemu/qemu_monitor_json.c | 37 ++++++++- src/qemu/qemu_monitor_json.h | 9 ++- src/qemu/qemu_monitor_text.c | 37 ++++++++- src/qemu/qemu_monitor_text.h | 9 ++- 7 files changed, 232 insertions(+), 96 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 41a516c..09b6493 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4789,6 +4789,7 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, qemuDomainObjPrivatePtr priv; struct stat sb; int is_reg = 0; + unsigned long long offset;
memset(&header, 0, sizeof(header)); memcpy(header.magic, QEMUD_SAVE_MAGIC, sizeof(header.magic)); @@ -4862,104 +4863,137 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, hdata.path = path; hdata.xml = xml; hdata.header =&header; + offset = sizeof(header) + header.xml_len; + + /* Due to way we append QEMU state on our header with dd, + * we need to ensure there's a 512 byte boundary. Unfortunately + * we don't have an explicit offset in the header, so we fake + * it by padding the XML string with NULLs */ + if (offset % QEMU_MONITOR_MIGRATE_TO_FILE_BS) { + unsigned long long pad = + QEMU_MONITOR_MIGRATE_TO_FILE_BS - + (offset % QEMU_MONITOR_MIGRATE_TO_FILE_BS); + + if (VIR_REALLOC_N(xml, header.xml_len + pad)< 0) { + virReportOOMError(); + goto endjob; + } + memset(xml + header.xml_len, 0, pad); + offset += pad; + header.xml_len += pad; + }
Is it really necessary to add this padding even when we *aren't* using dd? (ie, when is_reg == 1).

On 04/24/2010 12:50 AM, Laine Stump wrote:
Is it really necessary to add this padding even when we *aren't* using dd? (ie, when is_reg == 1).
Nevermind. Now that I've actual RTFC, I see that this new code *always* use dd. However, I just noticed an SELinux complaint about dd attempting to write to a file on an NFS-mounted directory. My system is running SELinux in permissive mode, so it succeeded, but won't this be a problem if it's in enforcing mode?

On Sun, Apr 25, 2010 at 03:04:21AM -0400, Laine Stump wrote:
On 04/24/2010 12:50 AM, Laine Stump wrote:
Is it really necessary to add this padding even when we *aren't* using dd? (ie, when is_reg == 1).
Nevermind. Now that I've actual RTFC, I see that this new code *always* use dd.
However, I just noticed an SELinux complaint about dd attempting to write to a file on an NFS-mounted directory. My system is running SELinux in permissive mode, so it succeeded, but won't this be a problem if it's in enforcing mode?
If there is a SELinux problem I don't think it can be related to this patch. Both before & after this change we're running a child process to actually write the data. Previously cat, now dd. So SELinux would impact them equally badly/well. Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

On 04/28/2010 08:49 AM, Daniel P. Berrange wrote:
On Sun, Apr 25, 2010 at 03:04:21AM -0400, Laine Stump wrote:
On 04/24/2010 12:50 AM, Laine Stump wrote:
Is it really necessary to add this padding even when we *aren't* using dd? (ie, when is_reg == 1).
Nevermind. Now that I've actual RTFC, I see that this new code *always* use dd.
However, I just noticed an SELinux complaint about dd attempting to write to a file on an NFS-mounted directory. My system is running SELinux in permissive mode, so it succeeded, but won't this be a problem if it's in enforcing mode?
If there is a SELinux problem I don't think it can be related to this patch. Both before& after this change we're running a child process to actually write the data. Previously cat, now dd. So SELinux would impact them equally badly/well.
Correct (that it's a problem with dd breaking an SELinux policy, not us). I don't recall if there was previously a complaint about cat doing it, but it seems probable that SELinux would be setup to not complain about a cat of a file on an NFS-mounted directory, yet complain loudly if someone used dd. So while nothing needs changing in this code, it's one of those things that we need to inform the SELinux people about - it really is foreseeable that someone would want to access an NFS-mounted file with dd.

When cgroups is enabled, access to block devices is likely to be restricted to a whitelist. Prior to saving a guest to a block device, it is necessary to add the block device to the whitelist. This is not required upon restore, since QEMU reads from stdin * src/qemu/qemu_driver.c: Add block device to cgroups whitelist if neccessary during domain save. --- src/qemu/qemu_driver.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 files changed, 39 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 09b6493..3bea7e7 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4790,6 +4790,7 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, struct stat sb; int is_reg = 0; unsigned long long offset; + virCgroupPtr cgroup = NULL; memset(&header, 0, sizeof(header)); memcpy(header.magic, QEMUD_SAVE_MAGIC, sizeof(header.magic)); @@ -4997,6 +4998,23 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, } + if (!is_reg && + qemuCgroupControllerActive(driver, VIR_CGROUP_CONTROLLER_DEVICES)) { + if (virCgroupForDomain(driver->cgroup, vm->def->name, &cgroup, 0) != 0) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + _("Unable to find cgroup for %s\n"), + vm->def->name); + goto endjob; + } + rc = virCgroupAllowDevicePath(cgroup, path); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to allow device %s for %s"), + path, vm->def->name); + goto endjob; + } + } + if ((!bypassSecurityDriver) && driver->securityDriver && driver->securityDriver->domainSetSavedStateLabel && @@ -5034,6 +5052,16 @@ static int qemudDomainSaveFlag(virDomainPtr dom, const char *path, driver->securityDriver->domainRestoreSavedStateLabel(vm, path) == -1) goto endjob; + if (cgroup != NULL) { + rc = virCgroupDenyDevicePath(cgroup, path); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to deny device %s for %s"), + path, vm->def->name); + goto endjob; + } + } + ret = 0; /* Shut it down */ @@ -5060,6 +5088,16 @@ endjob: vm->state = VIR_DOMAIN_RUNNING; } + if (ret != 0 && cgroup != NULL) { + rc = virCgroupDenyDevicePath(cgroup, path); + if (rc != 0) { + virReportSystemError(-rc, + _("Unable to deny device %s for %s"), + path, vm->def->name); + goto endjob; + } + } + if (qemuDomainObjEndJob(vm) == 0) vm = NULL; } @@ -5072,6 +5110,7 @@ cleanup: virDomainObjUnlock(vm); if (event) qemuDomainEventQueue(driver, event); + virCgroupFree(&cgroup); qemuDriverUnlock(driver); return ret; } -- 1.6.5.2
participants (3)
-
Daniel P. Berrange
-
Daniel Veillard
-
Laine Stump