[libvirt] [PATCH 0/8 v7] Unprivileged SG_IO support

As a result of RFC [1], this implements the unprivleged SG_IO support. v6 - v7: * No restoring of unpriv_sgio per Daniel's thought. * Use "major:minor" as the hash key per Jirka's suggestion. Osier Yang (8): util: Prepare helpers for unpriv_sgio setting qemu: Add a hash table for the shared disks docs: Add docs and rng schema for new XML cdbfilter conf: Parse and format the new XML tag cdbfilter qemu: set unpriv_sgio when domain starting qemu: Check if the shared disk's cdbfilter conflicts with others qemu: Set unpriv_sgio when attaching disk qemu: Error out if the shared disk conf conflicts with others when attaching docs/formatdomain.html.in | 13 ++- docs/schemas/domaincommon.rng | 52 ++++-- src/conf/domain_conf.c | 58 +++++-- src/conf/domain_conf.h | 10 + src/libvirt_private.syms | 4 + src/qemu/qemu_conf.c | 92 ++++++++++ src/qemu/qemu_conf.h | 12 ++ src/qemu/qemu_driver.c | 38 ++++ src/qemu/qemu_process.c | 75 ++++++++ src/qemu/qemu_process.h | 2 + src/util/util.c | 180 ++++++++++++++++++++ src/util/util.h | 8 + ...ml2argv-disk-scsi-lun-passthrough-cdbfilter.xml | 32 ++++ tests/qemuxml2xmltest.c | 1 + 14 files changed, 547 insertions(+), 30 deletions(-) create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-cdbfilter.xml [1] https://www.redhat.com/archives/libvir-list/2012-November/msg00988.html [2] https://www.redhat.com/archives/libvir-list/2012-December/msg00325.html Regards, Osier

"virGetDevice{Major,Minor}" could be used across the sources, but it doesn't relate with this series, and could be done later. * src/util/util.h: (Declare virGetDevice{Major,Minor}, and vir{Get,Set}DeviceUnprivSGIO) * src/util/util.c: (New internal helper virCanonicalizeDiskPath to canonicalize the disk path; Implement virGetDevice{Major,Minor} and vir{Get,Set}DeviceUnprivSGIO) * src/libvirt_private.syms: Export private symbols of upper helpers --- src/libvirt_private.syms | 4 + src/util/util.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/util.h | 8 ++ 3 files changed, 192 insertions(+), 0 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 9288ad3..908bfb7 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1281,6 +1281,9 @@ virFileWaitForDevices; virFileWriteStr; virFindFileInPath; virFormatIntDecimal; +virGetDeviceMajor; +virGetDeviceMinor; +virGetDeviceUnprivSGIO; virGetGroupID; virGetGroupName; virGetHostname; @@ -1299,6 +1302,7 @@ virPipeReadUntilEOF; virScaleInteger; virSetBlocking; virSetCloseExec; +virSetDeviceUnprivSGIO; virSetInherit; virSetNonBlock; virSetUIDGID; diff --git a/src/util/util.c b/src/util/util.c index 05e7ca7..8c6d43c 100644 --- a/src/util/util.c +++ b/src/util/util.c @@ -3129,3 +3129,183 @@ virStrIsPrint(const char *str) return true; } + +static char * +virCanonicalizeDiskPath(const char *path) +{ + char *canonical_path = NULL; + + if (STRPREFIX(path, "/dev/disk/by-path") || + STRPREFIX(path, "/dev/disk/by-uuid") || + STRPREFIX(path, "/dev/disk/by-id")) { + if (virFileResolveLink(path, &canonical_path) < 0) + return NULL; + } else { + canonical_path = strdup(path); + } + + return canonical_path; +} + +#if defined(major) && defined(minor) +int +virGetDeviceMajor(const char *path) +{ + struct stat sb; + char *canonical_path = NULL; + + if (!(canonical_path = virCanonicalizeDiskPath(path))) + return -errno; + + if (stat(canonical_path, &sb) < 0) { + VIR_FREE(canonical_path); + return -errno; + } + + if (!S_ISBLK(sb.st_mode)) { + VIR_FREE(canonical_path); + return -EINVAL; + } + + VIR_FREE(canonical_path); + return major(sb.st_rdev); +} + +int +virGetDeviceMinor(const char *path) +{ + struct stat sb; + char *canonical_path = NULL; + + if (!(canonical_path = virCanonicalizeDiskPath(path))) + return -errno; + + if (stat(canonical_path, &sb) < 0) { + VIR_FREE(canonical_path); + return -errno; + } + + if (!S_ISBLK(sb.st_mode)) { + VIR_FREE(canonical_path); + return -EINVAL; + } + + VIR_FREE(canonical_path); + return minor(sb.st_rdev); +} +#else +int +virGetDeviceMajor(const char *path) +{ + return -ENOSYS; +} + +int +virGetDeviceMinor(const char *path) +{ + return -ENOSYS; +} +#endif + +static char * +virGetUnprivSGIOSysfsPath(const char *path) +{ + int major, minor; + char *sysfs_path = NULL; + + if ((major = virGetDeviceMajor(path)) < 0) { + virReportSystemError(-major, + _("Unable to get major number of device '%s'"), + path); + return NULL; + } + + if ((minor = virGetDeviceMinor(path)) < 0) { + virReportSystemError(-minor, + _("Unable to get minor number of device '%s'"), + path); + return NULL; + } + + if (virAsprintf(&sysfs_path, "/sys/dev/block/%d:%d/queue/unpriv_sgio", + major, minor) < 0) { + virReportOOMError(); + return NULL; + } + + return sysfs_path; +} + +int +virSetDeviceUnprivSGIO(const char *path, + int unpriv_sgio) +{ + char *sysfs_path = NULL; + char *val = NULL; + int ret = -1; + int rc = -1; + + if (!(sysfs_path = virGetUnprivSGIOSysfsPath(path))) + return -1; + + if (!virFileExists(sysfs_path)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("unpriv_sgio is not supported by this kernel")); + goto cleanup; + } + + if (virAsprintf(&val, "%d", unpriv_sgio) < 0) { + virReportOOMError(); + goto cleanup; + } + + if ((rc = virFileWriteStr(sysfs_path, val, 0)) < 0) { + virReportSystemError(-rc, _("failed to set %s"), sysfs_path); + goto cleanup; + } + + ret = 0; +cleanup: + VIR_FREE(sysfs_path); + VIR_FREE(val); + return ret; +} + +int +virGetDeviceUnprivSGIO(const char *path, + int *unpriv_sgio) +{ + char *sysfs_path = NULL; + char *buf = NULL; + char *tmp = NULL; + int ret = -1; + int rc = -1; + + if (!(sysfs_path = virGetUnprivSGIOSysfsPath(path))) + return -1; + + if (!virFileExists(sysfs_path)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("unpriv_sgio is not supported by this kernel")); + goto cleanup; + } + + if (virFileReadAll(sysfs_path, 1024, &buf) < 0) + goto cleanup; + + if ((tmp = strchr(buf, '\n'))) + *tmp = '\0'; + + if ((rc = virStrToLong_i(buf, NULL, 10, + unpriv_sgio)) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("failed to parse value of %s"), sysfs_path); + goto cleanup; + } + + ret = 0; +cleanup: + VIR_FREE(sysfs_path); + VIR_FREE(buf); + return ret; +} diff --git a/src/util/util.h b/src/util/util.h index 6d5dd03..e00681a 100644 --- a/src/util/util.h +++ b/src/util/util.h @@ -281,4 +281,12 @@ bool virIsDevMapperDevice(const char *dev_name) ATTRIBUTE_NONNULL(1); bool virValidateWWN(const char *wwn); bool virStrIsPrint(const char *str); + +int virGetDeviceMajor(const char *path); +int virGetDeviceMinor(const char *path); +int virSetDeviceUnprivSGIO(const char *path, + int unpriv_sgio); +int virGetDeviceUnprivSGIO(const char *path, + int *unpriv_sgio); + #endif /* __VIR_UTIL_H__ */ -- 1.7.7.6

On Fri, Dec 14, 2012 at 03:05:36 +0800, Osier Yang wrote:
"virGetDevice{Major,Minor}" could be used across the sources, but it doesn't relate with this series, and could be done later.
* src/util/util.h: (Declare virGetDevice{Major,Minor}, and vir{Get,Set}DeviceUnprivSGIO) * src/util/util.c: (New internal helper virCanonicalizeDiskPath to canonicalize the disk path; Implement virGetDevice{Major,Minor} and vir{Get,Set}DeviceUnprivSGIO) * src/libvirt_private.syms: Export private symbols of upper helpers --- src/libvirt_private.syms | 4 + src/util/util.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/util.h | 8 ++ 3 files changed, 192 insertions(+), 0 deletions(-) ... diff --git a/src/util/util.c b/src/util/util.c index 05e7ca7..8c6d43c 100644 --- a/src/util/util.c +++ b/src/util/util.c @@ -3129,3 +3129,183 @@ virStrIsPrint(const char *str)
return true; } + +static char * +virCanonicalizeDiskPath(const char *path) +{ + char *canonical_path = NULL; + + if (STRPREFIX(path, "/dev/disk/by-path") || + STRPREFIX(path, "/dev/disk/by-uuid") || + STRPREFIX(path, "/dev/disk/by-id")) { + if (virFileResolveLink(path, &canonical_path) < 0) + return NULL; + } else { + canonical_path = strdup(path); + } + + return canonical_path; +}
There's no reason to restrict virFileResolveLink only to disk paths that start with /dev/disk/by-*. Symlinks to devices can be anywhere; just call it unconditionally.
+ +#if defined(major) && defined(minor) +int +virGetDeviceMajor(const char *path) +{ + struct stat sb; + char *canonical_path = NULL; + + if (!(canonical_path = virCanonicalizeDiskPath(path))) + return -errno; + + if (stat(canonical_path, &sb) < 0) { + VIR_FREE(canonical_path); + return -errno; + } + + if (!S_ISBLK(sb.st_mode)) { + VIR_FREE(canonical_path); + return -EINVAL; + } + + VIR_FREE(canonical_path); + return major(sb.st_rdev); +} + +int +virGetDeviceMinor(const char *path) +{ + struct stat sb; + char *canonical_path = NULL; + + if (!(canonical_path = virCanonicalizeDiskPath(path))) + return -errno; + + if (stat(canonical_path, &sb) < 0) { + VIR_FREE(canonical_path); + return -errno; + } + + if (!S_ISBLK(sb.st_mode)) { + VIR_FREE(canonical_path); + return -EINVAL; + } + + VIR_FREE(canonical_path); + return minor(sb.st_rdev); +}
Oh, this is horrible. Why did you create two functions for getting major and minor numbers? Creating int virGetDeviceID(const char *path, int *major, int *minor) that would extract both at one go (since you want to get both of them anyway and if not, you can allow NULL to be passed for either) seems to be a better approach. And while at it, you can just remove the virCanonicalizeDiskPath helper and call virFileResolveLink directly in virGetDeviceID.
+#else +int +virGetDeviceMajor(const char *path) +{ + return -ENOSYS; +} + +int +virGetDeviceMinor(const char *path) +{ + return -ENOSYS; +} +#endif + +static char * +virGetUnprivSGIOSysfsPath(const char *path) +{ + int major, minor; + char *sysfs_path = NULL; + + if ((major = virGetDeviceMajor(path)) < 0) { + virReportSystemError(-major, + _("Unable to get major number of device '%s'"), + path); + return NULL; + } + + if ((minor = virGetDeviceMinor(path)) < 0) { + virReportSystemError(-minor, + _("Unable to get minor number of device '%s'"), + path); + return NULL; + } + + if (virAsprintf(&sysfs_path, "/sys/dev/block/%d:%d/queue/unpriv_sgio", + major, minor) < 0) { + virReportOOMError(); + return NULL; + } + + return sysfs_path; +} + +int +virSetDeviceUnprivSGIO(const char *path, + int unpriv_sgio) +{ + char *sysfs_path = NULL; + char *val = NULL; + int ret = -1; + int rc = -1;
Nit: no need to initalize rc.
+ + if (!(sysfs_path = virGetUnprivSGIOSysfsPath(path))) + return -1; + + if (!virFileExists(sysfs_path)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("unpriv_sgio is not supported by this kernel")); + goto cleanup; + } + + if (virAsprintf(&val, "%d", unpriv_sgio) < 0) { + virReportOOMError(); + goto cleanup; + } + + if ((rc = virFileWriteStr(sysfs_path, val, 0)) < 0) { + virReportSystemError(-rc, _("failed to set %s"), sysfs_path); + goto cleanup; + } + + ret = 0; +cleanup: + VIR_FREE(sysfs_path); + VIR_FREE(val); + return ret; +} + +int +virGetDeviceUnprivSGIO(const char *path, + int *unpriv_sgio) +{ + char *sysfs_path = NULL; + char *buf = NULL; + char *tmp = NULL;
Nit: sysfs_path and tmp do not really need be initialized.
+ int ret = -1; + int rc = -1; + + if (!(sysfs_path = virGetUnprivSGIOSysfsPath(path))) + return -1; + + if (!virFileExists(sysfs_path)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("unpriv_sgio is not supported by this kernel")); + goto cleanup; + } + + if (virFileReadAll(sysfs_path, 1024, &buf) < 0) + goto cleanup; + + if ((tmp = strchr(buf, '\n'))) + *tmp = '\0'; + + if ((rc = virStrToLong_i(buf, NULL, 10, + unpriv_sgio)) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("failed to parse value of %s"), sysfs_path); + goto cleanup; + }
rc is completely useless in this function.
+ ret = 0; +cleanup: + VIR_FREE(sysfs_path); + VIR_FREE(buf); + return ret; +}
Jirka

On 2012年12月14日 21:13, Jiri Denemark wrote:
On Fri, Dec 14, 2012 at 03:05:36 +0800, Osier Yang wrote:
"virGetDevice{Major,Minor}" could be used across the sources, but it doesn't relate with this series, and could be done later.
* src/util/util.h: (Declare virGetDevice{Major,Minor}, and vir{Get,Set}DeviceUnprivSGIO) * src/util/util.c: (New internal helper virCanonicalizeDiskPath to canonicalize the disk path; Implement virGetDevice{Major,Minor} and vir{Get,Set}DeviceUnprivSGIO) * src/libvirt_private.syms: Export private symbols of upper helpers --- src/libvirt_private.syms | 4 + src/util/util.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/util.h | 8 ++ 3 files changed, 192 insertions(+), 0 deletions(-) ... diff --git a/src/util/util.c b/src/util/util.c index 05e7ca7..8c6d43c 100644 --- a/src/util/util.c +++ b/src/util/util.c @@ -3129,3 +3129,183 @@ virStrIsPrint(const char *str)
return true; } + +static char * +virCanonicalizeDiskPath(const char *path) +{ + char *canonical_path = NULL; + + if (STRPREFIX(path, "/dev/disk/by-path") || + STRPREFIX(path, "/dev/disk/by-uuid") || + STRPREFIX(path, "/dev/disk/by-id")) { + if (virFileResolveLink(path,&canonical_path)< 0) + return NULL; + } else { + canonical_path = strdup(path); + } + + return canonical_path; +}
There's no reason to restrict virFileResolveLink only to disk paths that start with /dev/disk/by-*. Symlinks to devices can be anywhere; just call it unconditionally.
+ +#if defined(major)&& defined(minor) +int +virGetDeviceMajor(const char *path) +{ + struct stat sb; + char *canonical_path = NULL; + + if (!(canonical_path = virCanonicalizeDiskPath(path))) + return -errno; + + if (stat(canonical_path,&sb)< 0) { + VIR_FREE(canonical_path); + return -errno; + } + + if (!S_ISBLK(sb.st_mode)) { + VIR_FREE(canonical_path); + return -EINVAL; + } + + VIR_FREE(canonical_path); + return major(sb.st_rdev); +} + +int +virGetDeviceMinor(const char *path) +{ + struct stat sb; + char *canonical_path = NULL; + + if (!(canonical_path = virCanonicalizeDiskPath(path))) + return -errno; + + if (stat(canonical_path,&sb)< 0) { + VIR_FREE(canonical_path); + return -errno; + } + + if (!S_ISBLK(sb.st_mode)) { + VIR_FREE(canonical_path); + return -EINVAL; + } + + VIR_FREE(canonical_path); + return minor(sb.st_rdev); +}
Oh, this is horrible. Why did you create two functions for getting major and minor numbers? Creating
int virGetDeviceID(const char *path, int *major, int *minor)
that would extract both at one go (since you want to get both of them anyway and if not, you can allow NULL to be passed for either) seems to be a better approach. And while at it, you can just remove the virCanonicalizeDiskPath helper and call virFileResolveLink directly in virGetDeviceID.
Okay, it's indeed for getting only one of them purpose. It won't be only used by this patch set. But yes, I agreed with using one function, for the potential race pointed by Eric.
+#else +int +virGetDeviceMajor(const char *path) +{ + return -ENOSYS; +} + +int +virGetDeviceMinor(const char *path) +{ + return -ENOSYS; +} +#endif + +static char * +virGetUnprivSGIOSysfsPath(const char *path) +{ + int major, minor; + char *sysfs_path = NULL; + + if ((major = virGetDeviceMajor(path))< 0) { + virReportSystemError(-major, + _("Unable to get major number of device '%s'"), + path); + return NULL; + } + + if ((minor = virGetDeviceMinor(path))< 0) { + virReportSystemError(-minor, + _("Unable to get minor number of device '%s'"), + path); + return NULL; + } + + if (virAsprintf(&sysfs_path, "/sys/dev/block/%d:%d/queue/unpriv_sgio", + major, minor)< 0) { + virReportOOMError(); + return NULL; + } + + return sysfs_path; +} + +int +virSetDeviceUnprivSGIO(const char *path, + int unpriv_sgio) +{ + char *sysfs_path = NULL; + char *val = NULL; + int ret = -1; + int rc = -1;
Nit: no need to initalize rc.
Okay.
+ + if (!(sysfs_path = virGetUnprivSGIOSysfsPath(path))) + return -1; + + if (!virFileExists(sysfs_path)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("unpriv_sgio is not supported by this kernel")); + goto cleanup; + } + + if (virAsprintf(&val, "%d", unpriv_sgio)< 0) { + virReportOOMError(); + goto cleanup; + } + + if ((rc = virFileWriteStr(sysfs_path, val, 0))< 0) { + virReportSystemError(-rc, _("failed to set %s"), sysfs_path); + goto cleanup; + } + + ret = 0; +cleanup: + VIR_FREE(sysfs_path); + VIR_FREE(val); + return ret; +} + +int +virGetDeviceUnprivSGIO(const char *path, + int *unpriv_sgio) +{ + char *sysfs_path = NULL; + char *buf = NULL; + char *tmp = NULL;
Nit: sysfs_path and tmp do not really need be initialized.
That's just personal habit.
+ int ret = -1; + int rc = -1; + + if (!(sysfs_path = virGetUnprivSGIOSysfsPath(path))) + return -1; + + if (!virFileExists(sysfs_path)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("unpriv_sgio is not supported by this kernel")); + goto cleanup; + } + + if (virFileReadAll(sysfs_path, 1024,&buf)< 0) + goto cleanup; + + if ((tmp = strchr(buf, '\n'))) + *tmp = '\0'; + + if ((rc = virStrToLong_i(buf, NULL, 10, + unpriv_sgio))< 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("failed to parse value of %s"), sysfs_path); + goto cleanup; + }
rc is completely useless in this function.
Okay.
+ ret = 0; +cleanup: + VIR_FREE(sysfs_path); + VIR_FREE(buf); + return ret; +}
Jirka

On 12/13/2012 12:05 PM, Osier Yang wrote:
"virGetDevice{Major,Minor}" could be used across the sources, but it doesn't relate with this series, and could be done later.
* src/util/util.h: (Declare virGetDevice{Major,Minor}, and
You generally want both values in one go; calling stat() twice because you have two functions is not only a waste, but a racy window (if someone is modifying the pathname in the meantime).
+static char * +virGetUnprivSGIOSysfsPath(const char *path) +{ + int major, minor; + char *sysfs_path = NULL; + + if ((major = virGetDeviceMajor(path)) < 0) { + virReportSystemError(-major, + _("Unable to get major number of device '%s'"), + path); + return NULL; + } + + if ((minor = virGetDeviceMinor(path)) < 0) { + virReportSystemError(-minor, + _("Unable to get minor number of device '%s'"), + path); + return NULL; + } + + if (virAsprintf(&sysfs_path, "/sys/dev/block/%d:%d/queue/unpriv_sgio", + major, minor) < 0) {
This is hard-coded to probe the actual kernel. If you instead make it use a configurable prefix, then we could default to the kernel path, but also allow our testsuite to pass in a prefix from the testsuite, so that we can test this functionality even on kernels that don't support the feature (similar to how we have tests/nodeinfodata for faked cpu and node information). I'm not yet sure whether we'll need to fake this information in any of our tests, but it's food for thought.
+int +virSetDeviceUnprivSGIO(const char *path, + int unpriv_sgio)
Is this value only ever going to be 0 or 1? If so, a bool might be more appropriate.
+{ + char *sysfs_path = NULL; + char *val = NULL; + int ret = -1; + int rc = -1; + + if (!(sysfs_path = virGetUnprivSGIOSysfsPath(path))) + return -1; + + if (!virFileExists(sysfs_path)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("unpriv_sgio is not supported by this kernel")); + goto cleanup; + } + + if (virAsprintf(&val, "%d", unpriv_sgio) < 0) {
If indeed this is bool, then you could avoid the virAsprintf,
+ virReportOOMError(); + goto cleanup; + } + + if ((rc = virFileWriteStr(sysfs_path, val, 0)) < 0) {
and instead write a single '0' or '1' with less malloc pressure.
+ virReportSystemError(-rc, _("failed to set %s"), sysfs_path); + goto cleanup; + } + + ret = 0; +cleanup: + VIR_FREE(sysfs_path); + VIR_FREE(val); + return ret; +} + +int +virGetDeviceUnprivSGIO(const char *path, + int *unpriv_sgio)
Again, to we expect the kernel to ever return more than 0 or 1? Would this interface be any simpler as: /* -1 for error, 0 for off, 1 for on */ int virGetDeviceUnprivSGIO(const char *path) -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 2012年12月14日 21:23, Eric Blake wrote:
On 12/13/2012 12:05 PM, Osier Yang wrote:
"virGetDevice{Major,Minor}" could be used across the sources, but it doesn't relate with this series, and could be done later.
* src/util/util.h: (Declare virGetDevice{Major,Minor}, and
You generally want both values in one go; calling stat() twice because you have two functions is not only a waste, but a racy window (if someone is modifying the pathname in the meantime).
Okay, agreed.
+static char * +virGetUnprivSGIOSysfsPath(const char *path) +{ + int major, minor; + char *sysfs_path = NULL; + + if ((major = virGetDeviceMajor(path))< 0) { + virReportSystemError(-major, + _("Unable to get major number of device '%s'"), + path); + return NULL; + } + + if ((minor = virGetDeviceMinor(path))< 0) { + virReportSystemError(-minor, + _("Unable to get minor number of device '%s'"), + path); + return NULL; + } + + if (virAsprintf(&sysfs_path, "/sys/dev/block/%d:%d/queue/unpriv_sgio", + major, minor)< 0) {
This is hard-coded to probe the actual kernel. If you instead make it use a configurable prefix, then we could default to the kernel path, but also allow our testsuite to pass in a prefix from the testsuite, so that we can test this functionality even on kernels that don't support the feature (similar to how we have tests/nodeinfodata for faked cpu and node information). I'm not yet sure whether we'll need to fake this information in any of our tests, but it's food for thought.
+int +virSetDeviceUnprivSGIO(const char *path, + int unpriv_sgio)
Is this value only ever going to be 0 or 1?
Not sure, and as far as I get from the kernel patches thread, I think it possibly could be other values too.
If so, a bool might be more appropriate.
Returning int here doesn't here anway, so I'd keep it in case of new values for unpriv_sgio.
+{ + char *sysfs_path = NULL; + char *val = NULL; + int ret = -1; + int rc = -1; + + if (!(sysfs_path = virGetUnprivSGIOSysfsPath(path))) + return -1; + + if (!virFileExists(sysfs_path)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("unpriv_sgio is not supported by this kernel")); + goto cleanup; + } + + if (virAsprintf(&val, "%d", unpriv_sgio)< 0) {
If indeed this is bool, then you could avoid the virAsprintf,
+ virReportOOMError(); + goto cleanup; + } + + if ((rc = virFileWriteStr(sysfs_path, val, 0))< 0) {
and instead write a single '0' or '1' with less malloc pressure.
+ virReportSystemError(-rc, _("failed to set %s"), sysfs_path); + goto cleanup; + } + + ret = 0; +cleanup: + VIR_FREE(sysfs_path); + VIR_FREE(val); + return ret; +} + +int +virGetDeviceUnprivSGIO(const char *path, + int *unpriv_sgio)
Again, to we expect the kernel to ever return more than 0 or 1?
Yes, I'd like keep it unchanged.
Would this interface be any simpler as:
/* -1 for error, 0 for off, 1 for on */ int virGetDeviceUnprivSGIO(const char *path)

On 2012年12月14日 21:23, Eric Blake wrote:
On 12/13/2012 12:05 PM, Osier Yang wrote:
"virGetDevice{Major,Minor}" could be used across the sources, but it doesn't relate with this series, and could be done later.
* src/util/util.h: (Declare virGetDevice{Major,Minor}, and
You generally want both values in one go; calling stat() twice because you have two functions is not only a waste, but a racy window (if someone is modifying the pathname in the meantime).
+static char * +virGetUnprivSGIOSysfsPath(const char *path) +{ + int major, minor; + char *sysfs_path = NULL; + + if ((major = virGetDeviceMajor(path))< 0) { + virReportSystemError(-major, + _("Unable to get major number of device '%s'"), + path); + return NULL; + } + + if ((minor = virGetDeviceMinor(path))< 0) { + virReportSystemError(-minor, + _("Unable to get minor number of device '%s'"), + path); + return NULL; + } + + if (virAsprintf(&sysfs_path, "/sys/dev/block/%d:%d/queue/unpriv_sgio", + major, minor)< 0) {
This is hard-coded to probe the actual kernel. If you instead make it use a configurable prefix, then we could default to the kernel path, but also allow our testsuite to pass in a prefix from the testsuite, so that we can test this functionality even on kernels that don't support the feature (similar to how we have tests/nodeinfodata for faked cpu and node information). I'm not yet sure whether we'll need to fake this information in any of our tests, but it's food for thought.
Okay, nice suggestion.

"virGetDeviceID" could be used across the sources, but it doesn't relate with this series, and could be done later. * src/util/util.h: (Declare virGetDeviceID, and vir{Get,Set}DeviceUnprivSGIO) * src/util/util.c: (Implement virGetDeviceID and vir{Get,Set}DeviceUnprivSGIO) * src/libvirt_private.syms: Export private symbols of upper helpers --- src/libvirt_private.syms | 3 + src/util/util.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/util.h | 11 ++++ 3 files changed, 154 insertions(+), 0 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 9288ad3..f1e1631 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1281,6 +1281,8 @@ virFileWaitForDevices; virFileWriteStr; virFindFileInPath; virFormatIntDecimal; +virGetDeviceID; +virGetDeviceUnprivSGIO; virGetGroupID; virGetGroupName; virGetHostname; @@ -1299,6 +1301,7 @@ virPipeReadUntilEOF; virScaleInteger; virSetBlocking; virSetCloseExec; +virSetDeviceUnprivSGIO; virSetInherit; virSetNonBlock; virSetUIDGID; diff --git a/src/util/util.c b/src/util/util.c index 05e7ca7..baca182 100644 --- a/src/util/util.c +++ b/src/util/util.c @@ -3129,3 +3129,143 @@ virStrIsPrint(const char *str) return true; } + +#if defined(major) && defined(minor) +int +virGetDeviceID(const char *path, int *major, int *minor) +{ + struct stat sb; + char *canonical_path = NULL; + + if (virFileResolveLink(path, &canonical_path) < 0) + return -errno; + + if (stat(canonical_path, &sb) < 0) { + VIR_FREE(canonical_path); + return -errno; + } + + if (!S_ISBLK(sb.st_mode)) { + VIR_FREE(canonical_path); + return -EINVAL; + } + + if (major) + *major = major(sb.st_rdev); + if (minor) + *minor = minor(sb.st_rdev); + + VIR_FREE(canonical_path); + return 0; +} +#else +int +virGetDeviceID(const char *path ATRRIBUTE_UNUSED, + int *major ATRRIBUTE_UNUSED, + int *minor ATRRIBUTE_UNUSED) +{ + + return -ENOSYS; +} +#endif + +#define SYSFS_DEV_BLOCK_PATH "/sys/dev/block" + +static char * +virGetUnprivSGIOSysfsPath(const char *path, + const char *sysfs_dir) +{ + int major, minor; + char *sysfs_path = NULL; + int rc; + + if ((rc = virGetDeviceID(path, &major, &minor)) < 0) { + virReportSystemError(-rc, + _("Unable to get device ID '%s'"), + path); + return NULL; + } + + if (virAsprintf(&sysfs_path, "%s/%d:%d/queue/unpriv_sgio", + sysfs_dir ? sysfs_dir : SYSFS_DEV_BLOCK_PATH, + major, minor) < 0) { + virReportOOMError(); + return NULL; + } + + return sysfs_path; +} + +int +virSetDeviceUnprivSGIO(const char *path, + const char *sysfs_dir, + int unpriv_sgio) +{ + char *sysfs_path = NULL; + char *val = NULL; + int ret = -1; + int rc; + + if (!(sysfs_path = virGetUnprivSGIOSysfsPath(path, sysfs_dir))) + return -1; + + if (!virFileExists(sysfs_path)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("unpriv_sgio is not supported by this kernel")); + goto cleanup; + } + + if (virAsprintf(&val, "%d", unpriv_sgio) < 0) { + virReportOOMError(); + goto cleanup; + } + + if ((rc = virFileWriteStr(sysfs_path, val, 0)) < 0) { + virReportSystemError(-rc, _("failed to set %s"), sysfs_path); + goto cleanup; + } + + ret = 0; +cleanup: + VIR_FREE(sysfs_path); + VIR_FREE(val); + return ret; +} + +int +virGetDeviceUnprivSGIO(const char *path, + const char *sysfs_dir, + int *unpriv_sgio) +{ + char *sysfs_path = NULL; + char *buf = NULL; + char *tmp = NULL; + int ret = -1; + + if (!(sysfs_path = virGetUnprivSGIOSysfsPath(path, sysfs_dir))) + return -1; + + if (!virFileExists(sysfs_path)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("unpriv_sgio is not supported by this kernel")); + goto cleanup; + } + + if (virFileReadAll(sysfs_path, 1024, &buf) < 0) + goto cleanup; + + if ((tmp = strchr(buf, '\n'))) + *tmp = '\0'; + + if (virStrToLong_i(buf, NULL, 10, unpriv_sgio) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("failed to parse value of %s"), sysfs_path); + goto cleanup; + } + + ret = 0; +cleanup: + VIR_FREE(sysfs_path); + VIR_FREE(buf); + return ret; +} diff --git a/src/util/util.h b/src/util/util.h index 6d5dd03..79c88a8 100644 --- a/src/util/util.h +++ b/src/util/util.h @@ -281,4 +281,15 @@ bool virIsDevMapperDevice(const char *dev_name) ATTRIBUTE_NONNULL(1); bool virValidateWWN(const char *wwn); bool virStrIsPrint(const char *str); + +int virGetDeviceID(const char *path, + int *major, + int *minor); +int virSetDeviceUnprivSGIO(const char *path, + const char *sysfs_dir, + int unpriv_sgio); +int virGetDeviceUnprivSGIO(const char *path, + const char *sysfs_dir, + int *unpriv_sgio); + #endif /* __VIR_UTIL_H__ */ -- 1.7.7.6

On 12/14/2012 08:41 AM, Osier Yang wrote:
"virGetDeviceID" could be used across the sources, but it doesn't relate with this series, and could be done later.
* src/util/util.h: (Declare virGetDeviceID, and vir{Get,Set}DeviceUnprivSGIO) * src/util/util.c: (Implement virGetDeviceID and vir{Get,Set}DeviceUnprivSGIO) * src/libvirt_private.syms: Export private symbols of upper helpers --- src/libvirt_private.syms | 3 + src/util/util.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/util.h | 11 ++++ 3 files changed, 154 insertions(+), 0 deletions(-)
Wait until post-release, but for this version, I'm comfortable giving: ACK.
+#if defined(major) && defined(minor) +int +virGetDeviceID(const char *path, int *major, int *minor)
The compiler didn't warn about your use of a local variable named 'major' conflicting with a macro-like function 'major()'? It might be worth using 'maj' and 'min' for the local variables, if only to avoid potential confusion. But since things compiled warning-free, that's cosmetic.
+#define SYSFS_DEV_BLOCK_PATH "/sys/dev/block" + +static char * +virGetUnprivSGIOSysfsPath(const char *path, + const char *sysfs_dir) +{ + int major, minor;
Same idea on local naming. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 2012年12月15日 02:56, Eric Blake wrote:
On 12/14/2012 08:41 AM, Osier Yang wrote:
"virGetDeviceID" could be used across the sources, but it doesn't relate with this series, and could be done later.
* src/util/util.h: (Declare virGetDeviceID, and vir{Get,Set}DeviceUnprivSGIO) * src/util/util.c: (Implement virGetDeviceID and vir{Get,Set}DeviceUnprivSGIO) * src/libvirt_private.syms: Export private symbols of upper helpers --- src/libvirt_private.syms | 3 + src/util/util.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/util.h | 11 ++++ 3 files changed, 154 insertions(+), 0 deletions(-)
Wait until post-release, but for this version, I'm comfortable giving:
ACK.
+#if defined(major)&& defined(minor) +int +virGetDeviceID(const char *path, int *major, int *minor)
The compiler didn't warn about your use of a local variable named 'major' conflicting with a macro-like function 'major()'? It might be worth using 'maj' and 'min' for the local variables, if only to avoid potential confusion. But since things compiled warning-free, that's cosmetic.
Reasonable, I will udpate it when pushing.
+#define SYSFS_DEV_BLOCK_PATH "/sys/dev/block" + +static char * +virGetUnprivSGIOSysfsPath(const char *path, + const char *sysfs_dir) +{ + int major, minor;
Same idea on local naming.

This introduces a hash table for qemu driver, to store the shared disk's info as (@disk_path, @ref_count). @ref_count is the number of domains which shares the disk. Since we only care about the disk's cdbfilter (see later patch) setting for shared disk currently, and cdbfilter is only valid for block disk, this patch only manages (add/remove hash entry) the shared disk for block disk. * src/qemu/qemu_conf.h: (Add member 'sharedDisks' of type virHashTablePtr; Declare helpers qemuGetSharedDiskKey, qemuAddSharedDisk and qemuRemoveSharedDisk) * src/qemu/qemu_conf.c (Implement the 3 helpers) * src/qemu/qemu_process.c (Update 'sharedDisks' when domain starting and shutdown) * src/qemu/qemu_driver.c (Update 'sharedDisks' when attaching or detaching disk). --- src/qemu/qemu_conf.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_conf.h | 12 ++++++ src/qemu/qemu_driver.c | 22 +++++++++++ src/qemu/qemu_process.c | 15 ++++++++ 4 files changed, 141 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index a1b1d04..5126a0f 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -553,3 +553,95 @@ qemuDriverCloseCallbackRunAll(virQEMUDriverPtr driver, virHashForEach(driver->closeCallbacks, qemuDriverCloseCallbackRun, &data); } + +/* Construct the hash key for sharedDisks as "major:minor" */ +char * +qemuGetSharedDiskKey(const char *disk_path) +{ + int major, minor; + char *key = NULL; + + if ((major = virGetDeviceMajor(disk_path)) < 0) { + virReportSystemError(-major, + _("Unable to get major number of device '%s'"), + disk_path); + return NULL; + } + + if ((minor = virGetDeviceMinor(disk_path)) < 0) { + virReportSystemError(-minor, + _("Unable to get minor number of device '%s'"), + disk_path); + return NULL; + } + + if (virAsprintf(&key, "%d:%d", major, minor) < 0) { + virReportOOMError(); + return NULL; + } + + return key; +} + +/* Increase ref count if the entry already exists, otherwise + * add a new entry. + */ +int +qemuAddSharedDisk(virHashTablePtr sharedDisks, + const char *disk_path) +{ + size_t *ref = NULL; + char *key = NULL; + + if (!(key = qemuGetSharedDiskKey(disk_path))) + return -1; + + if ((ref = virHashLookup(sharedDisks, key))) { + if (virHashUpdateEntry(sharedDisks, key, ++ref) < 0) { + VIR_FREE(key); + return -1; + } + } else { + if (virHashAddEntry(sharedDisks, key, (void *)0x1)) { + VIR_FREE(key); + return -1; + } + } + + VIR_FREE(key); + return 0; +} + +/* Decrease the ref count if the entry already exists, otherwise + * remove the entry. + */ +int +qemuRemoveSharedDisk(virHashTablePtr sharedDisks, + const char *disk_path) +{ + size_t *ref = NULL; + char *key = NULL; + + if (!(key = qemuGetSharedDiskKey(disk_path))) + return -1; + + if (!(ref = virHashLookup(sharedDisks, key))) { + VIR_FREE(key); + return -1; + } + + if (ref != (void *)1) { + if (virHashUpdateEntry(sharedDisks, key, --ref) < 0) { + VIR_FREE(key); + return -1; + } + } else { + if (virHashRemoveEntry(sharedDisks, key) < 0) { + VIR_FREE(key); + return -1; + } + } + + VIR_FREE(key); + return 0; +} diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h index 1a39946..225ba55 100644 --- a/src/qemu/qemu_conf.h +++ b/src/qemu/qemu_conf.h @@ -148,6 +148,8 @@ struct _virQEMUDriver { /* The devices which is are not in use by the host or any guest. */ pciDeviceList *inactivePciHostdevs; + virHashTablePtr sharedDisks; + virBitmapPtr reservedRemotePorts; virSysinfoDefPtr hostsysinfo; @@ -212,4 +214,14 @@ qemuDriverCloseCallback qemuDriverCloseCallbackGet(virQEMUDriverPtr driver, void qemuDriverCloseCallbackRunAll(virQEMUDriverPtr driver, virConnectPtr conn); +int qemuAddSharedDisk(virHashTablePtr sharedDisks, + const char *disk_path) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); + +int qemuRemoveSharedDisk(virHashTablePtr sharedDisks, + const char *disk_path) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); +char * qemuGetSharedDiskKey(const char *disk_path) + ATTRIBUTE_NONNULL(1); + #endif /* __QEMUD_CONF_H */ diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 6f0849c..a2700de 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -844,6 +844,9 @@ qemuStartup(bool privileged, if ((qemu_driver->inactivePciHostdevs = pciDeviceListNew()) == NULL) goto error; + if (!(qemu_driver->sharedDisks = virHashCreate(30, NULL))) + goto error; + if (privileged) { if (chown(qemu_driver->libDir, qemu_driver->user, qemu_driver->group) < 0) { virReportSystemError(errno, @@ -1097,6 +1100,7 @@ qemuShutdown(void) { pciDeviceListFree(qemu_driver->activePciHostdevs); pciDeviceListFree(qemu_driver->inactivePciHostdevs); usbDeviceListFree(qemu_driver->activeUsbHostdevs); + virHashFree(qemu_driver->sharedDisks); virCapabilitiesFree(qemu_driver->caps); qemuCapsCacheFree(qemu_driver->capsCache); @@ -6041,6 +6045,15 @@ qemuDomainAttachDeviceDiskLive(virConnectPtr conn, VIR_WARN("Failed to teardown cgroup for disk path %s", NULLSTR(disk->src)); } + + if (ret == 0 && + disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && + disk->shared) { + if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) + VIR_WARN("Failed to add disk '%s' to shared disk table", + disk->src); + } + end: if (cgroup) virCgroupFree(&cgroup); @@ -6155,6 +6168,15 @@ qemuDomainDetachDeviceDiskLive(virQEMUDriverPtr driver, virDomainDiskDeviceTypeToString(disk->type)); break; } + + if (ret == 0 && + disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && + disk->shared) { + if (qemuRemoveSharedDisk(driver->sharedDisks, disk->src) < 0) + VIR_WARN("Failed to remove disk '%s' from shared disk table", + disk->src); + } + return ret; } diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index cc0e947..c3ecf6f 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3720,6 +3720,8 @@ int qemuProcessStart(virConnectPtr conn, /* in case a certain disk is desirous of CAP_SYS_RAWIO, add this */ for (i = 0; i < vm->def->ndisks; i++) { + virDomainDiskDefPtr disk = vm->def->disks[i]; + if (vm->def->disks[i]->rawio == 1) #ifdef CAP_SYS_RAWIO virCommandAllowCap(cmd, CAP_SYS_RAWIO); @@ -3727,6 +3729,11 @@ int qemuProcessStart(virConnectPtr conn, virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("Raw I/O is not supported on this platform")); #endif + + if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { + if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) + goto cleanup; + } } virCommandSetPreExecHook(cmd, qemuProcessHook, &hookData); @@ -4123,6 +4130,14 @@ void qemuProcessStop(virQEMUDriverPtr driver, flags & VIR_QEMU_PROCESS_STOP_MIGRATED); virSecurityManagerReleaseLabel(driver->securityManager, vm->def); + for (i = 0; i < vm->def->ndisks; i++) { + virDomainDiskDefPtr disk = vm->def->disks[i]; + + if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { + ignore_value(qemuRemoveSharedDisk(driver->sharedDisks, disk->src)); + } + } + /* Clear out dynamically assigned labels */ for (i = 0; i < vm->def->nseclabels; i++) { if (vm->def->seclabels[i]->type == VIR_DOMAIN_SECLABEL_DYNAMIC) { -- 1.7.7.6

This introduces a hash table for qemu driver, to store the shared disk's info as (@disk_path, @ref_count). @ref_count is the number of domains which shares the disk. Since we only care about the disk's cdbfilter (see later patch) setting for shared disk currently, and cdbfilter is only valid for block disk, this patch only manages (add/remove hash entry) the shared disk for block disk. * src/qemu/qemu_conf.h: (Add member 'sharedDisks' of type virHashTablePtr; Declare helpers qemuGetSharedDiskKey, qemuAddSharedDisk and qemuRemoveSharedDisk) * src/qemu/qemu_conf.c (Implement the 3 helpers) * src/qemu/qemu_process.c (Update 'sharedDisks' when domain starting and shutdown) * src/qemu/qemu_driver.c (Update 'sharedDisks' when attaching or detaching disk). --- src/qemu/qemu_conf.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_conf.h | 12 ++++++ src/qemu/qemu_driver.c | 22 ++++++++++++ src/qemu/qemu_process.c | 15 ++++++++ 4 files changed, 135 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index a1b1d04..e25376d 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -553,3 +553,89 @@ qemuDriverCloseCallbackRunAll(virQEMUDriverPtr driver, virHashForEach(driver->closeCallbacks, qemuDriverCloseCallbackRun, &data); } + +/* Construct the hash key for sharedDisks as "major:minor" */ +char * +qemuGetSharedDiskKey(const char *disk_path) +{ + int major, minor; + char *key = NULL; + int rc; + + if ((rc = virGetDeviceID(disk_path, &major, &minor)) < 0) { + virReportSystemError(-rc, + _("Unable to get minor number of device '%s'"), + disk_path); + return NULL; + } + + if (virAsprintf(&key, "%d:%d", major, minor) < 0) { + virReportOOMError(); + return NULL; + } + + return key; +} + +/* Increase ref count if the entry already exists, otherwise + * add a new entry. + */ +int +qemuAddSharedDisk(virHashTablePtr sharedDisks, + const char *disk_path) +{ + size_t *ref = NULL; + char *key = NULL; + + if (!(key = qemuGetSharedDiskKey(disk_path))) + return -1; + + if ((ref = virHashLookup(sharedDisks, key))) { + if (virHashUpdateEntry(sharedDisks, key, ++ref) < 0) { + VIR_FREE(key); + return -1; + } + } else { + if (virHashAddEntry(sharedDisks, key, (void *)0x1)) { + VIR_FREE(key); + return -1; + } + } + + VIR_FREE(key); + return 0; +} + +/* Decrease the ref count if the entry already exists, otherwise + * remove the entry. + */ +int +qemuRemoveSharedDisk(virHashTablePtr sharedDisks, + const char *disk_path) +{ + size_t *ref = NULL; + char *key = NULL; + + if (!(key = qemuGetSharedDiskKey(disk_path))) + return -1; + + if (!(ref = virHashLookup(sharedDisks, key))) { + VIR_FREE(key); + return -1; + } + + if (ref != (void *)0x1) { + if (virHashUpdateEntry(sharedDisks, key, --ref) < 0) { + VIR_FREE(key); + return -1; + } + } else { + if (virHashRemoveEntry(sharedDisks, key) < 0) { + VIR_FREE(key); + return -1; + } + } + + VIR_FREE(key); + return 0; +} diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h index 1a39946..225ba55 100644 --- a/src/qemu/qemu_conf.h +++ b/src/qemu/qemu_conf.h @@ -148,6 +148,8 @@ struct _virQEMUDriver { /* The devices which is are not in use by the host or any guest. */ pciDeviceList *inactivePciHostdevs; + virHashTablePtr sharedDisks; + virBitmapPtr reservedRemotePorts; virSysinfoDefPtr hostsysinfo; @@ -212,4 +214,14 @@ qemuDriverCloseCallback qemuDriverCloseCallbackGet(virQEMUDriverPtr driver, void qemuDriverCloseCallbackRunAll(virQEMUDriverPtr driver, virConnectPtr conn); +int qemuAddSharedDisk(virHashTablePtr sharedDisks, + const char *disk_path) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); + +int qemuRemoveSharedDisk(virHashTablePtr sharedDisks, + const char *disk_path) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); +char * qemuGetSharedDiskKey(const char *disk_path) + ATTRIBUTE_NONNULL(1); + #endif /* __QEMUD_CONF_H */ diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 6f0849c..a2700de 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -844,6 +844,9 @@ qemuStartup(bool privileged, if ((qemu_driver->inactivePciHostdevs = pciDeviceListNew()) == NULL) goto error; + if (!(qemu_driver->sharedDisks = virHashCreate(30, NULL))) + goto error; + if (privileged) { if (chown(qemu_driver->libDir, qemu_driver->user, qemu_driver->group) < 0) { virReportSystemError(errno, @@ -1097,6 +1100,7 @@ qemuShutdown(void) { pciDeviceListFree(qemu_driver->activePciHostdevs); pciDeviceListFree(qemu_driver->inactivePciHostdevs); usbDeviceListFree(qemu_driver->activeUsbHostdevs); + virHashFree(qemu_driver->sharedDisks); virCapabilitiesFree(qemu_driver->caps); qemuCapsCacheFree(qemu_driver->capsCache); @@ -6041,6 +6045,15 @@ qemuDomainAttachDeviceDiskLive(virConnectPtr conn, VIR_WARN("Failed to teardown cgroup for disk path %s", NULLSTR(disk->src)); } + + if (ret == 0 && + disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && + disk->shared) { + if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) + VIR_WARN("Failed to add disk '%s' to shared disk table", + disk->src); + } + end: if (cgroup) virCgroupFree(&cgroup); @@ -6155,6 +6168,15 @@ qemuDomainDetachDeviceDiskLive(virQEMUDriverPtr driver, virDomainDiskDeviceTypeToString(disk->type)); break; } + + if (ret == 0 && + disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && + disk->shared) { + if (qemuRemoveSharedDisk(driver->sharedDisks, disk->src) < 0) + VIR_WARN("Failed to remove disk '%s' from shared disk table", + disk->src); + } + return ret; } diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index cc0e947..c3ecf6f 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3720,6 +3720,8 @@ int qemuProcessStart(virConnectPtr conn, /* in case a certain disk is desirous of CAP_SYS_RAWIO, add this */ for (i = 0; i < vm->def->ndisks; i++) { + virDomainDiskDefPtr disk = vm->def->disks[i]; + if (vm->def->disks[i]->rawio == 1) #ifdef CAP_SYS_RAWIO virCommandAllowCap(cmd, CAP_SYS_RAWIO); @@ -3727,6 +3729,11 @@ int qemuProcessStart(virConnectPtr conn, virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("Raw I/O is not supported on this platform")); #endif + + if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { + if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) + goto cleanup; + } } virCommandSetPreExecHook(cmd, qemuProcessHook, &hookData); @@ -4123,6 +4130,14 @@ void qemuProcessStop(virQEMUDriverPtr driver, flags & VIR_QEMU_PROCESS_STOP_MIGRATED); virSecurityManagerReleaseLabel(driver->securityManager, vm->def); + for (i = 0; i < vm->def->ndisks; i++) { + virDomainDiskDefPtr disk = vm->def->disks[i]; + + if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { + ignore_value(qemuRemoveSharedDisk(driver->sharedDisks, disk->src)); + } + } + /* Clear out dynamically assigned labels */ for (i = 0; i < vm->def->nseclabels; i++) { if (vm->def->seclabels[i]->type == VIR_DOMAIN_SECLABEL_DYNAMIC) { -- 1.7.7.6

On 12/14/2012 08:42 AM, Osier Yang wrote:
This introduces a hash table for qemu driver, to store the shared disk's info as (@disk_path, @ref_count). @ref_count is the number
A bit out of date - the key is now major:minor, not path.
of domains which shares the disk.
Since we only care about the disk's cdbfilter (see later patch) setting for shared disk currently, and cdbfilter is only valid for block disk, this patch only manages (add/remove hash entry) the shared disk for block disk.
* src/qemu/qemu_conf.h: (Add member 'sharedDisks' of type virHashTablePtr; Declare helpers qemuGetSharedDiskKey, qemuAddSharedDisk and qemuRemoveSharedDisk) * src/qemu/qemu_conf.c (Implement the 3 helpers) * src/qemu/qemu_process.c (Update 'sharedDisks' when domain starting and shutdown) * src/qemu/qemu_driver.c (Update 'sharedDisks' when attaching or detaching disk). --- src/qemu/qemu_conf.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_conf.h | 12 ++++++ src/qemu/qemu_driver.c | 22 ++++++++++++ src/qemu/qemu_process.c | 15 ++++++++ 4 files changed, 135 insertions(+), 0 deletions(-)
diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index a1b1d04..e25376d 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -553,3 +553,89 @@ qemuDriverCloseCallbackRunAll(virQEMUDriverPtr driver,
virHashForEach(driver->closeCallbacks, qemuDriverCloseCallbackRun, &data); } + +/* Construct the hash key for sharedDisks as "major:minor" */ +char * +qemuGetSharedDiskKey(const char *disk_path) +{ + int major, minor; + char *key = NULL; + int rc; + + if ((rc = virGetDeviceID(disk_path, &major, &minor)) < 0) { + virReportSystemError(-rc, + _("Unable to get minor number of device '%s'"), + disk_path); + return NULL; + }
virGetDeviceID fails for non-block devices; does that mean I'm going to get an error message in the log for regular files? Or do I have to pre-filter for block devices before calling this function, which kind of defeats the point of virGetDeviceID also doing the filtering? I'm wondering if it would be better to make virGetDeviceID return 0 on success for a block device, and 1 on success for a regular file, and reserve negative returns for actual errors (OOM, stat() failed, ...). Then, you could blindly call this function, and be silent on the common case of a regular file, while still emitting useful error messages where they matter. [edit - read more below before you change anything...]
+ + if (virAsprintf(&key, "%d:%d", major, minor) < 0) { + virReportOOMError(); + return NULL; + } + + return key; +} + +/* Increase ref count if the entry already exists, otherwise + * add a new entry. + */ +int +qemuAddSharedDisk(virHashTablePtr sharedDisks, + const char *disk_path) +{ + size_t *ref = NULL; + char *key = NULL; + + if (!(key = qemuGetSharedDiskKey(disk_path))) + return -1;
Hmm, here you would have to be able to distinguish between a NULL because an error message was issued, vs. NULL because disk_path was a regular file rather than a block device so there is nothing to do. That may mean rewriting things to: int qemuGetSharedDiskKey(const char *disk_path, char **result)
@@ -6041,6 +6045,15 @@ qemuDomainAttachDeviceDiskLive(virConnectPtr conn, VIR_WARN("Failed to teardown cgroup for disk path %s", NULLSTR(disk->src)); } + + if (ret == 0 && + disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && + disk->shared) {
Or maybe I should just read more of your patch first. It looks like you are indeed pre-filtering and that you expect block devices from the get-go, and that it is appropriate to error out on regular files. So in spite of my musings earlier in this patch, it looks like you are doing everything correctly. ACK, but again, delay until after 1.0.1. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

Since "rawio" and "cdbfilter" are only valid for "lun", this groups them together; And since both of them intend to allow the unprivledged user to use the SG_IO commands, they must be exclusive. --- docs/formatdomain.html.in | 13 +++++++++- docs/schemas/domaincommon.rng | 52 ++++++++++++++++++++++++++++------------ 2 files changed, 48 insertions(+), 17 deletions(-) diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 8e234fd..8c7c682 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -1395,7 +1395,18 @@ rawio='yes', rawio capability will be enabled for all disks in the domain (because, in the case of QEMU, this capability can only be set on a per-process basis). This attribute is only - valid when device is "lun". + valid when device is "lun". NB, <code>rawio</code> intends to + confine the capability per-device, however, current QEMU + implementation gives the domain process broader capability + than that (per-process basis, affects all the domain disks). + To confine the capability as much as possible for QEMU driver + as this stage, <code>cdbfilter</code> is recommended. + The optional <code>cdbfilter</code> attribute + (<span class="since">since 1.0.1</span>) indicates whether the + kernel will filter unprivileged SG_IO for the disk, valid settings + are "yes" or "no" (defaults to "yes"). Note that it's exclusive + with attribute <code>rawio</code>; Same with <code>rawio</code>, + <code>cdbfilter</code> is only valid for device 'lun'. The optional <code>snapshot</code> attribute indicates the default behavior of the disk during disk snapshots: "internal" requires a file format such as qcow2 that can store both the diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng index 14344e2..c8cdfd7 100644 --- a/docs/schemas/domaincommon.rng +++ b/docs/schemas/domaincommon.rng @@ -971,24 +971,44 @@ --> <define name="disk"> <element name="disk"> - <optional> - <attribute name="device"> - <choice> - <value>floppy</value> - <value>disk</value> - <value>cdrom</value> - <value>lun</value> - </choice> - </attribute> - </optional> - <optional> - <attribute name="rawio"> + <choice> + <group> + <optional> + <attribute name="device"> + <choice> + <value>floppy</value> + <value>disk</value> + <value>cdrom</value> + </choice> + </attribute> + </optional> + </group> + <group> + <optional> + <attribute name="device"> + <value>lun</value> + </attribute> + </optional> <choice> - <value>yes</value> - <value>no</value> + <optional> + <attribute name="rawio"> + <choice> + <value>yes</value> + <value>no</value> + </choice> + </attribute> + </optional> + <optional> + <attribute name="cdbfilter"> + <choice> + <value>yes</value> + <value>no</value> + </choice> + </attribute> + </optional> </choice> - </attribute> - </optional> + </group> + </choice> <optional> <ref name="snapshot"/> </optional> -- 1.7.7.6

On 12/13/2012 12:05 PM, Osier Yang wrote:
Since "rawio" and "cdbfilter" are only valid for "lun", this groups them together; And since both of them intend to allow the unprivledged user to use the SG_IO commands, they must be
s/unprivledged/unprivileged/
exclusive. --- docs/formatdomain.html.in | 13 +++++++++- docs/schemas/domaincommon.rng | 52 ++++++++++++++++++++++++++++------------ 2 files changed, 48 insertions(+), 17 deletions(-)
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 8e234fd..8c7c682 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -1395,7 +1395,18 @@ rawio='yes', rawio capability will be enabled for all disks in the domain (because, in the case of QEMU, this capability can only be set on a per-process basis). This attribute is only - valid when device is "lun". + valid when device is "lun". NB, <code>rawio</code> intends to + confine the capability per-device, however, current QEMU + implementation gives the domain process broader capability + than that (per-process basis, affects all the domain disks). + To confine the capability as much as possible for QEMU driver + as this stage, <code>cdbfilter</code> is recommended. + The optional <code>cdbfilter</code> attribute + (<span class="since">since 1.0.1</span>) indicates whether the + kernel will filter unprivileged SG_IO for the disk, valid settings + are "yes" or "no" (defaults to "yes"). Note that it's exclusive + with attribute <code>rawio</code>; Same with <code>rawio</code>, + <code>cdbfilter</code> is only valid for device 'lun'.
Hmm, this doesn't seem like the smartest of designs. If I'm understanding you right, we have: rawio cdbfilter result missing missing kernel prevents SG_IO missing no kernel prevents SG_IO missing yes SG_IO allowed for disk no missing kernel prevents SG_IO no no kernel prevents SG_IO no yes SG_IO allowed for disk yes missing SG_IO allowed for process yes no SG_IO allowed for process yes yes error Why not simplify things, and have a single attribute rawio, with multiple values? rawio result missing kernel prevents SG_IO no kernel prevents SG_IO yes SG_IO allowed for process cdb SG_IO allowed for disk where we document that 'yes' works on more kernel versions than 'cdb', but that 'cdb' (new to 1.0.1) is more secure.
The optional <code>snapshot</code> attribute indicates the default behavior of the disk during disk snapshots: "internal" requires a file format such as qcow2 that can store both the diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng index 14344e2..c8cdfd7 100644 --- a/docs/schemas/domaincommon.rng +++ b/docs/schemas/domaincommon.rng @@ -971,24 +971,44 @@ --> <define name="disk"> <element name="disk"> - <optional> - <attribute name="device"> - <choice> - <value>floppy</value> - <value>disk</value> - <value>cdrom</value> - <value>lun</value> - </choice> - </attribute> - </optional> - <optional> - <attribute name="rawio"> + <choice> + <group> + <optional> + <attribute name="device"> + <choice> + <value>floppy</value> + <value>disk</value> + <value>cdrom</value> + </choice> + </attribute> + </optional> + </group>
I like that you split this up, to enforce that rawio only appears with device='lun'. However...
+ <group> + <optional> + <attribute name="device"> + <value>lun</value> + </attribute> + </optional>
This device='lun' should not be <optional>, since the default when device= is omitted is 'disk' (that is, unless you want the absence of device= and the presence of rawio='...' to imply a lun, but I think we should require an explicit use of lun before allowing rawio=).
<choice> - <value>yes</value> - <value>no</value> + <optional> + <attribute name="rawio"> + <choice> + <value>yes</value> + <value>no</value> + </choice> + </attribute> + </optional> + <optional> + <attribute name="cdbfilter"> + <choice> + <value>yes</value> + <value>no</value> + </choice>
And this part would need a major rework if you go with my idea of a tri-state attribute value for rawio instead of introducing yet another attribute.
+ </attribute> + </optional> </choice> - </attribute> - </optional> + </group> + </choice> <optional> <ref name="snapshot"/> </optional>
-- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 12/14/2012 01:33 PM, Eric Blake wrote:
On 12/13/2012 12:05 PM, Osier Yang wrote:
Since "rawio" and "cdbfilter" are only valid for "lun", this groups them together; And since both of them intend to allow the unprivledged user to use the SG_IO commands, they must be
s/unprivledged/unprivileged/
rawio cdbfilter result missing missing kernel prevents SG_IO missing no kernel prevents SG_IO missing yes SG_IO allowed for disk no missing kernel prevents SG_IO no no kernel prevents SG_IO no yes SG_IO allowed for disk yes missing SG_IO allowed for process yes no SG_IO allowed for process yes yes error
This table shows another reason why I don't like your naming - the defaults are screwy, when an omitted rawio means 'no', but an omitted cdbfilter means 'yes'. Corrected, the table looks like: rawio cdbfilter result missing missing kernel prevents SG_IO missing no SG_IO allowed for disk missing yes kernel prevents SG_IO no missing kernel prevents SG_IO no no error? or SG_IO allowed for disk? no yes error? or kernel prevents SG_IO? yes missing SG_IO allowed for process yes no error yes yes error? or kernel prevents SG_IO?
Why not simplify things, and have a single attribute rawio, with multiple values?
rawio result missing kernel prevents SG_IO no kernel prevents SG_IO yes SG_IO allowed for process cdb SG_IO allowed for disk
where we document that 'yes' works on more kernel versions than 'cdb', but that 'cdb' (new to 1.0.1) is more secure.
Oh, another thing, what does 'cdb' mean? It happens to be an acronym that matches the kernel implementation, but it doesn't convey much meaning on its own. Given that our existing rawio='yes' merely turns on SG_IO via a capability for the entire process, and the new filtering method enables per-disk whitelisting of which disks get to use SG_IO passthrough, would it be any better to document things as: no|yes|disk and where we additionally allow 'process' as a synonym for 'yes' on input (for back-compat, we have to output 'yes', not 'process'). That is, I don't know that exposing the term 'cdb' buys us any understanding into what is really going on. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 2012年12月15日 04:47, Eric Blake wrote:
On 12/14/2012 01:33 PM, Eric Blake wrote:
On 12/13/2012 12:05 PM, Osier Yang wrote:
Since "rawio" and "cdbfilter" are only valid for "lun", this groups them together; And since both of them intend to allow the unprivledged user to use the SG_IO commands, they must be
s/unprivledged/unprivileged/
rawio cdbfilter result missing missing kernel prevents SG_IO missing no kernel prevents SG_IO missing yes SG_IO allowed for disk no missing kernel prevents SG_IO no no kernel prevents SG_IO no yes SG_IO allowed for disk yes missing SG_IO allowed for process yes no SG_IO allowed for process yes yes error
This table shows another reason why I don't like your naming - the defaults are screwy, when an omitted rawio means 'no', but an omitted cdbfilter means 'yes'. Corrected, the table looks like:
rawio cdbfilter result missing missing kernel prevents SG_IO missing no SG_IO allowed for disk missing yes kernel prevents SG_IO no missing kernel prevents SG_IO no no error? or SG_IO allowed for disk? no yes error? or kernel prevents SG_IO? yes missing SG_IO allowed for process yes no error yes yes error? or kernel prevents SG_IO?
Why not simplify things, and have a single attribute rawio, with multiple values?
rawio result missing kernel prevents SG_IO no kernel prevents SG_IO yes SG_IO allowed for process cdb SG_IO allowed for disk
where we document that 'yes' works on more kernel versions than 'cdb', but that 'cdb' (new to 1.0.1) is more secure.
Oh, another thing, what does 'cdb' mean? It happens to be an acronym that matches the kernel implementation,
Yes, it's abstraction of "Command Descriptor Block", see: http://en.wikipedia.org/wiki/SCSI_CDB
but it doesn't convey much meaning on its own. Given that our existing rawio='yes' merely turns on SG_IO via a capability for the entire process,
However, the "rawio" tag is designed for disk-basis. Though current qemu implementation turns on the capability for the whole process, it's just a qemu implementation. And it's still possible to hack 'rawio' to enable the SG_IO capabilty for single disk in future. This is Daniel's thought, and I agreed with it. See: https://www.redhat.com/archives/libvir-list/2012-November/msg01077.html
and the new filtering method enables per-disk whitelisting of which disks get to use SG_IO passthrough, would it be any better to document things as:
no|yes|disk
As explained above, "rawio" tag is already designed for disk-basic, and the underlying implementation for "rawio" and the new sysfs knob is complete different; And as far as I get from the kernel patches, I guess it could only filter sub-set of the commands in future, not simply filtering all or not filtering all. That means we probably will have to add new values in future. For above reasons, I'm afraid that mixing them up will lead us into a mess. So I'm inclined to use a new tag, and document it as a different approach with "rawio" to enable SG_IO. For the tag "name", I don't have better idea except "cdbfilter", though I agreed that it doesn't convey meaning much on its own.
and where we additionally allow 'process' as a synonym for 'yes' on input (for back-compat, we have to output 'yes', not 'process'). That is, I don't know that exposing the term 'cdb' buys us any understanding into what is really going on.
Thanks, Osier

Setting "cdbfilter" to "no" to enable the unprivleged SG_IO, I.e. Disable the kernel CDB filtering. And "yes" to enable it. "yes" is the kernel default behaviour. Later patch will do the actual setting. --- src/conf/domain_conf.c | 58 +++++++++++++++----- src/conf/domain_conf.h | 10 ++++ ...ml2argv-disk-scsi-lun-passthrough-cdbfilter.xml | 32 +++++++++++ tests/qemuxml2xmltest.c | 1 + 4 files changed, 88 insertions(+), 13 deletions(-) create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-cdbfilter.xml diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 12eeb5e..495ffe5 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -242,6 +242,10 @@ VIR_ENUM_IMPL(virDomainDiskIo, VIR_DOMAIN_DISK_IO_LAST, "default", "native", "threads") +VIR_ENUM_IMPL(virDomainDiskCDBFilter, VIR_DOMAIN_DISK_CDB_FILTER_LAST, + "default", + "yes", + "no") VIR_ENUM_IMPL(virDomainIoEventFd, VIR_DOMAIN_IO_EVENT_FD_LAST, "default", "on", @@ -3515,6 +3519,7 @@ virDomainDiskDefParseXML(virCapsPtr caps, char *device = NULL; char *snapshot = NULL; char *rawio = NULL; + char *cdbfilter = NULL; char *driverName = NULL; char *driverType = NULL; char *source = NULL; @@ -3579,6 +3584,7 @@ virDomainDiskDefParseXML(virCapsPtr caps, snapshot = virXMLPropString(node, "snapshot"); rawio = virXMLPropString(node, "rawio"); + cdbfilter = virXMLPropString(node, "cdbfilter"); cur = node->children; while (cur != NULL) { @@ -4029,24 +4035,46 @@ virDomainDiskDefParseXML(virCapsPtr caps, def->snapshot = VIR_DOMAIN_SNAPSHOT_LOCATION_NONE; } + if (rawio && cdbfilter) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("rawio and cdbfilter are exclusive")); + goto error; + } + + if ((rawio || cdbfilter) && + (def->device != VIR_DOMAIN_DISK_DEVICE_LUN)) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("rawio or cdbfilter can be used only with " + "device='lun'")); + goto error; + } + if (rawio) { def->rawio_specified = true; - if (def->device == VIR_DOMAIN_DISK_DEVICE_LUN) { - if (STREQ(rawio, "yes")) { - def->rawio = 1; - } else if (STREQ(rawio, "no")) { - def->rawio = 0; - } else { - virReportError(VIR_ERR_XML_ERROR, - _("unknown disk rawio setting '%s'"), - rawio); - goto error; - } + if (STREQ(rawio, "yes")) { + def->rawio = 1; + } else if (STREQ(rawio, "no")) { + def->rawio = 0; } else { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("rawio can be used only with device='lun'")); + virReportError(VIR_ERR_XML_ERROR, + _("unknown disk rawio setting '%s'"), + rawio); + goto error; + } + } + + if (cdbfilter) { + int cdbfilter_val = 0; + + if ((cdbfilter_val = + virDomainDiskCDBFilterTypeFromString(cdbfilter)) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("unknown disk cdbfilter setting '%s'"), + cdbfilter); goto error; } + + def->cdbfilter = cdbfilter_val; } if (bus) { @@ -4287,6 +4315,7 @@ cleanup: VIR_FREE(type); VIR_FREE(snapshot); VIR_FREE(rawio); + VIR_FREE(cdbfilter); VIR_FREE(target); VIR_FREE(source); VIR_FREE(tray); @@ -12022,6 +12051,9 @@ virDomainDiskDefFormat(virBufferPtr buf, virBufferAddLit(buf, " rawio='no'"); } } + if (def->cdbfilter) + virBufferAsprintf(buf, " cdbfilter='%s'", + virDomainDiskCDBFilterTypeToString(def->cdbfilter)); if (def->snapshot && !(def->snapshot == VIR_DOMAIN_SNAPSHOT_LOCATION_NONE && def->readonly)) virBufferAsprintf(buf, " snapshot='%s'", diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index e6659cd..5f38101 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -507,6 +507,14 @@ enum virDomainDiskIo { VIR_DOMAIN_DISK_IO_LAST }; +enum virDomainDiskCDBFilter { + VIR_DOMAIN_DISK_CDB_FILTER_DEFAULT = 0, + VIR_DOMAIN_DISK_CDB_FILTER_YES, /* reflects to 0 of sysfs unpriv_sgio */ + VIR_DOMAIN_DISK_CDB_FILTER_NO, /* reflects to 1 of sysfs unpriv_sgio */ + + VIR_DOMAIN_DISK_CDB_FILTER_LAST +}; + enum virDomainIoEventFd { VIR_DOMAIN_IO_EVENT_FD_DEFAULT = 0, VIR_DOMAIN_IO_EVENT_FD_ON, @@ -620,6 +628,7 @@ struct _virDomainDiskDef { virStorageEncryptionPtr encryption; bool rawio_specified; int rawio; /* no = 0, yes = 1 */ + int cdbfilter; size_t nseclabels; virSecurityDeviceLabelDefPtr *seclabels; @@ -2212,6 +2221,7 @@ VIR_ENUM_DECL(virDomainDiskErrorPolicy) VIR_ENUM_DECL(virDomainDiskProtocol) VIR_ENUM_DECL(virDomainDiskProtocolTransport) VIR_ENUM_DECL(virDomainDiskIo) +VIR_ENUM_DECL(virDomainDiskCDBFilter) VIR_ENUM_DECL(virDomainDiskSecretType) VIR_ENUM_DECL(virDomainDiskTray) VIR_ENUM_DECL(virDomainIoEventFd) diff --git a/tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-cdbfilter.xml b/tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-cdbfilter.xml new file mode 100644 index 0000000..731c9bd --- /dev/null +++ b/tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-cdbfilter.xml @@ -0,0 +1,32 @@ +<domain type='qemu'> + <name>QEMUGuest1</name> + <uuid>c7a5fdbd-edaf-9455-926a-d65c16db1809</uuid> + <memory unit='KiB'>219136</memory> + <currentMemory unit='KiB'>219136</currentMemory> + <vcpu placement='static'>1</vcpu> + <os> + <type arch='i686' machine='pc'>hvm</type> + <boot dev='hd'/> + </os> + <clock offset='utc'/> + <on_poweroff>destroy</on_poweroff> + <on_reboot>restart</on_reboot> + <on_crash>destroy</on_crash> + <devices> + <emulator>/usr/bin/qemu</emulator> + <disk type='block' device='lun' cdbfilter='yes'> + <source dev='/dev/HostVG/QEMUGuest1'/> + <target dev='hda' bus='scsi'/> + <address type='drive' controller='0' bus='0' target='0' unit='0'/> + </disk> + <disk type='block' device='lun' cdbfilter='no'> + <source dev='/dev/HostVG/QEMUGuest2'/> + <target dev='hda' bus='scsi'/> + <address type='drive' controller='0' bus='0' target='1' unit='1'/> + </disk> + <controller type='scsi' index='0' model='virtio-scsi'/> + <controller type='scsi' index='1' model='lsilogic'/> + <controller type='usb' index='0'/> + <memballoon model='virtio'/> + </devices> +</domain> diff --git a/tests/qemuxml2xmltest.c b/tests/qemuxml2xmltest.c index c2d3dd3..ca2daff 100644 --- a/tests/qemuxml2xmltest.c +++ b/tests/qemuxml2xmltest.c @@ -238,6 +238,7 @@ mymain(void) DO_TEST("seclabel-static"); DO_TEST("seclabel-none"); DO_TEST("numad-static-vcpu-no-numatune"); + DO_TEST("disk-scsi-lun-passthrough-cdbfilter"); DO_TEST("disk-scsi-disk-vpd"); -- 1.7.7.6

On 12/13/2012 12:05 PM, Osier Yang wrote:
Setting "cdbfilter" to "no" to enable the unprivleged SG_IO, I.e. Disable the kernel CDB filtering. And "yes" to enable it. "yes" is the kernel default behaviour. Later patch will do the actual setting.
See my comments on 3/8; I'd much rather see us turn the existing rawio attribute into a tri-value, than to introduce a competing attribute that is mutually exclusive with rawio but has the same end goal of allowing SG_IO. This patch gets much smaller if you merely add a new value to the existing rawio enum instead of a new attribute. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

Note that cdbfilter is not exactly match unpriv_sgio's value. VIR_DOMAIN_DISK_CDB_FILTER_YES mean the kernel will filter the SCSI commands, which maps to 0 of unpriv_sgio; VIR_DOMAIN_DISK_CDB_FILTER_NO maps to 1 of unpriv_sgio. --- src/qemu/qemu_process.c | 11 ++++++++++- 1 files changed, 10 insertions(+), 1 deletions(-) diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index c3ecf6f..4117251 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3730,10 +3730,19 @@ int qemuProcessStart(virConnectPtr conn, _("Raw I/O is not supported on this platform")); #endif - if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { + if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) goto cleanup; } + + /* Set sysfs unpriv_sgio if cdbfilter is specified */ + if (disk->cdbfilter) { + if (virSetDeviceUnprivSGIO(disk->src, + (disk->cdbfilter == + VIR_DOMAIN_DISK_CDB_FILTER_NO) + ? 1 : 0) < 0) + goto cleanup; + } } virCommandSetPreExecHook(cmd, qemuProcessHook, &hookData); -- 1.7.7.6

Note that cdbfilter is not exactly match unpriv_sgio's value. VIR_DOMAIN_DISK_CDB_FILTER_YES mean the kernel will filter the SCSI commands, which maps to 0 of unpriv_sgio; VIR_DOMAIN_DISK_CDB_FILTER_NO maps to 1 of unpriv_sgio. --- src/qemu/qemu_process.c | 11 ++++++++++- 1 files changed, 10 insertions(+), 1 deletions(-) diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index c3ecf6f..bcea0ff 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3730,10 +3730,19 @@ int qemuProcessStart(virConnectPtr conn, _("Raw I/O is not supported on this platform")); #endif - if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { + if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) goto cleanup; } + + /* Set sysfs unpriv_sgio if cdbfilter is specified */ + if (disk->cdbfilter) { + if (virSetDeviceUnprivSGIO(disk->src, NULL, + (disk->cdbfilter == + VIR_DOMAIN_DISK_CDB_FILTER_NO) + ? 1 : 0) < 0) + goto cleanup; + } } virCommandSetPreExecHook(cmd, qemuProcessHook, &hookData); -- 1.7.7.6

On 12/14/2012 08:43 AM, Osier Yang wrote:
Note that cdbfilter is not exactly match unpriv_sgio's value. VIR_DOMAIN_DISK_CDB_FILTER_YES mean the kernel will filter the SCSI commands, which maps to 0 of unpriv_sgio; VIR_DOMAIN_DISK_CDB_FILTER_NO maps to 1 of unpriv_sgio.
Again, I find this confusing to have negative logic, and think making rawio a tri-state will make it easier to reason about which disks have allowed unpriv_sgio vs. the default of being filtered.
--- src/qemu/qemu_process.c | 11 ++++++++++- 1 files changed, 10 insertions(+), 1 deletions(-)
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index c3ecf6f..bcea0ff 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3730,10 +3730,19 @@ int qemuProcessStart(virConnectPtr conn, _("Raw I/O is not supported on this platform")); #endif
- if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { + if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) {
The re-indent could be pushed independently and prior to the release (or, if it was introduced earlier in the series, then fix that earlier patch).
if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) goto cleanup; } + + /* Set sysfs unpriv_sgio if cdbfilter is specified */ + if (disk->cdbfilter) { + if (virSetDeviceUnprivSGIO(disk->src, NULL, + (disk->cdbfilter == + VIR_DOMAIN_DISK_CDB_FILTER_NO) + ? 1 : 0) < 0) + goto cleanup; + }
Would it be any simpler to write: virSetDeviceUnprivSGIO(disk->src, NULL, disk->cdbfilter != VIR_DOMAIN_DISK_CDB_FILTER_NO) < 0 but you may be rewriting this anyways. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 2012年12月15日 05:36, Eric Blake wrote:
On 12/14/2012 08:43 AM, Osier Yang wrote:
Note that cdbfilter is not exactly match unpriv_sgio's value. VIR_DOMAIN_DISK_CDB_FILTER_YES mean the kernel will filter the SCSI commands, which maps to 0 of unpriv_sgio; VIR_DOMAIN_DISK_CDB_FILTER_NO maps to 1 of unpriv_sgio.
Again, I find this confusing to have negative logic, and think making rawio a tri-state will make it easier to reason about which disks have allowed unpriv_sgio vs. the default of being filtered.
--- src/qemu/qemu_process.c | 11 ++++++++++- 1 files changed, 10 insertions(+), 1 deletions(-)
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index c3ecf6f..bcea0ff 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3730,10 +3730,19 @@ int qemuProcessStart(virConnectPtr conn, _("Raw I/O is not supported on this platform")); #endif
- if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK&& disk->shared) { + if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK&& disk->shared) {
The re-indent could be pushed independently and prior to the release (or, if it was introduced earlier in the series, then fix that earlier patch).
Okay.
if (qemuAddSharedDisk(driver->sharedDisks, disk->src)< 0) goto cleanup; } + + /* Set sysfs unpriv_sgio if cdbfilter is specified */ + if (disk->cdbfilter) { + if (virSetDeviceUnprivSGIO(disk->src, NULL, + (disk->cdbfilter == + VIR_DOMAIN_DISK_CDB_FILTER_NO) + ? 1 : 0)< 0) + goto cleanup; + }
Would it be any simpler to write:
virSetDeviceUnprivSGIO(disk->src, NULL, disk->cdbfilter != VIR_DOMAIN_DISK_CDB_FILTER_NO)< 0
but you may be rewriting this anyways.
Yeah, it's better.

This prevents the domain starting if the shared disk's setting conflicts with other active domain(s), E.g. A domain with "cdbfilter" set as "yes", however, another active domain is using it set as "no". --- This assumes the lock manager will prevents one using same disk mixed with non-shared and shared modes. I.E. The following scenario won't be happen: * dom1, with disk /dev/sdb's cdbfilter="no" shared * dom2, with disk /dev/sdb's cdbfilter="yes" and non-shared Without the lock manager, dom2's staring will override the disk's unpriv_sgio to 0. --- src/qemu/qemu_process.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_process.h | 2 + 2 files changed, 53 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 4117251..ace0404 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3385,6 +3385,54 @@ qemuProcessReconnectAll(virConnectPtr conn, virQEMUDriverPtr driver) virHashForEach(driver->domains.objs, qemuProcessReconnectHelper, &data); } +/* Check if a shared disk's setting conflicts with the conf + * used by other domain(s). Currently only checks the cdbfilter + * setting. + * + * Returns 0 if no conflicts, otherwise returns -1. + */ +int +qemuCheckSharedDisk(virHashTablePtr sharedDisks, + virDomainDiskDefPtr disk) +{ + int val; + size_t *ref = NULL; + char *key = NULL; + int ret = 0; + + if (!(key = qemuGetSharedDiskKey(disk->src))) + return -1; + + /* It can't be conflict if no other domain is + * is sharing it. + */ + if (!(ref = virHashLookup(sharedDisks, key))) + goto cleanup; + + if (ref == (void *)0x1) + goto cleanup; + + if (virGetDeviceUnprivSGIO(disk->src, &val) < 0) { + ret = -1; + goto cleanup; + } + + if ((val == 0 && + disk->cdbfilter == VIR_DOMAIN_DISK_CDB_FILTER_YES) || + (val == 1 && + disk->cdbfilter == VIR_DOMAIN_DISK_CDB_FILTER_NO)) + goto cleanup; + + virReportError(VIR_ERR_INTERNAL_ERROR, + _("cdbfilter of shared disk '%s' conflicts with other " + "active domains"), disk->src); + ret = -1; + +cleanup: + VIR_FREE(key); + return ret; +} + int qemuProcessStart(virConnectPtr conn, virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -3733,6 +3781,9 @@ int qemuProcessStart(virConnectPtr conn, if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) goto cleanup; + + if (qemuCheckSharedDisk(driver->sharedDisks, disk) < 0) + goto cleanup; } /* Set sysfs unpriv_sgio if cdbfilter is specified */ diff --git a/src/qemu/qemu_process.h b/src/qemu/qemu_process.h index c12df32..3aad08f 100644 --- a/src/qemu/qemu_process.h +++ b/src/qemu/qemu_process.h @@ -98,5 +98,7 @@ bool qemuProcessAutoDestroyActive(virQEMUDriverPtr driver, virDomainObjPtr vm); virBitmapPtr qemuPrepareCpumap(virQEMUDriverPtr driver, virBitmapPtr nodemask); +int qemuCheckSharedDisk(virHashTablePtr sharedDisks, + virDomainDiskDefPtr disk); #endif /* __QEMU_PROCESS_H__ */ -- 1.7.7.6

This prevents the domain starting if the shared disk's setting conflicts with other active domain(s), E.g. A domain with "cdbfilter" set as "yes", however, another active domain is using it set as "no". --- src/qemu/qemu_process.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_process.h | 2 + 2 files changed, 53 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index bcea0ff..1e2baac 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3385,6 +3385,54 @@ qemuProcessReconnectAll(virConnectPtr conn, virQEMUDriverPtr driver) virHashForEach(driver->domains.objs, qemuProcessReconnectHelper, &data); } +/* Check if a shared disk's setting conflicts with the conf + * used by other domain(s). Currently only checks the cdbfilter + * setting. + * + * Returns 0 if no conflicts, otherwise returns -1. + */ +int +qemuCheckSharedDisk(virHashTablePtr sharedDisks, + virDomainDiskDefPtr disk) +{ + int val; + size_t *ref = NULL; + char *key = NULL; + int ret = 0; + + if (!(key = qemuGetSharedDiskKey(disk->src))) + return -1; + + /* It can't be conflict if no other domain is + * is sharing it. + */ + if (!(ref = virHashLookup(sharedDisks, key))) + goto cleanup; + + if (ref == (void *)0x1) + goto cleanup; + + if (virGetDeviceUnprivSGIO(disk->src, NULL, &val) < 0) { + ret = -1; + goto cleanup; + } + + if ((val == 0 && + disk->cdbfilter == VIR_DOMAIN_DISK_CDB_FILTER_YES) || + (val == 1 && + disk->cdbfilter == VIR_DOMAIN_DISK_CDB_FILTER_NO)) + goto cleanup; + + virReportError(VIR_ERR_INTERNAL_ERROR, + _("cdbfilter of shared disk '%s' conflicts with other " + "active domains"), disk->src); + ret = -1; + +cleanup: + VIR_FREE(key); + return ret; +} + int qemuProcessStart(virConnectPtr conn, virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -3733,6 +3781,9 @@ int qemuProcessStart(virConnectPtr conn, if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) goto cleanup; + + if (qemuCheckSharedDisk(driver->sharedDisks, disk) < 0) + goto cleanup; } /* Set sysfs unpriv_sgio if cdbfilter is specified */ diff --git a/src/qemu/qemu_process.h b/src/qemu/qemu_process.h index c12df32..3aad08f 100644 --- a/src/qemu/qemu_process.h +++ b/src/qemu/qemu_process.h @@ -98,5 +98,7 @@ bool qemuProcessAutoDestroyActive(virQEMUDriverPtr driver, virDomainObjPtr vm); virBitmapPtr qemuPrepareCpumap(virQEMUDriverPtr driver, virBitmapPtr nodemask); +int qemuCheckSharedDisk(virHashTablePtr sharedDisks, + virDomainDiskDefPtr disk); #endif /* __QEMU_PROCESS_H__ */ -- 1.7.7.6

On 12/14/2012 08:44 AM, Osier Yang wrote:
This prevents the domain starting if the shared disk's setting conflicts with other active domain(s), E.g. A domain with "cdbfilter" set as "yes", however, another active domain is using it set as "no". --- src/qemu/qemu_process.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_process.h | 2 + 2 files changed, 53 insertions(+), 0 deletions(-)
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index bcea0ff..1e2baac 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3385,6 +3385,54 @@ qemuProcessReconnectAll(virConnectPtr conn, virQEMUDriverPtr driver) virHashForEach(driver->domains.objs, qemuProcessReconnectHelper, &data); }
+/* Check if a shared disk's setting conflicts with the conf + * used by other domain(s). Currently only checks the cdbfilter + * setting.
Probably worth documenting that this function should only be called on disks with a block device source...
+ * + * Returns 0 if no conflicts, otherwise returns -1. + */ +int +qemuCheckSharedDisk(virHashTablePtr sharedDisks, + virDomainDiskDefPtr disk) +{ + int val; + size_t *ref = NULL; + char *key = NULL; + int ret = 0; + + if (!(key = qemuGetSharedDiskKey(disk->src))) + return -1;
since you error out on regular files. (You correctly obey that restriction for now, but without documentation, it might be possible to reuse this function from the wrong context in a future patch.)
+ + /* It can't be conflict if no other domain is + * is sharing it. + */ + if (!(ref = virHashLookup(sharedDisks, key))) + goto cleanup; + + if (ref == (void *)0x1) + goto cleanup; + + if (virGetDeviceUnprivSGIO(disk->src, NULL, &val) < 0) { + ret = -1; + goto cleanup; + } + + if ((val == 0 && + disk->cdbfilter == VIR_DOMAIN_DISK_CDB_FILTER_YES) ||
Does this do the right thing if disk->cdbfilter == VIR_DOMAIN_DISK_CDB_FILTER_DEFAULT (that is, the user didn't specify the cdbfilter attribute)? Again, this area of code may need tweaks if you take my suggestion of making rawio a three-value enum.
+ (val == 1 && + disk->cdbfilter == VIR_DOMAIN_DISK_CDB_FILTER_NO)) + goto cleanup; + + virReportError(VIR_ERR_INTERNAL_ERROR, + _("cdbfilter of shared disk '%s' conflicts with other " + "active domains"), disk->src);
INTERNAL_ERROR is wrong. I think VIR_ERR_OPERATION_INVALID fits better. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 2012年12月15日 05:49, Eric Blake wrote:
On 12/14/2012 08:44 AM, Osier Yang wrote:
This prevents the domain starting if the shared disk's setting conflicts with other active domain(s), E.g. A domain with "cdbfilter" set as "yes", however, another active domain is using it set as "no". --- src/qemu/qemu_process.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_process.h | 2 + 2 files changed, 53 insertions(+), 0 deletions(-)
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index bcea0ff..1e2baac 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3385,6 +3385,54 @@ qemuProcessReconnectAll(virConnectPtr conn, virQEMUDriverPtr driver) virHashForEach(driver->domains.objs, qemuProcessReconnectHelper,&data); }
+/* Check if a shared disk's setting conflicts with the conf + * used by other domain(s). Currently only checks the cdbfilter + * setting.
Probably worth documenting that this function should only be called on disks with a block device source...
+ * + * Returns 0 if no conflicts, otherwise returns -1. + */ +int +qemuCheckSharedDisk(virHashTablePtr sharedDisks, + virDomainDiskDefPtr disk) +{ + int val; + size_t *ref = NULL; + char *key = NULL; + int ret = 0; + + if (!(key = qemuGetSharedDiskKey(disk->src))) + return -1;
since you error out on regular files. (You correctly obey that restriction for now, but without documentation, it might be possible to reuse this function from the wrong context in a future patch.)
Okay.
+ + /* It can't be conflict if no other domain is + * is sharing it. + */ + if (!(ref = virHashLookup(sharedDisks, key))) + goto cleanup; + + if (ref == (void *)0x1) + goto cleanup; + + if (virGetDeviceUnprivSGIO(disk->src, NULL,&val)< 0) { + ret = -1; + goto cleanup; + } + + if ((val == 0&& + disk->cdbfilter == VIR_DOMAIN_DISK_CDB_FILTER_YES) ||
Does this do the right thing if disk->cdbfilter == VIR_DOMAIN_DISK_CDB_FILTER_DEFAULT (that is, the user didn't specify the cdbfilter attribute)?
Oh, right, there is problem here. It should be like: if (!disk->cdbfilter) return 0 in the early beginnng of the function. As VIR_DOMAIN_DISK_CDB_FILTER_DEFAULT implies it doesn't care about the SG_IO setting. I.E, It lives with what current setting is.
Again, this area of code may need tweaks if you take my suggestion of making rawio a three-value enum.
+ (val == 1&& + disk->cdbfilter == VIR_DOMAIN_DISK_CDB_FILTER_NO)) + goto cleanup; + + virReportError(VIR_ERR_INTERNAL_ERROR, + _("cdbfilter of shared disk '%s' conflicts with other " + "active domains"), disk->src);
INTERNAL_ERROR is wrong. I think VIR_ERR_OPERATION_INVALID fits better.
Okay, the error number is always fuzzy to use. :-)

Just like for domain starting, this sets the disk's unpriv_sgio if cdbfilter is specified when attaching disk. --- src/qemu/qemu_driver.c | 23 +++++++++++++++++------ 1 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index a2700de..7239d71 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -6046,12 +6046,23 @@ qemuDomainAttachDeviceDiskLive(virConnectPtr conn, NULLSTR(disk->src)); } - if (ret == 0 && - disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && - disk->shared) { - if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) - VIR_WARN("Failed to add disk '%s' to shared disk table", - disk->src); + if (ret == 0) { + if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { + if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) + VIR_WARN("Failed to add disk '%s' to shared disk table", + disk->src); + } + + /* Set sysfs unpriv_sgio if cdbfilter is specified. */ + if (disk->cdbfilter) { + if (virSetDeviceUnprivSGIO(disk->src, + (disk->cdbfilter == + VIR_DOMAIN_DISK_CDB_FILTER_NO) + ? 1 : 0) < 0) { + VIR_WARN("Failed to set unpriv_sgio of disk '%s'", disk->src); + goto end; + } + } } end: -- 1.7.7.6

Just like for domain starting, this sets the disk's unpriv_sgio if cdbfilter is specified when attaching disk. --- src/qemu/qemu_driver.c | 23 +++++++++++++++++------ 1 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index a2700de..b7a59ab 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -6046,12 +6046,23 @@ qemuDomainAttachDeviceDiskLive(virConnectPtr conn, NULLSTR(disk->src)); } - if (ret == 0 && - disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && - disk->shared) { - if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) - VIR_WARN("Failed to add disk '%s' to shared disk table", - disk->src); + if (ret == 0) { + if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { + if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) + VIR_WARN("Failed to add disk '%s' to shared disk table", + disk->src); + } + + /* Set sysfs unpriv_sgio if cdbfilter is specified. */ + if (disk->cdbfilter) { + if (virSetDeviceUnprivSGIO(disk->src, NULL, + (disk->cdbfilter == + VIR_DOMAIN_DISK_CDB_FILTER_NO) + ? 1 : 0) < 0) { + VIR_WARN("Failed to set unpriv_sgio of disk '%s'", disk->src); + goto end; + } + } } end: -- 1.7.7.6

On 12/14/2012 08:45 AM, Osier Yang wrote:
Just like for domain starting, this sets the disk's unpriv_sgio if cdbfilter is specified when attaching disk. --- src/qemu/qemu_driver.c | 23 +++++++++++++++++------ 1 files changed, 17 insertions(+), 6 deletions(-)
+ + /* Set sysfs unpriv_sgio if cdbfilter is specified. */ + if (disk->cdbfilter) { + if (virSetDeviceUnprivSGIO(disk->src, NULL, + (disk->cdbfilter == + VIR_DOMAIN_DISK_CDB_FILTER_NO) + ? 1 : 0) < 0) { + VIR_WARN("Failed to set unpriv_sgio of disk '%s'", disk->src); + goto end; + } + }
Again, does this do the right thing if some other domain already had cdbfilter='no' (aka kernel allows SG_IO) but this domain omitted the cdbfilter attribute (which defaults to cdbfilter='yes' but does not satisfy the if(disk->cdbfilter) condition)? I suspect that will be worked out when you rework things to use a tri-state rawio. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 2012年12月15日 06:05, Eric Blake wrote:
On 12/14/2012 08:45 AM, Osier Yang wrote:
Just like for domain starting, this sets the disk's unpriv_sgio if cdbfilter is specified when attaching disk. --- src/qemu/qemu_driver.c | 23 +++++++++++++++++------ 1 files changed, 17 insertions(+), 6 deletions(-)
+ + /* Set sysfs unpriv_sgio if cdbfilter is specified. */ + if (disk->cdbfilter) { + if (virSetDeviceUnprivSGIO(disk->src, NULL, + (disk->cdbfilter == + VIR_DOMAIN_DISK_CDB_FILTER_NO) + ? 1 : 0)< 0) { + VIR_WARN("Failed to set unpriv_sgio of disk '%s'", disk->src); + goto end; + } + }
Again, does this do the right thing if some other domain already had cdbfilter='no' (aka kernel allows SG_IO) but this domain omitted the cdbfilter attribute (which defaults to cdbfilter='yes' but does not satisfy the if(disk->cdbfilter) condition)?
I think it does the right thing, because not specifying "cdbfilter" should mean it doesn't care about what the kernel setting is, not means cdbfiler="yes" instead. Assuming that one used rawio="yes" for a domain, and if we defaults cdbfilter to "yes", it will cause regression, as the user still expects SG_IO commands can be executed on the disk, however, it's disabled by cdbfilter="yes".
I suspect that will be worked out when you rework things to use a tri-state rawio.
Of course, this is based on we use a separate XML tag like cdbfilter, it won't stand anymore if we finally use a tri-state rawio. Regards, Osier

On 12/16/2012 12:32 PM, Osier Yang wrote:
Again, does this do the right thing if some other domain already had cdbfilter='no' (aka kernel allows SG_IO) but this domain omitted the cdbfilter attribute (which defaults to cdbfilter='yes' but does not satisfy the if(disk->cdbfilter) condition)?
I think it does the right thing, because not specifying "cdbfilter" should mean it doesn't care about what the kernel setting is, not means cdbfiler="yes" instead.
Unfortunately, that's the wrong approach. We should be secure by default, and therefore we MUST treat an omitted attribute as though it meant the secure setting, and not that we don't care about the setting. In other words, if guest A uses cdbfilter='no' and guest B omits the cdbfilter attribute, then we must refuse to start guest B (ignoring the issue would mean that guest B could do a raw SG_IO command that would negatively impact guest A, and the premise of sVirt is that guest B cannot impact guest A unless it was explicitly documented as being permitted).
Of course, this is based on we use a separate XML tag like cdbfilter, it won't stand anymore if we finally use a tri-state rawio.
I know you reposted a newer version of this series, and that based on IRC conversations we talked about using a naming of sgio='filtered|unfiltered' instead of cdbfilter='yes|no'; I'll have to review that series. But I wanted to make sure that I left a comment in this thread (and not just IRC) why secure-by-default is the only safe way to behave when the new attribute is not present. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 2012年12月19日 02:03, Eric Blake wrote:
On 12/16/2012 12:32 PM, Osier Yang wrote:
Again, does this do the right thing if some other domain already had cdbfilter='no' (aka kernel allows SG_IO) but this domain omitted the cdbfilter attribute (which defaults to cdbfilter='yes' but does not satisfy the if(disk->cdbfilter) condition)?
I think it does the right thing, because not specifying "cdbfilter" should mean it doesn't care about what the kernel setting is, not means cdbfiler="yes" instead.
Unfortunately, that's the wrong approach. We should be secure by default, and therefore we MUST treat an omitted attribute as though it meant the secure setting, and not that we don't care about the setting. In other words, if guest A uses cdbfilter='no' and guest B omits the cdbfilter attribute, then we must refuse to start guest B (ignoring the issue would mean that guest B could do a raw SG_IO command that would negatively impact guest A, and the premise of sVirt is that guest B cannot impact guest A unless it was explicitly documented as being permitted).
Agreed.
Of course, this is based on we use a separate XML tag like cdbfilter, it won't stand anymore if we finally use a tri-state rawio.
I know you reposted a newer version of this series, and that based on IRC conversations we talked about using a naming of sgio='filtered|unfiltered' instead of cdbfilter='yes|no'; I'll have to review that series. But I wanted to make sure that I left a comment in this thread (and not just IRC) why secure-by-default is the only safe way to behave when the new attribute is not present.
Instead of defaulting the sgio to "filtered" when parsing conf. The new version defaults it in qemu driver. To distinguish the user explicit requested "filtered" and default "filtered", for the default "filtered", we will ignore it if the kernel doesn't support it, but for explictly requestd "filtered", we have to error out anyway. Otherwise we might drop into regression on the platform kernel is old enough doesn't support unpriv_sgio. Regards, Osier

Just like for domain starting, this checks if the shared disk's conf conflicts with others. --- src/qemu/qemu_driver.c | 5 +++++ 1 files changed, 5 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 7239d71..eb0c921 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5995,6 +5995,11 @@ qemuDomainAttachDeviceDiskLive(virConnectPtr conn, goto end; } + if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && + disk->shared && + (qemuCheckSharedDisk(driver->sharedDisks, disk) < 0)) + goto end; + if (qemuDomainDetermineDiskChain(driver, disk, false) < 0) goto end; -- 1.7.7.6

On 12/13/2012 12:05 PM, Osier Yang wrote:
Just like for domain starting, this checks if the shared disk's conf conflicts with others. --- src/qemu/qemu_driver.c | 5 +++++ 1 files changed, 5 insertions(+), 0 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 7239d71..eb0c921 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5995,6 +5995,11 @@ qemuDomainAttachDeviceDiskLive(virConnectPtr conn, goto end; }
+ if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && + disk->shared && + (qemuCheckSharedDisk(driver->sharedDisks, disk) < 0)) + goto end; +
Looks reasonable. Might be small enough to be worth squashing with one of your other patches (that is, does it buy us anything to be able to bisect to a state where starting a domain and hotplugging a disk have different logic on checking whether there is a conflict, or should we just have a single patch that introduces all conflict checks to both places). -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 2012年12月15日 06:13, Eric Blake wrote:
On 12/13/2012 12:05 PM, Osier Yang wrote:
Just like for domain starting, this checks if the shared disk's conf conflicts with others. --- src/qemu/qemu_driver.c | 5 +++++ 1 files changed, 5 insertions(+), 0 deletions(-)
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 7239d71..eb0c921 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5995,6 +5995,11 @@ qemuDomainAttachDeviceDiskLive(virConnectPtr conn, goto end; }
+ if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK&& + disk->shared&& + (qemuCheckSharedDisk(driver->sharedDisks, disk)< 0)) + goto end; +
Looks reasonable. Might be small enough to be worth squashing with one of your other patches (that is, does it buy us anything to be able to bisect to a state where starting a domain and hotplugging a disk have different logic on checking whether there is a conflict, or should we just have a single patch that introduces all conflict checks to both places).
Basicly I thought it's easier for reviewing. Squashing into one patch sounds fine.
participants (3)
-
Eric Blake
-
Jiri Denemark
-
Osier Yang