[libvirt] [PATCH 0/6 v9] Unprivileged SG_IO support

As a result of RFC [1], this implements the unprivleged SG_IO support. 1/6 and 2/6 are already acked. v8 - v9: * Just rebasing. v7 - v8: * Change the XML tag name from "cdbfilter" to "sgio", and to leave enough room for future values, the values of "sgio" are "filtered" and "unfiltered" now. v6 - v7: * No restoring of unpriv_sgio per Daniel's thought. * Use "major:minor" as the hash key per Jirka's suggestion. Osier Yang (6): util: Prepare helpers for unpriv_sgio setting qemu: Add a hash table for the shared disks docs: Add docs and rng schema for new XML tag sgio conf: Parse and format the new XML qemu: set unpriv_sgio when starting domain and attaching disk qemu: Check if the shared disk's cdbfilter conflicts with others docs/formatdomain.html.in | 14 ++- docs/schemas/domaincommon.rng | 54 +++++--- src/conf/domain_conf.c | 55 ++++++-- src/conf/domain_conf.h | 10 ++ src/libvirt_private.syms | 4 + src/qemu/qemu_conf.c | 86 ++++++++++++ src/qemu/qemu_conf.h | 12 ++ src/qemu/qemu_driver.c | 30 ++++ src/qemu/qemu_process.c | 99 ++++++++++++++ src/qemu/qemu_process.h | 4 + src/util/virutil.c | 140 ++++++++++++++++++++ src/util/virutil.h | 13 ++ ...qemuxml2argv-disk-scsi-lun-passthrough-sgio.xml | 32 +++++ tests/qemuxml2xmltest.c | 1 + 14 files changed, 522 insertions(+), 32 deletions(-) create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-sgio.xml [1] https://www.redhat.com/archives/libvir-list/2012-November/msg00988.html Regards, Osier

"virGetDeviceID" could be used across the sources, but it doesn't relate with this series, and could be done later. * src/util/virutil.h: (Declare virGetDeviceID, and vir{Get,Set}DeviceUnprivSGIO) * src/util/virutil.c: (Implement virGetDeviceID and vir{Get,Set}DeviceUnprivSGIO) * src/libvirt_private.syms: Export private symbols of upper helpers --- src/libvirt_private.syms | 3 + src/util/virutil.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/virutil.h | 11 ++++ 3 files changed, 154 insertions(+), 0 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 497d5d3..6fba1d9 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1278,6 +1278,8 @@ virFileWaitForDevices; virFileWriteStr; virFindFileInPath; virFormatIntDecimal; +virGetDeviceID; +virGetDeviceUnprivSGIO; virGetGroupID; virGetGroupName; virGetHostname; @@ -1296,6 +1298,7 @@ virPipeReadUntilEOF; virScaleInteger; virSetBlocking; virSetCloseExec; +virSetDeviceUnprivSGIO; virSetInherit; virSetNonBlock; virSetUIDGID; diff --git a/src/util/virutil.c b/src/util/virutil.c index 78ca9e87..275489a 100644 --- a/src/util/virutil.c +++ b/src/util/virutil.c @@ -3129,3 +3129,143 @@ virStrIsPrint(const char *str) return true; } + +#if defined(major) && defined(minor) +int +virGetDeviceID(const char *path, int *maj, int *min) +{ + struct stat sb; + char *canonical_path = NULL; + + if (virFileResolveLink(path, &canonical_path) < 0) + return -errno; + + if (stat(canonical_path, &sb) < 0) { + VIR_FREE(canonical_path); + return -errno; + } + + if (!S_ISBLK(sb.st_mode)) { + VIR_FREE(canonical_path); + return -EINVAL; + } + + if (maj) + *maj = major(sb.st_rdev); + if (min) + *min = minor(sb.st_rdev); + + VIR_FREE(canonical_path); + return 0; +} +#else +int +virGetDeviceID(const char *path ATRRIBUTE_UNUSED, + int *maj ATRRIBUTE_UNUSED, + int *min ATRRIBUTE_UNUSED) +{ + + return -ENOSYS; +} +#endif + +#define SYSFS_DEV_BLOCK_PATH "/sys/dev/block" + +static char * +virGetUnprivSGIOSysfsPath(const char *path, + const char *sysfs_dir) +{ + int maj, min; + char *sysfs_path = NULL; + int rc; + + if ((rc = virGetDeviceID(path, &maj, &min)) < 0) { + virReportSystemError(-rc, + _("Unable to get device ID '%s'"), + path); + return NULL; + } + + if (virAsprintf(&sysfs_path, "%s/%d:%d/queue/unpriv_sgio", + sysfs_dir ? sysfs_dir : SYSFS_DEV_BLOCK_PATH, + maj, min) < 0) { + virReportOOMError(); + return NULL; + } + + return sysfs_path; +} + +int +virSetDeviceUnprivSGIO(const char *path, + const char *sysfs_dir, + int unpriv_sgio) +{ + char *sysfs_path = NULL; + char *val = NULL; + int ret = -1; + int rc; + + if (!(sysfs_path = virGetUnprivSGIOSysfsPath(path, sysfs_dir))) + return -1; + + if (!virFileExists(sysfs_path)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("unpriv_sgio is not supported by this kernel")); + goto cleanup; + } + + if (virAsprintf(&val, "%d", unpriv_sgio) < 0) { + virReportOOMError(); + goto cleanup; + } + + if ((rc = virFileWriteStr(sysfs_path, val, 0)) < 0) { + virReportSystemError(-rc, _("failed to set %s"), sysfs_path); + goto cleanup; + } + + ret = 0; +cleanup: + VIR_FREE(sysfs_path); + VIR_FREE(val); + return ret; +} + +int +virGetDeviceUnprivSGIO(const char *path, + const char *sysfs_dir, + int *unpriv_sgio) +{ + char *sysfs_path = NULL; + char *buf = NULL; + char *tmp = NULL; + int ret = -1; + + if (!(sysfs_path = virGetUnprivSGIOSysfsPath(path, sysfs_dir))) + return -1; + + if (!virFileExists(sysfs_path)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("unpriv_sgio is not supported by this kernel")); + goto cleanup; + } + + if (virFileReadAll(sysfs_path, 1024, &buf) < 0) + goto cleanup; + + if ((tmp = strchr(buf, '\n'))) + *tmp = '\0'; + + if (virStrToLong_i(buf, NULL, 10, unpriv_sgio) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("failed to parse value of %s"), sysfs_path); + goto cleanup; + } + + ret = 0; +cleanup: + VIR_FREE(sysfs_path); + VIR_FREE(buf); + return ret; +} diff --git a/src/util/virutil.h b/src/util/virutil.h index e5116ed..634dcaf 100644 --- a/src/util/virutil.h +++ b/src/util/virutil.h @@ -281,4 +281,15 @@ bool virIsDevMapperDevice(const char *dev_name) ATTRIBUTE_NONNULL(1); bool virValidateWWN(const char *wwn); bool virStrIsPrint(const char *str); + +int virGetDeviceID(const char *path, + int *maj, + int *min); +int virSetDeviceUnprivSGIO(const char *path, + const char *sysfs_dir, + int unpriv_sgio); +int virGetDeviceUnprivSGIO(const char *path, + const char *sysfs_dir, + int *unpriv_sgio); + #endif /* __VIR_UTIL_H__ */ -- 1.7.7.6

On 01/02/2013 07:37 AM, Osier Yang wrote:
"virGetDeviceID" could be used across the sources, but it doesn't relate with this series, and could be done later.
* src/util/virutil.h: (Declare virGetDeviceID, and vir{Get,Set}DeviceUnprivSGIO) * src/util/virutil.c: (Implement virGetDeviceID and vir{Get,Set}DeviceUnprivSGIO) * src/libvirt_private.syms: Export private symbols of upper helpers --- src/libvirt_private.syms | 3 + src/util/virutil.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/virutil.h | 11 ++++ 3 files changed, 154 insertions(+), 0 deletions(-)
ACK.
+int +virSetDeviceUnprivSGIO(const char *path, + const char *sysfs_dir, + int unpriv_sgio) +{ + char *sysfs_path = NULL; + char *val = NULL; + int ret = -1; + int rc; + + if (!(sysfs_path = virGetUnprivSGIOSysfsPath(path, sysfs_dir))) + return -1; + + if (!virFileExists(sysfs_path)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("unpriv_sgio is not supported by this kernel")); + goto cleanup;
If 'unpriv_sgio' is 0 here, does it make the logic in any later patches easier to return success in that case (you are setting things to the default)? But I'm okay with keeping it as a failure. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 2013年01月03日 08:01, Eric Blake wrote:
On 01/02/2013 07:37 AM, Osier Yang wrote:
"virGetDeviceID" could be used across the sources, but it doesn't relate with this series, and could be done later.
* src/util/virutil.h: (Declare virGetDeviceID, and vir{Get,Set}DeviceUnprivSGIO) * src/util/virutil.c: (Implement virGetDeviceID and vir{Get,Set}DeviceUnprivSGIO) * src/libvirt_private.syms: Export private symbols of upper helpers --- src/libvirt_private.syms | 3 + src/util/virutil.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/virutil.h | 11 ++++ 3 files changed, 154 insertions(+), 0 deletions(-)
ACK.
+int +virSetDeviceUnprivSGIO(const char *path, + const char *sysfs_dir, + int unpriv_sgio) +{ + char *sysfs_path = NULL; + char *val = NULL; + int ret = -1; + int rc; + + if (!(sysfs_path = virGetUnprivSGIOSysfsPath(path, sysfs_dir))) + return -1; + + if (!virFileExists(sysfs_path)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("unpriv_sgio is not supported by this kernel")); + goto cleanup;
If 'unpriv_sgio' is 0 here, does it make the logic in any later patches easier to return success in that case (you are setting things to the default)? But I'm okay with keeping it as a failure.
The "unpriv_sgio == 0" could be requested explicitly by user (not only the from the default), and in this case I'd think an error is better. Otherwise I could see one will raise bug like "sgio='filtered' succeeded, but sgio='unfiltered' failed" unless we document it somewhere. Regards, Osier

This introduces a hash table for qemu driver, to store the shared disk's info as (@major:minor, @ref_count). @ref_count is the number of domains which shares the disk. Since we only care about if the disk support unprivileged SG_IO commands, and the SG_IO commands only make sense for block disk, this patch only manages (add/remove hash entry) the shared disk for block disk. * src/qemu/qemu_conf.h: (Add member 'sharedDisks' of type virHashTablePtr; Declare helpers qemuGetSharedDiskKey, qemuAddSharedDisk and qemuRemoveSharedDisk) * src/qemu/qemu_conf.c (Implement the 3 helpers) * src/qemu/qemu_process.c (Update 'sharedDisks' when domain starting and shutdown) * src/qemu/qemu_driver.c (Update 'sharedDisks' when attaching or detaching disk). --- src/qemu/qemu_conf.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_conf.h | 12 ++++++ src/qemu/qemu_driver.c | 22 ++++++++++++ src/qemu/qemu_process.c | 15 ++++++++ 4 files changed, 135 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index c6deb10..8440247 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -552,3 +552,89 @@ qemuDriverCloseCallbackRunAll(virQEMUDriverPtr driver, virHashForEach(driver->closeCallbacks, qemuDriverCloseCallbackRun, &data); } + +/* Construct the hash key for sharedDisks as "major:minor" */ +char * +qemuGetSharedDiskKey(const char *disk_path) +{ + int major, minor; + char *key = NULL; + int rc; + + if ((rc = virGetDeviceID(disk_path, &major, &minor)) < 0) { + virReportSystemError(-rc, + _("Unable to get minor number of device '%s'"), + disk_path); + return NULL; + } + + if (virAsprintf(&key, "%d:%d", major, minor) < 0) { + virReportOOMError(); + return NULL; + } + + return key; +} + +/* Increase ref count if the entry already exists, otherwise + * add a new entry. + */ +int +qemuAddSharedDisk(virHashTablePtr sharedDisks, + const char *disk_path) +{ + size_t *ref = NULL; + char *key = NULL; + + if (!(key = qemuGetSharedDiskKey(disk_path))) + return -1; + + if ((ref = virHashLookup(sharedDisks, key))) { + if (virHashUpdateEntry(sharedDisks, key, ++ref) < 0) { + VIR_FREE(key); + return -1; + } + } else { + if (virHashAddEntry(sharedDisks, key, (void *)0x1)) { + VIR_FREE(key); + return -1; + } + } + + VIR_FREE(key); + return 0; +} + +/* Decrease the ref count if the entry already exists, otherwise + * remove the entry. + */ +int +qemuRemoveSharedDisk(virHashTablePtr sharedDisks, + const char *disk_path) +{ + size_t *ref = NULL; + char *key = NULL; + + if (!(key = qemuGetSharedDiskKey(disk_path))) + return -1; + + if (!(ref = virHashLookup(sharedDisks, key))) { + VIR_FREE(key); + return -1; + } + + if (ref != (void *)0x1) { + if (virHashUpdateEntry(sharedDisks, key, --ref) < 0) { + VIR_FREE(key); + return -1; + } + } else { + if (virHashRemoveEntry(sharedDisks, key) < 0) { + VIR_FREE(key); + return -1; + } + } + + VIR_FREE(key); + return 0; +} diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h index 1aa56cc..965eff7 100644 --- a/src/qemu/qemu_conf.h +++ b/src/qemu/qemu_conf.h @@ -148,6 +148,8 @@ struct _virQEMUDriver { /* The devices which is are not in use by the host or any guest. */ pciDeviceList *inactivePciHostdevs; + virHashTablePtr sharedDisks; + virBitmapPtr reservedRemotePorts; virSysinfoDefPtr hostsysinfo; @@ -212,4 +214,14 @@ qemuDriverCloseCallback qemuDriverCloseCallbackGet(virQEMUDriverPtr driver, void qemuDriverCloseCallbackRunAll(virQEMUDriverPtr driver, virConnectPtr conn); +int qemuAddSharedDisk(virHashTablePtr sharedDisks, + const char *disk_path) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); + +int qemuRemoveSharedDisk(virHashTablePtr sharedDisks, + const char *disk_path) + ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2); +char * qemuGetSharedDiskKey(const char *disk_path) + ATTRIBUTE_NONNULL(1); + #endif /* __QEMUD_CONF_H */ diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index b8bb36b..bdaabdf 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -843,6 +843,9 @@ qemuStartup(bool privileged, if ((qemu_driver->inactivePciHostdevs = pciDeviceListNew()) == NULL) goto error; + if (!(qemu_driver->sharedDisks = virHashCreate(30, NULL))) + goto error; + if (privileged) { if (chown(qemu_driver->libDir, qemu_driver->user, qemu_driver->group) < 0) { virReportSystemError(errno, @@ -1096,6 +1099,7 @@ qemuShutdown(void) { pciDeviceListFree(qemu_driver->activePciHostdevs); pciDeviceListFree(qemu_driver->inactivePciHostdevs); usbDeviceListFree(qemu_driver->activeUsbHostdevs); + virHashFree(qemu_driver->sharedDisks); virCapabilitiesFree(qemu_driver->caps); qemuCapsCacheFree(qemu_driver->capsCache); @@ -5860,6 +5864,15 @@ qemuDomainAttachDeviceDiskLive(virConnectPtr conn, VIR_WARN("Failed to teardown cgroup for disk path %s", NULLSTR(disk->src)); } + + if (ret == 0 && + disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && + disk->shared) { + if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) + VIR_WARN("Failed to add disk '%s' to shared disk table", + disk->src); + } + end: if (cgroup) virCgroupFree(&cgroup); @@ -5974,6 +5987,15 @@ qemuDomainDetachDeviceDiskLive(virQEMUDriverPtr driver, virDomainDiskDeviceTypeToString(disk->type)); break; } + + if (ret == 0 && + disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && + disk->shared) { + if (qemuRemoveSharedDisk(driver->sharedDisks, disk->src) < 0) + VIR_WARN("Failed to remove disk '%s' from shared disk table", + disk->src); + } + return ret; } diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index eac6553..2a1b6b7 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3719,6 +3719,8 @@ int qemuProcessStart(virConnectPtr conn, /* in case a certain disk is desirous of CAP_SYS_RAWIO, add this */ for (i = 0; i < vm->def->ndisks; i++) { + virDomainDiskDefPtr disk = vm->def->disks[i]; + if (vm->def->disks[i]->rawio == 1) #ifdef CAP_SYS_RAWIO virCommandAllowCap(cmd, CAP_SYS_RAWIO); @@ -3726,6 +3728,11 @@ int qemuProcessStart(virConnectPtr conn, virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("Raw I/O is not supported on this platform")); #endif + + if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { + if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) + goto cleanup; + } } virCommandSetPreExecHook(cmd, qemuProcessHook, &hookData); @@ -4122,6 +4129,14 @@ void qemuProcessStop(virQEMUDriverPtr driver, flags & VIR_QEMU_PROCESS_STOP_MIGRATED); virSecurityManagerReleaseLabel(driver->securityManager, vm->def); + for (i = 0; i < vm->def->ndisks; i++) { + virDomainDiskDefPtr disk = vm->def->disks[i]; + + if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { + ignore_value(qemuRemoveSharedDisk(driver->sharedDisks, disk->src)); + } + } + /* Clear out dynamically assigned labels */ for (i = 0; i < vm->def->nseclabels; i++) { if (vm->def->seclabels[i]->type == VIR_DOMAIN_SECLABEL_DYNAMIC) { -- 1.7.7.6

On 01/02/2013 07:37 AM, Osier Yang wrote:
This introduces a hash table for qemu driver, to store the shared disk's info as (@major:minor, @ref_count). @ref_count is the number of domains which shares the disk.
Since we only care about if the disk support unprivileged SG_IO commands, and the SG_IO commands only make sense for block disk, this patch only manages (add/remove hash entry) the shared disk for block disk.
* src/qemu/qemu_conf.h: (Add member 'sharedDisks' of type virHashTablePtr; Declare helpers qemuGetSharedDiskKey, qemuAddSharedDisk and qemuRemoveSharedDisk) * src/qemu/qemu_conf.c (Implement the 3 helpers) * src/qemu/qemu_process.c (Update 'sharedDisks' when domain starting and shutdown) * src/qemu/qemu_driver.c (Update 'sharedDisks' when attaching or detaching disk). --- src/qemu/qemu_conf.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_conf.h | 12 ++++++ src/qemu/qemu_driver.c | 22 ++++++++++++ src/qemu/qemu_process.c | 15 ++++++++ 4 files changed, 135 insertions(+), 0 deletions(-)
ACK, as I'd like to see this go in sooner rather than later, and what you have works. However...
diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index c6deb10..8440247 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -552,3 +552,89 @@ qemuDriverCloseCallbackRunAll(virQEMUDriverPtr driver,
virHashForEach(driver->closeCallbacks, qemuDriverCloseCallbackRun, &data); } + +/* Construct the hash key for sharedDisks as "major:minor" */ +char * +qemuGetSharedDiskKey(const char *disk_path) +{
Why are we converting major and minor into a malloc'd char*,...
+ int major, minor;
Here, I'd use 'maj' and 'min' to avoid any shadowing of the major() and minor() macros.
+int +qemuAddSharedDisk(virHashTablePtr sharedDisks, + const char *disk_path) +{ + size_t *ref = NULL; + char *key = NULL; + + if (!(key = qemuGetSharedDiskKey(disk_path))) + return -1; + + if ((ref = virHashLookup(sharedDisks, key))) {
...when you could just use a single int value comprising the combination of major and minor as the hash key, if only you had used virHashCreateFull() with custom comparators. That would be more efficient (no need to malloc each kay, comparisons are much faster as an integer compare instead of a strcmp, and so on). It may be worth a followup patch that re-does the hash table to be more efficient.
+++ b/src/qemu/qemu_driver.c @@ -843,6 +843,9 @@ qemuStartup(bool privileged, if ((qemu_driver->inactivePciHostdevs = pciDeviceListNew()) == NULL) goto error;
+ if (!(qemu_driver->sharedDisks = virHashCreate(30, NULL)))
Here's where the virHashCreateFull would let you avoid having to convert the key into a string. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 2013年01月04日 04:02, Eric Blake wrote:
On 01/02/2013 07:37 AM, Osier Yang wrote:
This introduces a hash table for qemu driver, to store the shared disk's info as (@major:minor, @ref_count). @ref_count is the number of domains which shares the disk.
Since we only care about if the disk support unprivileged SG_IO commands, and the SG_IO commands only make sense for block disk, this patch only manages (add/remove hash entry) the shared disk for block disk.
* src/qemu/qemu_conf.h: (Add member 'sharedDisks' of type virHashTablePtr; Declare helpers qemuGetSharedDiskKey, qemuAddSharedDisk and qemuRemoveSharedDisk) * src/qemu/qemu_conf.c (Implement the 3 helpers) * src/qemu/qemu_process.c (Update 'sharedDisks' when domain starting and shutdown) * src/qemu/qemu_driver.c (Update 'sharedDisks' when attaching or detaching disk). --- src/qemu/qemu_conf.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_conf.h | 12 ++++++ src/qemu/qemu_driver.c | 22 ++++++++++++ src/qemu/qemu_process.c | 15 ++++++++ 4 files changed, 135 insertions(+), 0 deletions(-)
ACK, as I'd like to see this go in sooner rather than later, and what you have works. However...
diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index c6deb10..8440247 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -552,3 +552,89 @@ qemuDriverCloseCallbackRunAll(virQEMUDriverPtr driver,
virHashForEach(driver->closeCallbacks, qemuDriverCloseCallbackRun,&data); } + +/* Construct the hash key for sharedDisks as "major:minor" */ +char * +qemuGetSharedDiskKey(const char *disk_path) +{
Why are we converting major and minor into a malloc'd char*,...
+ int major, minor;
Here, I'd use 'maj' and 'min' to avoid any shadowing of the major() and minor() macros.
Oh, okay, I missed that.
+int +qemuAddSharedDisk(virHashTablePtr sharedDisks, + const char *disk_path) +{ + size_t *ref = NULL; + char *key = NULL; + + if (!(key = qemuGetSharedDiskKey(disk_path))) + return -1; + + if ((ref = virHashLookup(sharedDisks, key))) {
...when you could just use a single int value comprising the combination of major and minor as the hash key, if only you had used
Sounds good, how about something like "805516", where "8" is major, and "16" is the minor? It should be small enough to be not overflowing.
virHashCreateFull() with custom comparators. That would be more efficient (no need to malloc each kay, comparisons are much faster as an integer compare instead of a strcmp, and so on). It may be worth a followup patch that re-does the hash table to be more efficient.
+++ b/src/qemu/qemu_driver.c @@ -843,6 +843,9 @@ qemuStartup(bool privileged, if ((qemu_driver->inactivePciHostdevs = pciDeviceListNew()) == NULL) goto error;
+ if (!(qemu_driver->sharedDisks = virHashCreate(30, NULL)))
Here's where the virHashCreateFull would let you avoid having to convert the key into a string.

On 01/03/2013 08:35 PM, Osier Yang wrote:
...when you could just use a single int value comprising the combination of major and minor as the hash key, if only you had used
Sounds good, how about something like "805516", where "8" is major, and "16" is the minor? It should be small enough to be not overflowing.
In fact, we already know of a single integer value large enough to contain both major and minor at the same time: the original st_rdev (of type dev_t) from stat(). We could simply add 'verify(sizeof(dev_t) <= sizeof(void*))', then reconstruct the st_rdev value by undoing the decomposition done by the major() and minor() macros. Or, you could even tweak patch 1/6 to return st_rdev as a single value instead of major/minor as two separate values. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 2013年01月04日 22:30, Eric Blake wrote:
On 01/03/2013 08:35 PM, Osier Yang wrote:
...when you could just use a single int value comprising the combination of major and minor as the hash key, if only you had used
Sounds good, how about something like "805516", where "8" is major, and "16" is the minor? It should be small enough to be not overflowing.
In fact, we already know of a single integer value large enough to contain both major and minor at the same time: the original st_rdev (of type dev_t) from stat(). We could simply add 'verify(sizeof(dev_t)<= sizeof(void*))', then reconstruct the st_rdev value by undoing the decomposition done by the major() and minor() macros. Or, you could even tweak patch 1/6 to return st_rdev as a single value instead of major/minor as two separate values.
Good, I will create a follow up patch, but later than pushing this set. Thanks, Osier

This introduces new XML tag "sgio" for disk, its valid values are "filtered" and "unfiltered", setting it as "filtered" will set the disk's unpriv_sgio to 0, and "unfiltered" to set it as 1, which allows the unprivileged SG_IO commands. --- docs/formatdomain.html.in | 14 ++++++++++- docs/schemas/domaincommon.rng | 54 +++++++++++++++++++++++++++------------- 2 files changed, 49 insertions(+), 19 deletions(-) diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 94df6f8..5e37b92 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -1395,7 +1395,19 @@ rawio='yes', rawio capability will be enabled for all disks in the domain (because, in the case of QEMU, this capability can only be set on a per-process basis). This attribute is only - valid when device is "lun". + valid when device is "lun". NB, <code>rawio</code> intends to + confine the capability per-device, however, current QEMU + implementation gives the domain process broader capability + than that (per-process basis, affects all the domain disks). + To confine the capability as much as possible for QEMU driver + as this stage, <code>sgio</code> is recommended, it's more + secure than <code>rawio</code>. + The optional <code>sgio</code> attribute indicates whether the + kernel will filter unprivileged SG_IO commands for the disk, + valid settings are "filtered" or "unfiltered". Defaults to + "filtered". Same with <code>rawio</code>, <code>sgio</code> + is only valid for device 'lun'. + <span class="since">since 1.0.2</span> The optional <code>snapshot</code> attribute indicates the default behavior of the disk during disk snapshots: "internal" requires a file format such as qcow2 that can store both the diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng index 0529d62..7a97781 100644 --- a/docs/schemas/domaincommon.rng +++ b/docs/schemas/domaincommon.rng @@ -971,24 +971,42 @@ --> <define name="disk"> <element name="disk"> - <optional> - <attribute name="device"> - <choice> - <value>floppy</value> - <value>disk</value> - <value>cdrom</value> - <value>lun</value> - </choice> - </attribute> - </optional> - <optional> - <attribute name="rawio"> - <choice> - <value>yes</value> - <value>no</value> - </choice> - </attribute> - </optional> + <choice> + <group> + <optional> + <attribute name="device"> + <choice> + <value>floppy</value> + <value>disk</value> + <value>cdrom</value> + </choice> + </attribute> + </optional> + </group> + <group> + <attribute name="device"> + <choice> + <value>lun</value> + </choice> + </attribute> + <optional> + <attribute name="rawio"> + <choice> + <value>yes</value> + <value>no</value> + </choice> + </attribute> + </optional> + <optional> + <attribute name="sgio"> + <choice> + <value>filtered</value> + <value>unfiltered</value> + </choice> + </attribute> + </optional> + </group> + </choice> <optional> <ref name="snapshot"/> </optional> -- 1.7.7.6

On 01/02/2013 07:37 AM, Osier Yang wrote:
This introduces new XML tag "sgio" for disk, its valid values are "filtered" and "unfiltered", setting it as "filtered" will set the disk's unpriv_sgio to 0, and "unfiltered" to set it as 1, which allows the unprivileged SG_IO commands. --- docs/formatdomain.html.in | 14 ++++++++++- docs/schemas/domaincommon.rng | 54 +++++++++++++++++++++++++++------------- 2 files changed, 49 insertions(+), 19 deletions(-)
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 94df6f8..5e37b92 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -1395,7 +1395,19 @@ rawio='yes', rawio capability will be enabled for all disks in the domain (because, in the case of QEMU, this capability can only be set on a per-process basis). This attribute is only - valid when device is "lun". + valid when device is "lun". NB, <code>rawio</code> intends to + confine the capability per-device, however, current QEMU + implementation gives the domain process broader capability + than that (per-process basis, affects all the domain disks). + To confine the capability as much as possible for QEMU driver + as this stage, <code>sgio</code> is recommended, it's more + secure than <code>rawio</code>. + The optional <code>sgio</code> attribute indicates whether the
For consistency with how we did it for 'rawio': The optional <code>sgio</code> attribute (<span class="since">since 1.0.2</span>) indicates...
+ kernel will filter unprivileged SG_IO commands for the disk, + valid settings are "filtered" or "unfiltered". Defaults to + "filtered". Same with <code>rawio</code>, <code>sgio</code>
s/Same with/Similar to/
+ is only valid for device 'lun'. + <span class="since">since 1.0.2</span>
...then drop the <span> here.
+ <group> + <attribute name="device"> + <choice> + <value>lun</value> + </choice>
Technically, the <choice> isn't needed here (but it doesn't hurt either). ACK with the grammar cleaned up. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 2013年01月04日 05:25, Eric Blake wrote:
On 01/02/2013 07:37 AM, Osier Yang wrote:
This introduces new XML tag "sgio" for disk, its valid values are "filtered" and "unfiltered", setting it as "filtered" will set the disk's unpriv_sgio to 0, and "unfiltered" to set it as 1, which allows the unprivileged SG_IO commands. --- docs/formatdomain.html.in | 14 ++++++++++- docs/schemas/domaincommon.rng | 54 +++++++++++++++++++++++++++------------- 2 files changed, 49 insertions(+), 19 deletions(-)
diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 94df6f8..5e37b92 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -1395,7 +1395,19 @@ rawio='yes', rawio capability will be enabled for all disks in the domain (because, in the case of QEMU, this capability can only be set on a per-process basis). This attribute is only - valid when device is "lun". + valid when device is "lun". NB,<code>rawio</code> intends to + confine the capability per-device, however, current QEMU + implementation gives the domain process broader capability + than that (per-process basis, affects all the domain disks). + To confine the capability as much as possible for QEMU driver + as this stage,<code>sgio</code> is recommended, it's more + secure than<code>rawio</code>. + The optional<code>sgio</code> attribute indicates whether the
For consistency with how we did it for 'rawio':
The optional<code>sgio</code> attribute (<span class="since">since 1.0.2</span>) indicates...
Okay,
+ kernel will filter unprivileged SG_IO commands for the disk, + valid settings are "filtered" or "unfiltered". Defaults to + "filtered". Same with<code>rawio</code>,<code>sgio</code>
s/Same with/Similar to/
Okay,
+ is only valid for device 'lun'. +<span class="since">since 1.0.2</span>
...then drop the<span> here.
+<group> +<attribute name="device"> +<choice> +<value>lun</value> +</choice>
Technically, the<choice> isn't needed here (but it doesn't hurt either).
ACK with the grammar cleaned up.
Will make the change when pushing, thanks. Osier

Like "rawio", "sgio" is only allowed for block disk of device type "lun". It doesn't default disk->sgio to "filtered" when parsing, as it won't be able to distinguish explicitly requested "filtered" and a default "filtered" in driver then. We have to error out for explicit request when the kernel doesn't support the new sysfs knob "unpriv_sgio", however, for defaulted "filtered", we can just ignore it if the kernel doesn't support "unpriv_sgio". --- src/conf/domain_conf.c | 55 +++++++++++++++----- src/conf/domain_conf.h | 10 ++++ ...qemuxml2argv-disk-scsi-lun-passthrough-sgio.xml | 32 +++++++++++ tests/qemuxml2xmltest.c | 1 + 4 files changed, 85 insertions(+), 13 deletions(-) create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-sgio.xml diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 79af087..b2bc57a 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -241,6 +241,12 @@ VIR_ENUM_IMPL(virDomainDiskIo, VIR_DOMAIN_DISK_IO_LAST, "default", "native", "threads") + +VIR_ENUM_IMPL(virDomainDiskSGIO, VIR_DOMAIN_DISK_SGIO_LAST, + "default", + "filtered", + "unfiltered") + VIR_ENUM_IMPL(virDomainIoEventFd, VIR_DOMAIN_IO_EVENT_FD_LAST, "default", "on", @@ -3593,6 +3599,7 @@ virDomainDiskDefParseXML(virCapsPtr caps, char *device = NULL; char *snapshot = NULL; char *rawio = NULL; + char *sgio = NULL; char *driverName = NULL; char *driverType = NULL; char *source = NULL; @@ -3657,6 +3664,7 @@ virDomainDiskDefParseXML(virCapsPtr caps, snapshot = virXMLPropString(node, "snapshot"); rawio = virXMLPropString(node, "rawio"); + sgio = virXMLPropString(node, "sgio"); cur = node->children; while (cur != NULL) { @@ -4107,22 +4115,32 @@ virDomainDiskDefParseXML(virCapsPtr caps, def->snapshot = VIR_DOMAIN_SNAPSHOT_LOCATION_NONE; } + if ((rawio || sgio) && + (def->device != VIR_DOMAIN_DISK_DEVICE_LUN)) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("rawio or sgio can be used only with " + "device='lun'")); + goto error; + } + if (rawio) { def->rawio_specified = true; - if (def->device == VIR_DOMAIN_DISK_DEVICE_LUN) { - if (STREQ(rawio, "yes")) { - def->rawio = 1; - } else if (STREQ(rawio, "no")) { - def->rawio = 0; - } else { - virReportError(VIR_ERR_XML_ERROR, - _("unknown disk rawio setting '%s'"), - rawio); - goto error; - } + if (STREQ(rawio, "yes")) { + def->rawio = 1; + } else if (STREQ(rawio, "no")) { + def->rawio = 0; } else { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("rawio can be used only with device='lun'")); + virReportError(VIR_ERR_XML_ERROR, + _("unknown disk rawio setting '%s'"), + rawio); + goto error; + } + } + + if (sgio) { + if ((def->sgio = virDomainDiskSGIOTypeFromString(sgio)) <= 0) { + virReportError(VIR_ERR_XML_ERROR, + _("unknown disk sgio mode '%s'"), sgio); goto error; } } @@ -4365,6 +4383,7 @@ cleanup: VIR_FREE(type); VIR_FREE(snapshot); VIR_FREE(rawio); + VIR_FREE(sgio); VIR_FREE(target); VIR_FREE(source); VIR_FREE(tray); @@ -12132,6 +12151,7 @@ virDomainDiskDefFormat(virBufferPtr buf, const char *event_idx = virDomainVirtioEventIdxTypeToString(def->event_idx); const char *copy_on_read = virDomainVirtioEventIdxTypeToString(def->copy_on_read); const char *startupPolicy = virDomainStartupPolicyTypeToString(def->startupPolicy); + const char *sgio = virDomainDiskSGIOTypeToString(def->sgio); int n; char uuidstr[VIR_UUID_STRING_BUFLEN]; @@ -12161,6 +12181,11 @@ virDomainDiskDefFormat(virBufferPtr buf, _("unexpected disk io mode %d"), def->iomode); return -1; } + if (!sgio) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("Unexpected disk sgio mode '%d'"), def->sgio); + return -1; + } virBufferAsprintf(buf, " <disk type='%s' device='%s'", @@ -12172,6 +12197,10 @@ virDomainDiskDefFormat(virBufferPtr buf, virBufferAddLit(buf, " rawio='no'"); } } + + if (def->sgio) + virBufferAsprintf(buf, " sgio='%s'", sgio); + if (def->snapshot && !(def->snapshot == VIR_DOMAIN_SNAPSHOT_LOCATION_NONE && def->readonly)) virBufferAsprintf(buf, " snapshot='%s'", diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index a975a63..4d836fd 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -566,6 +566,14 @@ enum virDomainDiskSecretType { VIR_DOMAIN_DISK_SECRET_TYPE_LAST }; +enum virDomainDiskSGIO { + VIR_DOMAIN_DISK_SGIO_DEFAULT = 0, + VIR_DOMAIN_DISK_SGIO_FILTERED, + VIR_DOMAIN_DISK_SGIO_UNFILTERED, + + VIR_DOMAIN_DISK_SGIO_LAST +}; + typedef struct _virDomainBlockIoTuneInfo virDomainBlockIoTuneInfo; struct _virDomainBlockIoTuneInfo { unsigned long long total_bytes_sec; @@ -638,6 +646,7 @@ struct _virDomainDiskDef { virStorageEncryptionPtr encryption; bool rawio_specified; int rawio; /* no = 0, yes = 1 */ + int sgio; size_t nseclabels; virSecurityDeviceLabelDefPtr *seclabels; @@ -2232,6 +2241,7 @@ VIR_ENUM_DECL(virDomainDiskProtocol) VIR_ENUM_DECL(virDomainDiskProtocolTransport) VIR_ENUM_DECL(virDomainDiskIo) VIR_ENUM_DECL(virDomainDiskSecretType) +VIR_ENUM_DECL(virDomainDiskSGIO) VIR_ENUM_DECL(virDomainDiskTray) VIR_ENUM_DECL(virDomainIoEventFd) VIR_ENUM_DECL(virDomainVirtioEventIdx) diff --git a/tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-sgio.xml b/tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-sgio.xml new file mode 100644 index 0000000..eecf609 --- /dev/null +++ b/tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-sgio.xml @@ -0,0 +1,32 @@ +<domain type='qemu'> + <name>QEMUGuest1</name> + <uuid>c7a5fdbd-edaf-9455-926a-d65c16db1809</uuid> + <memory unit='KiB'>219136</memory> + <currentMemory unit='KiB'>219136</currentMemory> + <vcpu placement='static'>1</vcpu> + <os> + <type arch='i686' machine='pc'>hvm</type> + <boot dev='hd'/> + </os> + <clock offset='utc'/> + <on_poweroff>destroy</on_poweroff> + <on_reboot>restart</on_reboot> + <on_crash>destroy</on_crash> + <devices> + <emulator>/usr/bin/qemu</emulator> + <disk type='block' device='lun' rawio='no' sgio='unfiltered'> + <source dev='/dev/HostVG/QEMUGuest1'/> + <target dev='hda' bus='scsi'/> + <address type='drive' controller='0' bus='0' target='0' unit='0'/> + </disk> + <disk type='block' device='lun' sgio='filtered'> + <source dev='/dev/HostVG/QEMUGuest2'/> + <target dev='hda' bus='scsi'/> + <address type='drive' controller='0' bus='0' target='1' unit='1'/> + </disk> + <controller type='scsi' index='0' model='virtio-scsi'/> + <controller type='scsi' index='1' model='lsilogic'/> + <controller type='usb' index='0'/> + <memballoon model='virtio'/> + </devices> +</domain> diff --git a/tests/qemuxml2xmltest.c b/tests/qemuxml2xmltest.c index c2d3dd3..160e958 100644 --- a/tests/qemuxml2xmltest.c +++ b/tests/qemuxml2xmltest.c @@ -238,6 +238,7 @@ mymain(void) DO_TEST("seclabel-static"); DO_TEST("seclabel-none"); DO_TEST("numad-static-vcpu-no-numatune"); + DO_TEST("disk-scsi-lun-passthrough-sgio"); DO_TEST("disk-scsi-disk-vpd"); -- 1.7.7.6

On 01/02/2013 07:37 AM, Osier Yang wrote:
Like "rawio", "sgio" is only allowed for block disk of device type "lun".
It doesn't default disk->sgio to "filtered" when parsing, as it won't be able to distinguish explicitly requested "filtered" and a default "filtered" in driver then. We have to error out for explicit request when the kernel doesn't support the new sysfs knob "unpriv_sgio", however, for defaulted "filtered", we can just ignore it if the kernel doesn't support "unpriv_sgio". --- src/conf/domain_conf.c | 55 +++++++++++++++----- src/conf/domain_conf.h | 10 ++++ ...qemuxml2argv-disk-scsi-lun-passthrough-sgio.xml | 32 +++++++++++ tests/qemuxml2xmltest.c | 1 + 4 files changed, 85 insertions(+), 13 deletions(-) create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-sgio.xml
+enum virDomainDiskSGIO { + VIR_DOMAIN_DISK_SGIO_DEFAULT = 0, + VIR_DOMAIN_DISK_SGIO_FILTERED, + VIR_DOMAIN_DISK_SGIO_UNFILTERED, + + VIR_DOMAIN_DISK_SGIO_LAST +}; + typedef struct _virDomainBlockIoTuneInfo virDomainBlockIoTuneInfo; struct _virDomainBlockIoTuneInfo { unsigned long long total_bytes_sec; @@ -638,6 +646,7 @@ struct _virDomainDiskDef { virStorageEncryptionPtr encryption; bool rawio_specified; int rawio; /* no = 0, yes = 1 */
Don't know why we didn't make this 'bool', but that's pre-existing and would be a separate cleanup patch.
+ int sgio;
I'd add /* enum virDomainDiskSGIO */, to make it easier to see what goes in this int. ACK. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 2013年01月04日 08:21, Eric Blake wrote:
On 01/02/2013 07:37 AM, Osier Yang wrote:
Like "rawio", "sgio" is only allowed for block disk of device type "lun".
It doesn't default disk->sgio to "filtered" when parsing, as it won't be able to distinguish explicitly requested "filtered" and a default "filtered" in driver then. We have to error out for explicit request when the kernel doesn't support the new sysfs knob "unpriv_sgio", however, for defaulted "filtered", we can just ignore it if the kernel doesn't support "unpriv_sgio". --- src/conf/domain_conf.c | 55 +++++++++++++++----- src/conf/domain_conf.h | 10 ++++ ...qemuxml2argv-disk-scsi-lun-passthrough-sgio.xml | 32 +++++++++++ tests/qemuxml2xmltest.c | 1 + 4 files changed, 85 insertions(+), 13 deletions(-) create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-sgio.xml
+enum virDomainDiskSGIO { + VIR_DOMAIN_DISK_SGIO_DEFAULT = 0, + VIR_DOMAIN_DISK_SGIO_FILTERED, + VIR_DOMAIN_DISK_SGIO_UNFILTERED, + + VIR_DOMAIN_DISK_SGIO_LAST +}; + typedef struct _virDomainBlockIoTuneInfo virDomainBlockIoTuneInfo; struct _virDomainBlockIoTuneInfo { unsigned long long total_bytes_sec; @@ -638,6 +646,7 @@ struct _virDomainDiskDef { virStorageEncryptionPtr encryption; bool rawio_specified; int rawio; /* no = 0, yes = 1 */
Don't know why we didn't make this 'bool', but that's pre-existing and would be a separate cleanup patch.
+ int sgio;
I'd add /* enum virDomainDiskSGIO */, to make it easier to see what goes in this int.
Okay, will add it when pushing.
ACK.

This ignores the default "filtered" if unpriv_sgio is not supported by kernel, but for explicit request "filtered", it error out for domain starting. --- src/libvirt_private.syms | 1 + src/qemu/qemu_driver.c | 15 +++++++++------ src/qemu/qemu_process.c | 31 +++++++++++++++++++++++++++++++ src/qemu/qemu_process.h | 1 + src/util/virutil.c | 2 +- src/util/virutil.h | 2 ++ 6 files changed, 45 insertions(+), 7 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 6fba1d9..7472254 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1283,6 +1283,7 @@ virGetDeviceUnprivSGIO; virGetGroupID; virGetGroupName; virGetHostname; +virGetUnprivSGIOSysfsPath; virGetUserCacheDirectory; virGetUserConfigDirectory; virGetUserDirectory; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index bdaabdf..9ea484c 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5865,12 +5865,15 @@ qemuDomainAttachDeviceDiskLive(virConnectPtr conn, NULLSTR(disk->src)); } - if (ret == 0 && - disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && - disk->shared) { - if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) - VIR_WARN("Failed to add disk '%s' to shared disk table", - disk->src); + if (ret == 0) { + if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { + if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) + VIR_WARN("Failed to add disk '%s' to shared disk table", + disk->src); + } + + if (qemuSetUnprivSGIO(disk) < 0) + VIR_WARN("Failed to set unpriv_sgio of disk '%s'", disk->src); } end: diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 2a1b6b7..47dae1a 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3384,6 +3384,34 @@ qemuProcessReconnectAll(virConnectPtr conn, virQEMUDriverPtr driver) virHashForEach(driver->domains.objs, qemuProcessReconnectHelper, &data); } +int +qemuSetUnprivSGIO(virDomainDiskDefPtr disk) +{ + int val = -1; + + if (disk->sgio) + val = (disk->sgio == VIR_DOMAIN_DISK_SGIO_UNFILTERED); + + /* Ignore the setting if unpriv_sgio is not supported by the + * kernel, otherwise defaults to filter the SG_IO commands, + * I.E. Set unpriv_sgio to 0. + */ + if (disk->sgio == VIR_DOMAIN_DISK_SGIO_DEFAULT && + disk->device == VIR_DOMAIN_DISK_DEVICE_LUN) { + char *sysfs_path = NULL; + + if ((sysfs_path = virGetUnprivSGIOSysfsPath(disk->src, NULL)) && + virFileExists(sysfs_path)) + val = 0; + VIR_FREE(sysfs_path); + } + + if (val >= 0 && virSetDeviceUnprivSGIO(disk->src, NULL, val) < 0) + return -1; + + return 0; +} + int qemuProcessStart(virConnectPtr conn, virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -3733,6 +3761,9 @@ int qemuProcessStart(virConnectPtr conn, if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) goto cleanup; } + + if (qemuSetUnprivSGIO(disk) < 0) + goto cleanup; } virCommandSetPreExecHook(cmd, qemuProcessHook, &hookData); diff --git a/src/qemu/qemu_process.h b/src/qemu/qemu_process.h index c12df32..52a298d 100644 --- a/src/qemu/qemu_process.h +++ b/src/qemu/qemu_process.h @@ -98,5 +98,6 @@ bool qemuProcessAutoDestroyActive(virQEMUDriverPtr driver, virDomainObjPtr vm); virBitmapPtr qemuPrepareCpumap(virQEMUDriverPtr driver, virBitmapPtr nodemask); +int qemuSetUnprivSGIO(virDomainDiskDefPtr disk); #endif /* __QEMU_PROCESS_H__ */ diff --git a/src/util/virutil.c b/src/util/virutil.c index 275489a..47ab17f 100644 --- a/src/util/virutil.c +++ b/src/util/virutil.c @@ -3171,7 +3171,7 @@ virGetDeviceID(const char *path ATRRIBUTE_UNUSED, #define SYSFS_DEV_BLOCK_PATH "/sys/dev/block" -static char * +char * virGetUnprivSGIOSysfsPath(const char *path, const char *sysfs_dir) { diff --git a/src/util/virutil.h b/src/util/virutil.h index 634dcaf..5a08c81 100644 --- a/src/util/virutil.h +++ b/src/util/virutil.h @@ -291,5 +291,7 @@ int virSetDeviceUnprivSGIO(const char *path, int virGetDeviceUnprivSGIO(const char *path, const char *sysfs_dir, int *unpriv_sgio); +char * virGetUnprivSGIOSysfsPath(const char *path, + const char *sysfs_dir); #endif /* __VIR_UTIL_H__ */ -- 1.7.7.6

On 01/02/2013 07:37 AM, Osier Yang wrote:
This ignores the default "filtered" if unpriv_sgio is not supported by kernel, but for explicit request "filtered", it error out for domain starting. --- src/libvirt_private.syms | 1 + src/qemu/qemu_driver.c | 15 +++++++++------ src/qemu/qemu_process.c | 31 +++++++++++++++++++++++++++++++ src/qemu/qemu_process.h | 1 + src/util/virutil.c | 2 +- src/util/virutil.h | 2 ++ 6 files changed, 45 insertions(+), 7 deletions(-)
ACK. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

This prevents domain starting and disk attaching if the shared disk's setting conflicts with other active domain(s), E.g. A domain with "sgio" set as "filtered", however, another active domain is using it set as "unfiltered". --- src/qemu/qemu_driver.c | 5 ++++ src/qemu/qemu_process.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_process.h | 3 ++ 3 files changed, 61 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 9ea484c..d378a8d 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5814,6 +5814,11 @@ qemuDomainAttachDeviceDiskLive(virConnectPtr conn, goto end; } + if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && + disk->shared && + (qemuCheckSharedDisk(driver->sharedDisks, disk) < 0)) + goto end; + if (qemuDomainDetermineDiskChain(driver, disk, false) < 0) goto end; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 47dae1a..76cd51a 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3412,6 +3412,56 @@ qemuSetUnprivSGIO(virDomainDiskDefPtr disk) return 0; } +/* Check if a shared disk's setting conflicts with the conf + * used by other domain(s). Currently only checks the sgio + * setting. Note that this should only be called for disk with + * block source. + * + * Returns 0 if no conflicts, otherwise returns -1. + */ +int +qemuCheckSharedDisk(virHashTablePtr sharedDisks, + virDomainDiskDefPtr disk) +{ + int val; + size_t *ref = NULL; + char *key = NULL; + int ret = 0; + + if (!(key = qemuGetSharedDiskKey(disk->src))) + return -1; + + /* It can't be conflict if no other domain is + * is sharing it. + */ + if (!(ref = virHashLookup(sharedDisks, key))) + goto cleanup; + + if (ref == (void *)0x1) + goto cleanup; + + if (virGetDeviceUnprivSGIO(disk->src, NULL, &val) < 0) { + ret = -1; + goto cleanup; + } + + if ((val == 0 && + (disk->sgio == VIR_DOMAIN_DISK_SGIO_FILTERED || + disk->sgio == VIR_DOMAIN_DISK_SGIO_DEFAULT)) || + (val == 1 && + disk->sgio == VIR_DOMAIN_DISK_SGIO_UNFILTERED)) + goto cleanup; + + virReportError(VIR_ERR_OPERATION_INVALID, + _("sgio of shared disk '%s' conflicts with other " + "active domains"), disk->src); + ret = -1; + +cleanup: + VIR_FREE(key); + return ret; +} + int qemuProcessStart(virConnectPtr conn, virQEMUDriverPtr driver, virDomainObjPtr vm, @@ -3760,6 +3810,9 @@ int qemuProcessStart(virConnectPtr conn, if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK && disk->shared) { if (qemuAddSharedDisk(driver->sharedDisks, disk->src) < 0) goto cleanup; + + if (qemuCheckSharedDisk(driver->sharedDisks, disk) < 0) + goto cleanup; } if (qemuSetUnprivSGIO(disk) < 0) diff --git a/src/qemu/qemu_process.h b/src/qemu/qemu_process.h index 52a298d..313fa39 100644 --- a/src/qemu/qemu_process.h +++ b/src/qemu/qemu_process.h @@ -100,4 +100,7 @@ virBitmapPtr qemuPrepareCpumap(virQEMUDriverPtr driver, virBitmapPtr nodemask); int qemuSetUnprivSGIO(virDomainDiskDefPtr disk); +int qemuCheckSharedDisk(virHashTablePtr sharedDisks, + virDomainDiskDefPtr disk); + #endif /* __QEMU_PROCESS_H__ */ -- 1.7.7.6

On 01/02/2013 07:37 AM, Osier Yang wrote:
This prevents domain starting and disk attaching if the shared disk's setting conflicts with other active domain(s), E.g. A domain with "sgio" set as "filtered", however, another active domain is using it set as "unfiltered". --- src/qemu/qemu_driver.c | 5 ++++ src/qemu/qemu_process.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_process.h | 3 ++ 3 files changed, 61 insertions(+), 0 deletions(-)
ACK. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

On 2013年01月02日 22:37, Osier Yang wrote:
As a result of RFC [1], this implements the unprivleged SG_IO support.
1/6 and 2/6 are already acked.
v8 - v9: * Just rebasing.
v7 - v8: * Change the XML tag name from "cdbfilter" to "sgio", and to leave enough room for future values, the values of "sgio" are "filtered" and "unfiltered" now.
v6 - v7: * No restoring of unpriv_sgio per Daniel's thought. * Use "major:minor" as the hash key per Jirka's suggestion.
Osier Yang (6): util: Prepare helpers for unpriv_sgio setting qemu: Add a hash table for the shared disks docs: Add docs and rng schema for new XML tag sgio conf: Parse and format the new XML qemu: set unpriv_sgio when starting domain and attaching disk qemu: Check if the shared disk's cdbfilter conflicts with others
Pushed with the small nits pointed out by Eric fixed. Regards, Osier
participants (2)
-
Eric Blake
-
Osier Yang