[libvirt] [PATCH 00/10 v3] Unprivileged SG_IO support

Hi, As a result of RFC [1], this implements the unprivleged SG_IO support. Testing is not that enough, but I'd like see the reviewing earlier, and meanwhile I'm not going to give up the further testing. v2 - v3: * Change the XML tag name to "cdbfilter" * Maintain an internal list of shared disks for QEMU driver. Patches 1/10 ~ 4/10 are to introduce the internal list for shared disks. Osier Yang (10): qemu: Introduce a list to maintain the shared disks between domains qemu: Init/Free the list with the driver's lifecyle qemu: Add/remove the shared disk entry during domain's lifecyle qemu: Add/Remove the entry of sharedDisks when live attaching/detaching docs: Add docs and rng schema for new XML cdbfilter conf: Parse and format the new XML tag cdbfilter util: Prepare helpers for unpriv_sgio setting qemu: Manage disk's cdbfilter in domain's lifecycle qemu: Do not restore the sysfs unpriv_sgio if the disk is being shared qemu: Error out when domain starting if the cdbfilter setting conflicts docs/formatdomain.html.in | 13 ++- docs/schemas/domaincommon.rng | 52 +++++-- src/conf/domain_conf.c | 71 +++++++-- src/conf/domain_conf.h | 13 ++ src/libvirt_private.syms | 5 + src/qemu/qemu_conf.c | 166 ++++++++++++++++++++ src/qemu/qemu_conf.h | 30 ++++ src/qemu/qemu_driver.c | 28 ++++ src/qemu/qemu_process.c | 103 ++++++++++++- src/util/util.c | 145 +++++++++++++++++ src/util/util.h | 7 + ...ml2argv-disk-scsi-lun-passthrough-cdbfilter.xml | 32 ++++ tests/qemuxml2xmltest.c | 1 + 13 files changed, 634 insertions(+), 32 deletions(-) create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-cdbfilter.xml Regards, Osier

A single entry of the list is constructed by the _disk path_ and a sub-list of domain names which are using the disk. * src/qemu/qemu_conf.h (New struct qemuSharedDisk, qemuSharedDiskList; New type qemuSharedDiskPtr, qemuSharedDiskListPtr; Declare the helpers qemuSharedDiskListFree, qemuSharedDiskListAdd, qemuSharedDiskListFind, qemuSharedDiskListDel) * src/qemu/qemu_conf.c (Implement the helpers) --- src/qemu/qemu_conf.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/qemu/qemu_conf.h | 30 +++++++++ 2 files changed, 190 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index dc4d680..98eb1b5 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -745,3 +745,163 @@ qemuDriverCloseCallbackRunAll(struct qemud_driver *driver, virHashForEach(driver->closeCallbacks, qemuDriverCloseCallbackRun, &data); } + +void +qemuSharedDiskListFree(qemuSharedDiskListPtr list) +{ + int i; + int j; + + for (i = 0; i < list->ndisks; i++) { + VIR_FREE(list->disks[i]->path); + + for (j = 0; j < list->disks[i]->ndomains; j++) + VIR_FREE(list->disks[i]->domains[j]); + VIR_FREE(list->disks[i]->domains); + VIR_FREE(list->disks[i]); + } + VIR_FREE(list); +} + +/* Return the matched entry on success, with @idx set as + * the index of the matched entry. Or NULL on failure. + */ +qemuSharedDiskPtr +qemuSharedDiskListFind(qemuSharedDiskListPtr list, + const char *disk_path, + const char *domain_name, + int *idx) +{ + int i; + int j; + + for (i = 0; i < list->ndisks; i++) { + if (STREQ(disk_path, list->disks[i]->path)) { + for (j = 0; j < list->disks[i]->ndomains; j++) { + if (STREQ(domain_name, list->disks[i]->domains[j])) { + *idx = i; + return list->disks[i]; + } + } + } + } + + return NULL; +} + +int +qemuSharedDiskListAdd(qemuSharedDiskListPtr list, + const char *disk_path, + const char *domain_name) +{ + int i; + bool existed = false; + + if (qemuSharedDiskListFind(list, disk_path, domain_name, NULL)) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("disk '%s' of domain '%s' is already " + "maintained"), disk_path, domain_name); + return -1; + } + + for (i = 0; i < list->ndisks; i++) { + qemuSharedDiskPtr disk = list->disks[i]; + + /* Append the domain name to the existed entry */ + if (STREQ(disk->path, disk_path)) { + if (VIR_REALLOC_N(disk->domains, + disk->ndomains + 1) < 0) { + virReportOOMError(); + return -1; + } + + disk->domains[disk->ndomains++] = strdup(domain_name); + existed = true; + break; + } + } + + /* No existed entry for the @disk_path yet */ + if (!existed) { + if (VIR_REALLOC_N(list->disks, list->ndisks + 1) < 0) { + virReportOOMError(); + return -1; + } + + qemuSharedDiskPtr disk = list->disks[list->ndisks]; + + if (VIR_ALLOC_N(list->disks[list->ndisks]->domains, 1) < 0) { + virReportOOMError(); + return -1; + } + + disk->path = strdup(disk_path); + disk->ndomains = 1; + disk->domains[0] = strdup(domain_name); + } + + return 0; +} + +int +qemuSharedDiskListDel(qemuSharedDiskListPtr list, + const char *disk_path, + const char *domain_name) +{ + qemuSharedDiskPtr disk = NULL; + int i; + int idx; + + if (!qemuSharedDiskListFind(list, disk_path, domain_name, &idx)) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("disk '%s' of domain '%s' is not " + "maintained yet"), disk_path, domain_name); + return -1; + } + + disk = list->disks[idx]; + + if (disk->ndomains == 1) { + /* Free the disk entry if there is only one domain using it */ + if (idx != list->ndisks - 1) + memmove(&list->disks[idx], + &list->disks[idx + 1], + sizeof(*list->disks) * (list->ndisks - idx - 1)); + + if (VIR_REALLOC_N(list->disks, list->ndisks - 1) < 0) { + virReportOOMError(); + return -1; + } + + VIR_FREE(disk->domains[0]); + VIR_FREE(disk->domains); + VIR_FREE(disk->path); + VIR_FREE(disk); + list->ndisks--; + } else { + /* Otherwise just remove the domain name from the sub-list + * list->disk[i]->domains. + */ + for (i = 0; i < disk->ndomains; i++) { + if (STREQ(disk->domains[i], domain_name)) { + char *name = disk->domains[i]; + + if (i != disk->ndomains - 1) + memmove(&disk->domains[i], + &disk->domains[i + 1], + sizeof(*disk->domains) * (disk->ndomains - i - 1)); + + if (VIR_REALLOC_N(disk->domains, disk->ndomains - 1) < 0) { + virReportOOMError(); + return -1; + } + + disk->ndomains--; + VIR_FREE(name); + break; + } + } + } + + return 0; +} diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h index 2c7f70c..45e5eaa 100644 --- a/src/qemu/qemu_conf.h +++ b/src/qemu/qemu_conf.h @@ -50,6 +50,22 @@ typedef struct _qemuDriverCloseDef qemuDriverCloseDef; typedef qemuDriverCloseDef *qemuDriverCloseDefPtr; +typedef struct _qemuSharedDisk qemuSharedDisk; +typedef struct _qemuSharedDiskList qemuSharedDiskList; +typedef qemuSharedDiskList *qemuSharedDiskListPtr; +typedef qemuSharedDisk *qemuSharedDiskPtr; + +struct _qemuSharedDisk { + char *path; /* Disk path */ + char **domains; /* List of domain names which share the disk */ + int ndomains; +}; + +struct _qemuSharedDiskList { + int ndisks; + qemuSharedDiskPtr *disks; +}; + /* Main driver state */ struct qemud_driver { virMutex lock; @@ -140,6 +156,8 @@ struct qemud_driver { /* The devices which is are not in use by the host or any guest. */ pciDeviceList *inactivePciHostdevs; + qemuSharedDiskListPtr sharedDisks; + virBitmapPtr reservedRemotePorts; virSysinfoDefPtr hostsysinfo; @@ -204,4 +222,16 @@ qemuDriverCloseCallback qemuDriverCloseCallbackGet(struct qemud_driver *driver, void qemuDriverCloseCallbackRunAll(struct qemud_driver *driver, virConnectPtr conn); +void qemuSharedDiskListFree(qemuSharedDiskListPtr list); +int qemuSharedDiskListAdd(qemuSharedDiskListPtr list, + const char *disk_path, + const char *domain_name); +qemuSharedDiskPtr qemuSharedDiskListFind(qemuSharedDiskListPtr list, + const char *disk_path, + const char *domain_name, + int *idx); +int qemuSharedDiskListDel(qemuSharedDiskListPtr list, + const char *disk_path, + const char *domain_name); + #endif /* __QEMUD_CONF_H */ -- 1.7.7.6

Lifecyle here only means starting and shutdown. --- src/qemu/qemu_driver.c | 6 ++++++ 1 files changed, 6 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index d4cafcc..b108358 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -803,6 +803,11 @@ qemudStartup(int privileged) { if ((qemu_driver->inactivePciHostdevs = pciDeviceListNew()) == NULL) goto error; + if (VIR_ALLOC(qemu_driver->sharedDisks) < 0) { + virReportOOMError(); + goto error; + } + if (privileged) { if (chown(qemu_driver->libDir, qemu_driver->user, qemu_driver->group) < 0) { virReportSystemError(errno, @@ -1011,6 +1016,7 @@ qemudShutdown(void) { pciDeviceListFree(qemu_driver->activePciHostdevs); pciDeviceListFree(qemu_driver->inactivePciHostdevs); usbDeviceListFree(qemu_driver->activeUsbHostdevs); + qemuSharedDiskListFree(qemu_driver->sharedDisks); virCapabilitiesFree(qemu_driver->caps); qemuCapsCacheFree(qemu_driver->capsCache); -- 1.7.7.6

Lifecyle here only means starting and shutdown. --- src/qemu/qemu_process.c | 20 ++++++++++++++++++++ 1 files changed, 20 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 8743c60..962d6ea 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3700,6 +3700,13 @@ int qemuProcessStart(virConnectPtr conn, for (i = 0; i < vm->def->ndisks; i++) { if (vm->def->disks[i]->rawio == 1) virCommandAllowCap(cmd, CAP_SYS_RAWIO); + + if (vm->def->disks[i]->shared && + (qemuSharedDiskListAdd(driver->sharedDisks, + vm->def->disks[i]->src, + vm->def->name) < 0)) { + goto cleanup; + } } virCommandSetPreExecHook(cmd, qemuProcessHook, &hookData); @@ -4092,6 +4099,19 @@ void qemuProcessStop(struct qemud_driver *driver, flags & VIR_QEMU_PROCESS_STOP_MIGRATED); virSecurityManagerReleaseLabel(driver->securityManager, vm->def); + /* Remove the shared disk entry from qemud_driver->sharedDisks */ + for (i = 0; i < vm->def->ndisks; i++) { + if (vm->def->disks[i]->shared && + (qemuSharedDiskListDel(driver->sharedDisks, + vm->def->disks[i]->src, + vm->def->name) < 0)) { + VIR_WARN("Unable to remove shared disk entry for " + "disk = '%s', domain = '%s'", + vm->def->disks[i]->src, + vm->def->name); + } + } + /* Clear out dynamically assigned labels */ for (i = 0; i < vm->def->nseclabels; i++) { if (vm->def->seclabels[i]->type == VIR_DOMAIN_SECLABEL_DYNAMIC) { -- 1.7.7.6

This adds one entry to the list qemud_driver->sharedDisks when attaching a shared disk. And removing the entry from the list when detaching. --- src/qemu/qemu_driver.c | 22 ++++++++++++++++++++++ 1 files changed, 22 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index b108358..60c3ee8 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -5967,6 +5967,18 @@ qemuDomainAttachDeviceDiskLive(virConnectPtr conn, VIR_WARN("Failed to teardown cgroup for disk path %s", NULLSTR(disk->src)); } + + /* If the attached disk is shareable, add it to the list + * qemud_driver->sharedDisks if attachment succeeded + */ + if (ret == 0 && disk->shared) { + if (qemuSharedDiskListAdd(driver->sharedDisks, + disk->src, + vm->def->name) < 0) { + VIR_DEBUG("Failed to add the disk '%s' of domain '%s' to " + "sharedDisks list", disk->src, vm->def->name); + } + } end: if (cgroup) virCgroupFree(&cgroup); @@ -6082,6 +6094,16 @@ qemuDomainDetachDeviceDiskLive(struct qemud_driver *driver, virDomainDiskDeviceTypeToString(disk->type)); break; } + + if (ret == 0 && disk->shared) { + if (qemuSharedDiskListDel(driver->sharedDisks, + disk->src, + vm->def->name) < 0) { + VIR_DEBUG("Failed to remove the entry of disk '%s', domain " + "%s from sharedDisks", disk->src, vm->def->name); + } + } + return ret; } -- 1.7.7.6

Since "rawio" and "cdbfilter" are only valid for "lun", this groups them together; And since both of them intend to allow the unprivledged user to use the SG_IO commands, they must be exclusive. --- docs/formatdomain.html.in | 13 +++++++++- docs/schemas/domaincommon.rng | 52 ++++++++++++++++++++++++++++------------ 2 files changed, 48 insertions(+), 17 deletions(-) diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in index 6a3b976..f515ea9 100644 --- a/docs/formatdomain.html.in +++ b/docs/formatdomain.html.in @@ -1395,7 +1395,18 @@ rawio='yes', rawio capability will be enabled for all disks in the domain (because, in the case of QEMU, this capability can only be set on a per-process basis). This attribute is only - valid when device is "lun". + valid when device is "lun". NB, <code>rawio</code> intends to + confine the capability per-device, however, current QEMU + implementation gives the domain process broader capability + than that (per-process basis, affects all the domain disks). + To confine the capability as much as possible for QEMU driver + as this stage, <code>cdbfilter</code> is recommended. + The optional <code>cdbfilter</code> attribute + (<span class="since">since 1.0.1</span>) indicates whether the + kernel will filter unprivileged SG_IO for the disk, valid settings + are "yes" or "no" (defaults to "yes"). Note that it's exclusive + with attribute <code>rawio</code>; Same with <code>rawio</code>, + <code>cdbfilter</code> is only valid for device 'lun'. The optional <code>snapshot</code> attribute indicates the default behavior of the disk during disk snapshots: "internal" requires a file format such as qcow2 that can store both the diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng index 02ad477..a7acc92 100644 --- a/docs/schemas/domaincommon.rng +++ b/docs/schemas/domaincommon.rng @@ -957,24 +957,44 @@ --> <define name="disk"> <element name="disk"> - <optional> - <attribute name="device"> - <choice> - <value>floppy</value> - <value>disk</value> - <value>cdrom</value> - <value>lun</value> - </choice> - </attribute> - </optional> - <optional> - <attribute name="rawio"> + <choice> + <group> + <optional> + <attribute name="device"> + <choice> + <value>floppy</value> + <value>disk</value> + <value>cdrom</value> + </choice> + </attribute> + </optional> + </group> + <group> + <optional> + <attribute name="device"> + <value>lun</value> + </attribute> + </optional> <choice> - <value>yes</value> - <value>no</value> + <optional> + <attribute name="rawio"> + <choice> + <value>yes</value> + <value>no</value> + </choice> + </attribute> + </optional> + <optional> + <attribute name="cdbfilter"> + <choice> + <value>yes</value> + <value>no</value> + </choice> + </attribute> + </optional> </choice> - </attribute> - </optional> + </group> + </choice> <optional> <ref name="snapshot"/> </optional> -- 1.7.7.6

Setting "cdbfilter" to "no" to enable the unprivleged SG_IO, I.e. Disable the kernel CDB filtering. And "yes" to enable it. "yes" is the kernel default behaviour. Later patch will do the actual setting. --- src/conf/domain_conf.c | 58 +++++++++++++++----- src/conf/domain_conf.h | 10 ++++ ...ml2argv-disk-scsi-lun-passthrough-cdbfilter.xml | 32 +++++++++++ tests/qemuxml2xmltest.c | 1 + 4 files changed, 88 insertions(+), 13 deletions(-) create mode 100644 tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-cdbfilter.xml diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index ed8b53f..880cb6e 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -236,6 +236,10 @@ VIR_ENUM_IMPL(virDomainDiskIo, VIR_DOMAIN_DISK_IO_LAST, "default", "native", "threads") +VIR_ENUM_IMPL(virDomainDiskCDBFilter, VIR_DOMAIN_DISK_CDB_FILTER_LAST, + "default", + "yes", + "no") VIR_ENUM_IMPL(virDomainIoEventFd, VIR_DOMAIN_IO_EVENT_FD_LAST, "default", "on", @@ -3515,6 +3519,7 @@ virDomainDiskDefParseXML(virCapsPtr caps, char *device = NULL; char *snapshot = NULL; char *rawio = NULL; + char *cdbfilter = NULL; char *driverName = NULL; char *driverType = NULL; char *source = NULL; @@ -3576,6 +3581,7 @@ virDomainDiskDefParseXML(virCapsPtr caps, snapshot = virXMLPropString(node, "snapshot"); rawio = virXMLPropString(node, "rawio"); + cdbfilter = virXMLPropString(node, "cdbfilter"); cur = node->children; while (cur != NULL) { @@ -3966,24 +3972,46 @@ virDomainDiskDefParseXML(virCapsPtr caps, def->snapshot = VIR_DOMAIN_SNAPSHOT_LOCATION_NONE; } + if (rawio && cdbfilter) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("rawio and cdbfilter are exclusive")); + goto error; + } + + if ((rawio || cdbfilter) && + (def->device != VIR_DOMAIN_DISK_DEVICE_LUN)) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("rawio or cdbfilter can be used only with " + "device='lun'")); + goto error; + } + if (rawio) { def->rawio_specified = true; - if (def->device == VIR_DOMAIN_DISK_DEVICE_LUN) { - if (STREQ(rawio, "yes")) { - def->rawio = 1; - } else if (STREQ(rawio, "no")) { - def->rawio = 0; - } else { - virReportError(VIR_ERR_XML_ERROR, - _("unknown disk rawio setting '%s'"), - rawio); - goto error; - } + if (STREQ(rawio, "yes")) { + def->rawio = 1; + } else if (STREQ(rawio, "no")) { + def->rawio = 0; } else { - virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", - _("rawio can be used only with device='lun'")); + virReportError(VIR_ERR_XML_ERROR, + _("unknown disk rawio setting '%s'"), + rawio); + goto error; + } + } + + if (cdbfilter) { + int cdbfilter_val = 0; + + if ((cdbfilter_val = + virDomainDiskCDBFilterTypeFromString(cdbfilter)) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("unknown disk cdbfilter setting '%s'"), + cdbfilter); goto error; } + + def->cdbfilter = cdbfilter_val; } if (bus) { @@ -4220,6 +4248,7 @@ cleanup: VIR_FREE(type); VIR_FREE(snapshot); VIR_FREE(rawio); + VIR_FREE(cdbfilter); VIR_FREE(target); VIR_FREE(source); VIR_FREE(tray); @@ -11897,6 +11926,9 @@ virDomainDiskDefFormat(virBufferPtr buf, virBufferAddLit(buf, " rawio='no'"); } } + if (def->cdbfilter) + virBufferAsprintf(buf, " cdbfilter='%s'", + virDomainDiskCDBFilterTypeToString(def->cdbfilter)); if (def->snapshot && !(def->snapshot == VIR_DOMAIN_SNAPSHOT_LOCATION_NONE && def->readonly)) virBufferAsprintf(buf, " snapshot='%s'", diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index c3e8c16..8d5aa34 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -496,6 +496,14 @@ enum virDomainDiskIo { VIR_DOMAIN_DISK_IO_LAST }; +enum virDomainDiskCDBFilter { + VIR_DOMAIN_DISK_CDB_FILTER_DEFAULT = 0, + VIR_DOMAIN_DISK_CDB_FILTER_YES, /* reflects to 0 of sysfs unpriv_sgio */ + VIR_DOMAIN_DISK_CDB_FILTER_NO, /* reflects to 1 of sysfs unpriv_sgio */ + + VIR_DOMAIN_DISK_CDB_FILTER_LAST +}; + enum virDomainIoEventFd { VIR_DOMAIN_IO_EVENT_FD_DEFAULT = 0, VIR_DOMAIN_IO_EVENT_FD_ON, @@ -607,6 +615,7 @@ struct _virDomainDiskDef { virStorageEncryptionPtr encryption; bool rawio_specified; int rawio; /* no = 0, yes = 1 */ + int cdbfilter; size_t nseclabels; virSecurityDeviceLabelDefPtr *seclabels; @@ -2197,6 +2206,7 @@ VIR_ENUM_DECL(virDomainDiskCache) VIR_ENUM_DECL(virDomainDiskErrorPolicy) VIR_ENUM_DECL(virDomainDiskProtocol) VIR_ENUM_DECL(virDomainDiskIo) +VIR_ENUM_DECL(virDomainDiskCDBFilter) VIR_ENUM_DECL(virDomainDiskSecretType) VIR_ENUM_DECL(virDomainDiskTray) VIR_ENUM_DECL(virDomainIoEventFd) diff --git a/tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-cdbfilter.xml b/tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-cdbfilter.xml new file mode 100644 index 0000000..731c9bd --- /dev/null +++ b/tests/qemuxml2argvdata/qemuxml2argv-disk-scsi-lun-passthrough-cdbfilter.xml @@ -0,0 +1,32 @@ +<domain type='qemu'> + <name>QEMUGuest1</name> + <uuid>c7a5fdbd-edaf-9455-926a-d65c16db1809</uuid> + <memory unit='KiB'>219136</memory> + <currentMemory unit='KiB'>219136</currentMemory> + <vcpu placement='static'>1</vcpu> + <os> + <type arch='i686' machine='pc'>hvm</type> + <boot dev='hd'/> + </os> + <clock offset='utc'/> + <on_poweroff>destroy</on_poweroff> + <on_reboot>restart</on_reboot> + <on_crash>destroy</on_crash> + <devices> + <emulator>/usr/bin/qemu</emulator> + <disk type='block' device='lun' cdbfilter='yes'> + <source dev='/dev/HostVG/QEMUGuest1'/> + <target dev='hda' bus='scsi'/> + <address type='drive' controller='0' bus='0' target='0' unit='0'/> + </disk> + <disk type='block' device='lun' cdbfilter='no'> + <source dev='/dev/HostVG/QEMUGuest2'/> + <target dev='hda' bus='scsi'/> + <address type='drive' controller='0' bus='0' target='1' unit='1'/> + </disk> + <controller type='scsi' index='0' model='virtio-scsi'/> + <controller type='scsi' index='1' model='lsilogic'/> + <controller type='usb' index='0'/> + <memballoon model='virtio'/> + </devices> +</domain> diff --git a/tests/qemuxml2xmltest.c b/tests/qemuxml2xmltest.c index 1d366f1..9f1c0ba 100644 --- a/tests/qemuxml2xmltest.c +++ b/tests/qemuxml2xmltest.c @@ -238,6 +238,7 @@ mymain(void) DO_TEST("seclabel-static"); DO_TEST("seclabel-none"); DO_TEST("numad-static-vcpu-no-numatune"); + DO_TEST("disk-scsi-lun-passthrough-cdbfilter"); /* These tests generate different XML */ DO_TEST_DIFFERENT("balloon-device-auto"); -- 1.7.7.6

"virGetDevice{Major,Minor}" could be used across the sources, but it doesn't relate with this series, and could be done later. * src/util/util.h: (Declare virGetDevice{Major,Minor}, and vir{Get,Set}DeviceUnprivSGIO) * src/util/util.c: (Implement virGetDevice{Major,Minor} and vir{Get,Set}DeviceUnprivSGIO) * src/libvirt_private.syms: Export private symbols of upper helpers --- I'd like keep the term "UnprivSGIO" for these helpers, as it reflects to the underlying sysfs knob name. Also it might be used in other places in future other than "cdbfilter". --- src/libvirt_private.syms | 4 + src/util/util.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/util.h | 7 ++ 3 files changed, 156 insertions(+), 0 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 0115db1..c756130 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1257,6 +1257,9 @@ virFileWaitForDevices; virFileWriteStr; virFindFileInPath; virFormatIntDecimal; +virGetDeviceMajor; +virGetDeviceMinor; +virGetDeviceUnprivSGIO; virGetGroupID; virGetGroupName; virGetHostname; @@ -1275,6 +1278,7 @@ virPipeReadUntilEOF; virScaleInteger; virSetBlocking; virSetCloseExec; +virSetDeviceUnprivSGIO; virSetInherit; virSetNonBlock; virSetUIDGID; diff --git a/src/util/util.c b/src/util/util.c index 2fd0f2c..4dab6ac 100644 --- a/src/util/util.c +++ b/src/util/util.c @@ -3113,3 +3113,148 @@ virValidateWWN(const char *wwn) { return true; } + +#if defined(major) && defined(minor) +int +virGetDeviceMajor(const char *path) +{ + struct stat sb; + + if (stat(path, &sb) < 0) + return -errno; + + if (!S_ISBLK(sb.st_mode)) + return -EINVAL; + + return major(sb.st_rdev); +} + +int +virGetDeviceMinor(const char *path) +{ + struct stat sb; + + if (stat(path, &sb) < 0) + return -errno; + + if (!S_ISBLK(sb.st_mode)) + return -EINVAL; + + return minor(sb.st_rdev); +} +#else +int +virGetDeviceMajor(const char *path) +{ + return -ENOSYS; +} + +int +virGetDeviceMinor(const char *path) +{ + return -ENOSYS; +} +#endif + +static char * +virGetUnprivSGIOSysfsPath(const char *path) +{ + int major, minor; + char *sysfs_path = NULL; + + if ((major = virGetDeviceMajor(path)) < 0) { + virReportSystemError(-major, + _("Unable to get major number of device '%s'"), + path); + return NULL; + } + + if ((minor = virGetDeviceMinor(path)) < 0) { + virReportSystemError(-minor, + _("Unable to get minor number of device '%s'"), + path); + return NULL; + } + + if (virAsprintf(&sysfs_path, "/sys/dev/block/%d:%d/queue/unpriv_sgio", + major, minor) < 0) { + virReportOOMError(); + return NULL; + } + + return sysfs_path; +} + +int +virSetDeviceUnprivSGIO(const char *path, + int unpriv_sgio) +{ + char *sysfs_path = NULL; + char *val = NULL; + int ret = -1; + int rc = -1; + + if (!(sysfs_path = virGetUnprivSGIOSysfsPath(path))) + return -1; + + if (!virFileExists(sysfs_path)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("unpriv_sgio is not supported by this kernel")); + goto cleanup; + } + + if (virAsprintf(&val, "%d", unpriv_sgio) < 0) { + virReportOOMError(); + goto cleanup; + } + + if ((rc = virFileWriteStr(sysfs_path, val, 0)) < 0) { + virReportSystemError(-rc, _("failed to set %s"), sysfs_path); + goto cleanup; + } + + ret = 0; +cleanup: + VIR_FREE(sysfs_path); + VIR_FREE(val); + return ret; +} + +int +virGetDeviceUnprivSGIO(const char *path, + int *unpriv_sgio) +{ + char *sysfs_path = NULL; + char *buf = NULL; + char *tmp = NULL; + int ret = -1; + int rc = -1; + + if (!(sysfs_path = virGetUnprivSGIOSysfsPath(path))) + return -1; + + if (!virFileExists(sysfs_path)) { + virReportError(VIR_ERR_OPERATION_INVALID, "%s", + _("unpriv_sgio is not supported by this kernel")); + goto cleanup; + } + + if (virFileReadAll(sysfs_path, 1024, &buf) < 0) + goto cleanup; + + if ((tmp = strchr(buf, '\n'))) + *tmp = '\0'; + + if ((rc = virStrToLong_i(buf, NULL, 10, + unpriv_sgio)) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("failed to parse value of %s"), sysfs_path); + goto cleanup; + } + + ret = 0; +cleanup: + VIR_FREE(sysfs_path); + VIR_FREE(buf); + return ret; +} diff --git a/src/util/util.h b/src/util/util.h index 4316ab1..41146c1 100644 --- a/src/util/util.h +++ b/src/util/util.h @@ -280,4 +280,11 @@ bool virIsDevMapperDevice(const char *dev_name) ATTRIBUTE_NONNULL(1); bool virValidateWWN(const char *wwn); +int virGetDeviceMajor(const char *path); +int virGetDeviceMinor(const char *path); +int virSetDeviceUnprivSGIO(const char *path, + int unpriv_sgio); +int virGetDeviceUnprivSGIO(const char *path, + int *unpriv_sgio); + #endif /* __VIR_UTIL_H__ */ -- 1.7.7.6

Here lifecyle only means staring and shutdown. To record the original "unpriv_sgio" value, this introduces "old_cdbfilter" for disk def. When the domain is starting, the disk's "unpriv_sgio" is set with regards to the config in domain XML. And when the domain is being destroyed, it's restored to the original value ("old_cdbfilter"). --- src/conf/domain_conf.h | 1 + src/qemu/qemu_process.c | 53 +++++++++++++++++++++++++++++++++++++++------- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 8d5aa34..47a16c8 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -616,6 +616,7 @@ struct _virDomainDiskDef { bool rawio_specified; int rawio; /* no = 0, yes = 1 */ int cdbfilter; + int old_cdbfilter; /* Record the original state, internal only */ size_t nseclabels; virSecurityDeviceLabelDefPtr *seclabels; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 962d6ea..13fce78 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3696,17 +3696,39 @@ int qemuProcessStart(virConnectPtr conn, if (driver->clearEmulatorCapabilities) virCommandClearCaps(cmd); - /* in case a certain disk is desirous of CAP_SYS_RAWIO, add this */ for (i = 0; i < vm->def->ndisks; i++) { - if (vm->def->disks[i]->rawio == 1) + virDomainDiskDefPtr disk = vm->def->disks[i]; + int val; + + /* Add CAP_SYS_RAWIO if the disk is desirous of it */ + if (disk->rawio == 1) virCommandAllowCap(cmd, CAP_SYS_RAWIO); - if (vm->def->disks[i]->shared && + /* Add to qemud_driver->sharedDisks list if the disk is shared */ + if (disk->shared && (qemuSharedDiskListAdd(driver->sharedDisks, - vm->def->disks[i]->src, + disk->src, vm->def->name) < 0)) { goto cleanup; } + + if (!disk->cdbfilter) + continue; + + /* Set sysfs unpriv_sgio if cdbfilter is specified */ + if (virGetDeviceUnprivSGIO(disk->src, &val) < 0) + goto cleanup; + + if (val == 0) + disk->old_cdbfilter = VIR_DOMAIN_DISK_CDB_FILTER_YES; + else + disk->old_cdbfilter = VIR_DOMAIN_DISK_CDB_FILTER_NO; + + if (virSetDeviceUnprivSGIO(disk->src, + (disk->cdbfilter == + VIR_DOMAIN_DISK_CDB_FILTER_NO) + ? 1 : 0) < 0) + goto cleanup; } virCommandSetPreExecHook(cmd, qemuProcessHook, &hookData); @@ -4099,17 +4121,32 @@ void qemuProcessStop(struct qemud_driver *driver, flags & VIR_QEMU_PROCESS_STOP_MIGRATED); virSecurityManagerReleaseLabel(driver->securityManager, vm->def); - /* Remove the shared disk entry from qemud_driver->sharedDisks */ for (i = 0; i < vm->def->ndisks; i++) { - if (vm->def->disks[i]->shared && + virDomainDiskDefPtr disk = vm->def->disks[i]; + int val; + + /* Remove the shared disk entry from qemud_driver->sharedDisks */ + if (disk->shared && (qemuSharedDiskListDel(driver->sharedDisks, - vm->def->disks[i]->src, + disk->src, vm->def->name) < 0)) { VIR_WARN("Unable to remove shared disk entry for " "disk = '%s', domain = '%s'", - vm->def->disks[i]->src, + disk->src, vm->def->name); } + + if (!disk->cdbfilter) + continue; + + /* Restore sysfs unpriv_sgio for the disk */ + if (disk->old_cdbfilter == VIR_DOMAIN_DISK_CDB_FILTER_YES) + val = 0; + else + val = 1; + + if (virSetDeviceUnprivSGIO(disk->src, val) < 0) + VIR_WARN("Unable to restore unpriv_sgio for disk '%s'", disk->src); } /* Clear out dynamically assigned labels */ -- 1.7.7.6

This prevents restoring the unpriv_sgio if the disk is shared, and is being used by other active domain. Because we don't want to fall into the corruption situation. --- src/qemu/qemu_conf.c | 6 ++++++ src/qemu/qemu_process.c | 11 +++++++++++ 2 files changed, 17 insertions(+), 0 deletions(-) diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index 98eb1b5..d214ffe 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -765,6 +765,9 @@ qemuSharedDiskListFree(qemuSharedDiskListPtr list) /* Return the matched entry on success, with @idx set as * the index of the matched entry. Or NULL on failure. + * + * If @domain_name is passed as NULL, it simply returns + * the entry which matches the @disk_path. */ qemuSharedDiskPtr qemuSharedDiskListFind(qemuSharedDiskListPtr list, @@ -777,6 +780,9 @@ qemuSharedDiskListFind(qemuSharedDiskListPtr list, for (i = 0; i < list->ndisks; i++) { if (STREQ(disk_path, list->disks[i]->path)) { + if (!domain_name) + return list->disks[i]; + for (j = 0; j < list->disks[i]->ndomains; j++) { if (STREQ(domain_name, list->disks[i]->domains[j])) { *idx = i; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 13fce78..7ac83b5 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -4139,6 +4139,17 @@ void qemuProcessStop(struct qemud_driver *driver, if (!disk->cdbfilter) continue; + /* Don't try to restore the unpriv_sgio if the disk is shared + * by other active domain(s). We don't want to fall into the + * corruptions. + */ + if (disk->shared && + qemuSharedDiskListFind(driver->sharedDisks, + disk->src, + NULL, + NULL)) + continue; + /* Restore sysfs unpriv_sgio for the disk */ if (disk->old_cdbfilter == VIR_DOMAIN_DISK_CDB_FILTER_YES) val = 0; -- 1.7.7.6

This prevents the domain starting if the shared disk's setting conflicts with other active domain(s), E.g. A domain with "cdbfilter" set as "yes", however, another active domain is using it set as "no". * src/conf/domain_conf.h: (Declare helper virDomainDiskFindByPath) * src/conf/domain_conf.c: (Implement virDomainDiskFindByPath) * src/libvirt_private.syms (export virDomainDiskFindByPath) * src/qemu/qemu_process.c: (Error out if the shared disk's cdbfilter conflicts with others) --- src/conf/domain_conf.c | 13 +++++++++++++ src/conf/domain_conf.h | 2 ++ src/libvirt_private.syms | 1 + src/qemu/qemu_process.c | 43 +++++++++++++++++++++++++++++++++++++------ 4 files changed, 53 insertions(+), 6 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 880cb6e..811143c 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -3055,6 +3055,19 @@ virDomainDiskFindControllerModel(virDomainDefPtr def, return model; } +virDomainDiskDefPtr +virDomainDiskFindByPath(virDomainDefPtr def, + const char *path) +{ + int i; + + for (i = 0; i < def->ndisks; i++) + if (STREQ(def->disks[i]->src, path)) + return def->disks[i]; + + return NULL; +} + int virDomainDiskDefAssignAddress(virCapsPtr caps, virDomainDiskDefPtr def) { diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 47a16c8..5838a7b 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -1886,6 +1886,8 @@ void virDomainDiskHostDefFree(virDomainDiskHostDefPtr def); int virDomainDiskFindControllerModel(virDomainDefPtr def, virDomainDiskDefPtr disk, int controllerType); +virDomainDiskDefPtr virDomainDiskFindByPath(virDomainDefPtr def, + const char *path); void virDomainControllerDefFree(virDomainControllerDefPtr def); void virDomainFSDefFree(virDomainFSDefPtr def); void virDomainActualNetDefFree(virDomainActualNetDefPtr def); diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index c756130..37187db 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -343,6 +343,7 @@ virDomainDiskDefFree; virDomainDiskDeviceTypeToString; virDomainDiskErrorPolicyTypeFromString; virDomainDiskErrorPolicyTypeToString; +virDomainDiskFindByPath; virDomainDiskFindControllerModel; virDomainDiskGeometryTransTypeFromString; virDomainDiskGeometryTransTypeToString; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 7ac83b5..691ceea 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3704,12 +3704,43 @@ int qemuProcessStart(virConnectPtr conn, if (disk->rawio == 1) virCommandAllowCap(cmd, CAP_SYS_RAWIO); - /* Add to qemud_driver->sharedDisks list if the disk is shared */ - if (disk->shared && - (qemuSharedDiskListAdd(driver->sharedDisks, - disk->src, - vm->def->name) < 0)) { - goto cleanup; + if (disk->shared) { + /* Error out if the cdbfilter setting is different with what + * other domain(s) uses. + */ + qemuSharedDiskPtr entry = NULL; + + if ((entry = qemuSharedDiskListFind(driver->sharedDisks, + disk->src, + NULL, + NULL))) { + for (i = 0; i < entry->ndomains; i++) { + virDomainObjPtr domobj = NULL; + virDomainDiskDefPtr diskdef = NULL; + + if (!(domobj = virDomainFindByName(&driver->domains, + entry->domains[i]))) + goto cleanup; + + if (!(diskdef = virDomainDiskFindByPath(domobj->def, + disk->src))) + goto cleanup; + + if (diskdef->cdbfilter != disk->cdbfilter) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("cdbfilter of shared disk '%s' " + "conflicts with other active " + "domains"), disk->src); + goto cleanup; + } + } + } + + /* Add to qemud_driver->sharedDisks list if the disk is shared */ + if (qemuSharedDiskListAdd(driver->sharedDisks, + disk->src, + vm->def->name) < 0) + goto cleanup; } if (!disk->cdbfilter) -- 1.7.7.6
participants (1)
-
Osier Yang