If a domain has an NVMe disk configured, then we need to allow it
on devices CGroup so that qemu can access it. There is one caveat
though - if an NVMe disk is read only we need CGroup to allow
write too. This is because when opening the device, qemu does
couple of ioctl()-s which are considered as write.
Signed-off-by: Michal Privoznik <mprivozn(a)redhat.com>
---
src/qemu/qemu_cgroup.c | 101 +++++++++++++++++++++++++++++------------
1 file changed, 72 insertions(+), 29 deletions(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c
index 9684bf3662..b9fb0ebca2 100644
--- a/src/qemu/qemu_cgroup.c
+++ b/src/qemu/qemu_cgroup.c
@@ -119,10 +119,30 @@ qemuSetupImageCgroupInternal(virDomainObjPtr vm,
virStorageSourcePtr src,
bool forceReadonly)
{
- if (!src->path || !virStorageSourceIsLocalStorage(src)) {
- VIR_DEBUG("Not updating cgroups for disk path '%s', type: %s",
- NULLSTR(src->path), virStorageTypeToString(src->type));
- return 0;
+ VIR_AUTOFREE(char *) path = NULL;
+ bool readonly = src->readonly || forceReadonly;
+
+ if (src->type == VIR_STORAGE_TYPE_NVME) {
+ /* Even though disk is R/O we can't make it so in
+ * CGroups. QEMU will try to do some ioctl()-s over the
+ * device and such operations are considered R/W by the
+ * kernel */
+ readonly = false;
+
+ if (!(path =
virPCIDeviceAddressGetIOMMUGroupDev(&src->nvme->pciAddr)))
+ return -1;
+
+ if (qemuSetupImagePathCgroup(vm, QEMU_DEV_VFIO, false) < 0)
+ return -1;
+ } else {
+ if (!src->path || !virStorageSourceIsLocalStorage(src)) {
+ VIR_DEBUG("Not updating cgroups for disk path '%s', type:
%s",
+ NULLSTR(src->path), virStorageTypeToString(src->type));
+ return 0;
+ }
+
+ if (VIR_STRDUP(path, src->path) < 0)
+ return -1;
}
if (virStoragePRDefIsManaged(src->pr) &&
@@ -130,7 +150,7 @@ qemuSetupImageCgroupInternal(virDomainObjPtr vm,
qemuSetupImagePathCgroup(vm, QEMU_DEVICE_MAPPER_CONTROL_PATH, false) < 0)
return -1;
- return qemuSetupImagePathCgroup(vm, src->path, src->readonly ||
forceReadonly);
+ return qemuSetupImagePathCgroup(vm, path, readonly);
}
@@ -147,7 +167,10 @@ qemuTeardownImageCgroup(virDomainObjPtr vm,
virStorageSourcePtr src)
{
qemuDomainObjPrivatePtr priv = vm->privateData;
+ VIR_AUTOFREE(char *path) = NULL;
int perms = VIR_CGROUP_DEVICE_RWM;
+ bool hasPR = false;
+ bool hasNVMe = false;
size_t i;
int ret;
@@ -155,41 +178,61 @@ qemuTeardownImageCgroup(virDomainObjPtr vm,
VIR_CGROUP_CONTROLLER_DEVICES))
return 0;
- if (!src->path || !virStorageSourceIsLocalStorage(src)) {
- VIR_DEBUG("Not updating cgroups for disk path '%s', type: %s",
- NULLSTR(src->path), virStorageTypeToString(src->type));
- return 0;
+ for (i = 0; i < vm->def->ndisks; i++) {
+ virStorageSourcePtr diskSrc = vm->def->disks[i]->src;
+
+ if (src == diskSrc)
+ continue;
+
+ if (virStoragePRDefIsManaged(diskSrc->pr))
+ hasPR = true;
+
+ if (virStorageSourceChainHasNVMe(diskSrc))
+ hasNVMe = true;
}
- if (virFileExists(QEMU_DEVICE_MAPPER_CONTROL_PATH)) {
- for (i = 0; i < vm->def->ndisks; i++) {
- virStorageSourcePtr diskSrc = vm->def->disks[i]->src;
+ if (src->type == VIR_STORAGE_TYPE_NVME) {
+ if (!(path =
virPCIDeviceAddressGetIOMMUGroupDev(&src->nvme->pciAddr)))
+ return -1;
- if (src == diskSrc)
- continue;
-
- if (virStoragePRDefIsManaged(diskSrc->pr))
- break;
- }
-
- if (i == vm->def->ndisks) {
- VIR_DEBUG("Disabling device mapper control");
- ret = virCgroupDenyDevicePath(priv->cgroup,
- QEMU_DEVICE_MAPPER_CONTROL_PATH,
- perms, true);
+ if (!hasNVMe &&
+ !qemuDomainNeedsVFIO(vm->def)) {
+ ret = virCgroupDenyDevicePath(priv->cgroup, QEMU_DEV_VFIO, perms, true);
virDomainAuditCgroupPath(vm, priv->cgroup, "deny",
- QEMU_DEVICE_MAPPER_CONTROL_PATH,
+ QEMU_DEV_VFIO,
virCgroupGetDevicePermsString(perms), ret);
if (ret < 0)
- return ret;
+ return -1;
}
+ } else {
+ if (!src->path || !virStorageSourceIsLocalStorage(src)) {
+ VIR_DEBUG("Not updating cgroups for disk path '%s', type:
%s",
+ NULLSTR(src->path), virStorageTypeToString(src->type));
+ return 0;
+ }
+
+ if (VIR_STRDUP(path, src->path) < 0)
+ return -1;
+ }
+
+ if (!hasPR &&
+ virFileExists(QEMU_DEVICE_MAPPER_CONTROL_PATH)) {
+ VIR_DEBUG("Disabling device mapper control");
+ ret = virCgroupDenyDevicePath(priv->cgroup,
+ QEMU_DEVICE_MAPPER_CONTROL_PATH,
+ perms, true);
+ virDomainAuditCgroupPath(vm, priv->cgroup, "deny",
+ QEMU_DEVICE_MAPPER_CONTROL_PATH,
+ virCgroupGetDevicePermsString(perms), ret);
+ if (ret < 0)
+ return ret;
}
- VIR_DEBUG("Deny path %s", src->path);
+ VIR_DEBUG("Deny path %s", path);
- ret = virCgroupDenyDevicePath(priv->cgroup, src->path, perms, true);
+ ret = virCgroupDenyDevicePath(priv->cgroup, path, perms, true);
- virDomainAuditCgroupPath(vm, priv->cgroup, "deny", src->path,
+ virDomainAuditCgroupPath(vm, priv->cgroup, "deny", path,
virCgroupGetDevicePermsString(perms), ret);
/* If you're looking for a counter part to
--
2.21.0