[libvirt] [RFC PATCH 0/2] Add new mdev type for aggregated resources

Current mdev device create interface depends on fixed mdev type, which get uuid from user to create instance of mdev device. If user wants to use customized number of resource for mdev device, then only can create new mdev type for that which may not be flexible. To allow to create user defined resources for mdev, this RFC trys to extend mdev create interface by adding new "instances=xxx" parameter following uuid, for target mdev type if aggregation is supported, it can create new mdev device which contains resources combined by number of instances, e.g echo "<uuid>,instances=10" > create VM manager e.g libvirt can check mdev type with "aggregation" attribute which can support this setting. And new sysfs attribute "instances" is created for each mdev device to show allocated number. Default number of 1 or no "instances" file can be used for compatibility check. This RFC trys to create new KVMGT type with minimal vGPU resources which can be combined with "instances=x" setting to allocate for user wanted resources. Zhenyu Wang (2): vfio/mdev: Add new instances parameters for mdev create drm/i915/gvt: Add new aggregation type drivers/gpu/drm/i915/gvt/gvt.c | 26 ++++++++++++--- drivers/gpu/drm/i915/gvt/gvt.h | 14 +++++--- drivers/gpu/drm/i915/gvt/kvmgt.c | 9 +++-- drivers/gpu/drm/i915/gvt/vgpu.c | 56 ++++++++++++++++++++++++++++---- drivers/s390/cio/vfio_ccw_ops.c | 3 +- drivers/vfio/mdev/mdev_core.c | 11 ++++--- drivers/vfio/mdev/mdev_private.h | 6 +++- drivers/vfio/mdev/mdev_sysfs.c | 42 ++++++++++++++++++++---- include/linux/mdev.h | 3 +- samples/vfio-mdev/mbochs.c | 3 +- samples/vfio-mdev/mdpy.c | 3 +- samples/vfio-mdev/mtty.c | 3 +- 12 files changed, 141 insertions(+), 38 deletions(-) -- 2.18.0.rc2

For special mdev type which can aggregate instances for mdev device, this extends mdev create interface by allowing extra "instances=xxx" parameter, which is passed to mdev device model to be able to create arbitrary bundled number of instances for target mdev device. For created mdev device, also create new sysfs attr "instances" to show. For compatibility, current default instances is 1. Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kirti Wankhede <kwankhede@nvidia.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- drivers/gpu/drm/i915/gvt/kvmgt.c | 3 ++- drivers/s390/cio/vfio_ccw_ops.c | 3 ++- drivers/vfio/mdev/mdev_core.c | 11 ++++++--- drivers/vfio/mdev/mdev_private.h | 6 ++++- drivers/vfio/mdev/mdev_sysfs.c | 42 +++++++++++++++++++++++++++----- include/linux/mdev.h | 3 ++- samples/vfio-mdev/mbochs.c | 3 ++- samples/vfio-mdev/mdpy.c | 3 ++- samples/vfio-mdev/mtty.c | 3 ++- 9 files changed, 60 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index df4e4a07db3d..4a543e23b9a0 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -444,7 +444,8 @@ static void kvmgt_put_vfio_device(void *vgpu) vfio_device_put(((struct intel_vgpu *)vgpu)->vdev.vfio_device); } -static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev) +static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev, + unsigned int instances) { struct intel_vgpu *vgpu = NULL; struct intel_vgpu_type *type; diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 41eeb57d68a3..4c37826ec9d0 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -106,7 +106,8 @@ static struct attribute_group *mdev_type_groups[] = { NULL, }; -static int vfio_ccw_mdev_create(struct kobject *kobj, struct mdev_device *mdev) +static int vfio_ccw_mdev_create(struct kobject *kobj, struct mdev_device *mdev, + unsigned int instances) { struct vfio_ccw_private *private = dev_get_drvdata(mdev_parent_dev(mdev)); diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index 0212f0ee8aea..98b0a78e6832 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -104,12 +104,13 @@ static inline void mdev_put_parent(struct mdev_parent *parent) } static int mdev_device_create_ops(struct kobject *kobj, - struct mdev_device *mdev) + struct mdev_device *mdev, + unsigned int instances) { struct mdev_parent *parent = mdev->parent; int ret; - ret = parent->ops->create(kobj, mdev); + ret = parent->ops->create(kobj, mdev, instances); if (ret) return ret; @@ -276,7 +277,8 @@ static void mdev_device_release(struct device *dev) kfree(mdev); } -int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) +int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid, + unsigned int instances) { int ret; struct mdev_device *mdev, *tmp; @@ -316,6 +318,7 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) mdev->dev.bus = &mdev_bus_type; mdev->dev.release = mdev_device_release; dev_set_name(&mdev->dev, "%pUl", uuid.b); + mdev->type_instances = instances; ret = device_register(&mdev->dev); if (ret) { @@ -323,7 +326,7 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) goto mdev_fail; } - ret = mdev_device_create_ops(kobj, mdev); + ret = mdev_device_create_ops(kobj, mdev, instances); if (ret) goto create_fail; diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h index b5819b7d7ef7..aa0b4b64c503 100644 --- a/drivers/vfio/mdev/mdev_private.h +++ b/drivers/vfio/mdev/mdev_private.h @@ -33,6 +33,7 @@ struct mdev_device { struct kref ref; struct list_head next; struct kobject *type_kobj; + unsigned int type_instances; bool active; }; @@ -52,13 +53,16 @@ struct mdev_type { #define to_mdev_type(_kobj) \ container_of(_kobj, struct mdev_type, kobj) +#define MDEV_CREATE_OPT_LEN 32 + int parent_create_sysfs_files(struct mdev_parent *parent); void parent_remove_sysfs_files(struct mdev_parent *parent); int mdev_create_sysfs_files(struct device *dev, struct mdev_type *type); void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type); -int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid); +int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid, + unsigned int instances); int mdev_device_remove(struct device *dev, bool force_remove); #endif /* MDEV_PRIVATE_H */ diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 249472f05509..5a417a20e774 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -54,11 +54,15 @@ static const struct sysfs_ops mdev_type_sysfs_ops = { static ssize_t create_store(struct kobject *kobj, struct device *dev, const char *buf, size_t count) { - char *str; + char *str, *opt = NULL; uuid_le uuid; int ret; + unsigned int instances = 1; - if ((count < UUID_STRING_LEN) || (count > UUID_STRING_LEN + 1)) + if (count < UUID_STRING_LEN) + return -EINVAL; + + if (count > UUID_STRING_LEN + 1 + MDEV_CREATE_OPT_LEN) return -EINVAL; str = kstrndup(buf, count, GFP_KERNEL); @@ -66,13 +70,31 @@ static ssize_t create_store(struct kobject *kobj, struct device *dev, return -ENOMEM; ret = uuid_le_to_bin(str, &uuid); - kfree(str); - if (ret) + if (ret) { + kfree(str); return ret; + } - ret = mdev_device_create(kobj, dev, uuid); - if (ret) + if (count > UUID_STRING_LEN + 1) { + opt = str + UUID_STRING_LEN; + if (*opt++ != ',' || + strncmp(opt, "instances=", 10)) { + kfree(str); + return -EINVAL; + } + opt += 10; + if (kstrtouint(opt, 10, &instances)) { + kfree(str); + return -EINVAL; + } + } + + ret = mdev_device_create(kobj, dev, uuid, instances); + if (ret) { + kfree(str); return ret; + } + kfree(str); return count; } @@ -246,10 +268,18 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr, return count; } +static ssize_t instances_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct mdev_device *mdev = to_mdev_device(dev); + return sprintf(buf, "%u\n", mdev->type_instances); +} + static DEVICE_ATTR_WO(remove); +static DEVICE_ATTR_RO(instances); static const struct attribute *mdev_device_attrs[] = { &dev_attr_remove.attr, + &dev_attr_instances.attr, NULL, }; diff --git a/include/linux/mdev.h b/include/linux/mdev.h index b6e048e1045f..12d42aa30f81 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -70,7 +70,8 @@ struct mdev_parent_ops { const struct attribute_group **mdev_attr_groups; struct attribute_group **supported_type_groups; - int (*create)(struct kobject *kobj, struct mdev_device *mdev); + int (*create)(struct kobject *kobj, struct mdev_device *mdev, + unsigned int instances); int (*remove)(struct mdev_device *mdev); int (*open)(struct mdev_device *mdev); void (*release)(struct mdev_device *mdev); diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 2960e26c6ea4..8b2a9e98c587 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -432,7 +432,8 @@ static int mbochs_reset(struct mdev_device *mdev) return 0; } -static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev) +static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev, + unsigned int instances) { const struct mbochs_type *type = mbochs_find_type(kobj); struct device *dev = mdev_dev(mdev); diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 96e7969c473a..0ec48ea5225a 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -226,7 +226,8 @@ static int mdpy_reset(struct mdev_device *mdev) return 0; } -static int mdpy_create(struct kobject *kobj, struct mdev_device *mdev) +static int mdpy_create(struct kobject *kobj, struct mdev_device *mdev, + unsigned int instances) { const struct mdpy_type *type = mdpy_find_type(kobj); struct device *dev = mdev_dev(mdev); diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 7abb79d8313d..ff2eaffa1155 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -727,7 +727,8 @@ static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count, return ret; } -int mtty_create(struct kobject *kobj, struct mdev_device *mdev) +int mtty_create(struct kobject *kobj, struct mdev_device *mdev, + unsigned int instances) { struct mdev_state *mdev_state; char name[MTTY_STRING_LEN]; -- 2.18.0.rc2

New aggregation type is created for KVMGT which can be used with new mdev create "instances=xxx" parameter to combine minimal resource number for target instances, which can create user defined number of resources. User space checks whether there's "aggregation" attribute for mdev type and "instances" parameter can't be larger than "avail_instances". Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kirti Wankhede <kwankhede@nvidia.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- drivers/gpu/drm/i915/gvt/gvt.c | 26 ++++++++++++--- drivers/gpu/drm/i915/gvt/gvt.h | 14 +++++--- drivers/gpu/drm/i915/gvt/kvmgt.c | 6 ++-- drivers/gpu/drm/i915/gvt/vgpu.c | 56 ++++++++++++++++++++++++++++---- 4 files changed, 81 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/gvt.c b/drivers/gpu/drm/i915/gvt/gvt.c index 4e65266e7b95..ae90bd368b3f 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.c +++ b/drivers/gpu/drm/i915/gvt/gvt.c @@ -57,7 +57,7 @@ static struct intel_vgpu_type *intel_gvt_find_vgpu_type(struct intel_gvt *gvt, for (i = 0; i < gvt->num_types; i++) { t = &gvt->types[i]; if (!strncmp(t->name, name + strlen(driver_name) + 1, - sizeof(t->name))) + strlen(t->name))) return t; } @@ -105,9 +105,16 @@ static ssize_t description_show(struct kobject *kobj, struct device *dev, type->weight); } +static ssize_t aggregation_show(struct kobject *kobj, struct device *dev, + char *buf) +{ + return sprintf(buf, "%s\n", "true"); +} + static MDEV_TYPE_ATTR_RO(available_instances); static MDEV_TYPE_ATTR_RO(device_api); static MDEV_TYPE_ATTR_RO(description); +static MDEV_TYPE_ATTR_RO(aggregation); static struct attribute *gvt_type_attrs[] = { &mdev_type_attr_available_instances.attr, @@ -116,14 +123,20 @@ static struct attribute *gvt_type_attrs[] = { NULL, }; +static struct attribute *gvt_type_aggregate_attrs[] = { + &mdev_type_attr_available_instances.attr, + &mdev_type_attr_device_api.attr, + &mdev_type_attr_description.attr, + &mdev_type_attr_aggregation.attr, + NULL, +}; + static struct attribute_group *gvt_vgpu_type_groups[] = { [0 ... NR_MAX_INTEL_VGPU_TYPES - 1] = NULL, }; -static bool intel_get_gvt_attrs(struct attribute ***type_attrs, - struct attribute_group ***intel_vgpu_type_groups) +static bool intel_get_gvt_attrs(struct attribute_group ***intel_vgpu_type_groups) { - *type_attrs = gvt_type_attrs; *intel_vgpu_type_groups = gvt_vgpu_type_groups; return true; } @@ -142,7 +155,10 @@ static bool intel_gvt_init_vgpu_type_groups(struct intel_gvt *gvt) goto unwind; group->name = type->name; - group->attrs = gvt_type_attrs; + if (type->aggregation) + group->attrs = gvt_type_aggregate_attrs; + else + group->attrs = gvt_type_attrs; gvt_vgpu_type_groups[i] = group; } diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index de2a3a2580be..c0320b95804c 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -241,6 +241,9 @@ struct intel_vgpu { struct intel_gvt_gm { unsigned long vgpu_allocated_low_gm_size; unsigned long vgpu_allocated_high_gm_size; + unsigned long low_avail; + unsigned long high_avail; + unsigned long fence_avail; }; struct intel_gvt_fence { @@ -290,13 +293,14 @@ struct intel_gvt_firmware { #define NR_MAX_INTEL_VGPU_TYPES 20 struct intel_vgpu_type { - char name[16]; + char name[32]; unsigned int avail_instance; unsigned int low_gm_size; unsigned int high_gm_size; unsigned int fence; unsigned int weight; enum intel_vgpu_edid resolution; + bool aggregation; /* fine-grained resource type with aggregation capability */ }; struct intel_gvt { @@ -482,7 +486,8 @@ void intel_gvt_clean_vgpu_types(struct intel_gvt *gvt); struct intel_vgpu *intel_gvt_create_idle_vgpu(struct intel_gvt *gvt); void intel_gvt_destroy_idle_vgpu(struct intel_vgpu *vgpu); struct intel_vgpu *intel_gvt_create_vgpu(struct intel_gvt *gvt, - struct intel_vgpu_type *type); + struct intel_vgpu_type *type, + unsigned int); void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu); void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, bool dmlr, unsigned int engine_mask); @@ -560,15 +565,14 @@ struct intel_gvt_ops { int (*emulate_mmio_write)(struct intel_vgpu *, u64, void *, unsigned int); struct intel_vgpu *(*vgpu_create)(struct intel_gvt *, - struct intel_vgpu_type *); + struct intel_vgpu_type *, unsigned int); void (*vgpu_destroy)(struct intel_vgpu *); void (*vgpu_reset)(struct intel_vgpu *); void (*vgpu_activate)(struct intel_vgpu *); void (*vgpu_deactivate)(struct intel_vgpu *); struct intel_vgpu_type *(*gvt_find_vgpu_type)(struct intel_gvt *gvt, const char *name); - bool (*get_gvt_attrs)(struct attribute ***type_attrs, - struct attribute_group ***intel_vgpu_type_groups); + bool (*get_gvt_attrs)(struct attribute_group ***intel_vgpu_type_groups); int (*vgpu_query_plane)(struct intel_vgpu *vgpu, void *); int (*vgpu_get_dmabuf)(struct intel_vgpu *vgpu, unsigned int); int (*write_protect_handler)(struct intel_vgpu *, u64, void *, diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 4a543e23b9a0..f02c3a17714f 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -464,7 +464,7 @@ static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev, goto out; } - vgpu = intel_gvt_ops->vgpu_create(gvt, type); + vgpu = intel_gvt_ops->vgpu_create(gvt, type, instances); if (IS_ERR_OR_NULL(vgpu)) { ret = vgpu == NULL ? -EFAULT : PTR_ERR(vgpu); gvt_err("failed to create intel vgpu: %d\n", ret); @@ -1389,12 +1389,10 @@ static struct mdev_parent_ops intel_vgpu_ops = { static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops) { - struct attribute **kvm_type_attrs; struct attribute_group **kvm_vgpu_type_groups; intel_gvt_ops = ops; - if (!intel_gvt_ops->get_gvt_attrs(&kvm_type_attrs, - &kvm_vgpu_type_groups)) + if (!intel_gvt_ops->get_gvt_attrs(&kvm_vgpu_type_groups)) return -EFAULT; intel_vgpu_ops.supported_type_groups = kvm_vgpu_type_groups; diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index 889d10f8ee96..e4a5540401be 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -124,11 +124,14 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) high_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE; num_types = sizeof(vgpu_types) / sizeof(vgpu_types[0]); - gvt->types = kzalloc(num_types * sizeof(struct intel_vgpu_type), + gvt->types = kzalloc((num_types + 1) * sizeof(struct intel_vgpu_type), GFP_KERNEL); if (!gvt->types) return -ENOMEM; + gvt->gm.low_avail = low_avail; + gvt->gm.high_avail = high_avail; + gvt->gm.fence_avail = 32 - HOST_FENCE; min_low = MB_TO_BYTES(32); for (i = 0; i < num_types; ++i) { if (low_avail / vgpu_types[i].low_mm == 0) @@ -148,11 +151,11 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) high_avail / vgpu_types[i].high_mm); if (IS_GEN8(gvt->dev_priv)) - sprintf(gvt->types[i].name, "GVTg_V4_%s", - vgpu_types[i].name); + snprintf(gvt->types[i].name, sizeof(gvt->types[i].name), + "GVTg_V4_%s", vgpu_types[i].name); else if (IS_GEN9(gvt->dev_priv)) - sprintf(gvt->types[i].name, "GVTg_V5_%s", - vgpu_types[i].name); + snprintf(gvt->types[i].name, sizeof(gvt->types[i].name), + "GVTg_V5_%s", vgpu_types[i].name); gvt_dbg_core("type[%d]: %s avail %u low %u high %u fence %u weight %u res %s\n", i, gvt->types[i].name, @@ -163,7 +166,32 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) vgpu_edid_str(gvt->types[i].resolution)); } - gvt->num_types = i; + /* add aggregation type */ + gvt->types[i].low_gm_size = MB_TO_BYTES(32); + gvt->types[i].high_gm_size = MB_TO_BYTES(192); + gvt->types[i].fence = 2; + gvt->types[i].weight = 16; + gvt->types[i].resolution = GVT_EDID_1024_768; + gvt->types[i].avail_instance = min(low_avail / gvt->types[i].low_gm_size, + high_avail / gvt->types[i].high_gm_size); + gvt->types[i].avail_instance = min(gvt->types[i].avail_instance, + (32 - HOST_FENCE) / gvt->types[i].fence); + if (IS_GEN8(gvt->dev_priv)) + strcat(gvt->types[i].name, "GVTg_V4_aggregate"); + else if (IS_GEN9(gvt->dev_priv)) + strcat(gvt->types[i].name, "GVTg_V5_aggregate"); + + gvt_dbg_core("type[%d]: %s avail %u low %u high %u fence %u weight %u res %s\n", + i, gvt->types[i].name, + gvt->types[i].avail_instance, + gvt->types[i].low_gm_size, + gvt->types[i].high_gm_size, gvt->types[i].fence, + gvt->types[i].weight, + vgpu_edid_str(gvt->types[i].resolution)); + + gvt->types[i].aggregation = true; + gvt->num_types = ++i; + return 0; } @@ -443,7 +471,8 @@ static struct intel_vgpu *__intel_gvt_create_vgpu(struct intel_gvt *gvt, * pointer to intel_vgpu, error pointer if failed. */ struct intel_vgpu *intel_gvt_create_vgpu(struct intel_gvt *gvt, - struct intel_vgpu_type *type) + struct intel_vgpu_type *type, + unsigned int instances) { struct intel_vgpu_creation_params param; struct intel_vgpu *vgpu; @@ -460,6 +489,19 @@ struct intel_vgpu *intel_gvt_create_vgpu(struct intel_gvt *gvt, param.low_gm_sz = BYTES_TO_MB(param.low_gm_sz); param.high_gm_sz = BYTES_TO_MB(param.high_gm_sz); + if (type->aggregation && instances > 1) { + if (instances > type->avail_instance) + return ERR_PTR(-EINVAL); + param.low_gm_sz = min(param.low_gm_sz * instances, + (u64)BYTES_TO_MB(gvt->gm.low_avail)); + param.high_gm_sz = min(param.high_gm_sz * instances, + (u64)BYTES_TO_MB(gvt->gm.high_avail)); + param.fence_sz = min(param.fence_sz * instances, + (u64)gvt->gm.fence_avail); + gvt_dbg_core("instances %d, low %lluMB, high %lluMB, fence %llu\n", + instances, param.low_gm_sz, param.high_gm_sz, param.fence_sz); + } + mutex_lock(&gvt->lock); vgpu = __intel_gvt_create_vgpu(gvt, ¶m); if (!IS_ERR(vgpu)) -- 2.18.0.rc2

On 6/20/2018 1:10 PM, Zhenyu Wang wrote:
Current mdev device create interface depends on fixed mdev type, which get uuid from user to create instance of mdev device. If user wants to use customized number of resource for mdev device, then only can create new mdev type for that which may not be flexible.
To allow to create user defined resources for mdev, this RFC trys to extend mdev create interface by adding new "instances=xxx" parameter following uuid, for target mdev type if aggregation is supported, it can create new mdev device which contains resources combined by number of instances, e.g
echo "<uuid>,instances=10" > create
This seems orthogonal to the way mdev types are meant to be used. Vendor driver can provide the possible types to provide flexibility to the users. Secondly, not always all resources defined for a particular mdev type can be multiplied, for example, a mdev type for vGPU that supports 2 heads, that can't be multiplied to use with 20 heads. Thanks, Kirti
VM manager e.g libvirt can check mdev type with "aggregation" attribute which can support this setting. And new sysfs attribute "instances" is created for each mdev device to show allocated number. Default number of 1 or no "instances" file can be used for compatibility check.
This RFC trys to create new KVMGT type with minimal vGPU resources which can be combined with "instances=x" setting to allocate for user wanted resources.
Zhenyu Wang (2): vfio/mdev: Add new instances parameters for mdev create drm/i915/gvt: Add new aggregation type
drivers/gpu/drm/i915/gvt/gvt.c | 26 ++++++++++++--- drivers/gpu/drm/i915/gvt/gvt.h | 14 +++++--- drivers/gpu/drm/i915/gvt/kvmgt.c | 9 +++-- drivers/gpu/drm/i915/gvt/vgpu.c | 56 ++++++++++++++++++++++++++++---- drivers/s390/cio/vfio_ccw_ops.c | 3 +- drivers/vfio/mdev/mdev_core.c | 11 ++++--- drivers/vfio/mdev/mdev_private.h | 6 +++- drivers/vfio/mdev/mdev_sysfs.c | 42 ++++++++++++++++++++---- include/linux/mdev.h | 3 +- samples/vfio-mdev/mbochs.c | 3 +- samples/vfio-mdev/mdpy.c | 3 +- samples/vfio-mdev/mtty.c | 3 +- 12 files changed, 141 insertions(+), 38 deletions(-)

On Thu, 21 Jun 2018 19:57:38 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote:
On 6/20/2018 1:10 PM, Zhenyu Wang wrote:
Current mdev device create interface depends on fixed mdev type, which get uuid from user to create instance of mdev device. If user wants to use customized number of resource for mdev device, then only can create new mdev type for that which may not be flexible.
To allow to create user defined resources for mdev, this RFC trys to extend mdev create interface by adding new "instances=xxx" parameter following uuid, for target mdev type if aggregation is supported, it can create new mdev device which contains resources combined by number of instances, e.g
echo "<uuid>,instances=10" > create
This seems orthogonal to the way mdev types are meant to be used. Vendor driver can provide the possible types to provide flexibility to the users.
I think the goal here is to define how we create a type that supports arbitrary combinations of resources where the total number of those resources my be sufficiently large that the parent driver enumerating them all is not feasible.
Secondly, not always all resources defined for a particular mdev type can be multiplied, for example, a mdev type for vGPU that supports 2 heads, that can't be multiplied to use with 20 heads.
Not all types need to define themselves this way, aiui this is an optional extension. Userspace can determine if this feature is available with the new attribute and if they're not aware of the new attribute, we operate in a backwards compatible mode where 'echo $UUID > create' consumes one instance of that type. Mdev, like vfio, is device agnostic, so while a vGPU example may have scaling issues that need to be ironed out to implement this, those don't immediately negate the value of this proposal. Thanks, Alex
VM manager e.g libvirt can check mdev type with "aggregation" attribute which can support this setting. And new sysfs attribute "instances" is created for each mdev device to show allocated number. Default number of 1 or no "instances" file can be used for compatibility check.
This RFC trys to create new KVMGT type with minimal vGPU resources which can be combined with "instances=x" setting to allocate for user wanted resources.
Zhenyu Wang (2): vfio/mdev: Add new instances parameters for mdev create drm/i915/gvt: Add new aggregation type
drivers/gpu/drm/i915/gvt/gvt.c | 26 ++++++++++++--- drivers/gpu/drm/i915/gvt/gvt.h | 14 +++++--- drivers/gpu/drm/i915/gvt/kvmgt.c | 9 +++-- drivers/gpu/drm/i915/gvt/vgpu.c | 56 ++++++++++++++++++++++++++++---- drivers/s390/cio/vfio_ccw_ops.c | 3 +- drivers/vfio/mdev/mdev_core.c | 11 ++++--- drivers/vfio/mdev/mdev_private.h | 6 +++- drivers/vfio/mdev/mdev_sysfs.c | 42 ++++++++++++++++++++---- include/linux/mdev.h | 3 +- samples/vfio-mdev/mbochs.c | 3 +- samples/vfio-mdev/mdpy.c | 3 +- samples/vfio-mdev/mtty.c | 3 +- 12 files changed, 141 insertions(+), 38 deletions(-)

On 2018.06.21 09:00:28 -0600, Alex Williamson wrote:
On Thu, 21 Jun 2018 19:57:38 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote:
On 6/20/2018 1:10 PM, Zhenyu Wang wrote:
Current mdev device create interface depends on fixed mdev type, which get uuid from user to create instance of mdev device. If user wants to use customized number of resource for mdev device, then only can create new mdev type for that which may not be flexible.
To allow to create user defined resources for mdev, this RFC trys to extend mdev create interface by adding new "instances=xxx" parameter following uuid, for target mdev type if aggregation is supported, it can create new mdev device which contains resources combined by number of instances, e.g
echo "<uuid>,instances=10" > create
This seems orthogonal to the way mdev types are meant to be used. Vendor driver can provide the possible types to provide flexibility to the users.
I think the goal here is to define how we create a type that supports arbitrary combinations of resources where the total number of those resources my be sufficiently large that the parent driver enumerating them all is not feasible.
Secondly, not always all resources defined for a particular mdev type can be multiplied, for example, a mdev type for vGPU that supports 2 heads, that can't be multiplied to use with 20 heads.
yeah, for vGPU we actually can have flexible config for memory size but currently fixed by type definition. Although like for display config, we are just sticking with 1 head config even for aggregate type.
Not all types need to define themselves this way, aiui this is an optional extension. Userspace can determine if this feature is available with the new attribute and if they're not aware of the new attribute, we operate in a backwards compatible mode where 'echo $UUID > create' consumes one instance of that type. Mdev, like vfio, is device agnostic, so while a vGPU example may have scaling issues that need to be ironed out to implement this, those don't immediately negate the value of this proposal. Thanks,
yep, backward compatible is always ensured by this, so for no aggregation attrib type, still keep current behavior, driver should refuse "instances=" if set by user. One thing I'm concern is that looks currently without changing mdev create ops interface, can't carry that option for driver, or should we do with more general parameter? thanks
VM manager e.g libvirt can check mdev type with "aggregation" attribute which can support this setting. And new sysfs attribute "instances" is created for each mdev device to show allocated number. Default number of 1 or no "instances" file can be used for compatibility check.
This RFC trys to create new KVMGT type with minimal vGPU resources which can be combined with "instances=x" setting to allocate for user wanted resources.
Zhenyu Wang (2): vfio/mdev: Add new instances parameters for mdev create drm/i915/gvt: Add new aggregation type
drivers/gpu/drm/i915/gvt/gvt.c | 26 ++++++++++++--- drivers/gpu/drm/i915/gvt/gvt.h | 14 +++++--- drivers/gpu/drm/i915/gvt/kvmgt.c | 9 +++-- drivers/gpu/drm/i915/gvt/vgpu.c | 56 ++++++++++++++++++++++++++++---- drivers/s390/cio/vfio_ccw_ops.c | 3 +- drivers/vfio/mdev/mdev_core.c | 11 ++++--- drivers/vfio/mdev/mdev_private.h | 6 +++- drivers/vfio/mdev/mdev_sysfs.c | 42 ++++++++++++++++++++---- include/linux/mdev.h | 3 +- samples/vfio-mdev/mbochs.c | 3 +- samples/vfio-mdev/mdpy.c | 3 +- samples/vfio-mdev/mtty.c | 3 +- 12 files changed, 141 insertions(+), 38 deletions(-)
-- Open Source Technology Center, Intel ltd. $gpg --keyserver wwwkeys.pgp.net --recv-keys 4D781827

On 6/22/2018 1:12 PM, Zhenyu Wang wrote:
On 2018.06.21 09:00:28 -0600, Alex Williamson wrote:
On Thu, 21 Jun 2018 19:57:38 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote:
On 6/20/2018 1:10 PM, Zhenyu Wang wrote:
Current mdev device create interface depends on fixed mdev type, which get uuid from user to create instance of mdev device. If user wants to use customized number of resource for mdev device, then only can create new mdev type for that which may not be flexible.
To allow to create user defined resources for mdev, this RFC trys to extend mdev create interface by adding new "instances=xxx" parameter following uuid, for target mdev type if aggregation is supported, it can create new mdev device which contains resources combined by number of instances, e.g
echo "<uuid>,instances=10" > create
This seems orthogonal to the way mdev types are meant to be used. Vendor driver can provide the possible types to provide flexibility to the users.
I think the goal here is to define how we create a type that supports arbitrary combinations of resources where the total number of those resources my be sufficiently large that the parent driver enumerating them all is not feasible.
Secondly, not always all resources defined for a particular mdev type can be multiplied, for example, a mdev type for vGPU that supports 2 heads, that can't be multiplied to use with 20 heads.
yeah, for vGPU we actually can have flexible config for memory size but currently fixed by type definition. Although like for display config, we are just sticking with 1 head config even for aggregate type.
A mdev type is set of parameters. If aggregation is on one parameter, how can user know which parameter can be aggregated?
Not all types need to define themselves this way, aiui this is an optional extension. Userspace can determine if this feature is available with the new attribute and if they're not aware of the new attribute, we operate in a backwards compatible mode where 'echo $UUID > create' consumes one instance of that type. Mdev, like vfio, is device agnostic, so while a vGPU example may have scaling issues that need to be ironed out to implement this, those don't immediately negate the value of this proposal. Thanks,
yep, backward compatible is always ensured by this, so for no aggregation attrib type, still keep current behavior, driver should refuse "instances=" if set by user. One thing I'm concern is that looks currently without changing mdev create ops interface, can't carry that option for driver, or should we do with more general parameter?
I think, if aggregation is not supported then vendor driver should fail 'create' with instance >1. That means every vendor driver would need a change to handle this case. Other way could be, structure mdev_parent_ops can have other function pointer 'create_with_instances' which has instances as an argument. Vendor driver which support aggregation will have to provide this callback and existing vendor driver will need no change. Then in mdev core: if (instances > 1) { if (parent->ops->create_with_instances) ret = parent->ops->create_with_instances(kobj, mdev, instances); else return -EINVAL; } else ret = parent->ops->create(kobj, mdev); Thanks, Kirti
thanks
VM manager e.g libvirt can check mdev type with "aggregation" attribute which can support this setting. And new sysfs attribute "instances" is created for each mdev device to show allocated number. Default number of 1 or no "instances" file can be used for compatibility check.
This RFC trys to create new KVMGT type with minimal vGPU resources which can be combined with "instances=x" setting to allocate for user wanted resources.
Zhenyu Wang (2): vfio/mdev: Add new instances parameters for mdev create drm/i915/gvt: Add new aggregation type
drivers/gpu/drm/i915/gvt/gvt.c | 26 ++++++++++++--- drivers/gpu/drm/i915/gvt/gvt.h | 14 +++++--- drivers/gpu/drm/i915/gvt/kvmgt.c | 9 +++-- drivers/gpu/drm/i915/gvt/vgpu.c | 56 ++++++++++++++++++++++++++++---- drivers/s390/cio/vfio_ccw_ops.c | 3 +- drivers/vfio/mdev/mdev_core.c | 11 ++++--- drivers/vfio/mdev/mdev_private.h | 6 +++- drivers/vfio/mdev/mdev_sysfs.c | 42 ++++++++++++++++++++---- include/linux/mdev.h | 3 +- samples/vfio-mdev/mbochs.c | 3 +- samples/vfio-mdev/mdpy.c | 3 +- samples/vfio-mdev/mtty.c | 3 +- 12 files changed, 141 insertions(+), 38 deletions(-)

Current mdev device create interface depends on fixed mdev type, which get uuid from user to create instance of mdev device. If user wants to use customized number of resource for mdev device, then only can create new mdev type for that which may not be flexible. This requirement comes not only from to be able to allocate flexible resources for KVMGT, but also from Intel scalable IO virtualization which would use vfio/mdev to be able to allocate arbitrary resources on mdev instance. More info on [1] [2] [3]. To allow to create user defined resources for mdev, it trys to extend mdev create interface by adding new "instances=xxx" parameter following uuid, for target mdev type if aggregation is supported, it can create new mdev device which contains resources combined by number of instances, e.g echo "<uuid>,instances=10" > create VM manager e.g libvirt can check mdev type with "aggregation" attribute which can support this setting. If no "aggregation" attribute found for mdev type, previous behavior is still kept for one instance allocation. And new sysfs attribute "instances" is created for each mdev device to show allocated number. This trys to create new KVMGT type with minimal vGPU resources which can be combined with "instances=x" setting to allocate for user wanted resources. References: [1] https://software.intel.com/en-us/download/intel-virtualization-technology-fo... [2] https://software.intel.com/en-us/download/intel-scalable-io-virtualization-t... [3] https://schd.ws/hosted_files/lc32018/00/LC3-SIOV-final.pdf v2: - Add new create_with_instances driver hook - Update doc for new attributes Zhenyu Wang (4): vfio/mdev: Add new instances parameter for mdev create vfio/mdev: Add mdev device instances attribute drm/i915/gvt: Add new aggregation type support Documentation/vfio-mediated-device.txt: update for aggregation attribute Documentation/vfio-mediated-device.txt | 39 +++++++++++++++--- drivers/gpu/drm/i915/gvt/gvt.c | 26 +++++++++--- drivers/gpu/drm/i915/gvt/gvt.h | 14 ++++--- drivers/gpu/drm/i915/gvt/kvmgt.c | 30 +++++++++++--- drivers/gpu/drm/i915/gvt/vgpu.c | 56 ++++++++++++++++++++++---- drivers/vfio/mdev/mdev_core.c | 19 +++++++-- drivers/vfio/mdev/mdev_private.h | 6 ++- drivers/vfio/mdev/mdev_sysfs.c | 42 ++++++++++++++++--- include/linux/mdev.h | 10 +++++ 9 files changed, 203 insertions(+), 39 deletions(-) -- 2.18.0

For special mdev type which can aggregate instances for mdev device, this extends mdev create interface by allowing extra "instances=xxx" parameter, which is passed to mdev device model to be able to create arbitrary bundled number of instances for target mdev device. v2: create new create_with_instances operator for vendor driver Cc: Kirti Wankhede <kwankhede@nvidia.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kevin Tian <kevin.tian@intel.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- drivers/vfio/mdev/mdev_core.c | 18 +++++++++++++---- drivers/vfio/mdev/mdev_private.h | 5 ++++- drivers/vfio/mdev/mdev_sysfs.c | 34 ++++++++++++++++++++++++++------ include/linux/mdev.h | 10 ++++++++++ 4 files changed, 56 insertions(+), 11 deletions(-) diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index 0212f0ee8aea..e69db302a4f2 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -104,12 +104,16 @@ static inline void mdev_put_parent(struct mdev_parent *parent) } static int mdev_device_create_ops(struct kobject *kobj, - struct mdev_device *mdev) + struct mdev_device *mdev, + unsigned int instances) { struct mdev_parent *parent = mdev->parent; int ret; - ret = parent->ops->create(kobj, mdev); + if (instances > 1) + ret = parent->ops->create_with_instances(kobj, mdev, instances); + else + ret = parent->ops->create(kobj, mdev); if (ret) return ret; @@ -276,7 +280,8 @@ static void mdev_device_release(struct device *dev) kfree(mdev); } -int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) +int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid, + unsigned int instances) { int ret; struct mdev_device *mdev, *tmp; @@ -287,6 +292,11 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) if (!parent) return -EINVAL; + if (instances > 1 && !parent->ops->create_with_instances) { + ret = -EINVAL; + goto mdev_fail; + } + mutex_lock(&mdev_list_lock); /* Check for duplicate */ @@ -323,7 +333,7 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) goto mdev_fail; } - ret = mdev_device_create_ops(kobj, mdev); + ret = mdev_device_create_ops(kobj, mdev, instances); if (ret) goto create_fail; diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h index b5819b7d7ef7..c9d3fe04e273 100644 --- a/drivers/vfio/mdev/mdev_private.h +++ b/drivers/vfio/mdev/mdev_private.h @@ -52,13 +52,16 @@ struct mdev_type { #define to_mdev_type(_kobj) \ container_of(_kobj, struct mdev_type, kobj) +#define MDEV_CREATE_OPT_LEN 32 + int parent_create_sysfs_files(struct mdev_parent *parent); void parent_remove_sysfs_files(struct mdev_parent *parent); int mdev_create_sysfs_files(struct device *dev, struct mdev_type *type); void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type); -int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid); +int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid, + unsigned int instances); int mdev_device_remove(struct device *dev, bool force_remove); #endif /* MDEV_PRIVATE_H */ diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 249472f05509..a06e5b7c69d3 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -54,11 +54,15 @@ static const struct sysfs_ops mdev_type_sysfs_ops = { static ssize_t create_store(struct kobject *kobj, struct device *dev, const char *buf, size_t count) { - char *str; + char *str, *opt = NULL; uuid_le uuid; int ret; + unsigned int instances = 1; - if ((count < UUID_STRING_LEN) || (count > UUID_STRING_LEN + 1)) + if (count < UUID_STRING_LEN) + return -EINVAL; + + if (count > UUID_STRING_LEN + 1 + MDEV_CREATE_OPT_LEN) return -EINVAL; str = kstrndup(buf, count, GFP_KERNEL); @@ -66,13 +70,31 @@ static ssize_t create_store(struct kobject *kobj, struct device *dev, return -ENOMEM; ret = uuid_le_to_bin(str, &uuid); - kfree(str); - if (ret) + if (ret) { + kfree(str); return ret; + } - ret = mdev_device_create(kobj, dev, uuid); - if (ret) + if (count > UUID_STRING_LEN + 1) { + opt = str + UUID_STRING_LEN; + if (*opt++ != ',' || + strncmp(opt, "instances=", 10)) { + kfree(str); + return -EINVAL; + } + opt += 10; + if (kstrtouint(opt, 10, &instances)) { + kfree(str); + return -EINVAL; + } + } + + ret = mdev_device_create(kobj, dev, uuid, instances); + if (ret) { + kfree(str); return ret; + } + kfree(str); return count; } diff --git a/include/linux/mdev.h b/include/linux/mdev.h index b6e048e1045f..cfb702600f95 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -30,6 +30,13 @@ struct mdev_device; * @kobj: kobject of type for which 'create' is called. * @mdev: mdev_device structure on of mediated device * that is being created + * @create_with_instances: Allocate aggregated instances' resources in parent device's + * driver for a particular mediated device. It is optional + * if doesn't support aggregated resources. + * @kobj: kobject of type for which 'create' is called. + * @mdev: mdev_device structure on of mediated device + * that is being created + * @instances: number of instances to aggregate * Returns integer: success (0) or error (< 0) * @remove: Called to free resources in parent device's driver for a * a mediated device. It is mandatory to provide 'remove' @@ -71,6 +78,9 @@ struct mdev_parent_ops { struct attribute_group **supported_type_groups; int (*create)(struct kobject *kobj, struct mdev_device *mdev); + int (*create_with_instances)(struct kobject *kobj, + struct mdev_device *mdev, + unsigned int instances); int (*remove)(struct mdev_device *mdev); int (*open)(struct mdev_device *mdev); void (*release)(struct mdev_device *mdev); -- 2.18.0

On Fri, 20 Jul 2018 10:19:25 +0800 Zhenyu Wang <zhenyuw@linux.intel.com> wrote:
For special mdev type which can aggregate instances for mdev device, this extends mdev create interface by allowing extra "instances=xxx" parameter, which is passed to mdev device model to be able to create arbitrary bundled number of instances for target mdev device.
v2: create new create_with_instances operator for vendor driver
Cc: Kirti Wankhede <kwankhede@nvidia.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kevin Tian <kevin.tian@intel.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- drivers/vfio/mdev/mdev_core.c | 18 +++++++++++++---- drivers/vfio/mdev/mdev_private.h | 5 ++++- drivers/vfio/mdev/mdev_sysfs.c | 34 ++++++++++++++++++++++++++------ include/linux/mdev.h | 10 ++++++++++ 4 files changed, 56 insertions(+), 11 deletions(-)
(...)
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 249472f05509..a06e5b7c69d3 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -54,11 +54,15 @@ static const struct sysfs_ops mdev_type_sysfs_ops = { static ssize_t create_store(struct kobject *kobj, struct device *dev, const char *buf, size_t count) { - char *str; + char *str, *opt = NULL; uuid_le uuid; int ret; + unsigned int instances = 1;
- if ((count < UUID_STRING_LEN) || (count > UUID_STRING_LEN + 1)) + if (count < UUID_STRING_LEN) + return -EINVAL; + + if (count > UUID_STRING_LEN + 1 + MDEV_CREATE_OPT_LEN)
Do you plan to have other optional parameters? If you don't, you could probably do a quick exit here if count is between UUID_STRING_LEN + 1 and UUID_STRING_LEN + 12 (for ",instances=<one digit>")?
return -EINVAL;
str = kstrndup(buf, count, GFP_KERNEL);
(...)
diff --git a/include/linux/mdev.h b/include/linux/mdev.h index b6e048e1045f..cfb702600f95 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -30,6 +30,13 @@ struct mdev_device; * @kobj: kobject of type for which 'create' is called. * @mdev: mdev_device structure on of mediated device * that is being created + * @create_with_instances: Allocate aggregated instances' resources in parent device's + * driver for a particular mediated device. It is optional + * if doesn't support aggregated resources.
"Optional if aggregated resources are not supported"
+ * @kobj: kobject of type for which 'create' is called. + * @mdev: mdev_device structure on of mediated device + * that is being created + * @instances: number of instances to aggregate * Returns integer: success (0) or error (< 0)
You need that "Returns" line for both the old and the new ops.
* @remove: Called to free resources in parent device's driver for a * a mediated device. It is mandatory to provide 'remove' @@ -71,6 +78,9 @@ struct mdev_parent_ops { struct attribute_group **supported_type_groups;
int (*create)(struct kobject *kobj, struct mdev_device *mdev); + int (*create_with_instances)(struct kobject *kobj, + struct mdev_device *mdev, + unsigned int instances); int (*remove)(struct mdev_device *mdev); int (*open)(struct mdev_device *mdev); void (*release)(struct mdev_device *mdev);

For mdev device, create new sysfs attribute "instances" to show number of instances allocated for possible aggregation type. For compatibility default or without aggregated allocation, the number is 1. Cc: Kirti Wankhede <kwankhede@nvidia.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kevin Tian <kevin.tian@intel.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- drivers/vfio/mdev/mdev_core.c | 1 + drivers/vfio/mdev/mdev_private.h | 1 + drivers/vfio/mdev/mdev_sysfs.c | 8 ++++++++ 3 files changed, 10 insertions(+) diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index e69db302a4f2..f0478fc372d8 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -326,6 +326,7 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid, mdev->dev.bus = &mdev_bus_type; mdev->dev.release = mdev_device_release; dev_set_name(&mdev->dev, "%pUl", uuid.b); + mdev->type_instances = instances; ret = device_register(&mdev->dev); if (ret) { diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h index c9d3fe04e273..aa0b4b64c503 100644 --- a/drivers/vfio/mdev/mdev_private.h +++ b/drivers/vfio/mdev/mdev_private.h @@ -33,6 +33,7 @@ struct mdev_device { struct kref ref; struct list_head next; struct kobject *type_kobj; + unsigned int type_instances; bool active; }; diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index a06e5b7c69d3..5a417a20e774 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -268,10 +268,18 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr, return count; } +static ssize_t instances_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct mdev_device *mdev = to_mdev_device(dev); + return sprintf(buf, "%u\n", mdev->type_instances); +} + static DEVICE_ATTR_WO(remove); +static DEVICE_ATTR_RO(instances); static const struct attribute *mdev_device_attrs[] = { &dev_attr_remove.attr, + &dev_attr_instances.attr, NULL, }; -- 2.18.0

New aggregation type is created for KVMGT which can be used with new mdev create "instances=xxx" parameter to combine minimal resource number for target instances, which can create user defined number of resources. For KVMGT, aggregated resource is determined by memory and fence resource allocation for target number of instances. Cc: Kirti Wankhede <kwankhede@nvidia.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kevin Tian <kevin.tian@intel.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- drivers/gpu/drm/i915/gvt/gvt.c | 26 ++++++++++++--- drivers/gpu/drm/i915/gvt/gvt.h | 14 +++++--- drivers/gpu/drm/i915/gvt/kvmgt.c | 30 ++++++++++++++--- drivers/gpu/drm/i915/gvt/vgpu.c | 56 ++++++++++++++++++++++++++++---- 4 files changed, 104 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/gvt.c b/drivers/gpu/drm/i915/gvt/gvt.c index 712f9d14e720..21600447b029 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.c +++ b/drivers/gpu/drm/i915/gvt/gvt.c @@ -57,7 +57,7 @@ static struct intel_vgpu_type *intel_gvt_find_vgpu_type(struct intel_gvt *gvt, for (i = 0; i < gvt->num_types; i++) { t = &gvt->types[i]; if (!strncmp(t->name, name + strlen(driver_name) + 1, - sizeof(t->name))) + strlen(t->name))) return t; } @@ -105,9 +105,16 @@ static ssize_t description_show(struct kobject *kobj, struct device *dev, type->weight); } +static ssize_t aggregation_show(struct kobject *kobj, struct device *dev, + char *buf) +{ + return sprintf(buf, "%s\n", "true"); +} + static MDEV_TYPE_ATTR_RO(available_instances); static MDEV_TYPE_ATTR_RO(device_api); static MDEV_TYPE_ATTR_RO(description); +static MDEV_TYPE_ATTR_RO(aggregation); static struct attribute *gvt_type_attrs[] = { &mdev_type_attr_available_instances.attr, @@ -116,14 +123,20 @@ static struct attribute *gvt_type_attrs[] = { NULL, }; +static struct attribute *gvt_type_aggregate_attrs[] = { + &mdev_type_attr_available_instances.attr, + &mdev_type_attr_device_api.attr, + &mdev_type_attr_description.attr, + &mdev_type_attr_aggregation.attr, + NULL, +}; + static struct attribute_group *gvt_vgpu_type_groups[] = { [0 ... NR_MAX_INTEL_VGPU_TYPES - 1] = NULL, }; -static bool intel_get_gvt_attrs(struct attribute ***type_attrs, - struct attribute_group ***intel_vgpu_type_groups) +static bool intel_get_gvt_attrs(struct attribute_group ***intel_vgpu_type_groups) { - *type_attrs = gvt_type_attrs; *intel_vgpu_type_groups = gvt_vgpu_type_groups; return true; } @@ -142,7 +155,10 @@ static bool intel_gvt_init_vgpu_type_groups(struct intel_gvt *gvt) goto unwind; group->name = type->name; - group->attrs = gvt_type_attrs; + if (type->aggregation) + group->attrs = gvt_type_aggregate_attrs; + else + group->attrs = gvt_type_attrs; gvt_vgpu_type_groups[i] = group; } diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 9a9671522774..8848587f638f 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -241,6 +241,9 @@ struct intel_vgpu { struct intel_gvt_gm { unsigned long vgpu_allocated_low_gm_size; unsigned long vgpu_allocated_high_gm_size; + unsigned long low_avail; + unsigned long high_avail; + unsigned long fence_avail; }; struct intel_gvt_fence { @@ -292,13 +295,14 @@ struct intel_gvt_firmware { #define NR_MAX_INTEL_VGPU_TYPES 20 struct intel_vgpu_type { - char name[16]; + char name[32]; unsigned int avail_instance; unsigned int low_gm_size; unsigned int high_gm_size; unsigned int fence; unsigned int weight; enum intel_vgpu_edid resolution; + bool aggregation; /* fine-grained resource type with aggregation capability */ }; struct intel_gvt { @@ -484,7 +488,8 @@ void intel_gvt_clean_vgpu_types(struct intel_gvt *gvt); struct intel_vgpu *intel_gvt_create_idle_vgpu(struct intel_gvt *gvt); void intel_gvt_destroy_idle_vgpu(struct intel_vgpu *vgpu); struct intel_vgpu *intel_gvt_create_vgpu(struct intel_gvt *gvt, - struct intel_vgpu_type *type); + struct intel_vgpu_type *type, + unsigned int); void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu); void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, bool dmlr, unsigned int engine_mask); @@ -562,15 +567,14 @@ struct intel_gvt_ops { int (*emulate_mmio_write)(struct intel_vgpu *, u64, void *, unsigned int); struct intel_vgpu *(*vgpu_create)(struct intel_gvt *, - struct intel_vgpu_type *); + struct intel_vgpu_type *, unsigned int); void (*vgpu_destroy)(struct intel_vgpu *); void (*vgpu_reset)(struct intel_vgpu *); void (*vgpu_activate)(struct intel_vgpu *); void (*vgpu_deactivate)(struct intel_vgpu *); struct intel_vgpu_type *(*gvt_find_vgpu_type)(struct intel_gvt *gvt, const char *name); - bool (*get_gvt_attrs)(struct attribute ***type_attrs, - struct attribute_group ***intel_vgpu_type_groups); + bool (*get_gvt_attrs)(struct attribute_group ***intel_vgpu_type_groups); int (*vgpu_query_plane)(struct intel_vgpu *vgpu, void *); int (*vgpu_get_dmabuf)(struct intel_vgpu *vgpu, unsigned int); int (*write_protect_handler)(struct intel_vgpu *, u64, void *, diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 4d2f53ae9f0f..4f3a57b510fd 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -498,7 +498,9 @@ static void kvmgt_put_vfio_device(void *vgpu) vfio_device_put(((struct intel_vgpu *)vgpu)->vdev.vfio_device); } -static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev) +static int intel_vgpu_create_internal(struct kobject *kobj, + struct mdev_device *mdev, + unsigned int instances) { struct intel_vgpu *vgpu = NULL; struct intel_vgpu_type *type; @@ -517,7 +519,12 @@ static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev) goto out; } - vgpu = intel_gvt_ops->vgpu_create(gvt, type); + if (instances > 1 && !type->aggregation) { + ret = -EINVAL; + goto out; + } + + vgpu = intel_gvt_ops->vgpu_create(gvt, type, instances); if (IS_ERR_OR_NULL(vgpu)) { ret = vgpu == NULL ? -EFAULT : PTR_ERR(vgpu); gvt_err("failed to create intel vgpu: %d\n", ret); @@ -537,6 +544,20 @@ static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev) return ret; } +static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev) +{ + return intel_vgpu_create_internal(kobj, mdev, 1); +} + +static int intel_vgpu_create_with_instances(struct kobject *kobj, + struct mdev_device *mdev, + unsigned int instances) +{ + return intel_vgpu_create_internal(kobj, mdev, instances); +} + + + static int intel_vgpu_remove(struct mdev_device *mdev) { struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); @@ -1430,6 +1451,7 @@ static const struct attribute_group *intel_vgpu_groups[] = { static struct mdev_parent_ops intel_vgpu_ops = { .mdev_attr_groups = intel_vgpu_groups, .create = intel_vgpu_create, + .create_with_instances = intel_vgpu_create_with_instances, .remove = intel_vgpu_remove, .open = intel_vgpu_open, @@ -1443,12 +1465,10 @@ static struct mdev_parent_ops intel_vgpu_ops = { static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops) { - struct attribute **kvm_type_attrs; struct attribute_group **kvm_vgpu_type_groups; intel_gvt_ops = ops; - if (!intel_gvt_ops->get_gvt_attrs(&kvm_type_attrs, - &kvm_vgpu_type_groups)) + if (!intel_gvt_ops->get_gvt_attrs(&kvm_vgpu_type_groups)) return -EFAULT; intel_vgpu_ops.supported_type_groups = kvm_vgpu_type_groups; diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index f6fa916517c3..5304b983a84b 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -125,11 +125,14 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) high_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE; num_types = sizeof(vgpu_types) / sizeof(vgpu_types[0]); - gvt->types = kcalloc(num_types, sizeof(struct intel_vgpu_type), + gvt->types = kcalloc(num_types + 1, sizeof(struct intel_vgpu_type), GFP_KERNEL); if (!gvt->types) return -ENOMEM; + gvt->gm.low_avail = low_avail; + gvt->gm.high_avail = high_avail; + gvt->gm.fence_avail = 32 - HOST_FENCE; min_low = MB_TO_BYTES(32); for (i = 0; i < num_types; ++i) { if (low_avail / vgpu_types[i].low_mm == 0) @@ -149,11 +152,11 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) high_avail / vgpu_types[i].high_mm); if (IS_GEN8(gvt->dev_priv)) - sprintf(gvt->types[i].name, "GVTg_V4_%s", - vgpu_types[i].name); + snprintf(gvt->types[i].name, sizeof(gvt->types[i].name), + "GVTg_V4_%s", vgpu_types[i].name); else if (IS_GEN9(gvt->dev_priv)) - sprintf(gvt->types[i].name, "GVTg_V5_%s", - vgpu_types[i].name); + snprintf(gvt->types[i].name, sizeof(gvt->types[i].name), + "GVTg_V5_%s", vgpu_types[i].name); gvt_dbg_core("type[%d]: %s avail %u low %u high %u fence %u weight %u res %s\n", i, gvt->types[i].name, @@ -164,7 +167,32 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) vgpu_edid_str(gvt->types[i].resolution)); } - gvt->num_types = i; + /* add aggregation type */ + gvt->types[i].low_gm_size = MB_TO_BYTES(32); + gvt->types[i].high_gm_size = MB_TO_BYTES(192); + gvt->types[i].fence = 2; + gvt->types[i].weight = 16; + gvt->types[i].resolution = GVT_EDID_1024_768; + gvt->types[i].avail_instance = min(low_avail / gvt->types[i].low_gm_size, + high_avail / gvt->types[i].high_gm_size); + gvt->types[i].avail_instance = min(gvt->types[i].avail_instance, + (32 - HOST_FENCE) / gvt->types[i].fence); + if (IS_GEN8(gvt->dev_priv)) + strcat(gvt->types[i].name, "GVTg_V4_aggregate"); + else if (IS_GEN9(gvt->dev_priv)) + strcat(gvt->types[i].name, "GVTg_V5_aggregate"); + + gvt_dbg_core("type[%d]: %s avail %u low %u high %u fence %u weight %u res %s\n", + i, gvt->types[i].name, + gvt->types[i].avail_instance, + gvt->types[i].low_gm_size, + gvt->types[i].high_gm_size, gvt->types[i].fence, + gvt->types[i].weight, + vgpu_edid_str(gvt->types[i].resolution)); + + gvt->types[i].aggregation = true; + gvt->num_types = ++i; + return 0; } @@ -444,7 +472,8 @@ static struct intel_vgpu *__intel_gvt_create_vgpu(struct intel_gvt *gvt, * pointer to intel_vgpu, error pointer if failed. */ struct intel_vgpu *intel_gvt_create_vgpu(struct intel_gvt *gvt, - struct intel_vgpu_type *type) + struct intel_vgpu_type *type, + unsigned int instances) { struct intel_vgpu_creation_params param; struct intel_vgpu *vgpu; @@ -461,6 +490,19 @@ struct intel_vgpu *intel_gvt_create_vgpu(struct intel_gvt *gvt, param.low_gm_sz = BYTES_TO_MB(param.low_gm_sz); param.high_gm_sz = BYTES_TO_MB(param.high_gm_sz); + if (type->aggregation && instances > 1) { + if (instances > type->avail_instance) + return ERR_PTR(-EINVAL); + param.low_gm_sz = min(param.low_gm_sz * instances, + (u64)BYTES_TO_MB(gvt->gm.low_avail)); + param.high_gm_sz = min(param.high_gm_sz * instances, + (u64)BYTES_TO_MB(gvt->gm.high_avail)); + param.fence_sz = min(param.fence_sz * instances, + (u64)gvt->gm.fence_avail); + gvt_dbg_core("instances %d, low %lluMB, high %lluMB, fence %llu\n", + instances, param.low_gm_sz, param.high_gm_sz, param.fence_sz); + } + mutex_lock(&gvt->lock); vgpu = __intel_gvt_create_vgpu(gvt, ¶m); if (!IS_ERR(vgpu)) -- 2.18.0

Update mdev doc on new aggregration attribute and instances attribute for mdev. Cc: Kirti Wankhede <kwankhede@nvidia.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kevin Tian <kevin.tian@intel.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- Documentation/vfio-mediated-device.txt | 39 ++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/Documentation/vfio-mediated-device.txt b/Documentation/vfio-mediated-device.txt index c3f69bcaf96e..9ec9495dcbe7 100644 --- a/Documentation/vfio-mediated-device.txt +++ b/Documentation/vfio-mediated-device.txt @@ -211,12 +211,20 @@ Directories and files under the sysfs for Each Physical Device | | |--- description | | |--- [devices] | |--- [<type-id>] - | |--- create - | |--- name - | |--- available_instances - | |--- device_api - | |--- description - | |--- [devices] + | | |--- create + | | |--- name + | | |--- available_instances + | | |--- device_api + | | |--- description + | | |--- [devices] + | |--- [<type-id>] + | | |--- create + | | |--- name + | | |--- available_instances + | | |--- device_api + | | |--- description + | | |--- <aggregation> + | | |--- [devices] * [mdev_supported_types] @@ -260,6 +268,19 @@ Directories and files under the sysfs for Each Physical Device This attribute should show brief features/description of the type. This is optional attribute. +* <aggregation> + + The description is to show feature for one instance of the type. <aggregation> + is an optional attributes to show that [<type-id>]'s instances can be + aggregated to be assigned for one mdev device. Set number of instances by + appending "instances=N" parameter for create. Instances number can't exceed + available_instances number. Without "instances=N" parameter will be default + one instance to create. + +Example:: + + # echo "<uuid>,instances=N" > create + Directories and Files Under the sysfs for Each mdev Device ---------------------------------------------------------- @@ -268,6 +289,7 @@ Directories and Files Under the sysfs for Each mdev Device |- [parent phy device] |--- [$MDEV_UUID] |--- remove + |--- instances |--- mdev_type {link to its type} |--- vendor-specific-attributes [optional] @@ -281,6 +303,11 @@ Example:: # echo 1 > /sys/bus/mdev/devices/$mdev_UUID/remove +* instances + +For aggregation type show number of instances assigned for this mdev. For normal +type or default will just show one instance. + Mediated device Hot plug ------------------------ -- 2.18.0

On Fri, 20 Jul 2018 10:19:28 +0800 Zhenyu Wang <zhenyuw@linux.intel.com> wrote:
Update mdev doc on new aggregration attribute and instances attribute for mdev.
Cc: Kirti Wankhede <kwankhede@nvidia.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kevin Tian <kevin.tian@intel.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- Documentation/vfio-mediated-device.txt | 39 ++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 6 deletions(-)
diff --git a/Documentation/vfio-mediated-device.txt b/Documentation/vfio-mediated-device.txt index c3f69bcaf96e..9ec9495dcbe7 100644 --- a/Documentation/vfio-mediated-device.txt +++ b/Documentation/vfio-mediated-device.txt @@ -211,12 +211,20 @@ Directories and files under the sysfs for Each Physical Device | | |--- description | | |--- [devices] | |--- [<type-id>] - | |--- create - | |--- name - | |--- available_instances - | |--- device_api - | |--- description - | |--- [devices] + | | |--- create + | | |--- name + | | |--- available_instances + | | |--- device_api + | | |--- description + | | |--- [devices] + | |--- [<type-id>] + | | |--- create + | | |--- name + | | |--- available_instances + | | |--- device_api + | | |--- description + | | |--- <aggregation> + | | |--- [devices]
* [mdev_supported_types]
@@ -260,6 +268,19 @@ Directories and files under the sysfs for Each Physical Device This attribute should show brief features/description of the type. This is optional attribute.
+* <aggregation> + + The description is to show feature for one instance of the type. <aggregation>
You are talking about "one instance" here. Can this be different for the same type with different physical devices?
+ is an optional attributes to show that [<type-id>]'s instances can be + aggregated to be assigned for one mdev device. Set number of instances by + appending "instances=N" parameter for create. Instances number can't exceed + available_instances number. Without "instances=N" parameter will be default + one instance to create.
Could there be a case where available_instances is n, but aggregation is only supported for a value m < n? If yes, should m be discoverable via the "aggregation" attribute?
+ +Example:: + + # echo "<uuid>,instances=N" > create + Directories and Files Under the sysfs for Each mdev Device ----------------------------------------------------------
@@ -268,6 +289,7 @@ Directories and Files Under the sysfs for Each mdev Device |- [parent phy device] |--- [$MDEV_UUID] |--- remove + |--- instances |--- mdev_type {link to its type} |--- vendor-specific-attributes [optional]
@@ -281,6 +303,11 @@ Example::
# echo 1 > /sys/bus/mdev/devices/$mdev_UUID/remove
+* instances + +For aggregation type show number of instances assigned for this mdev. For normal +type or default will just show one instance. + Mediated device Hot plug ------------------------

On 2018.07.26 17:46:40 +0200, Cornelia Huck wrote:
On Fri, 20 Jul 2018 10:19:28 +0800 Zhenyu Wang <zhenyuw@linux.intel.com> wrote:
Update mdev doc on new aggregration attribute and instances attribute for mdev.
Cc: Kirti Wankhede <kwankhede@nvidia.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kevin Tian <kevin.tian@intel.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- Documentation/vfio-mediated-device.txt | 39 ++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 6 deletions(-)
diff --git a/Documentation/vfio-mediated-device.txt b/Documentation/vfio-mediated-device.txt index c3f69bcaf96e..9ec9495dcbe7 100644 --- a/Documentation/vfio-mediated-device.txt +++ b/Documentation/vfio-mediated-device.txt @@ -211,12 +211,20 @@ Directories and files under the sysfs for Each Physical Device | | |--- description | | |--- [devices] | |--- [<type-id>] - | |--- create - | |--- name - | |--- available_instances - | |--- device_api - | |--- description - | |--- [devices] + | | |--- create + | | |--- name + | | |--- available_instances + | | |--- device_api + | | |--- description + | | |--- [devices] + | |--- [<type-id>] + | | |--- create + | | |--- name + | | |--- available_instances + | | |--- device_api + | | |--- description + | | |--- <aggregation> + | | |--- [devices]
* [mdev_supported_types]
@@ -260,6 +268,19 @@ Directories and files under the sysfs for Each Physical Device This attribute should show brief features/description of the type. This is optional attribute.
+* <aggregation> + + The description is to show feature for one instance of the type. <aggregation>
You are talking about "one instance" here. Can this be different for the same type with different physical devices?
I would expect for normal mdev types, driver might expose like x2, x4, x8 types which split hw resource equally. But for type with aggregation feature, it can set user wanted number of instances. Sorry maybe my use of word was not clear, how about "one example of type"?
+ is an optional attributes to show that [<type-id>]'s instances can be + aggregated to be assigned for one mdev device. Set number of instances by + appending "instances=N" parameter for create. Instances number can't exceed + available_instances number. Without "instances=N" parameter will be default + one instance to create.
Could there be a case where available_instances is n, but aggregation is only supported for a value m < n? If yes, should m be discoverable via the "aggregation" attribute?
I don't think there could be a case for that. As for aggregation type, it represents minimal resource allocated for a singleton, I can't see any reason for possible m < n. Thanks.
+ +Example:: + + # echo "<uuid>,instances=N" > create + Directories and Files Under the sysfs for Each mdev Device ----------------------------------------------------------
@@ -268,6 +289,7 @@ Directories and Files Under the sysfs for Each mdev Device |- [parent phy device] |--- [$MDEV_UUID] |--- remove + |--- instances |--- mdev_type {link to its type} |--- vendor-specific-attributes [optional]
@@ -281,6 +303,11 @@ Example::
# echo 1 > /sys/bus/mdev/devices/$mdev_UUID/remove
+* instances + +For aggregation type show number of instances assigned for this mdev. For normal +type or default will just show one instance. + Mediated device Hot plug ------------------------
-- Open Source Technology Center, Intel ltd. $gpg --keyserver wwwkeys.pgp.net --recv-keys 4D781827

On Fri, 27 Jul 2018 10:16:58 +0800 Zhenyu Wang <zhenyuw@linux.intel.com> wrote:
On 2018.07.26 17:46:40 +0200, Cornelia Huck wrote:
On Fri, 20 Jul 2018 10:19:28 +0800 Zhenyu Wang <zhenyuw@linux.intel.com> wrote:
Update mdev doc on new aggregration attribute and instances attribute for mdev.
Cc: Kirti Wankhede <kwankhede@nvidia.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kevin Tian <kevin.tian@intel.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- Documentation/vfio-mediated-device.txt | 39 ++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 6 deletions(-)
diff --git a/Documentation/vfio-mediated-device.txt b/Documentation/vfio-mediated-device.txt index c3f69bcaf96e..9ec9495dcbe7 100644 --- a/Documentation/vfio-mediated-device.txt +++ b/Documentation/vfio-mediated-device.txt @@ -211,12 +211,20 @@ Directories and files under the sysfs for Each Physical Device | | |--- description | | |--- [devices] | |--- [<type-id>] - | |--- create - | |--- name - | |--- available_instances - | |--- device_api - | |--- description - | |--- [devices] + | | |--- create + | | |--- name + | | |--- available_instances + | | |--- device_api + | | |--- description + | | |--- [devices] + | |--- [<type-id>] + | | |--- create + | | |--- name + | | |--- available_instances + | | |--- device_api + | | |--- description + | | |--- <aggregation> + | | |--- [devices]
* [mdev_supported_types]
@@ -260,6 +268,19 @@ Directories and files under the sysfs for Each Physical Device This attribute should show brief features/description of the type. This is optional attribute.
+* <aggregation> + + The description is to show feature for one instance of the type. <aggregation>
You are talking about "one instance" here. Can this be different for the same type with different physical devices?
I would expect for normal mdev types, driver might expose like x2, x4, x8 types which split hw resource equally. But for type with aggregation feature, it can set user wanted number of instances. Sorry maybe my use of word was not clear, how about "one example of type"?
+ is an optional attributes to show that [<type-id>]'s instances can be + aggregated to be assigned for one mdev device. Set number of instances by + appending "instances=N" parameter for create. Instances number can't exceed + available_instances number. Without "instances=N" parameter will be default + one instance to create.
Could there be a case where available_instances is n, but aggregation is only supported for a value m < n? If yes, should m be discoverable via the "aggregation" attribute?
I don't think there could be a case for that. As for aggregation type, it represents minimal resource allocated for a singleton, I can't see any reason for possible m < n.
Let's think about the interface beyond the immediate needs. It seems perfectly reasonable to me to think that a vendor driver might have a large number of resources available, the ability to aggregate resources beyond what's practical to enumerate with discrete types, yet have an upper bound of how many resources can be aggregated for a single instance. Thanks, Alex

On Fri, 27 Jul 2018 10:16:58 +0800 Zhenyu Wang <zhenyuw@linux.intel.com> wrote:
On 2018.07.26 17:46:40 +0200, Cornelia Huck wrote:
On Fri, 20 Jul 2018 10:19:28 +0800 Zhenyu Wang <zhenyuw@linux.intel.com> wrote:
Update mdev doc on new aggregration attribute and instances attribute for mdev.
Cc: Kirti Wankhede <kwankhede@nvidia.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kevin Tian <kevin.tian@intel.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- Documentation/vfio-mediated-device.txt | 39 ++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 6 deletions(-)
diff --git a/Documentation/vfio-mediated-device.txt b/Documentation/vfio-mediated-device.txt index c3f69bcaf96e..9ec9495dcbe7 100644 --- a/Documentation/vfio-mediated-device.txt +++ b/Documentation/vfio-mediated-device.txt @@ -211,12 +211,20 @@ Directories and files under the sysfs for Each Physical Device | | |--- description | | |--- [devices] | |--- [<type-id>] - | |--- create - | |--- name - | |--- available_instances - | |--- device_api - | |--- description - | |--- [devices] + | | |--- create + | | |--- name + | | |--- available_instances + | | |--- device_api + | | |--- description + | | |--- [devices] + | |--- [<type-id>] + | | |--- create + | | |--- name + | | |--- available_instances + | | |--- device_api + | | |--- description + | | |--- <aggregation> + | | |--- [devices]
* [mdev_supported_types]
@@ -260,6 +268,19 @@ Directories and files under the sysfs for Each Physical Device This attribute should show brief features/description of the type. This is optional attribute.
+* <aggregation> + + The description is to show feature for one instance of the type. <aggregation>
You are talking about "one instance" here. Can this be different for the same type with different physical devices?
I would expect for normal mdev types, driver might expose like x2, x4, x8 types which split hw resource equally. But for type with aggregation feature, it can set user wanted number of instances. Sorry maybe my use of word was not clear, how about "one example of type"?
Maybe my question was confusing as well... <aggregation> is an attribute that is exposed for a particular type under a particular physical device. - If <aggregation> is always the same for that particular type, regardless of which physical device we're dealing with, let's just drop the "one instance" sentence. - If it instead depends on what physical device we're handling, I'd write something like "The contents of this attribute depend both on the type and on the particular instance."

On Fri, 20 Jul 2018 10:19:24 +0800 Zhenyu Wang <zhenyuw@linux.intel.com> wrote:
Current mdev device create interface depends on fixed mdev type, which get uuid from user to create instance of mdev device. If user wants to use customized number of resource for mdev device, then only can create new mdev type for that which may not be flexible. This requirement comes not only from to be able to allocate flexible resources for KVMGT, but also from Intel scalable IO virtualization which would use vfio/mdev to be able to allocate arbitrary resources on mdev instance. More info on [1] [2] [3].
To allow to create user defined resources for mdev, it trys to extend mdev create interface by adding new "instances=xxx" parameter following uuid, for target mdev type if aggregation is supported, it can create new mdev device which contains resources combined by number of instances, e.g
echo "<uuid>,instances=10" > create
VM manager e.g libvirt can check mdev type with "aggregation" attribute which can support this setting. If no "aggregation" attribute found for mdev type, previous behavior is still kept for one instance allocation. And new sysfs attribute "instances" is created for each mdev device to show allocated number.
This trys to create new KVMGT type with minimal vGPU resources which can be combined with "instances=x" setting to allocate for user wanted resources.
"instances" makes me think this is arg helps to create multiple mdev instances rather than consuming multiple instances for a single mdev. You're already exposing the "aggregation" attribute, so doesn't "aggregate" perhaps make more sense as the create option? We're asking the driver to aggregate $NUM instances into a single mdev. The mdev attribute should then perhaps also be "aggregated_instances". The next user question for the interface might be what aspect of the device gets multiplied by this aggregation? In i915 I see you're multiplying the memory sizes by the instance, but clearly the resolution doesn't change. I assume this is sort of like mdev types themselves, ie. some degree of what a type means is buried in the implementation and some degree of what some number of those types aggregated together means is impossible to describe generically. We're also going to need to add aggregation to the checklist for device compatibility for migration, for example 1) is it the same mdev_type, 1a) are the aggregation counts the same (new), 2) is the host driver compatible (TBD). The count handling in create_store(), particularly MDEV_CREATE_OPT_LEN looks a little suspicious. I think we should just be validating that the string before the ',' or the entire string if there is no comma is UUID length. Pass only that to uuid_le_to_bin(). We can then strncmp as you have for "instances=" (or "aggregate=") but then let's be sure to end the string we pass to kstrtouint(), ie. assume there could be further args. Documentation/ABI/testing/sysfs-bus-vfio-mdev also needs to be updated with this series. I'm curious what libvirt folks and Kirti think of this, it looks like it has a nice degree of backwards compatibility, both in the sysfs interface and the vendor driver interface. Thanks, Alex

On Tue, Jul 24, 2018 at 11:44:40AM -0600, Alex Williamson wrote:
On Fri, 20 Jul 2018 10:19:24 +0800 Zhenyu Wang <zhenyuw@linux.intel.com> wrote:
Current mdev device create interface depends on fixed mdev type, which get uuid from user to create instance of mdev device. If user wants to use customized number of resource for mdev device, then only can create new mdev type for that which may not be flexible. This requirement comes not only from to be able to allocate flexible resources for KVMGT, but also from Intel scalable IO virtualization which would use vfio/mdev to be able to allocate arbitrary resources on mdev instance. More info on [1] [2] [3].
To allow to create user defined resources for mdev, it trys to extend mdev create interface by adding new "instances=xxx" parameter following uuid, for target mdev type if aggregation is supported, it can create new mdev device which contains resources combined by number of instances, e.g
echo "<uuid>,instances=10" > create
VM manager e.g libvirt can check mdev type with "aggregation" attribute which can support this setting. If no "aggregation" attribute found for mdev type, previous behavior is still kept for one instance allocation. And new sysfs attribute "instances" is created for each mdev device to show allocated number.
This trys to create new KVMGT type with minimal vGPU resources which can be combined with "instances=x" setting to allocate for user wanted resources.
"instances" makes me think this is arg helps to create multiple mdev instances rather than consuming multiple instances for a single mdev. You're already exposing the "aggregation" attribute, so doesn't "aggregate" perhaps make more sense as the create option? We're asking the driver to aggregate $NUM instances into a single mdev. The mdev attribute should then perhaps also be "aggregated_instances".
The next user question for the interface might be what aspect of the device gets multiplied by this aggregation? In i915 I see you're multiplying the memory sizes by the instance, but clearly the resolution doesn't change. I assume this is sort of like mdev types themselves, ie. some degree of what a type means is buried in the implementation and some degree of what some number of those types aggregated together means is impossible to describe generically.
I don't seem to clearly see the benefit here, so I have to ask, how is this better and/or different from allowing a heterogeneous setup if one needs a more performant instance in terms of more resources? Because to me, once you're able to aggregate instances, I would assume that a simple "echo `uuid`" with a different type should succeed as well and provide me (from user's perspective) with the same results. Could you please clarify this to me, as well as what resources/parameters are going to be impacted by aggregation? ...
I'm curious what libvirt folks and Kirti think of this, it looks like it has a nice degree of backwards compatibility, both in the sysfs interface and the vendor driver interface. Thanks,
Since libvirt doesn't have an API to create mdevs yet, this doesn't pose an issue for us at the moment. I see this adds new optional sysfs attributes which we could expose within our device capabilities XML, provided it doesn't use a free form text, like the description attribute does. Erik

On Thu, 26 Jul 2018 15:50:28 +0200 Erik Skultety <eskultet@redhat.com> wrote:
On Tue, Jul 24, 2018 at 11:44:40AM -0600, Alex Williamson wrote:
On Fri, 20 Jul 2018 10:19:24 +0800 Zhenyu Wang <zhenyuw@linux.intel.com> wrote:
Current mdev device create interface depends on fixed mdev type, which get uuid from user to create instance of mdev device. If user wants to use customized number of resource for mdev device, then only can create new mdev type for that which may not be flexible. This requirement comes not only from to be able to allocate flexible resources for KVMGT, but also from Intel scalable IO virtualization which would use vfio/mdev to be able to allocate arbitrary resources on mdev instance. More info on [1] [2] [3].
To allow to create user defined resources for mdev, it trys to extend mdev create interface by adding new "instances=xxx" parameter following uuid, for target mdev type if aggregation is supported, it can create new mdev device which contains resources combined by number of instances, e.g
echo "<uuid>,instances=10" > create
VM manager e.g libvirt can check mdev type with "aggregation" attribute which can support this setting. If no "aggregation" attribute found for mdev type, previous behavior is still kept for one instance allocation. And new sysfs attribute "instances" is created for each mdev device to show allocated number.
This trys to create new KVMGT type with minimal vGPU resources which can be combined with "instances=x" setting to allocate for user wanted resources.
"instances" makes me think this is arg helps to create multiple mdev instances rather than consuming multiple instances for a single mdev. You're already exposing the "aggregation" attribute, so doesn't "aggregate" perhaps make more sense as the create option? We're asking the driver to aggregate $NUM instances into a single mdev. The mdev attribute should then perhaps also be "aggregated_instances".
The next user question for the interface might be what aspect of the device gets multiplied by this aggregation? In i915 I see you're multiplying the memory sizes by the instance, but clearly the resolution doesn't change. I assume this is sort of like mdev types themselves, ie. some degree of what a type means is buried in the implementation and some degree of what some number of those types aggregated together means is impossible to describe generically.
I don't seem to clearly see the benefit here, so I have to ask, how is this better and/or different from allowing a heterogeneous setup if one needs a more performant instance in terms of more resources? Because to me, once you're able to aggregate instances, I would assume that a simple "echo `uuid`" with a different type should succeed as well and provide me (from user's perspective) with the same results. Could you please clarify this to me, as well as what resources/parameters are going to be impacted by aggregation?
I think you're suggesting that we could simply define new mdev types to account for these higher aggregate instances, for example we can define discrete types that are 2x, 3x, 4x, etc. the resource count of a single instance. What I think we're trying to address with this proposal is what happens when the resources available are exceptionally large and they can be combined in arbitrary ways. For example if a parent device can expose 10,000 resources and the granularity with which we can create and mdev instance is 1, is it practical to create 10,000 mdev types or does it make more sense to expose a granularity and method of aggregation. Using graphics here perhaps falls a little short of the intention of the interface because the possible types are easily enumerable and it would be entirely practical to create discrete types for each. vGPUs also have a lot of variables, so defining which attribute of the device is multiplied by the number of instances is a little more fuzzy. Thanks, Alex

On Thu, Jul 26, 2018 at 08:29:45AM -0600, Alex Williamson wrote:
On Thu, 26 Jul 2018 15:50:28 +0200 Erik Skultety <eskultet@redhat.com> wrote:
On Tue, Jul 24, 2018 at 11:44:40AM -0600, Alex Williamson wrote:
On Fri, 20 Jul 2018 10:19:24 +0800 Zhenyu Wang <zhenyuw@linux.intel.com> wrote:
Current mdev device create interface depends on fixed mdev type, which get uuid from user to create instance of mdev device. If user wants to use customized number of resource for mdev device, then only can create new mdev type for that which may not be flexible. This requirement comes not only from to be able to allocate flexible resources for KVMGT, but also from Intel scalable IO virtualization which would use vfio/mdev to be able to allocate arbitrary resources on mdev instance. More info on [1] [2] [3].
To allow to create user defined resources for mdev, it trys to extend mdev create interface by adding new "instances=xxx" parameter following uuid, for target mdev type if aggregation is supported, it can create new mdev device which contains resources combined by number of instances, e.g
echo "<uuid>,instances=10" > create
VM manager e.g libvirt can check mdev type with "aggregation" attribute which can support this setting. If no "aggregation" attribute found for mdev type, previous behavior is still kept for one instance allocation. And new sysfs attribute "instances" is created for each mdev device to show allocated number.
This trys to create new KVMGT type with minimal vGPU resources which can be combined with "instances=x" setting to allocate for user wanted resources.
"instances" makes me think this is arg helps to create multiple mdev instances rather than consuming multiple instances for a single mdev. You're already exposing the "aggregation" attribute, so doesn't "aggregate" perhaps make more sense as the create option? We're asking the driver to aggregate $NUM instances into a single mdev. The mdev attribute should then perhaps also be "aggregated_instances".
The next user question for the interface might be what aspect of the device gets multiplied by this aggregation? In i915 I see you're multiplying the memory sizes by the instance, but clearly the resolution doesn't change. I assume this is sort of like mdev types themselves, ie. some degree of what a type means is buried in the implementation and some degree of what some number of those types aggregated together means is impossible to describe generically.
I don't seem to clearly see the benefit here, so I have to ask, how is this better and/or different from allowing a heterogeneous setup if one needs a more performant instance in terms of more resources? Because to me, once you're able to aggregate instances, I would assume that a simple "echo `uuid`" with a different type should succeed as well and provide me (from user's perspective) with the same results. Could you please clarify this to me, as well as what resources/parameters are going to be impacted by aggregation?
I think you're suggesting that we could simply define new mdev types to account for these higher aggregate instances, for example we can define discrete types that are 2x, 3x, 4x, etc. the resource count of a single instance. What I think we're trying to address with this proposal is what happens when the resources available are exceptionally large and they can be combined in arbitrary ways. For example if a parent device can expose 10,000 resources and the granularity with which we can create and mdev instance is 1, is it practical to create 10,000 mdev types or does it make more sense to expose a granularity and method of aggregation.
Okay, I got a much better picture now, thanks for the clarification. The granularity you mentioned definitely does give users more power and control (in terms of provisioning) over the devices. Erik
Using graphics here perhaps falls a little short of the intention of the interface because the possible types are easily enumerable and it would be entirely practical to create discrete types for each. vGPUs also have a lot of variables, so defining which attribute of the device is multiplied by the number of instances is a little more fuzzy. Thanks,
Alex

On Thu, 26 Jul 2018 15:50:28 +0200 Erik Skultety <eskultet@redhat.com> wrote:
On Tue, Jul 24, 2018 at 11:44:40AM -0600, Alex Williamson wrote:
On Fri, 20 Jul 2018 10:19:24 +0800 Zhenyu Wang <zhenyuw@linux.intel.com> wrote:
Current mdev device create interface depends on fixed mdev type, which get uuid from user to create instance of mdev device. If user wants to use customized number of resource for mdev device, then only can create new mdev type for that which may not be flexible. This requirement comes not only from to be able to allocate flexible resources for KVMGT, but also from Intel scalable IO virtualization which would use vfio/mdev to be able to allocate arbitrary resources on mdev instance. More info on [1] [2] [3].
To allow to create user defined resources for mdev, it trys to extend mdev create interface by adding new "instances=xxx" parameter following uuid, for target mdev type if aggregation is supported, it can create new mdev device which contains resources combined by number of instances, e.g
echo "<uuid>,instances=10" > create
VM manager e.g libvirt can check mdev type with "aggregation" attribute which can support this setting. If no "aggregation" attribute found for mdev type, previous behavior is still kept for one instance allocation. And new sysfs attribute "instances" is created for each mdev device to show allocated number.
This trys to create new KVMGT type with minimal vGPU resources which can be combined with "instances=x" setting to allocate for user wanted resources.
"instances" makes me think this is arg helps to create multiple mdev instances rather than consuming multiple instances for a single mdev. You're already exposing the "aggregation" attribute, so doesn't "aggregate" perhaps make more sense as the create option? We're asking the driver to aggregate $NUM instances into a single mdev. The mdev attribute should then perhaps also be "aggregated_instances".
I agree, that seems like a better naming scheme. (...)
I'm curious what libvirt folks and Kirti think of this, it looks like it has a nice degree of backwards compatibility, both in the sysfs interface and the vendor driver interface. Thanks,
Since libvirt doesn't have an API to create mdevs yet, this doesn't pose an issue for us at the moment. I see this adds new optional sysfs attributes which we could expose within our device capabilities XML, provided it doesn't use a free form text, like the description attribute does.
One thing I noticed is that we have seem to have an optional (?) vendor-driver created "aggregation" attribute (which always prints "true" in the Intel driver). Would it be better or worse for libvirt if it contained some kind of upper boundary or so? Additionally, would it be easier if the "create" attribute always accepted ",instances=1" (calling the .create ops in that case?)

On Thu, Jul 26, 2018 at 05:30:07PM +0200, Cornelia Huck wrote:
On Thu, 26 Jul 2018 15:50:28 +0200 Erik Skultety <eskultet@redhat.com> wrote:
On Tue, Jul 24, 2018 at 11:44:40AM -0600, Alex Williamson wrote:
On Fri, 20 Jul 2018 10:19:24 +0800 Zhenyu Wang <zhenyuw@linux.intel.com> wrote:
Current mdev device create interface depends on fixed mdev type, which get uuid from user to create instance of mdev device. If user wants to use customized number of resource for mdev device, then only can create new mdev type for that which may not be flexible. This requirement comes not only from to be able to allocate flexible resources for KVMGT, but also from Intel scalable IO virtualization which would use vfio/mdev to be able to allocate arbitrary resources on mdev instance. More info on [1] [2] [3].
To allow to create user defined resources for mdev, it trys to extend mdev create interface by adding new "instances=xxx" parameter following uuid, for target mdev type if aggregation is supported, it can create new mdev device which contains resources combined by number of instances, e.g
echo "<uuid>,instances=10" > create
VM manager e.g libvirt can check mdev type with "aggregation" attribute which can support this setting. If no "aggregation" attribute found for mdev type, previous behavior is still kept for one instance allocation. And new sysfs attribute "instances" is created for each mdev device to show allocated number.
This trys to create new KVMGT type with minimal vGPU resources which can be combined with "instances=x" setting to allocate for user wanted resources.
"instances" makes me think this is arg helps to create multiple mdev instances rather than consuming multiple instances for a single mdev. You're already exposing the "aggregation" attribute, so doesn't "aggregate" perhaps make more sense as the create option? We're asking the driver to aggregate $NUM instances into a single mdev. The mdev attribute should then perhaps also be "aggregated_instances".
I agree, that seems like a better naming scheme.
(...)
I'm curious what libvirt folks and Kirti think of this, it looks like it has a nice degree of backwards compatibility, both in the sysfs interface and the vendor driver interface. Thanks,
Since libvirt doesn't have an API to create mdevs yet, this doesn't pose an issue for us at the moment. I see this adds new optional sysfs attributes which we could expose within our device capabilities XML, provided it doesn't use a free form text, like the description attribute does.
One thing I noticed is that we have seem to have an optional (?) vendor-driver created "aggregation" attribute (which always prints "true" in the Intel driver). Would it be better or worse for libvirt if it contained some kind of upper boundary or so? Additionally, would it
Can you be more specific? Although, I wouldn't argue against data that conveys some information, since I would treat the mere presence of the optional attribute as a supported feature that we can expose. Therefore, additional *structured* data which sets clear limits to a certain feature is only a plus that we can expose to the users/management layer.
be easier if the "create" attribute always accepted ",instances=1" (calling the .create ops in that case?)
Is ^this libvirt related question? If so, then by accepting such syntax you'll definitely save us a few lines of code ;). However, until we have a clear vision on how to tackle the mdev migration, I'd like to avoid proposing a libvirt "create" API until then. Erik

On Thu, 26 Jul 2018 17:43:45 +0200 Erik Skultety <eskultet@redhat.com> wrote:
On Thu, Jul 26, 2018 at 05:30:07PM +0200, Cornelia Huck wrote:
One thing I noticed is that we have seem to have an optional (?) vendor-driver created "aggregation" attribute (which always prints "true" in the Intel driver). Would it be better or worse for libvirt if it contained some kind of upper boundary or so? Additionally, would it
Can you be more specific? Although, I wouldn't argue against data that conveys some information, since I would treat the mere presence of the optional attribute as a supported feature that we can expose. Therefore, additional *structured* data which sets clear limits to a certain feature is only a plus that we can expose to the users/management layer.
My question is what would be easiest for libvirt: - "aggregation" attribute only present when driver supports aggregation (possibly containing max number of resources to be aggregated) - "aggregation" attribute always present; contains '1' if driver does not support aggregation and 'm' if driver can aggregate 'm' resources

On Thu, Jul 26, 2018 at 06:04:10PM +0200, Cornelia Huck wrote:
On Thu, 26 Jul 2018 17:43:45 +0200 Erik Skultety <eskultet@redhat.com> wrote:
On Thu, Jul 26, 2018 at 05:30:07PM +0200, Cornelia Huck wrote:
One thing I noticed is that we have seem to have an optional (?) vendor-driver created "aggregation" attribute (which always prints "true" in the Intel driver). Would it be better or worse for libvirt if it contained some kind of upper boundary or so? Additionally, would it
Can you be more specific? Although, I wouldn't argue against data that conveys some information, since I would treat the mere presence of the optional attribute as a supported feature that we can expose. Therefore, additional *structured* data which sets clear limits to a certain feature is only a plus that we can expose to the users/management layer.
My question is what would be easiest for libvirt:
- "aggregation" attribute only present when driver supports aggregation (possibly containing max number of resources to be aggregated) - "aggregation" attribute always present; contains '1' if driver does not support aggregation and 'm' if driver can aggregate 'm' resources
Both are fine from libvirt's POV, but IMHO the former makes a bit more sense and I'm in favour of that one, IOW the presence of an attribute denotes a new functionality which we can report, if it's missing, the feature is clearly lacking- I don't think we (libvirt) should be reporting the value 1 explicitly in the XML if the feature is missing, we can assume 1 as the default. Erik

On 2018.07.27 16:45:55 +0200, Erik Skultety wrote:
On Thu, Jul 26, 2018 at 06:04:10PM +0200, Cornelia Huck wrote:
On Thu, 26 Jul 2018 17:43:45 +0200 Erik Skultety <eskultet@redhat.com> wrote:
On Thu, Jul 26, 2018 at 05:30:07PM +0200, Cornelia Huck wrote:
One thing I noticed is that we have seem to have an optional (?) vendor-driver created "aggregation" attribute (which always prints "true" in the Intel driver). Would it be better or worse for libvirt if it contained some kind of upper boundary or so? Additionally, would it
Can you be more specific? Although, I wouldn't argue against data that conveys some information, since I would treat the mere presence of the optional attribute as a supported feature that we can expose. Therefore, additional *structured* data which sets clear limits to a certain feature is only a plus that we can expose to the users/management layer.
My question is what would be easiest for libvirt:
- "aggregation" attribute only present when driver supports aggregation (possibly containing max number of resources to be aggregated) - "aggregation" attribute always present; contains '1' if driver does not support aggregation and 'm' if driver can aggregate 'm' resources
Both are fine from libvirt's POV, but IMHO the former makes a bit more sense and I'm in favour of that one, IOW the presence of an attribute denotes a new functionality which we can report, if it's missing, the feature is clearly lacking- I don't think we (libvirt) should be reporting the value 1 explicitly in the XML if the feature is missing, we can assume 1 as the default.
Good I'll adhere to that, thanks! -- Open Source Technology Center, Intel ltd. $gpg --keyserver wwwkeys.pgp.net --recv-keys 4D781827

On Thu, 26 Jul 2018 17:30:07 +0200 Cornelia Huck <cohuck@redhat.com> wrote:
On Thu, 26 Jul 2018 15:50:28 +0200 Erik Skultety <eskultet@redhat.com> wrote:
On Tue, Jul 24, 2018 at 11:44:40AM -0600, Alex Williamson wrote:
On Fri, 20 Jul 2018 10:19:24 +0800 Zhenyu Wang <zhenyuw@linux.intel.com> wrote:
Current mdev device create interface depends on fixed mdev type, which get uuid from user to create instance of mdev device. If user wants to use customized number of resource for mdev device, then only can create new mdev type for that which may not be flexible. This requirement comes not only from to be able to allocate flexible resources for KVMGT, but also from Intel scalable IO virtualization which would use vfio/mdev to be able to allocate arbitrary resources on mdev instance. More info on [1] [2] [3].
To allow to create user defined resources for mdev, it trys to extend mdev create interface by adding new "instances=xxx" parameter following uuid, for target mdev type if aggregation is supported, it can create new mdev device which contains resources combined by number of instances, e.g
echo "<uuid>,instances=10" > create
VM manager e.g libvirt can check mdev type with "aggregation" attribute which can support this setting. If no "aggregation" attribute found for mdev type, previous behavior is still kept for one instance allocation. And new sysfs attribute "instances" is created for each mdev device to show allocated number.
This trys to create new KVMGT type with minimal vGPU resources which can be combined with "instances=x" setting to allocate for user wanted resources.
"instances" makes me think this is arg helps to create multiple mdev instances rather than consuming multiple instances for a single mdev. You're already exposing the "aggregation" attribute, so doesn't "aggregate" perhaps make more sense as the create option? We're asking the driver to aggregate $NUM instances into a single mdev. The mdev attribute should then perhaps also be "aggregated_instances".
I agree, that seems like a better naming scheme.
(...)
I'm curious what libvirt folks and Kirti think of this, it looks like it has a nice degree of backwards compatibility, both in the sysfs interface and the vendor driver interface. Thanks,
Since libvirt doesn't have an API to create mdevs yet, this doesn't pose an issue for us at the moment. I see this adds new optional sysfs attributes which we could expose within our device capabilities XML, provided it doesn't use a free form text, like the description attribute does.
One thing I noticed is that we have seem to have an optional (?) vendor-driver created "aggregation" attribute (which always prints "true" in the Intel driver). Would it be better or worse for libvirt if it contained some kind of upper boundary or so?
Ultimately the aggregation value should be fully specified in Documentation/ABI, but doesn't the kernel generally use 'Y' or 'N' for boolean attributes in sysfs? Maybe mdev core can handle the attribute since it should exist any time the create_with_instances callback is provided.
Additionally, would it be easier if the "create" attribute always accepted ",instances=1" (calling the .create ops in that case?)
Unless I misunderstand the code or the question, I think this is exactly what happens: + if (instances > 1) + ret = parent->ops->create_with_instances(kobj, mdev, instances); + else + ret = parent->ops->create(kobj, mdev); And elsewhere: + if (instances > 1 && !parent->ops->create_with_instances) { + ret = -EINVAL; + goto mdev_fail; + } If the mdev core supplied the aggregation attribute, then the presence of the attribute could indicate to userspace whether it can always provide an instance (aggregate) count, even if limited to '1' when 'N', for that mdev type. Thanks, Alex

On Thu, 26 Jul 2018 09:51:26 -0600 Alex Williamson <alex.williamson@redhat.com> wrote:
On Thu, 26 Jul 2018 17:30:07 +0200 Cornelia Huck <cohuck@redhat.com> wrote:
On Thu, 26 Jul 2018 15:50:28 +0200 Erik Skultety <eskultet@redhat.com> wrote:
Since libvirt doesn't have an API to create mdevs yet, this doesn't pose an issue for us at the moment. I see this adds new optional sysfs attributes which we could expose within our device capabilities XML, provided it doesn't use a free form text, like the description attribute does.
One thing I noticed is that we have seem to have an optional (?) vendor-driver created "aggregation" attribute (which always prints "true" in the Intel driver). Would it be better or worse for libvirt if it contained some kind of upper boundary or so?
Ultimately the aggregation value should be fully specified in Documentation/ABI, but doesn't the kernel generally use 'Y' or 'N' for boolean attributes in sysfs? Maybe mdev core can handle the attribute since it should exist any time the create_with_instances callback is provided.
It might make sense to print a number if the driver allows a number of resources to be aggregated which is not the same as available_instances (see my reply to the documentation patch).
Additionally, would it be easier if the "create" attribute always accepted ",instances=1" (calling the .create ops in that case?)
Unless I misunderstand the code or the question, I think this is exactly what happens:
Indeed, it does. Let me blame the weather ;)
If the mdev core supplied the aggregation attribute, then the presence of the attribute could indicate to userspace whether it can always provide an instance (aggregate) count, even if limited to '1' when 'N', for that mdev type. Thanks,
Alex

On 2018.07.24 11:44:40 -0600, Alex Williamson wrote:
On Fri, 20 Jul 2018 10:19:24 +0800 Zhenyu Wang <zhenyuw@linux.intel.com> wrote:
Current mdev device create interface depends on fixed mdev type, which get uuid from user to create instance of mdev device. If user wants to use customized number of resource for mdev device, then only can create new mdev type for that which may not be flexible. This requirement comes not only from to be able to allocate flexible resources for KVMGT, but also from Intel scalable IO virtualization which would use vfio/mdev to be able to allocate arbitrary resources on mdev instance. More info on [1] [2] [3].
To allow to create user defined resources for mdev, it trys to extend mdev create interface by adding new "instances=xxx" parameter following uuid, for target mdev type if aggregation is supported, it can create new mdev device which contains resources combined by number of instances, e.g
echo "<uuid>,instances=10" > create
VM manager e.g libvirt can check mdev type with "aggregation" attribute which can support this setting. If no "aggregation" attribute found for mdev type, previous behavior is still kept for one instance allocation. And new sysfs attribute "instances" is created for each mdev device to show allocated number.
This trys to create new KVMGT type with minimal vGPU resources which can be combined with "instances=x" setting to allocate for user wanted resources.
"instances" makes me think this is arg helps to create multiple mdev instances rather than consuming multiple instances for a single mdev. You're already exposing the "aggregation" attribute, so doesn't "aggregate" perhaps make more sense as the create option? We're asking the driver to aggregate $NUM instances into a single mdev. The mdev attribute should then perhaps also be "aggregated_instances".
yeah that seems better.
The next user question for the interface might be what aspect of the device gets multiplied by this aggregation? In i915 I see you're multiplying the memory sizes by the instance, but clearly the resolution doesn't change. I assume this is sort of like mdev types themselves, ie. some degree of what a type means is buried in the implementation and some degree of what some number of those types aggregated together means is impossible to describe generically.
yeah, the purpose was to increase memory resource only, but due to current free formatted 'description', can't have a meaningful expression for that, and not sure if libvirt likes to understand vendor specific behavior e.g for intel vGPU?
We're also going to need to add aggregation to the checklist for device compatibility for migration, for example 1) is it the same mdev_type, 1a) are the aggregation counts the same (new), 2) is the host driver compatible (TBD).
Right, will check with Zhi on that.
The count handling in create_store(), particularly MDEV_CREATE_OPT_LEN looks a little suspicious. I think we should just be validating that the string before the ',' or the entire string if there is no comma is UUID length. Pass only that to uuid_le_to_bin(). We can then strncmp as you have for "instances=" (or "aggregate=") but then let's be sure to end the string we pass to kstrtouint(), ie. assume there could be further args.
Original purpose was to limit the length of string to accept, but can take this way without issue I think.
Documentation/ABI/testing/sysfs-bus-vfio-mdev also needs to be updated with this series.
Ok. Thanks
I'm curious what libvirt folks and Kirti think of this, it looks like it has a nice degree of backwards compatibility, both in the sysfs interface and the vendor driver interface. Thanks,
Alex
-- Open Source Technology Center, Intel ltd. $gpg --keyserver wwwkeys.pgp.net --recv-keys 4D781827

Hi, Zhenyu, curious about the progress of this series. Is there still some open remaining or a new version coming soon? Thanks Kevin
From: Zhenyu Wang [mailto:zhenyuw@linux.intel.com] Sent: Friday, July 20, 2018 10:19 AM
Current mdev device create interface depends on fixed mdev type, which get uuid from user to create instance of mdev device. If user wants to use customized number of resource for mdev device, then only can create new mdev type for that which may not be flexible. This requirement comes not only from to be able to allocate flexible resources for KVMGT, but also from Intel scalable IO virtualization which would use vfio/mdev to be able to allocate arbitrary resources on mdev instance. More info on [1] [2] [3].
To allow to create user defined resources for mdev, it trys to extend mdev create interface by adding new "instances=xxx" parameter following uuid, for target mdev type if aggregation is supported, it can create new mdev device which contains resources combined by number of instances, e.g
echo "<uuid>,instances=10" > create
VM manager e.g libvirt can check mdev type with "aggregation" attribute which can support this setting. If no "aggregation" attribute found for mdev type, previous behavior is still kept for one instance allocation. And new sysfs attribute "instances" is created for each mdev device to show allocated number.
This trys to create new KVMGT type with minimal vGPU resources which can be combined with "instances=x" setting to allocate for user wanted resources.
References: [1] https://software.intel.com/en-us/download/intel-virtualization- technology-for-directed-io-architecture-specification [2] https://software.intel.com/en-us/download/intel-scalable-io- virtualization-technical-specification [3] https://schd.ws/hosted_files/lc32018/00/LC3-SIOV-final.pdf
v2: - Add new create_with_instances driver hook - Update doc for new attributes
Zhenyu Wang (4): vfio/mdev: Add new instances parameter for mdev create vfio/mdev: Add mdev device instances attribute drm/i915/gvt: Add new aggregation type support Documentation/vfio-mediated-device.txt: update for aggregation attribute
Documentation/vfio-mediated-device.txt | 39 +++++++++++++++--- drivers/gpu/drm/i915/gvt/gvt.c | 26 +++++++++--- drivers/gpu/drm/i915/gvt/gvt.h | 14 ++++--- drivers/gpu/drm/i915/gvt/kvmgt.c | 30 +++++++++++--- drivers/gpu/drm/i915/gvt/vgpu.c | 56 ++++++++++++++++++++++---- drivers/vfio/mdev/mdev_core.c | 19 +++++++-- drivers/vfio/mdev/mdev_private.h | 6 ++- drivers/vfio/mdev/mdev_sysfs.c | 42 ++++++++++++++++--- include/linux/mdev.h | 10 +++++ 9 files changed, 203 insertions(+), 39 deletions(-)
-- 2.18.0

On 2018.10.08 03:19:25 +0000, Tian, Kevin wrote:
Hi, Zhenyu,
curious about the progress of this series. Is there still some open remaining or a new version coming soon?
I had new version almost ready before my vacation, planned to send after be back. So will refresh this later. Sorry for any delay as was trying to close several gvt issues. Thanks. -- Open Source Technology Center, Intel ltd. $gpg --keyserver wwwkeys.pgp.net --recv-keys 4D781827

Current mdev device create interface depends on fixed mdev type, which get uuid from user to create instance of mdev device. If user wants to use customized number of resource for mdev device, then only can create new mdev type for that which may not be flexible. This requirement comes not only from to be able to allocate flexible resources for KVMGT, but also from Intel scalable IO virtualization which would use vfio/mdev to be able to allocate arbitrary resources on mdev instance. More info on [1] [2] [3]. To allow to create user defined resources for mdev, it trys to extend mdev create interface by adding new "aggregate=xxx" parameter following UUID, for target mdev type if aggregation is supported, it can create new mdev device which contains resources combined by number of instances, e.g echo "<uuid>,aggregate=10" > create VM manager e.g libvirt can check mdev type with "aggregation" attribute which can support this setting. If no "aggregation" attribute found for mdev type, previous behavior is still kept for one instance allocation. And new sysfs attribute "aggregated_instances" is created for each mdev device to show allocated number. References: [1] https://software.intel.com/en-us/download/intel-virtualization-technology-fo... [2] https://software.intel.com/en-us/download/intel-scalable-io-virtualization-t... [3] https://schd.ws/hosted_files/lc32018/00/LC3-SIOV-final.pdf v2: - Add new create_with_instances driver hook - Update doc for new attributes v3: - Rename parameter and attribute names from review - Make "aggregation" attribute to show maxium resource number for aggregation - Check driver hooks for attribute create validation - Update doc and ABI spec Zhenyu Wang (6): vfio/mdev: Add new "aggregate" parameter for mdev create vfio/mdev: Add "aggregation" attribute for supported mdev type vfio/mdev: Add "aggregated_instances" attribute for supported mdev device Documentation/vfio-mediated-device.txt: Update for vfio/mdev aggregation support Documentation/ABI/testing/sysfs-bus-vfio-mdev: Update for vfio/mdev aggregation support drm/i915/gvt: Add new type with aggregation support Documentation/ABI/testing/sysfs-bus-vfio-mdev | 25 +++++++ Documentation/vfio-mediated-device.txt | 44 +++++++++-- drivers/gpu/drm/i915/gvt/gvt.h | 11 ++- drivers/gpu/drm/i915/gvt/kvmgt.c | 53 ++++++++++++- drivers/gpu/drm/i915/gvt/vgpu.c | 56 +++++++++++++- drivers/vfio/mdev/mdev_core.c | 40 +++++++++- drivers/vfio/mdev/mdev_private.h | 6 +- drivers/vfio/mdev/mdev_sysfs.c | 74 ++++++++++++++++++- include/linux/mdev.h | 19 +++++ 9 files changed, 305 insertions(+), 23 deletions(-) -- 2.19.1

For special mdev type which can aggregate instances for mdev device, this extends mdev create interface by allowing extra "aggregate=xxx" parameter, which is passed to mdev device model to be able to create bundled number of instances for target mdev device. v2: create new create_with_instances operator for vendor driver v3: - Change parameter name as "aggregate=" - Fix new interface comments. - Parameter checking for new option, pass UUID string only to parse and properly end parameter for kstrtouint() conversion. Cc: Kirti Wankhede <kwankhede@nvidia.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kevin Tian <kevin.tian@intel.com> Cc: Cornelia Huck <cohuck@redhat.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- drivers/vfio/mdev/mdev_core.c | 21 +++++++++++++++++---- drivers/vfio/mdev/mdev_private.h | 4 +++- drivers/vfio/mdev/mdev_sysfs.c | 32 ++++++++++++++++++++++++++++---- include/linux/mdev.h | 11 +++++++++++ 4 files changed, 59 insertions(+), 9 deletions(-) diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index 0212f0ee8aea..545c52ec7618 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -104,12 +104,17 @@ static inline void mdev_put_parent(struct mdev_parent *parent) } static int mdev_device_create_ops(struct kobject *kobj, - struct mdev_device *mdev) + struct mdev_device *mdev, + unsigned int instances) { struct mdev_parent *parent = mdev->parent; int ret; - ret = parent->ops->create(kobj, mdev); + if (instances > 1) { + ret = parent->ops->create_with_instances(kobj, mdev, + instances); + } else + ret = parent->ops->create(kobj, mdev); if (ret) return ret; @@ -276,7 +281,8 @@ static void mdev_device_release(struct device *dev) kfree(mdev); } -int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) +int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid, + unsigned int instances) { int ret; struct mdev_device *mdev, *tmp; @@ -287,6 +293,12 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) if (!parent) return -EINVAL; + if (instances > 1 && + !parent->ops->create_with_instances) { + ret = -EINVAL; + goto mdev_fail; + } + mutex_lock(&mdev_list_lock); /* Check for duplicate */ @@ -316,6 +328,7 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) mdev->dev.bus = &mdev_bus_type; mdev->dev.release = mdev_device_release; dev_set_name(&mdev->dev, "%pUl", uuid.b); + mdev->type_instances = instances; ret = device_register(&mdev->dev); if (ret) { @@ -323,7 +336,7 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) goto mdev_fail; } - ret = mdev_device_create_ops(kobj, mdev); + ret = mdev_device_create_ops(kobj, mdev, instances); if (ret) goto create_fail; diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h index b5819b7d7ef7..e90d295d3927 100644 --- a/drivers/vfio/mdev/mdev_private.h +++ b/drivers/vfio/mdev/mdev_private.h @@ -33,6 +33,7 @@ struct mdev_device { struct kref ref; struct list_head next; struct kobject *type_kobj; + unsigned int type_instances; bool active; }; @@ -58,7 +59,8 @@ void parent_remove_sysfs_files(struct mdev_parent *parent); int mdev_create_sysfs_files(struct device *dev, struct mdev_type *type); void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type); -int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid); +int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid, + unsigned int instances); int mdev_device_remove(struct device *dev, bool force_remove); #endif /* MDEV_PRIVATE_H */ diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 249472f05509..aefed0c8891b 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -54,14 +54,21 @@ static const struct sysfs_ops mdev_type_sysfs_ops = { static ssize_t create_store(struct kobject *kobj, struct device *dev, const char *buf, size_t count) { - char *str; + char *str, *param, *opt = NULL; uuid_le uuid; int ret; + unsigned int instances = 1; - if ((count < UUID_STRING_LEN) || (count > UUID_STRING_LEN + 1)) + if (count < UUID_STRING_LEN) return -EINVAL; - str = kstrndup(buf, count, GFP_KERNEL); + if ((param = strnchr(buf, count, ',')) == NULL) { + if (count > UUID_STRING_LEN + 1) + return -EINVAL; + } else if (param - buf != UUID_STRING_LEN) + return -EINVAL; + + str = kstrndup(buf, UUID_STRING_LEN, GFP_KERNEL); if (!str) return -ENOMEM; @@ -70,7 +77,24 @@ static ssize_t create_store(struct kobject *kobj, struct device *dev, if (ret) return ret; - ret = mdev_device_create(kobj, dev, uuid); + if (param) { + opt = kstrndup(param + 1, count - UUID_STRING_LEN - 1, + GFP_KERNEL); + if (!opt) + return -ENOMEM; + if (strncmp(opt, "aggregate=", 10)) { + kfree(opt); + return -EINVAL; + } + opt += 10; + if (kstrtouint(opt, 10, &instances)) { + kfree(opt); + return -EINVAL; + } + kfree(opt); + } + + ret = mdev_device_create(kobj, dev, uuid, instances); if (ret) return ret; diff --git a/include/linux/mdev.h b/include/linux/mdev.h index b6e048e1045f..c12c0bfc5598 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -31,6 +31,14 @@ struct mdev_device; * @mdev: mdev_device structure on of mediated device * that is being created * Returns integer: success (0) or error (< 0) + * @create_with_instances: Allocate aggregated instances' resources in parent device's + * driver for a particular mediated device. Optional if aggregated + * resources are not supported. + * @kobj: kobject of type for which 'create' is called. + * @mdev: mdev_device structure on of mediated device + * that is being created + * @instances: number of instances to aggregate + * Returns integer: success (0) or error (< 0) * @remove: Called to free resources in parent device's driver for a * a mediated device. It is mandatory to provide 'remove' * ops. @@ -71,6 +79,9 @@ struct mdev_parent_ops { struct attribute_group **supported_type_groups; int (*create)(struct kobject *kobj, struct mdev_device *mdev); + int (*create_with_instances)(struct kobject *kobj, + struct mdev_device *mdev, + unsigned int instances); int (*remove)(struct mdev_device *mdev); int (*open)(struct mdev_device *mdev); void (*release)(struct mdev_device *mdev); -- 2.19.1

For supported mdev driver to create aggregated device, this creates new "aggregation" attribute for target type, which will show maximum number of instance resources that can be aggregated. Cc: Kirti Wankhede <kwankhede@nvidia.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kevin Tian <kevin.tian@intel.com> Cc: Cornelia Huck <cohuck@redhat.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- drivers/vfio/mdev/mdev_core.c | 19 +++++++++++++++++++ drivers/vfio/mdev/mdev_private.h | 2 ++ drivers/vfio/mdev/mdev_sysfs.c | 22 ++++++++++++++++++++++ include/linux/mdev.h | 8 ++++++++ 4 files changed, 51 insertions(+) diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index 545c52ec7618..8f8bbb72e5d9 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -161,6 +161,25 @@ static int mdev_device_remove_cb(struct device *dev, void *data) return mdev_device_remove(dev, data ? *(bool *)data : true); } +int mdev_max_aggregated_instances(struct kobject *kobj, struct device *dev, + unsigned int *m) +{ + struct mdev_parent *parent; + struct mdev_type *type = to_mdev_type(kobj); + int ret; + + parent = mdev_get_parent(type->parent); + if (!parent) + return -EINVAL; + + if (parent->ops->max_aggregated_instances) { + ret = parent->ops->max_aggregated_instances(kobj, dev, m); + } else + ret = -EINVAL; + mdev_put_parent(parent); + return ret; +} + /* * mdev_register_device : Register a device * @dev: device structure representing parent device. diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h index e90d295d3927..f1289db75884 100644 --- a/drivers/vfio/mdev/mdev_private.h +++ b/drivers/vfio/mdev/mdev_private.h @@ -63,4 +63,6 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid, unsigned int instances); int mdev_device_remove(struct device *dev, bool force_remove); +int mdev_max_aggregated_instances(struct kobject *kobj, struct device *dev, + unsigned int *m); #endif /* MDEV_PRIVATE_H */ diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index aefed0c8891b..a329d6ab6cb9 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -103,6 +103,18 @@ static ssize_t create_store(struct kobject *kobj, struct device *dev, MDEV_TYPE_ATTR_WO(create); +static ssize_t +aggregation_show(struct kobject *kobj, struct device *dev, char *buf) +{ + unsigned int m; + + if (mdev_max_aggregated_instances(kobj, dev, &m)) + return sprintf(buf, "1\n"); + else + return sprintf(buf, "%u\n", m); +} +MDEV_TYPE_ATTR_RO(aggregation); + static void mdev_type_release(struct kobject *kobj) { struct mdev_type *type = to_mdev_type(kobj); @@ -145,6 +157,14 @@ struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent, if (ret) goto attr_create_failed; + if (parent->ops->create_with_instances && + parent->ops->max_aggregated_instances) { + ret = sysfs_create_file(&type->kobj, + &mdev_type_attr_aggregation.attr); + if (ret) + goto attr_aggregate_failed; + } + type->devices_kobj = kobject_create_and_add("devices", &type->kobj); if (!type->devices_kobj) { ret = -ENOMEM; @@ -165,6 +185,8 @@ struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent, attrs_failed: kobject_put(type->devices_kobj); attr_devices_failed: + sysfs_remove_file(&type->kobj, &mdev_type_attr_aggregation.attr); +attr_aggregate_failed: sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr); attr_create_failed: kobject_del(&type->kobj); diff --git a/include/linux/mdev.h b/include/linux/mdev.h index c12c0bfc5598..66cfdb0bf0d6 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -39,6 +39,11 @@ struct mdev_device; * that is being created * @instances: number of instances to aggregate * Returns integer: success (0) or error (< 0) + * @max_aggregated_instances: Return max number for aggregated resources + * @kobj: kobject of type + * @dev: mdev parent device for target type + * @max: return max number of instances which can aggregate + * Returns integer: success (0) or error (< 0) * @remove: Called to free resources in parent device's driver for a * a mediated device. It is mandatory to provide 'remove' * ops. @@ -82,6 +87,9 @@ struct mdev_parent_ops { int (*create_with_instances)(struct kobject *kobj, struct mdev_device *mdev, unsigned int instances); + int (*max_aggregated_instances)(struct kobject *kobj, + struct device *dev, + unsigned int *max); int (*remove)(struct mdev_device *mdev); int (*open)(struct mdev_device *mdev); void (*release)(struct mdev_device *mdev); -- 2.19.1

For mdev device created by "aggregate" parameter, this creates new mdev device attribute "aggregated_instances" to show number of aggregated instances allocated. v2: - change attribute name as "aggregated_instances" v3: - create only for aggregated allocation Cc: Kirti Wankhede <kwankhede@nvidia.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kevin Tian <kevin.tian@intel.com> Cc: Cornelia Huck <cohuck@redhat.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- drivers/vfio/mdev/mdev_sysfs.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index a329d6ab6cb9..f03bdfbf5a42 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -292,7 +292,17 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr, return count; } +static ssize_t +aggregated_instances_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mdev_device *mdev = to_mdev_device(dev); + return sprintf(buf, "%u\n", mdev->type_instances); +} + static DEVICE_ATTR_WO(remove); +static DEVICE_ATTR_RO(aggregated_instances); static const struct attribute *mdev_device_attrs[] = { &dev_attr_remove.attr, @@ -301,6 +311,7 @@ static const struct attribute *mdev_device_attrs[] = { int mdev_create_sysfs_files(struct device *dev, struct mdev_type *type) { + struct mdev_device *mdev = to_mdev_device(dev); int ret; ret = sysfs_create_link(type->devices_kobj, &dev->kobj, dev_name(dev)); @@ -315,8 +326,17 @@ int mdev_create_sysfs_files(struct device *dev, struct mdev_type *type) if (ret) goto create_files_failed; + if (mdev->type_instances > 1) { + ret = sysfs_create_file(&dev->kobj, + &dev_attr_aggregated_instances.attr); + if (ret) + goto create_aggregated_failed; + } + return ret; +create_aggregated_failed: + sysfs_remove_files(&dev->kobj, mdev_device_attrs); create_files_failed: sysfs_remove_link(&dev->kobj, "mdev_type"); type_link_failed: -- 2.19.1

Update vfio/mdev doc on new "aggregate" create parameter, new "aggregation" attribute and "aggregated_instances" attribute for mdev device. Cc: Kirti Wankhede <kwankhede@nvidia.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kevin Tian <kevin.tian@intel.com> Cc: Cornelia Huck <cohuck@redhat.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- Documentation/vfio-mediated-device.txt | 44 ++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/Documentation/vfio-mediated-device.txt b/Documentation/vfio-mediated-device.txt index c3f69bcaf96e..cf4849a34c9f 100644 --- a/Documentation/vfio-mediated-device.txt +++ b/Documentation/vfio-mediated-device.txt @@ -211,12 +211,20 @@ Directories and files under the sysfs for Each Physical Device | | |--- description | | |--- [devices] | |--- [<type-id>] - | |--- create - | |--- name - | |--- available_instances - | |--- device_api - | |--- description - | |--- [devices] + | | |--- create + | | |--- name + | | |--- available_instances + | | |--- device_api + | | |--- description + | | |--- [devices] + | |--- [<type-id>] + | | |--- create + | | |--- name + | | |--- available_instances + | | |--- device_api + | | |--- description + | | |--- <aggregation> + | | |--- [devices] * [mdev_supported_types] @@ -260,6 +268,23 @@ Directories and files under the sysfs for Each Physical Device This attribute should show brief features/description of the type. This is optional attribute. +* <aggregation> + + <aggregation> is an optional attributes to show max number that the + instance resources of [<type-id>] can be aggregated to be assigned + for one mdev device. No <aggregation> attribute means driver doesn't + support to aggregate instance resoures for one mdev device. + <aggregation> may be less than available_instances which depends on + driver. <aggregation> number can't exceed available_instances. + + Set number of instances by appending "aggregate=N" parameter for + create attribute. By default without "aggregate=N" parameter it + will create one instance as normal. + +Example:: + + # echo "<uuid>,aggregate=N" > create + Directories and Files Under the sysfs for Each mdev Device ---------------------------------------------------------- @@ -268,6 +293,7 @@ Directories and Files Under the sysfs for Each mdev Device |- [parent phy device] |--- [$MDEV_UUID] |--- remove + |--- <aggregated_instances> |--- mdev_type {link to its type} |--- vendor-specific-attributes [optional] @@ -281,6 +307,12 @@ Example:: # echo 1 > /sys/bus/mdev/devices/$mdev_UUID/remove +* <aggregated_instances> (read only) + +For mdev created with aggregate parameter, this shows number of +instances assigned for this mdev. For normal type this attribute will +not exist. + Mediated device Hot plug ------------------------ -- 2.19.1

Update vfio/mdev ABI description for new aggregation attributes. Cc: Kirti Wankhede <kwankhede@nvidia.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kevin Tian <kevin.tian@intel.com> Cc: Cornelia Huck <cohuck@redhat.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- Documentation/ABI/testing/sysfs-bus-vfio-mdev | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-vfio-mdev b/Documentation/ABI/testing/sysfs-bus-vfio-mdev index 452dbe39270e..192fe06e60d0 100644 --- a/Documentation/ABI/testing/sysfs-bus-vfio-mdev +++ b/Documentation/ABI/testing/sysfs-bus-vfio-mdev @@ -85,6 +85,24 @@ Users: a particular <type-id> that can help in understanding the features provided by that type of mediated device. +What: /sys/.../mdev_supported_types/<type-id>/aggregation +Date: October 2018 +Contact: Zhenyu Wang <zhenyuw@linux.intel.com> +Description: + Reading this attribute will show number of mdev instances + that can be aggregated to assign for one mdev device. + This is optional attribute. If this attribute exists that + means driver supports to aggregate target mdev type's + resources assigned for one mdev device. +Users: + Userspace applications interested in creating mediated + device with aggregated type instances. Userspace application + should check the number of aggregation instances that could + be created before creating mediated device by applying this, + e.g + # echo "83b8f4f2-509f-382f-3c1e-e6bfe0fa1001,aggregate=XX" > \ + /sys/devices/foo/mdev_supported_types/foo-1/create + What: /sys/.../<device>/<UUID>/ Date: October 2016 Contact: Kirti Wankhede <kwankhede@nvidia.com> @@ -109,3 +127,10 @@ Description: is active and the vendor driver doesn't support hot unplug. Example: # echo 1 > /sys/bus/mdev/devices/<UUID>/remove + +What: /sys/.../<device>/<UUID>/aggregated_instances +Date: October 2018 +Contact: Zhenyu Wang <zhenyuw@linux.intel.com> +Description: + This attributes shows number of aggregated instances if this + mediated device was created with "aggregate" parameter. \ No newline at end of file -- 2.19.1

New aggregation type is created for KVMGT which can be used to combine minimal resource number for target instances, to create user defined number of resources. For KVMGT, aggregated resource is determined by memory and fence resource allocation for target number of instances. v2: - apply for new hooks Cc: Kirti Wankhede <kwankhede@nvidia.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Kevin Tian <kevin.tian@intel.com> Cc: Cornelia Huck <cohuck@redhat.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com> --- drivers/gpu/drm/i915/gvt/gvt.h | 11 +++++-- drivers/gpu/drm/i915/gvt/kvmgt.c | 53 ++++++++++++++++++++++++++++-- drivers/gpu/drm/i915/gvt/vgpu.c | 56 ++++++++++++++++++++++++++++++-- 3 files changed, 112 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 31f6cdbe5c42..cb318079fa74 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -241,6 +241,9 @@ struct intel_vgpu { struct intel_gvt_gm { unsigned long vgpu_allocated_low_gm_size; unsigned long vgpu_allocated_high_gm_size; + unsigned long low_avail; + unsigned long high_avail; + unsigned long fence_avail; }; struct intel_gvt_fence { @@ -292,13 +295,15 @@ struct intel_gvt_firmware { #define NR_MAX_INTEL_VGPU_TYPES 20 struct intel_vgpu_type { - char name[16]; + const char *drv_name; + char name[32]; unsigned int avail_instance; unsigned int low_gm_size; unsigned int high_gm_size; unsigned int fence; unsigned int weight; enum intel_vgpu_edid resolution; + unsigned int aggregation; }; struct intel_gvt { @@ -484,7 +489,7 @@ void intel_gvt_clean_vgpu_types(struct intel_gvt *gvt); struct intel_vgpu *intel_gvt_create_idle_vgpu(struct intel_gvt *gvt); void intel_gvt_destroy_idle_vgpu(struct intel_vgpu *vgpu); struct intel_vgpu *intel_gvt_create_vgpu(struct intel_gvt *gvt, - struct intel_vgpu_type *type); + struct intel_vgpu_type *type, unsigned int); void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu); void intel_gvt_release_vgpu(struct intel_vgpu *vgpu); void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, bool dmlr, @@ -563,7 +568,7 @@ struct intel_gvt_ops { int (*emulate_mmio_write)(struct intel_vgpu *, u64, void *, unsigned int); struct intel_vgpu *(*vgpu_create)(struct intel_gvt *, - struct intel_vgpu_type *); + struct intel_vgpu_type *, unsigned int); void (*vgpu_destroy)(struct intel_vgpu *vgpu); void (*vgpu_release)(struct intel_vgpu *vgpu); void (*vgpu_reset)(struct intel_vgpu *); diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index c1072143da1d..841ad5437c4a 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -501,7 +501,9 @@ static void kvmgt_put_vfio_device(void *vgpu) vfio_device_put(((struct intel_vgpu *)vgpu)->vdev.vfio_device); } -static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev) +static int intel_vgpu_create_internal(struct kobject *kobj, + struct mdev_device *mdev, + unsigned int instances) { struct intel_vgpu *vgpu = NULL; struct intel_vgpu_type *type; @@ -520,7 +522,14 @@ static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev) goto out; } - vgpu = intel_gvt_ops->vgpu_create(gvt, type); + if (instances > type->aggregation) { + gvt_vgpu_err("wrong aggregation specified for type %s\n", + kobject_name(kobj)); + ret = -EINVAL; + goto out; + } + + vgpu = intel_gvt_ops->vgpu_create(gvt, type, instances); if (IS_ERR_OR_NULL(vgpu)) { ret = vgpu == NULL ? -EFAULT : PTR_ERR(vgpu); gvt_err("failed to create intel vgpu: %d\n", ret); @@ -540,6 +549,44 @@ static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev) return ret; } +static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev) +{ + return intel_vgpu_create_internal(kobj, mdev, 1); +} + +static int intel_vgpu_create_with_instances(struct kobject *kobj, + struct mdev_device *mdev, + unsigned int instances) +{ + return intel_vgpu_create_internal(kobj, mdev, instances); +} + +static int intel_vgpu_max_aggregated_instances(struct kobject *kobj, + struct device *dev, + unsigned int *max) +{ + struct intel_vgpu_type *type; + struct intel_gvt *gvt; + int ret = 0; + + gvt = kdev_to_i915(dev)->gvt; + + type = intel_gvt_ops->gvt_find_vgpu_type(gvt, kobject_name(kobj)); + if (!type) { + gvt_err("failed to find type %s to create\n", + kobject_name(kobj)); + ret = -EINVAL; + goto out; + } + + if (type->aggregation <= 1) + *max = 1; + else + *max = type->aggregation; +out: + return ret; +} + static int intel_vgpu_remove(struct mdev_device *mdev) { struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); @@ -1442,6 +1489,8 @@ static const struct attribute_group *intel_vgpu_groups[] = { static struct mdev_parent_ops intel_vgpu_ops = { .mdev_attr_groups = intel_vgpu_groups, .create = intel_vgpu_create, + .create_with_instances = intel_vgpu_create_with_instances, + .max_aggregated_instances = intel_vgpu_max_aggregated_instances, .remove = intel_vgpu_remove, .open = intel_vgpu_open, diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index c628be05fbfe..d36f017c740b 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -108,6 +108,7 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) unsigned int num_types; unsigned int i, low_avail, high_avail; unsigned int min_low; + const char *driver_name = dev_driver_string(&gvt->dev_priv->drm.pdev->dev); /* vGPU type name is defined as GVTg_Vx_y which contains * physical GPU generation type (e.g V4 as BDW server, V5 as @@ -125,11 +126,15 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) high_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE; num_types = sizeof(vgpu_types) / sizeof(vgpu_types[0]); - gvt->types = kcalloc(num_types, sizeof(struct intel_vgpu_type), + gvt->types = kcalloc(num_types + 1, sizeof(struct intel_vgpu_type), GFP_KERNEL); if (!gvt->types) return -ENOMEM; + gvt->gm.low_avail = low_avail; + gvt->gm.high_avail = high_avail; + gvt->gm.fence_avail = 32 - HOST_FENCE; + min_low = MB_TO_BYTES(32); for (i = 0; i < num_types; ++i) { if (low_avail / vgpu_types[i].low_mm == 0) @@ -147,6 +152,7 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) gvt->types[i].resolution = vgpu_types[i].edid; gvt->types[i].avail_instance = min(low_avail / vgpu_types[i].low_mm, high_avail / vgpu_types[i].high_mm); + gvt->types[i].aggregation = 1; if (IS_GEN8(gvt->dev_priv)) sprintf(gvt->types[i].name, "GVTg_V4_%s", @@ -154,6 +160,7 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) else if (IS_GEN9(gvt->dev_priv)) sprintf(gvt->types[i].name, "GVTg_V5_%s", vgpu_types[i].name); + gvt->types[i].drv_name = driver_name; gvt_dbg_core("type[%d]: %s avail %u low %u high %u fence %u weight %u res %s\n", i, gvt->types[i].name, @@ -164,7 +171,32 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) vgpu_edid_str(gvt->types[i].resolution)); } - gvt->num_types = i; + /* add aggregation type */ + gvt->types[i].low_gm_size = MB_TO_BYTES(32); + gvt->types[i].high_gm_size = MB_TO_BYTES(192); + gvt->types[i].fence = 2; + gvt->types[i].weight = 16; + gvt->types[i].resolution = GVT_EDID_1024_768; + gvt->types[i].avail_instance = min(low_avail / gvt->types[i].low_gm_size, + high_avail / gvt->types[i].high_gm_size); + gvt->types[i].avail_instance = min(gvt->types[i].avail_instance, + (32 - HOST_FENCE) / gvt->types[i].fence); + if (IS_GEN8(gvt->dev_priv)) + strcat(gvt->types[i].name, "GVTg_V4_aggregate"); + else if (IS_GEN9(gvt->dev_priv)) + strcat(gvt->types[i].name, "GVTg_V5_aggregate"); + gvt->types[i].drv_name = driver_name; + + gvt_dbg_core("type[%d]: %s avail %u low %u high %u fence %u weight %u res %s\n", + i, gvt->types[i].name, + gvt->types[i].avail_instance, + gvt->types[i].low_gm_size, + gvt->types[i].high_gm_size, gvt->types[i].fence, + gvt->types[i].weight, + vgpu_edid_str(gvt->types[i].resolution)); + + gvt->types[i].aggregation = gvt->types[i].avail_instance; + gvt->num_types = ++i; return 0; } @@ -195,6 +227,8 @@ static void intel_gvt_update_vgpu_types(struct intel_gvt *gvt) fence_min = fence_avail / gvt->types[i].fence; gvt->types[i].avail_instance = min(min(low_gm_min, high_gm_min), fence_min); + if (gvt->types[i].aggregation > 1) + gvt->types[i].aggregation = gvt->types[i].avail_instance; gvt_dbg_core("update type[%d]: %s avail %u low %u high %u fence %u\n", i, gvt->types[i].name, @@ -464,7 +498,8 @@ static struct intel_vgpu *__intel_gvt_create_vgpu(struct intel_gvt *gvt, * pointer to intel_vgpu, error pointer if failed. */ struct intel_vgpu *intel_gvt_create_vgpu(struct intel_gvt *gvt, - struct intel_vgpu_type *type) + struct intel_vgpu_type *type, + unsigned int instances) { struct intel_vgpu_creation_params param; struct intel_vgpu *vgpu; @@ -481,6 +516,21 @@ struct intel_vgpu *intel_gvt_create_vgpu(struct intel_gvt *gvt, param.low_gm_sz = BYTES_TO_MB(param.low_gm_sz); param.high_gm_sz = BYTES_TO_MB(param.high_gm_sz); + if (instances > 1 && instances <= type->aggregation) { + if (instances > type->avail_instance) + return ERR_PTR(-EINVAL); + param.low_gm_sz = min(param.low_gm_sz * instances, + (u64)BYTES_TO_MB(gvt->gm.low_avail)); + param.high_gm_sz = min(param.high_gm_sz * instances, + (u64)BYTES_TO_MB(gvt->gm.high_avail)); + param.fence_sz = min(param.fence_sz * instances, + (u64)gvt->gm.fence_avail); + type->aggregation -= instances; + gvt_dbg_core("instances %d, low %lluMB, high %lluMB, fence %llu, left %u\n", + instances, param.low_gm_sz, param.high_gm_sz, param.fence_sz, + type->aggregation); + } + mutex_lock(&gvt->lock); vgpu = __intel_gvt_create_vgpu(gvt, ¶m); if (!IS_ERR(vgpu)) -- 2.19.1
participants (6)
-
Alex Williamson
-
Cornelia Huck
-
Erik Skultety
-
Kirti Wankhede
-
Tian, Kevin
-
Zhenyu Wang