diff mbox

[v7,1/4] vfio: Mediated device Core driver

Message ID 1472097235-6332-2-git-send-email-kwankhede@nvidia.com
State New
Headers show

Commit Message

Kirti Wankhede Aug. 25, 2016, 3:53 a.m. UTC
Design for Mediated Device Driver:
Main purpose of this driver is to provide a common interface for mediated
device management that can be used by different drivers of different
devices.

This module provides a generic interface to create the device, add it to
mediated bus, add device to IOMMU group and then add it to vfio group.

Below is the high Level block diagram, with Nvidia, Intel and IBM devices
as example, since these are the devices which are going to actively use
this module as of now.

 +---------------+
 |               |
 | +-----------+ |  mdev_register_driver() +--------------+
 | |           | +<------------------------+ __init()     |
 | |  mdev     | |                         |              |
 | |  bus      | +------------------------>+              |<-> VFIO user
 | |  driver   | |     probe()/remove()    | vfio_mdev.ko |    APIs
 | |           | |                         |              |
 | +-----------+ |                         +--------------+
 |               |
 |  MDEV CORE    |
 |   MODULE      |
 |   mdev.ko     |
 | +-----------+ |  mdev_register_device() +--------------+
 | |           | +<------------------------+              |
 | |           | |                         |  nvidia.ko   |<-> physical
 | |           | +------------------------>+              |    device
 | |           | |        callback         +--------------+
 | | Physical  | |
 | |  device   | |  mdev_register_device() +--------------+
 | | interface | |<------------------------+              |
 | |           | |                         |  i915.ko     |<-> physical
 | |           | +------------------------>+              |    device
 | |           | |        callback         +--------------+
 | |           | |
 | |           | |  mdev_register_device() +--------------+
 | |           | +<------------------------+              |
 | |           | |                         | ccw_device.ko|<-> physical
 | |           | +------------------------>+              |    device
 | |           | |        callback         +--------------+
 | +-----------+ |
 +---------------+

Core driver provides two types of registration interfaces:
1. Registration interface for mediated bus driver:

/**
  * struct mdev_driver - Mediated device's driver
  * @name: driver name
  * @probe: called when new device created
  * @remove:called when device removed
  * @driver:device driver structure
  *
  **/
struct mdev_driver {
         const char *name;
         int  (*probe)  (struct device *dev);
         void (*remove) (struct device *dev);
         struct device_driver    driver;
};

int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
void mdev_unregister_driver(struct mdev_driver *drv);

Mediated device's driver for mdev, vfio_mdev, uses this interface to
register with Core driver. vfio_mdev module adds mediated device to VFIO
group.

2. Physical device driver interface
This interface provides vendor driver the set APIs to manage physical
device related work in their own driver. APIs are :
- supported_config: provide supported configuration list by the vendor
		    driver
- create: to allocate basic resources in vendor driver for a mediated
	  device.
- destroy: to free resources in vendor driver when mediated device is
	   destroyed.
- reset: to free and reallocate resources in vendor driver during device
	 reset.
- set_online_status: to change online status of mediated device.
- get_online_status: to get current (online/offline) status of mediated
		     device.
- read : read emulation callback.
- write: write emulation callback.
- mmap: mmap emulation callback.
- get_irq_info: to retrieve information about mediated device's IRQ.
- set_irqs: send interrupt configuration information that VMM sets.
- get_device_info: to retrieve VFIO device related flags, number of regions
		   and number of IRQs supported.
- get_region_info: to provide region size and its flags for the mediated
		   device.

This registration interface should be used by vendor drivers to register
each physical device to mdev core driver.
Locks to serialize above callbacks are removed. If required, vendor driver
can have locks to serialize above APIs in their driver.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I73a5084574270b14541c529461ea2f03c292d510
Reviewed-on: http://git-master/r/1175705
Reviewed-by: Automatic_Commit_Validation_User
---
 drivers/vfio/Kconfig             |   1 +
 drivers/vfio/Makefile            |   1 +
 drivers/vfio/mdev/Kconfig        |  12 +
 drivers/vfio/mdev/Makefile       |   5 +
 drivers/vfio/mdev/mdev_core.c    | 509 +++++++++++++++++++++++++++++++++++++++
 drivers/vfio/mdev/mdev_driver.c  | 131 ++++++++++
 drivers/vfio/mdev/mdev_private.h |  36 +++
 drivers/vfio/mdev/mdev_sysfs.c   | 240 ++++++++++++++++++
 include/linux/mdev.h             | 212 ++++++++++++++++
 9 files changed, 1147 insertions(+)
 create mode 100644 drivers/vfio/mdev/Kconfig
 create mode 100644 drivers/vfio/mdev/Makefile
 create mode 100644 drivers/vfio/mdev/mdev_core.c
 create mode 100644 drivers/vfio/mdev/mdev_driver.c
 create mode 100644 drivers/vfio/mdev/mdev_private.h
 create mode 100644 drivers/vfio/mdev/mdev_sysfs.c
 create mode 100644 include/linux/mdev.h

Comments

Jike Song Sept. 8, 2016, 8:09 a.m. UTC | #1
On 08/25/2016 11:53 AM, Kirti Wankhede wrote:
> Design for Mediated Device Driver:
> Main purpose of this driver is to provide a common interface for mediated
> device management that can be used by different drivers of different
> devices.
> 
> This module provides a generic interface to create the device, add it to
> mediated bus, add device to IOMMU group and then add it to vfio group.
> 
> Below is the high Level block diagram, with Nvidia, Intel and IBM devices
> as example, since these are the devices which are going to actively use
> this module as of now.
> 
>  +---------------+
>  |               |
>  | +-----------+ |  mdev_register_driver() +--------------+
>  | |           | +<------------------------+ __init()     |
>  | |  mdev     | |                         |              |
>  | |  bus      | +------------------------>+              |<-> VFIO user
>  | |  driver   | |     probe()/remove()    | vfio_mdev.ko |    APIs
>  | |           | |                         |              |
>  | +-----------+ |                         +--------------+
>  |               |

This aimed to have only one single vfio bus driver for all mediated devices,
right?

>  |  MDEV CORE    |
>  |   MODULE      |
>  |   mdev.ko     |
>  | +-----------+ |  mdev_register_device() +--------------+
>  | |           | +<------------------------+              |
>  | |           | |                         |  nvidia.ko   |<-> physical
>  | |           | +------------------------>+              |    device
>  | |           | |        callback         +--------------+
>  | | Physical  | |
>  | |  device   | |  mdev_register_device() +--------------+
>  | | interface | |<------------------------+              |
>  | |           | |                         |  i915.ko     |<-> physical
>  | |           | +------------------------>+              |    device
>  | |           | |        callback         +--------------+
>  | |           | |
>  | |           | |  mdev_register_device() +--------------+
>  | |           | +<------------------------+              |
>  | |           | |                         | ccw_device.ko|<-> physical
>  | |           | +------------------------>+              |    device
>  | |           | |        callback         +--------------+
>  | +-----------+ |
>  +---------------+
> 
> Core driver provides two types of registration interfaces:
> 1. Registration interface for mediated bus driver:
> 
> /**
>   * struct mdev_driver - Mediated device's driver
>   * @name: driver name
>   * @probe: called when new device created
>   * @remove:called when device removed
>   * @driver:device driver structure
>   *
>   **/
> struct mdev_driver {
>          const char *name;
>          int  (*probe)  (struct device *dev);
>          void (*remove) (struct device *dev);
>          struct device_driver    driver;
> };
> 
> int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
> void mdev_unregister_driver(struct mdev_driver *drv);
> 
> Mediated device's driver for mdev, vfio_mdev, uses this interface to
> register with Core driver. vfio_mdev module adds mediated device to VFIO
> group.
> 
> 2. Physical device driver interface
> This interface provides vendor driver the set APIs to manage physical
> device related work in their own driver. APIs are :
> - supported_config: provide supported configuration list by the vendor
> 		    driver
> - create: to allocate basic resources in vendor driver for a mediated
> 	  device.
> - destroy: to free resources in vendor driver when mediated device is
> 	   destroyed.
> - reset: to free and reallocate resources in vendor driver during device
> 	 reset.
> - set_online_status: to change online status of mediated device.
> - get_online_status: to get current (online/offline) status of mediated
> 		     device.
> - read : read emulation callback.
> - write: write emulation callback.
> - mmap: mmap emulation callback.
> - get_irq_info: to retrieve information about mediated device's IRQ.
> - set_irqs: send interrupt configuration information that VMM sets.
> - get_device_info: to retrieve VFIO device related flags, number of regions
> 		   and number of IRQs supported.
> - get_region_info: to provide region size and its flags for the mediated
> 		   device.
> 
> This registration interface should be used by vendor drivers to register
> each physical device to mdev core driver.
> Locks to serialize above callbacks are removed. If required, vendor driver
> can have locks to serialize above APIs in their driver.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I73a5084574270b14541c529461ea2f03c292d510
> Reviewed-on: http://git-master/r/1175705
> Reviewed-by: Automatic_Commit_Validation_User
> ---
>  drivers/vfio/Kconfig             |   1 +
>  drivers/vfio/Makefile            |   1 +
>  drivers/vfio/mdev/Kconfig        |  12 +
>  drivers/vfio/mdev/Makefile       |   5 +
>  drivers/vfio/mdev/mdev_core.c    | 509 +++++++++++++++++++++++++++++++++++++++
>  drivers/vfio/mdev/mdev_driver.c  | 131 ++++++++++
>  drivers/vfio/mdev/mdev_private.h |  36 +++
>  drivers/vfio/mdev/mdev_sysfs.c   | 240 ++++++++++++++++++
>  include/linux/mdev.h             | 212 ++++++++++++++++
>  9 files changed, 1147 insertions(+)
>  create mode 100644 drivers/vfio/mdev/Kconfig
>  create mode 100644 drivers/vfio/mdev/Makefile
>  create mode 100644 drivers/vfio/mdev/mdev_core.c
>  create mode 100644 drivers/vfio/mdev/mdev_driver.c
>  create mode 100644 drivers/vfio/mdev/mdev_private.h
>  create mode 100644 drivers/vfio/mdev/mdev_sysfs.c
>  create mode 100644 include/linux/mdev.h
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index da6e2ce77495..23eced02aaf6 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -48,4 +48,5 @@ menuconfig VFIO_NOIOMMU
>  
>  source "drivers/vfio/pci/Kconfig"
>  source "drivers/vfio/platform/Kconfig"
> +source "drivers/vfio/mdev/Kconfig"
>  source "virt/lib/Kconfig"
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 7b8a31f63fea..4a23c13b6be4 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -7,3 +7,4 @@ obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
>  obj-$(CONFIG_VFIO_PLATFORM) += platform/
> +obj-$(CONFIG_VFIO_MDEV) += mdev/
> diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
> new file mode 100644
> index 000000000000..a34fbc66f92f
> --- /dev/null
> +++ b/drivers/vfio/mdev/Kconfig
> @@ -0,0 +1,12 @@
> +
> +config VFIO_MDEV
> +    tristate "Mediated device driver framework"
> +    depends on VFIO
> +    default n
> +    help
> +        Provides a framework to virtualize device.
> +	See Documentation/vfio-mediated-device.txt for more details.
> +
> +        If you don't know what do here, say N.
> +
> +
> diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
> new file mode 100644
> index 000000000000..56a75e689582
> --- /dev/null
> +++ b/drivers/vfio/mdev/Makefile
> @@ -0,0 +1,5 @@
> +
> +mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o
> +
> +obj-$(CONFIG_VFIO_MDEV) += mdev.o
> +
> diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
> new file mode 100644
> index 000000000000..9f278c7507f7
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev_core.c
> @@ -0,0 +1,509 @@
> +/*
> + * Mediated device Core Driver
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/fs.h>
> +#include <linux/slab.h>
> +#include <linux/sched.h>
> +#include <linux/uuid.h>
> +#include <linux/vfio.h>
> +#include <linux/iommu.h>
> +#include <linux/sysfs.h>
> +#include <linux/mdev.h>
> +
> +#include "mdev_private.h"
> +
> +#define DRIVER_VERSION		"0.1"
> +#define DRIVER_AUTHOR		"NVIDIA Corporation"
> +#define DRIVER_DESC		"Mediated device Core Driver"
> +
> +static LIST_HEAD(parent_list);
> +static DEFINE_MUTEX(parent_list_lock);
> +
> +static int mdev_add_attribute_group(struct device *dev,
> +				    const struct attribute_group **groups)
> +{
> +	return sysfs_create_groups(&dev->kobj, groups);
> +}
> +
> +static void mdev_remove_attribute_group(struct device *dev,
> +					const struct attribute_group **groups)
> +{
> +	sysfs_remove_groups(&dev->kobj, groups);
> +}

These functions are not necessary. You can always specify the attribute groups
to dev->groups before registering a new device.

> +
> +/* Should be called holding parent->mdev_list_lock */
> +static struct mdev_device *__find_mdev_device(struct parent_device *parent,
> +					      uuid_le uuid)
> +{
> +	struct mdev_device *mdev;
> +
> +	list_for_each_entry(mdev, &parent->mdev_list, next) {
> +		if (uuid_le_cmp(mdev->uuid, uuid) == 0)
> +			return mdev;
> +	}
> +	return NULL;
> +}
> +
> +/* Should be called holding parent_list_lock */
> +static struct parent_device *__find_parent_device(struct device *dev)
> +{
> +	struct parent_device *parent;
> +
> +	list_for_each_entry(parent, &parent_list, next) {
> +		if (parent->dev == dev)
> +			return parent;
> +	}
> +	return NULL;
> +}
> +
> +static void mdev_release_parent(struct kref *kref)
> +{
> +	struct parent_device *parent = container_of(kref, struct parent_device,
> +						    ref);
> +	kfree(parent);
> +}
> +
> +static
> +inline struct parent_device *mdev_get_parent(struct parent_device *parent)
> +{
> +	if (parent)
> +		kref_get(&parent->ref);
> +
> +	return parent;
> +}
> +
> +static inline void mdev_put_parent(struct parent_device *parent)
> +{
> +	if (parent)
> +		kref_put(&parent->ref, mdev_release_parent);
> +}
> +
> +static struct parent_device *mdev_get_parent_from_dev(struct device *dev)
> +{
> +	struct parent_device *parent;
> +
> +	mutex_lock(&parent_list_lock);
> +	parent = mdev_get_parent(__find_parent_device(dev));
> +	mutex_unlock(&parent_list_lock);
> +
> +	return parent;
> +}

As we have demonstrated, all these refs and locks and release workqueue are not necessary,
as long as you have an independent device associated with the mdev host device
("parent" device here).

PS, "parent" is somehow a name too generic?

> +
> +static int mdev_device_create_ops(struct mdev_device *mdev, char *mdev_params)
> +{
> +	struct parent_device *parent = mdev->parent;
> +	int ret;
> +
> +	ret = parent->ops->create(mdev, mdev_params);
> +	if (ret)
> +		return ret;
> +
> +	ret = mdev_add_attribute_group(&mdev->dev,
> +					parent->ops->mdev_attr_groups);

Ditto: dev->groups.

> +	if (ret)
> +		parent->ops->destroy(mdev);
> +
> +	return ret;
> +}
> +
> +static int mdev_device_destroy_ops(struct mdev_device *mdev, bool force)
> +{
> +	struct parent_device *parent = mdev->parent;
> +	int ret = 0;
> +
> +	/*
> +	 * If vendor driver doesn't return success that means vendor
> +	 * driver doesn't support hot-unplug
> +	 */
> +	ret = parent->ops->destroy(mdev);
> +	if (ret && !force)
> +		return -EBUSY;
> +
> +	mdev_remove_attribute_group(&mdev->dev,
> +				    parent->ops->mdev_attr_groups);
> +
> +	return ret;
> +}
> +
> +static void mdev_release_device(struct kref *kref)
> +{
> +	struct mdev_device *mdev = container_of(kref, struct mdev_device, ref);
> +	struct parent_device *parent = mdev->parent;
> +
> +	list_del(&mdev->next);
> +
> +	/*
> +	 * This unlock pairs with mutex held by mdev_put_device() through
> +	 * kref_put_mutex()
> +	 */
> +	mutex_unlock(&parent->mdev_list_lock);
> +
> +	device_unregister(&mdev->dev);
> +	wake_up(&parent->release_done);
> +	mdev_put_parent(parent);
> +}
> +
> +struct mdev_device *mdev_get_device(struct mdev_device *mdev)
> +{
> +	if (mdev)
> +		kref_get(&mdev->ref);
> +	return mdev;
> +}
> +EXPORT_SYMBOL(mdev_get_device);
> +
> +void mdev_put_device(struct mdev_device *mdev)
> +{
> +	struct parent_device *parent;
> +
> +	if (!mdev)
> +		return;
> +
> +	parent = mdev->parent;
> +	kref_put_mutex(&mdev->ref, mdev_release_device,
> +		       &parent->mdev_list_lock);
> +}
> +EXPORT_SYMBOL(mdev_put_device);
> +
> +/*
> + * mdev_register_device : Register a device
> + * @dev: device structure representing parent device.
> + * @ops: Parent device operation structure to be registered.
> + *
> + * Add device to list of registered parent devices.
> + * Returns a negative value on error, otherwise 0.
> + */
> +int mdev_register_device(struct device *dev, const struct parent_ops *ops)
> +{
> +	int ret = 0;
> +	struct parent_device *parent;
> +
> +	if (!dev || !ops)
> +		return -EINVAL;
> +
> +	/* check for mandatory ops */
> +	if (!ops->create || !ops->destroy)
> +		return -EINVAL;
> +
> +	mutex_lock(&parent_list_lock);
> +
> +	/* Check for duplicate */
> +	parent = __find_parent_device(dev);
> +	if (parent) {
> +		ret = -EEXIST;
> +		goto add_dev_err;
> +	}
> +
> +	parent = kzalloc(sizeof(*parent), GFP_KERNEL);
> +	if (!parent) {
> +		ret = -ENOMEM;
> +		goto add_dev_err;
> +	}
> +
> +	kref_init(&parent->ref);
> +	list_add(&parent->next, &parent_list);
> +
> +	parent->dev = dev;
> +	parent->ops = ops;
> +	mutex_init(&parent->mdev_list_lock);
> +	INIT_LIST_HEAD(&parent->mdev_list);
> +	init_waitqueue_head(&parent->release_done);
> +	mutex_unlock(&parent_list_lock);
> +
> +	ret = parent_create_sysfs_files(dev);
> +	if (ret)
> +		goto add_sysfs_error;
> +
> +	ret = mdev_add_attribute_group(dev, ops->dev_attr_groups);

parent_create_sysfs_files and mdev_add_attribute_group are kind of doing
the same thing, do you mind to merge them into one?

> +	if (ret)
> +		goto add_group_error;
> +
> +	dev_info(dev, "MDEV: Registered\n");
> +	return 0;
> +
> +add_group_error:
> +	mdev_remove_sysfs_files(dev);
> +add_sysfs_error:
> +	mutex_lock(&parent_list_lock);
> +	list_del(&parent->next);
> +	mutex_unlock(&parent_list_lock);
> +	mdev_put_parent(parent);
> +	return ret;
> +
> +add_dev_err:
> +	mutex_unlock(&parent_list_lock);
> +	return ret;
> +}
> +EXPORT_SYMBOL(mdev_register_device);
> +
> +/*
> + * mdev_unregister_device : Unregister a parent device
> + * @dev: device structure representing parent device.
> + *
> + * Remove device from list of registered parent devices. Give a chance to free
> + * existing mediated devices for given device.
> + */
> +
> +void mdev_unregister_device(struct device *dev)
> +{
> +	struct parent_device *parent;
> +	struct mdev_device *mdev = NULL;
> +	int ret;
> +
> +	mutex_lock(&parent_list_lock);
> +	parent = __find_parent_device(dev);
> +
> +	if (!parent) {
> +		mutex_unlock(&parent_list_lock);
> +		return;
> +	}
> +	dev_info(dev, "MDEV: Unregistering\n");
> +
> +	/*
> +	 * Remove parent from the list and remove "mdev_create" and
> +	 * "mdev_destroy" sysfs files so that no new mediated device could be
> +	 * created for this parent
> +	 */
> +	list_del(&parent->next);
> +	parent_remove_sysfs_files(dev);
> +	mutex_unlock(&parent_list_lock);
> +
> +	mdev_remove_attribute_group(dev,
> +				    parent->ops->dev_attr_groups);
> +
> +	while (!list_empty(&parent->mdev_list)) {
> +		mutex_lock(&parent->mdev_list_lock);
> +		if (!list_empty(&parent->mdev_list)) {
> +			mdev = list_first_entry(&parent->mdev_list,
> +						struct mdev_device, next);
> +			mdev_device_destroy_ops(mdev, true);
> +		}
> +		mutex_unlock(&parent->mdev_list_lock);
> +
> +		if (mdev)
> +			mdev_put_device(mdev);
> +	}
> +
> +	do {
> +		ret = wait_event_interruptible_timeout(parent->release_done,
> +				list_empty(&parent->mdev_list), HZ * 10);
> +		if (ret == -ERESTARTSYS) {
> +			dev_warn(dev, "Mediated devices are in use, task"
> +				      " \"%s\" (%d) "
> +				      "blocked until all are released",
> +				      current->comm, task_pid_nr(current));
> +		}
> +	} while (ret <= 0);
> +
> +	mdev_put_parent(parent);
> +}
> +EXPORT_SYMBOL(mdev_unregister_device);
> +
> +/*
> + * Functions required for mdev_sysfs
> + */
> +static void mdev_device_release(struct device *dev)
> +{
> +	struct mdev_device *mdev = to_mdev_device(dev);
> +
> +	dev_dbg(&mdev->dev, "MDEV: destroying\n");
> +	kfree(mdev);
> +}
> +
> +int mdev_device_create(struct device *dev, uuid_le uuid, char *mdev_params)
> +{
> +	int ret;
> +	struct mdev_device *mdev;
> +	struct parent_device *parent;
> +
> +	parent = mdev_get_parent_from_dev(dev);
> +	if (!parent)
> +		return -EINVAL;
> +
> +	mutex_lock(&parent->mdev_list_lock);
> +	/* Check for duplicate */
> +	mdev = __find_mdev_device(parent, uuid);
> +	if (mdev) {
> +		ret = -EEXIST;
> +		goto create_err;
> +	}
> +
> +	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
> +	if (!mdev) {
> +		ret = -ENOMEM;
> +		goto create_err;
> +	}
> +
> +	memcpy(&mdev->uuid, &uuid, sizeof(uuid_le));
> +	mdev->parent = parent;
> +	kref_init(&mdev->ref);
> +
> +	mdev->dev.parent  = dev;
> +	mdev->dev.bus     = &mdev_bus_type;
> +	mdev->dev.release = mdev_device_release;
> +	dev_set_name(&mdev->dev, "%pUl", uuid.b);
> +
> +	ret = device_register(&mdev->dev);
> +	if (ret) {
> +		put_device(&mdev->dev);
> +		goto create_err;
> +	}
> +
> +	ret = mdev_device_create_ops(mdev, mdev_params);
> +	if (ret)
> +		goto create_failed;
> +
> +	ret = mdev_create_sysfs_files(&mdev->dev);
> +	if (ret)
> +		goto create_sysfs_error;
> +
> +	list_add(&mdev->next, &parent->mdev_list);
> +	mutex_unlock(&parent->mdev_list_lock);
> +
> +	dev_dbg(&mdev->dev, "MDEV: created\n");
> +
> +	return ret;
> +
> +create_sysfs_error:
> +	mdev_device_destroy_ops(mdev, true);
> +
> +create_failed:
> +	device_unregister(&mdev->dev);
> +
> +create_err:
> +	mutex_unlock(&parent->mdev_list_lock);
> +	mdev_put_parent(parent);
> +	return ret;
> +}
> +
> +int mdev_device_destroy(struct device *dev, uuid_le uuid)
> +{
> +	struct mdev_device *mdev;
> +	struct parent_device *parent;
> +	int ret;
> +
> +	parent = mdev_get_parent_from_dev(dev);
> +	if (!parent)
> +		return -ENODEV;
> +
> +	mutex_lock(&parent->mdev_list_lock);
> +	mdev = __find_mdev_device(parent, uuid);
> +	if (!mdev) {
> +		ret = -EINVAL;
> +		goto destroy_err;
> +	}
> +
> +	mdev_remove_sysfs_files(&mdev->dev);
> +	ret = mdev_device_destroy_ops(mdev, false);
> +	if (ret)
> +		goto destroy_err;
> +
> +	mutex_unlock(&parent->mdev_list_lock);
> +	mdev_put_device(mdev);
> +
> +	mdev_put_parent(parent);
> +	return ret;
> +
> +destroy_err:
> +	mutex_unlock(&parent->mdev_list_lock);
> +	mdev_put_parent(parent);
> +	return ret;
> +}
> +
> +void mdev_device_supported_config(struct device *dev, char *str)
> +{
> +	struct parent_device *parent;
> +
> +	parent = mdev_get_parent_from_dev(dev);
> +
> +	if (parent) {
> +		if (parent->ops->supported_config)
> +			parent->ops->supported_config(parent->dev, str);
> +		mdev_put_parent(parent);
> +	}
> +}
> +
> +int mdev_device_set_online_status(struct device *dev, bool online)
> +{
> +	int ret = 0;
> +	struct mdev_device *mdev;
> +	struct parent_device *parent;
> +
> +	mdev = mdev_get_device(to_mdev_device(dev));
> +	if (!mdev)
> +		return -EINVAL;
> +
> +	parent = mdev->parent;
> +
> +	if (parent->ops->set_online_status)
> +		ret = parent->ops->set_online_status(mdev, online);
> +
> +	if (ret)
> +		pr_err("mdev online failed  %d\n", ret);
> +	else {
> +		if (online)
> +			kobject_uevent(&mdev->dev.kobj, KOBJ_ONLINE);
> +		else
> +			kobject_uevent(&mdev->dev.kobj, KOBJ_OFFLINE);
> +	}
> +
> +	mdev_put_device(mdev);
> +
> +	return ret;
> +}
> +
> +int mdev_device_get_online_status(struct device *dev, bool *online)
> +{
> +	int ret = 0;
> +	struct mdev_device *mdev;
> +	struct parent_device *parent;
> +
> +	mdev = mdev_get_device(to_mdev_device(dev));
> +	if (!mdev)
> +		return -EINVAL;
> +
> +	parent = mdev->parent;
> +
> +	if (parent->ops->get_online_status)
> +		ret = parent->ops->get_online_status(mdev, online);
> +
> +	mdev_put_device(mdev);
> +
> +	return ret;
> +}

The driver core has a perfect 'online' file for a device, with both
'show' and 'store' support, you don't need to write another one.

Please have a look at online_show and online_store in drivers/base/core.c.

> +
> +static int __init mdev_init(void)
> +{
> +	int ret;
> +
> +	ret = mdev_bus_register();
> +	if (ret) {
> +		pr_err("Failed to register mdev bus\n");
> +		return ret;
> +	}
> +
> +	return ret;
> +}
> +
> +static void __exit mdev_exit(void)
> +{
> +	mdev_bus_unregister();
> +}
> +
> +module_init(mdev_init)
> +module_exit(mdev_exit)
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
> new file mode 100644
> index 000000000000..8afc2d8e5c04
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev_driver.c
> @@ -0,0 +1,131 @@
> +/*
> + * MDEV driver
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/device.h>
> +#include <linux/iommu.h>
> +#include <linux/mdev.h>
> +
> +#include "mdev_private.h"
> +
> +static int mdev_attach_iommu(struct mdev_device *mdev)
> +{
> +	int ret;
> +	struct iommu_group *group;
> +
> +	group = iommu_group_alloc();
> +	if (IS_ERR(group)) {
> +		dev_err(&mdev->dev, "MDEV: failed to allocate group!\n");
> +		return PTR_ERR(group);
> +	}
> +
> +	ret = iommu_group_add_device(group, &mdev->dev);
> +	if (ret) {
> +		dev_err(&mdev->dev, "MDEV: failed to add dev to group!\n");
> +		goto attach_fail;
> +	}
> +
> +	mdev->group = group;
> +
> +	dev_info(&mdev->dev, "MDEV: group_id = %d\n",
> +				 iommu_group_id(group));
> +attach_fail:
> +	iommu_group_put(group);
> +	return ret;
> +}
> +
> +static void mdev_detach_iommu(struct mdev_device *mdev)
> +{
> +	iommu_group_remove_device(&mdev->dev);
> +	mdev->group = NULL;
> +	dev_info(&mdev->dev, "MDEV: detaching iommu\n");
> +}
> +
> +static int mdev_probe(struct device *dev)
> +{
> +	struct mdev_driver *drv = to_mdev_driver(dev->driver);
> +	struct mdev_device *mdev = to_mdev_device(dev);
> +	int ret;
> +
> +	ret = mdev_attach_iommu(mdev);
> +	if (ret) {
> +		dev_err(dev, "Failed to attach IOMMU\n");
> +		return ret;
> +	}
> +
> +	if (drv && drv->probe)
> +		ret = drv->probe(dev);
> +
> +	if (ret)
> +		mdev_detach_iommu(mdev);
> +
> +	return ret;
> +}
> +
> +static int mdev_remove(struct device *dev)
> +{
> +	struct mdev_driver *drv = to_mdev_driver(dev->driver);
> +	struct mdev_device *mdev = to_mdev_device(dev);
> +
> +	if (drv && drv->remove)
> +		drv->remove(dev);
> +
> +	mdev_detach_iommu(mdev);
> +
> +	return 0;
> +}
> +
> +struct bus_type mdev_bus_type = {
> +	.name		= "mdev",
> +	.probe		= mdev_probe,
> +	.remove		= mdev_remove,
> +};
> +EXPORT_SYMBOL_GPL(mdev_bus_type);
> +
> +/*
> + * mdev_register_driver - register a new MDEV driver
> + * @drv: the driver to register
> + * @owner: module owner of driver to be registered
> + *
> + * Returns a negative value on error, otherwise 0.
> + */
> +int mdev_register_driver(struct mdev_driver *drv, struct module *owner)
> +{
> +	/* initialize common driver fields */
> +	drv->driver.name = drv->name;
> +	drv->driver.bus = &mdev_bus_type;
> +	drv->driver.owner = owner;
> +
> +	/* register with core */
> +	return driver_register(&drv->driver);
> +}
> +EXPORT_SYMBOL(mdev_register_driver);
> +
> +/*
> + * mdev_unregister_driver - unregister MDEV driver
> + * @drv: the driver to unregister
> + *
> + */
> +void mdev_unregister_driver(struct mdev_driver *drv)
> +{
> +	driver_unregister(&drv->driver);
> +}
> +EXPORT_SYMBOL(mdev_unregister_driver);
> +
> +int mdev_bus_register(void)
> +{
> +	return bus_register(&mdev_bus_type);
> +}
> +
> +void mdev_bus_unregister(void)
> +{
> +	bus_unregister(&mdev_bus_type);
> +}
> diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
> new file mode 100644
> index 000000000000..07ad1b381370
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev_private.h
> @@ -0,0 +1,36 @@
> +/*
> + * Mediated device interal definitions
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef MDEV_PRIVATE_H
> +#define MDEV_PRIVATE_H
> +
> +int  mdev_bus_register(void);
> +void mdev_bus_unregister(void);
> +
> +/* Function prototypes for mdev_sysfs */
> +
> +extern struct class_attribute mdev_class_attrs[];

This is useless?

> +
> +int  parent_create_sysfs_files(struct device *dev);
> +void parent_remove_sysfs_files(struct device *dev);
> +
> +int  mdev_create_sysfs_files(struct device *dev);
> +void mdev_remove_sysfs_files(struct device *dev);
> +
> +int  mdev_device_create(struct device *dev, uuid_le uuid, char *mdev_params);
> +int  mdev_device_destroy(struct device *dev, uuid_le uuid);
> +void mdev_device_supported_config(struct device *dev, char *str);
> +
> +int mdev_device_set_online_status(struct device *dev, bool online);
> +int mdev_device_get_online_status(struct device *dev, bool *online);
> +
> +#endif /* MDEV_PRIVATE_H */
> diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
> new file mode 100644
> index 000000000000..ed55cd5d6595
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev_sysfs.c
> @@ -0,0 +1,240 @@
> +/*
> + * File attributes for Mediated devices
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/sysfs.h>
> +#include <linux/ctype.h>
> +#include <linux/device.h>
> +#include <linux/slab.h>
> +#include <linux/uuid.h>
> +#include <linux/mdev.h>
> +
> +#include "mdev_private.h"
> +
> +/* Prototypes */
> +static ssize_t mdev_supported_types_show(struct device *dev,
> +					 struct device_attribute *attr,
> +					 char *buf);
> +static DEVICE_ATTR_RO(mdev_supported_types);
> +
> +static ssize_t mdev_create_store(struct device *dev,
> +				 struct device_attribute *attr,
> +				 const char *buf, size_t count);
> +static DEVICE_ATTR_WO(mdev_create);
> +
> +static ssize_t mdev_destroy_store(struct device *dev,
> +				  struct device_attribute *attr,
> +				  const char *buf, size_t count);
> +static DEVICE_ATTR_WO(mdev_destroy);
> +
> +static ssize_t online_store(struct device *dev, struct device_attribute *attr,
> +			    const char *buf, size_t count);
> +static ssize_t online_show(struct device *dev, struct device_attribute *attr,
> +			   char *buf);
> +static DEVICE_ATTR_RW(online);
> +
> +/* Static functions */
> +
> +#define SUPPORTED_TYPE_BUFFER_LENGTH	4096
> +
> +/* mdev sysfs Functions */
> +static ssize_t mdev_supported_types_show(struct device *dev,
> +					 struct device_attribute *attr,
> +					 char *buf)
> +{
> +	char *str, *ptr;
> +	ssize_t n;
> +
> +	str = kzalloc(sizeof(*str) * SUPPORTED_TYPE_BUFFER_LENGTH, GFP_KERNEL);
> +	if (!str)
> +		return -ENOMEM;
> +
> +	ptr = str;
> +	mdev_device_supported_config(dev, str);
> +
> +	n = sprintf(buf, "%s\n", str);
> +	kfree(ptr);
> +
> +	return n;
> +}
> +
> +static ssize_t mdev_create_store(struct device *dev,
> +				 struct device_attribute *attr,
> +				 const char *buf, size_t count)
> +{
> +	char *str, *pstr;
> +	char *uuid_str, *mdev_params = NULL, *params = NULL;
> +	uuid_le uuid;
> +	int ret;
> +
> +	pstr = str = kstrndup(buf, count, GFP_KERNEL);

pstr is not used.

> +
> +	if (!str)
> +		return -ENOMEM;
> +
> +	uuid_str = strsep(&str, ":");
> +	if (!uuid_str) {
> +		pr_err("mdev_create: Empty UUID string %s\n", buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	if (str)
> +		params = mdev_params = kstrdup(str, GFP_KERNEL);
> +
> +	ret = uuid_le_to_bin(uuid_str, &uuid);
> +	if (ret) {
> +		pr_err("mdev_create: UUID parse error %s\n", buf);
> +		goto create_error;
> +	}
> +
> +	ret = mdev_device_create(dev, uuid, mdev_params);
> +	if (ret)
> +		pr_err("mdev_create: Failed to create mdev device\n");
> +	else
> +		ret = count;
> +
> +create_error:
> +	kfree(params);
> +	kfree(pstr);
> +	return ret;
> +}
> +
> +static ssize_t mdev_destroy_store(struct device *dev,
> +				  struct device_attribute *attr,
> +				  const char *buf, size_t count)
> +{
> +	char *uuid_str, *str, *pstr;
> +	uuid_le uuid;
> +	int ret;
> +
> +	str = pstr = kstrndup(buf, count, GFP_KERNEL);

Ditto.

> +
> +	if (!str)
> +		return -ENOMEM;
> +
> +	uuid_str = strsep(&str, ":");
> +	if (!uuid_str) {
> +		pr_err("mdev_destroy: Empty UUID string %s\n", buf);
> +		ret = -EINVAL;
> +		goto destroy_error;
> +	}
> +
> +	ret = uuid_le_to_bin(uuid_str, &uuid);
> +	if (ret) {
> +		pr_err("mdev_destroy: UUID parse error  %s\n", buf);
> +		goto destroy_error;
> +	}
> +
> +	ret = mdev_device_destroy(dev, uuid);
> +	if (ret == 0)
> +		ret = count;
> +
> +destroy_error:
> +	kfree(pstr);
> +	return ret;
> +}
> +
> +static ssize_t online_store(struct device *dev, struct device_attribute *attr,
> +			    const char *buf, size_t count)
> +{
> +	char *str;
> +	int ret;
> +	uint32_t online_status;
> +	bool online;
> +
> +	str = kstrndup(buf, count, GFP_KERNEL);
> +	if (!str)
> +		return -ENOMEM;
> +
> +	ret = kstrtouint(str, 0, &online_status);
> +	kfree(str);
> +
> +	if (ret) {
> +		pr_err("online: parsing error %s\n", buf);
> +		return ret;
> +	}
> +
> +	online = online_status > 0 ? true : false;
> +
> +	ret = mdev_device_set_online_status(dev, online);
> +	if (ret)
> +		return ret;
> +
> +	return count;
> +}
> +
> +static ssize_t online_show(struct device *dev, struct device_attribute *attr,
> +			   char *buf)
> +{
> +	int ret;
> +	bool online = false;
> +
> +	ret = mdev_device_get_online_status(dev, &online);
> +	if (ret)
> +		return ret;
> +
> +	ret = sprintf(buf, "%d\n", online);
> +	return ret;
> +}

online_show and online_store are unnecessary, see comment on mdev_device_get_online_status.

> +
> +int parent_create_sysfs_files(struct device *dev)
> +{
> +	int ret;
> +
> +	ret = sysfs_create_file(&dev->kobj,
> +				&dev_attr_mdev_supported_types.attr);
> +	if (ret) {
> +		pr_err("Failed to create mdev_supported_types sysfs entry\n");
> +		return ret;
> +	}
> +
> +	ret = sysfs_create_file(&dev->kobj, &dev_attr_mdev_create.attr);
> +	if (ret) {
> +		pr_err("Failed to create mdev_create sysfs entry\n");
> +		goto create_sysfs_failed;
> +	}
> +
> +	ret = sysfs_create_file(&dev->kobj, &dev_attr_mdev_destroy.attr);
> +	if (ret) {
> +		pr_err("Failed to create mdev_destroy sysfs entry\n");
> +		sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr);
> +	} else
> +		return ret;
> +
> +create_sysfs_failed:
> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr);
> +	return ret;
> +}
> +
> +void parent_remove_sysfs_files(struct device *dev)
> +{
> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr);
> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr);
> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_destroy.attr);
> +}

The 2 functions above are also unnecessary: you can always group it with a single
function call of sysfs_create_files.

> +
> +int mdev_create_sysfs_files(struct device *dev)
> +{
> +	int ret;
> +
> +	ret = sysfs_create_file(&dev->kobj, &dev_attr_online.attr);
> +	if (ret)
> +		pr_err("Failed to create 'online' entry\n");
> +
> +	return ret;
> +}
> +
> +void mdev_remove_sysfs_files(struct device *dev)
> +{
> +	sysfs_remove_file(&dev->kobj, &dev_attr_online.attr);
> +}

As said above, "online" attr is unnecessary.

> +
> diff --git a/include/linux/mdev.h b/include/linux/mdev.h
> new file mode 100644
> index 000000000000..babcb7293199
> --- /dev/null
> +++ b/include/linux/mdev.h
> @@ -0,0 +1,212 @@
> +/*
> + * Mediated device definition
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef MDEV_H
> +#define MDEV_H
> +
> +#include <uapi/linux/vfio.h>
> +
> +struct parent_device;
> +
> +/*
> + * Mediated device
> + */
> +
> +struct mdev_device {
> +	struct device		dev;
> +	struct parent_device	*parent;
> +	struct iommu_group	*group;
> +	uuid_le			uuid;
> +	void			*driver_data;
> +
> +	/* internal only */
> +	struct kref		ref;
> +	struct list_head	next;
> +};
> +
> +
> +/**
> + * struct parent_ops - Structure to be registered for each parent device to
> + * register the device to mdev module.
> + *
> + * @owner:		The module owner.
> + * @dev_attr_groups:	Default attributes of the parent device.
> + * @mdev_attr_groups:	Default attributes of the mediated device.
> + * @supported_config:	Called to get information about supported types.
> + *			@dev : device structure of parent device.
> + *			@config: should return string listing supported config
> + *			Returns integer: success (0) or error (< 0)
> + * @create:		Called to allocate basic resources in parent device's
> + *			driver for a particular mediated device. It is
> + *			mandatory to provide create ops.
> + *			@mdev: mdev_device structure on of mediated device
> + *			      that is being created
> + *			@mdev_params: extra parameters required by parent
> + *			device's driver.
> + *			Returns integer: success (0) or error (< 0)
> + * @destroy:		Called to free resources in parent device's driver for a
> + *			a mediated device. It is mandatory to provide destroy
> + *			ops.
> + *			@mdev: mdev_device device structure which is being
> + *			       destroyed
> + *			Returns integer: success (0) or error (< 0)
> + *			If VMM is running and destroy() is called that means the
> + *			mdev is being hotunpluged. Return error if VMM is
> + *			running and driver doesn't support mediated device
> + *			hotplug.
> + * @reset:		Called to reset mediated device.
> + *			@mdev: mdev_device device structure.
> + *			Returns integer: success (0) or error (< 0)
> + * @set_online_status:	Called to change to status of mediated device.
> + *			@mdev: mediated device.
> + *			@online: set true or false to make mdev device online or
> + *			offline.
> + *			Returns integer: success (0) or error (< 0)
> + * @get_online_status:	Called to get online/offline status of  mediated device
> + *			@mdev: mediated device.
> + *			@online: Returns status of mediated device.
> + *			Returns integer: success (0) or error (< 0)
> + * @read:		Read emulation callback
> + *			@mdev: mediated device structure
> + *			@buf: read buffer
> + *			@count: number of bytes to read
> + *			@pos: address.
> + *			Retuns number on bytes read on success or error.
> + * @write:		Write emulation callback
> + *			@mdev: mediated device structure
> + *			@buf: write buffer
> + *			@count: number of bytes to be written
> + *			@pos: address.
> + *			Retuns number on bytes written on success or error.
> + * @get_irq_info:	Called to retrieve information about mediated device IRQ
> + *			@mdev: mediated device structure
> + *			@irq_info: VFIO IRQ flags and count.
> + *			Returns integer: success (0) or error (< 0)
> + * @set_irqs:		Called to send about interrupts configuration
> + *			information that VMM sets.
> + *			@mdev: mediated device structure
> + *			@flags, index, start, count and *data : same as that of
> + *			struct vfio_irq_set of VFIO_DEVICE_SET_IRQS API.
> + * @get_device_info:	Called to get VFIO device information for a mediated
> + *			device.
> + *			@vfio_device_info: VFIO device info.
> + *			Returns integer: success (0) or error (< 0)
> + * @get_region_info:	Called to get VFIO region size and flags of mediated
> + *			device.
> + *			@mdev: mediated device structure
> + *			@region_info: output, returns size and flags of
> + *				      requested region.
> + *			@cap_type_id: returns id of capability.
> + *			@cap_type: returns pointer to capability structure
> + *			corresponding to capability id.
> + *			Returns integer: success (0) or error (< 0)
> + *
> + * Parent device that support mediated device should be registered with mdev
> + * module with parent_ops structure.
> + */
> +
> +struct parent_ops {
> +	struct module   *owner;
> +	const struct attribute_group **dev_attr_groups;
> +	const struct attribute_group **mdev_attr_groups;
> +
> +	int	(*supported_config)(struct device *dev, char *config);
> +	int     (*create)(struct mdev_device *mdev, char *mdev_params);
> +	int     (*destroy)(struct mdev_device *mdev);
> +	int     (*reset)(struct mdev_device *mdev);
> +	int     (*set_online_status)(struct mdev_device *mdev, bool online);
> +	int     (*get_online_status)(struct mdev_device *mdev, bool *online);
> +	ssize_t (*read)(struct mdev_device *mdev, char *buf, size_t count,
> +			loff_t pos);
> +	ssize_t (*write)(struct mdev_device *mdev, char *buf, size_t count,
> +			 loff_t pos);
> +	int	(*mmap)(struct mdev_device *mdev, struct vm_area_struct *vma);
> +	int	(*get_irq_info)(struct mdev_device *mdev,
> +				struct vfio_irq_info *irq_info);
> +	int     (*set_irqs)(struct mdev_device *mdev, uint32_t flags,
> +			    unsigned int index, unsigned int start,
> +			    unsigned int count, void *data);
> +	int	(*get_device_info)(struct mdev_device *mdev,
> +				   struct vfio_device_info *dev_info);
> +	int	(*get_region_info)(struct mdev_device *mdev,
> +				   struct vfio_region_info *region_info,
> +				   u16 *cap_type_id, void **cap_type);
> +};

I have a strong objection here to such low-level interfaces, the interfaces
between vfio-mdev and vendor drivers should be as thin as possible, not imposing
any limitation to vendor drivers.

I saw that validate_map_request was removed from the ops and mmap was added. 
That is pretty nice. Furthermore, if you add an ioctl here, you can also remove
get_device_info, get_irq_info, set_irqs, and get_region_info (and even "reset").
There are several benefits by doing this:

	-	Balanced interfaces.
		Like I replied in another mail, you won't have unbalanced interfaces.
		You already have read, write and mmap in the ops, why not ioctl?

	-	Scalability.
		You are intercepting vfio optional capabilities in the framework, but
		how if vfio.ko, or vfio-pci.ko add a few new capabilities in the future?

	-	Abstraction.
		Even placing common codes here can avoid code duplication, you still
		have code duplicate with vfio-pci.  Better to move common logic out of
		vfio-pci and call them from mdev vendor drivers.

	-	Maintainability.
		This is pretty obvious :)

> +
> +/*
> + * Parent Device
> + */
> +
> +struct parent_device {
> +	struct device		*dev;
> +	const struct parent_ops	*ops;
> +
> +	/* internal */
> +	struct kref		ref;
> +	struct list_head	next;
> +	struct list_head	mdev_list;
> +	struct mutex		mdev_list_lock;
> +	wait_queue_head_t	release_done;
> +};
> +
> +/**
> + * struct mdev_driver - Mediated device driver
> + * @name: driver name
> + * @probe: called when new device created
> + * @remove: called when device removed
> + * @driver: device driver structure
> + *
> + **/
> +struct mdev_driver {
> +	const char *name;
> +	int  (*probe)(struct device *dev);
> +	void (*remove)(struct device *dev);
> +	struct device_driver driver;
> +};
> +
> +static inline struct mdev_driver *to_mdev_driver(struct device_driver *drv)
> +{
> +	return drv ? container_of(drv, struct mdev_driver, driver) : NULL;
> +}
> +
> +static inline struct mdev_device *to_mdev_device(struct device *dev)
> +{
> +	return dev ? container_of(dev, struct mdev_device, dev) : NULL;
> +}

These can be macros, like pci ones.

> +
> +static inline void *mdev_get_drvdata(struct mdev_device *mdev)
> +{
> +	return mdev->driver_data;
> +}
> +
> +static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data)
> +{
> +	mdev->driver_data = data;
> +}
> +
> +extern struct bus_type mdev_bus_type;
> +
> +#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type)
> +
> +extern int  mdev_register_device(struct device *dev,
> +				 const struct parent_ops *ops);
> +extern void mdev_unregister_device(struct device *dev);
> +
> +extern int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
> +extern void mdev_unregister_driver(struct mdev_driver *drv);
> +
> +extern struct mdev_device *mdev_get_device(struct mdev_device *mdev);
> +extern void mdev_put_device(struct mdev_device *mdev);
> +
> +extern struct mdev_device *mdev_get_device_by_group(struct iommu_group *group);
> +
> +#endif /* MDEV_H */
> 

--
Thanks,
Jike
Neo Jia Sept. 8, 2016, 9:38 a.m. UTC | #2
On Thu, Sep 08, 2016 at 04:09:39PM +0800, Jike Song wrote:
> On 08/25/2016 11:53 AM, Kirti Wankhede wrote:
> > +
> > +/**
> > + * struct parent_ops - Structure to be registered for each parent device to
> > + * register the device to mdev module.
> > + *
> > + * @owner:		The module owner.
> > + * @dev_attr_groups:	Default attributes of the parent device.
> > + * @mdev_attr_groups:	Default attributes of the mediated device.
> > + * @supported_config:	Called to get information about supported types.
> > + *			@dev : device structure of parent device.
> > + *			@config: should return string listing supported config
> > + *			Returns integer: success (0) or error (< 0)
> > + * @create:		Called to allocate basic resources in parent device's
> > + *			driver for a particular mediated device. It is
> > + *			mandatory to provide create ops.
> > + *			@mdev: mdev_device structure on of mediated device
> > + *			      that is being created
> > + *			@mdev_params: extra parameters required by parent
> > + *			device's driver.
> > + *			Returns integer: success (0) or error (< 0)
> > + * @destroy:		Called to free resources in parent device's driver for a
> > + *			a mediated device. It is mandatory to provide destroy
> > + *			ops.
> > + *			@mdev: mdev_device device structure which is being
> > + *			       destroyed
> > + *			Returns integer: success (0) or error (< 0)
> > + *			If VMM is running and destroy() is called that means the
> > + *			mdev is being hotunpluged. Return error if VMM is
> > + *			running and driver doesn't support mediated device
> > + *			hotplug.
> > + * @reset:		Called to reset mediated device.
> > + *			@mdev: mdev_device device structure.
> > + *			Returns integer: success (0) or error (< 0)
> > + * @set_online_status:	Called to change to status of mediated device.
> > + *			@mdev: mediated device.
> > + *			@online: set true or false to make mdev device online or
> > + *			offline.
> > + *			Returns integer: success (0) or error (< 0)
> > + * @get_online_status:	Called to get online/offline status of  mediated device
> > + *			@mdev: mediated device.
> > + *			@online: Returns status of mediated device.
> > + *			Returns integer: success (0) or error (< 0)
> > + * @read:		Read emulation callback
> > + *			@mdev: mediated device structure
> > + *			@buf: read buffer
> > + *			@count: number of bytes to read
> > + *			@pos: address.
> > + *			Retuns number on bytes read on success or error.
> > + * @write:		Write emulation callback
> > + *			@mdev: mediated device structure
> > + *			@buf: write buffer
> > + *			@count: number of bytes to be written
> > + *			@pos: address.
> > + *			Retuns number on bytes written on success or error.
> > + * @get_irq_info:	Called to retrieve information about mediated device IRQ
> > + *			@mdev: mediated device structure
> > + *			@irq_info: VFIO IRQ flags and count.
> > + *			Returns integer: success (0) or error (< 0)
> > + * @set_irqs:		Called to send about interrupts configuration
> > + *			information that VMM sets.
> > + *			@mdev: mediated device structure
> > + *			@flags, index, start, count and *data : same as that of
> > + *			struct vfio_irq_set of VFIO_DEVICE_SET_IRQS API.
> > + * @get_device_info:	Called to get VFIO device information for a mediated
> > + *			device.
> > + *			@vfio_device_info: VFIO device info.
> > + *			Returns integer: success (0) or error (< 0)
> > + * @get_region_info:	Called to get VFIO region size and flags of mediated
> > + *			device.
> > + *			@mdev: mediated device structure
> > + *			@region_info: output, returns size and flags of
> > + *				      requested region.
> > + *			@cap_type_id: returns id of capability.
> > + *			@cap_type: returns pointer to capability structure
> > + *			corresponding to capability id.
> > + *			Returns integer: success (0) or error (< 0)
> > + *
> > + * Parent device that support mediated device should be registered with mdev
> > + * module with parent_ops structure.
> > + */
> > +
> > +struct parent_ops {
> > +	struct module   *owner;
> > +	const struct attribute_group **dev_attr_groups;
> > +	const struct attribute_group **mdev_attr_groups;
> > +
> > +	int	(*supported_config)(struct device *dev, char *config);
> > +	int     (*create)(struct mdev_device *mdev, char *mdev_params);
> > +	int     (*destroy)(struct mdev_device *mdev);
> > +	int     (*reset)(struct mdev_device *mdev);
> > +	int     (*set_online_status)(struct mdev_device *mdev, bool online);
> > +	int     (*get_online_status)(struct mdev_device *mdev, bool *online);
> > +	ssize_t (*read)(struct mdev_device *mdev, char *buf, size_t count,
> > +			loff_t pos);
> > +	ssize_t (*write)(struct mdev_device *mdev, char *buf, size_t count,
> > +			 loff_t pos);
> > +	int	(*mmap)(struct mdev_device *mdev, struct vm_area_struct *vma);
> > +	int	(*get_irq_info)(struct mdev_device *mdev,
> > +				struct vfio_irq_info *irq_info);
> > +	int     (*set_irqs)(struct mdev_device *mdev, uint32_t flags,
> > +			    unsigned int index, unsigned int start,
> > +			    unsigned int count, void *data);
> > +	int	(*get_device_info)(struct mdev_device *mdev,
> > +				   struct vfio_device_info *dev_info);
> > +	int	(*get_region_info)(struct mdev_device *mdev,
> > +				   struct vfio_region_info *region_info,
> > +				   u16 *cap_type_id, void **cap_type);
> > +};
> 
> I have a strong objection here to such low-level interfaces, the interfaces
> between vfio-mdev and vendor drivers should be as thin as possible, not imposing
> any limitation to vendor drivers.

Hi Jike,

Welcome! :-)

Unfortunately, this is something I definitely can't agree with you.

We would like to capture the common code as much as possible without losing
flexibilities, so each vendor driver writers won't have to duplicate them and we
have something can be maintained publicly.

If you are running into specific limitation with above callback interfaces,
please show us the scenarios and we are very happy to look into that.

> 
> I saw that validate_map_request was removed from the ops and mmap was added. 
> That is pretty nice. Furthermore, if you add an ioctl here, you can also remove
> get_device_info, get_irq_info, set_irqs, and get_region_info (and even "reset").
> There are several benefits by doing this:

The decision of moving validate_map_request is mainly because we are adding a lot of 
advanced logic which most vendor drivers don't require, since we are the only consumer 
of such logic, no need to put it in the public/shared module.

> 
> 	-	Balanced interfaces.
> 		Like I replied in another mail, you won't have unbalanced interfaces.
> 		You already have read, write and mmap in the ops, why not ioctl?

Sorry, don't think "balanced" interface is a design criteria especially when
simply pursuing the sake of "balanced or full-set" interface ends up lots
duplicated code for vendor driver writers.

> 
> 	-	Scalability.
> 		You are intercepting vfio optional capabilities in the framework, but
> 		how if vfio.ko, or vfio-pci.ko add a few new capabilities in the future?

Exactly my point about the code sharing.

If new cap is added inside vfio.ko or vfio-pci.ko, we can just add it into
vfio_mdev.ko. 

Adding the code in one place is better to duplicate in multiple vendor drivers.

> 
> 	-	Abstraction.
> 		Even placing common codes here can avoid code duplication, you still
> 		have code duplicate with vfio-pci.  Better to move common logic out of
> 		vfio-pci and call them from mdev vendor drivers.

Are you saying to avoid the code duplications between vfio-pci and vfio-mdev?

> 
> 	-	Maintainability.
> 		This is pretty obvious :)

Definitely not, the burden is moving to the vendor driver side.

Again, Jike, I really want to enable you with the mediated framework we have been
doing here. So it is probably easier for us to accommodate your need if you could
follow the interfaces we have introduced and let us know if you have any specific
issues.

Thanks,
Neo

> 
> > +
> > +/*
> > + * Parent Device
> > + */
> > +
> > +struct parent_device {
> > +	struct device		*dev;
> > +	const struct parent_ops	*ops;
> > +
> > +	/* internal */
> > +	struct kref		ref;
> > +	struct list_head	next;
> > +	struct list_head	mdev_list;
> > +	struct mutex		mdev_list_lock;
> > +	wait_queue_head_t	release_done;
> > +};
> > +
> > +/**
> > + * struct mdev_driver - Mediated device driver
> > + * @name: driver name
> > + * @probe: called when new device created
> > + * @remove: called when device removed
> > + * @driver: device driver structure
> > + *
> > + **/
> > +struct mdev_driver {
> > +	const char *name;
> > +	int  (*probe)(struct device *dev);
> > +	void (*remove)(struct device *dev);
> > +	struct device_driver driver;
> > +};
> > +
> > +static inline struct mdev_driver *to_mdev_driver(struct device_driver *drv)
> > +{
> > +	return drv ? container_of(drv, struct mdev_driver, driver) : NULL;
> > +}
> > +
> > +static inline struct mdev_device *to_mdev_device(struct device *dev)
> > +{
> > +	return dev ? container_of(dev, struct mdev_device, dev) : NULL;
> > +}
> 
> These can be macros, like pci ones.
> 
> > +
> > +static inline void *mdev_get_drvdata(struct mdev_device *mdev)
> > +{
> > +	return mdev->driver_data;
> > +}
> > +
> > +static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data)
> > +{
> > +	mdev->driver_data = data;
> > +}
> > +
> > +extern struct bus_type mdev_bus_type;
> > +
> > +#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type)
> > +
> > +extern int  mdev_register_device(struct device *dev,
> > +				 const struct parent_ops *ops);
> > +extern void mdev_unregister_device(struct device *dev);
> > +
> > +extern int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
> > +extern void mdev_unregister_driver(struct mdev_driver *drv);
> > +
> > +extern struct mdev_device *mdev_get_device(struct mdev_device *mdev);
> > +extern void mdev_put_device(struct mdev_device *mdev);
> > +
> > +extern struct mdev_device *mdev_get_device_by_group(struct iommu_group *group);
> > +
> > +#endif /* MDEV_H */
> > 
> 
> --
> Thanks,
> Jike
Jike Song Sept. 9, 2016, 6:26 a.m. UTC | #3
On 09/08/2016 05:38 PM, Neo Jia wrote:
> On Thu, Sep 08, 2016 at 04:09:39PM +0800, Jike Song wrote:
>> On 08/25/2016 11:53 AM, Kirti Wankhede wrote:
>>> +
>>> +/**
>>> + * struct parent_ops - Structure to be registered for each parent device to
>>> + * register the device to mdev module.
>>> + *
>>> + * @owner:		The module owner.
>>> + * @dev_attr_groups:	Default attributes of the parent device.
>>> + * @mdev_attr_groups:	Default attributes of the mediated device.
>>> + * @supported_config:	Called to get information about supported types.
>>> + *			@dev : device structure of parent device.
>>> + *			@config: should return string listing supported config
>>> + *			Returns integer: success (0) or error (< 0)
>>> + * @create:		Called to allocate basic resources in parent device's
>>> + *			driver for a particular mediated device. It is
>>> + *			mandatory to provide create ops.
>>> + *			@mdev: mdev_device structure on of mediated device
>>> + *			      that is being created
>>> + *			@mdev_params: extra parameters required by parent
>>> + *			device's driver.
>>> + *			Returns integer: success (0) or error (< 0)
>>> + * @destroy:		Called to free resources in parent device's driver for a
>>> + *			a mediated device. It is mandatory to provide destroy
>>> + *			ops.
>>> + *			@mdev: mdev_device device structure which is being
>>> + *			       destroyed
>>> + *			Returns integer: success (0) or error (< 0)
>>> + *			If VMM is running and destroy() is called that means the
>>> + *			mdev is being hotunpluged. Return error if VMM is
>>> + *			running and driver doesn't support mediated device
>>> + *			hotplug.
>>> + * @reset:		Called to reset mediated device.
>>> + *			@mdev: mdev_device device structure.
>>> + *			Returns integer: success (0) or error (< 0)
>>> + * @set_online_status:	Called to change to status of mediated device.
>>> + *			@mdev: mediated device.
>>> + *			@online: set true or false to make mdev device online or
>>> + *			offline.
>>> + *			Returns integer: success (0) or error (< 0)
>>> + * @get_online_status:	Called to get online/offline status of  mediated device
>>> + *			@mdev: mediated device.
>>> + *			@online: Returns status of mediated device.
>>> + *			Returns integer: success (0) or error (< 0)
>>> + * @read:		Read emulation callback
>>> + *			@mdev: mediated device structure
>>> + *			@buf: read buffer
>>> + *			@count: number of bytes to read
>>> + *			@pos: address.
>>> + *			Retuns number on bytes read on success or error.
>>> + * @write:		Write emulation callback
>>> + *			@mdev: mediated device structure
>>> + *			@buf: write buffer
>>> + *			@count: number of bytes to be written
>>> + *			@pos: address.
>>> + *			Retuns number on bytes written on success or error.
>>> + * @get_irq_info:	Called to retrieve information about mediated device IRQ
>>> + *			@mdev: mediated device structure
>>> + *			@irq_info: VFIO IRQ flags and count.
>>> + *			Returns integer: success (0) or error (< 0)
>>> + * @set_irqs:		Called to send about interrupts configuration
>>> + *			information that VMM sets.
>>> + *			@mdev: mediated device structure
>>> + *			@flags, index, start, count and *data : same as that of
>>> + *			struct vfio_irq_set of VFIO_DEVICE_SET_IRQS API.
>>> + * @get_device_info:	Called to get VFIO device information for a mediated
>>> + *			device.
>>> + *			@vfio_device_info: VFIO device info.
>>> + *			Returns integer: success (0) or error (< 0)
>>> + * @get_region_info:	Called to get VFIO region size and flags of mediated
>>> + *			device.
>>> + *			@mdev: mediated device structure
>>> + *			@region_info: output, returns size and flags of
>>> + *				      requested region.
>>> + *			@cap_type_id: returns id of capability.
>>> + *			@cap_type: returns pointer to capability structure
>>> + *			corresponding to capability id.
>>> + *			Returns integer: success (0) or error (< 0)
>>> + *
>>> + * Parent device that support mediated device should be registered with mdev
>>> + * module with parent_ops structure.
>>> + */
>>> +
>>> +struct parent_ops {
>>> +	struct module   *owner;
>>> +	const struct attribute_group **dev_attr_groups;
>>> +	const struct attribute_group **mdev_attr_groups;
>>> +
>>> +	int	(*supported_config)(struct device *dev, char *config);
>>> +	int     (*create)(struct mdev_device *mdev, char *mdev_params);
>>> +	int     (*destroy)(struct mdev_device *mdev);
>>> +	int     (*reset)(struct mdev_device *mdev);
>>> +	int     (*set_online_status)(struct mdev_device *mdev, bool online);
>>> +	int     (*get_online_status)(struct mdev_device *mdev, bool *online);
>>> +	ssize_t (*read)(struct mdev_device *mdev, char *buf, size_t count,
>>> +			loff_t pos);
>>> +	ssize_t (*write)(struct mdev_device *mdev, char *buf, size_t count,
>>> +			 loff_t pos);
>>> +	int	(*mmap)(struct mdev_device *mdev, struct vm_area_struct *vma);
>>> +	int	(*get_irq_info)(struct mdev_device *mdev,
>>> +				struct vfio_irq_info *irq_info);
>>> +	int     (*set_irqs)(struct mdev_device *mdev, uint32_t flags,
>>> +			    unsigned int index, unsigned int start,
>>> +			    unsigned int count, void *data);
>>> +	int	(*get_device_info)(struct mdev_device *mdev,
>>> +				   struct vfio_device_info *dev_info);
>>> +	int	(*get_region_info)(struct mdev_device *mdev,
>>> +				   struct vfio_region_info *region_info,
>>> +				   u16 *cap_type_id, void **cap_type);
>>> +};
>>
>> I have a strong objection here to such low-level interfaces, the interfaces
>> between vfio-mdev and vendor drivers should be as thin as possible, not imposing
>> any limitation to vendor drivers.
> 
> Hi Jike,
> 
> Welcome! :-)

Aha, thanks! :)

>
> Unfortunately, this is something I definitely can't agree with you.
>

Glad to see your opinion!


> We would like to capture the common code as much as possible without losing
> flexibilities, so each vendor driver writers won't have to duplicate them and we
> have something can be maintained publicly.
> 

Yeah it is good to reduce the duplications among different vendor drivers,
but what do you think about the duplication between here and other bus drivers
like vfio-pci?

> If you are running into specific limitation with above callback interfaces,
> please show us the scenarios and we are very happy to look into that.
>

Though we don't actually test it upon this series (using high-level implementations
instead), I personally don't think there is a problem. However, this doesn't
necessarily mean it's sufficient.

>>
>> I saw that validate_map_request was removed from the ops and mmap was added. 
>> That is pretty nice. Furthermore, if you add an ioctl here, you can also remove
>> get_device_info, get_irq_info, set_irqs, and get_region_info (and even "reset").
>> There are several benefits by doing this:
> 
> The decision of moving validate_map_request is mainly because we are adding a lot of 
> advanced logic which most vendor drivers don't require, since we are the only consumer 
> of such logic, no need to put it in the public/shared module.
>
>>
>> 	-	Balanced interfaces.
>> 		Like I replied in another mail, you won't have unbalanced interfaces.
>> 		You already have read, write and mmap in the ops, why not ioctl?
> 
> Sorry, don't think "balanced" interface is a design criteria especially when
> simply pursuing the sake of "balanced or full-set" interface ends up lots
> duplicated code for vendor driver writers.
> 

Please kindly have a look at my comment on patch 2/4, about how to check the
validity of "count".

>>
>> 	-	Scalability.
>> 		You are intercepting vfio optional capabilities in the framework, but
>> 		how if vfio.ko, or vfio-pci.ko add a few new capabilities in the future?
> 
> Exactly my point about the code sharing.
> 
> If new cap is added inside vfio.ko or vfio-pci.ko, we can just add it into
> vfio_mdev.ko.
> 
> Adding the code in one place is better to duplicate in multiple vendor drivers.

So after adding that, how many places will you have?

>>
>> 	-	Abstraction.
>> 		Even placing common codes here can avoid code duplication, you still
>> 		have code duplicate with vfio-pci.  Better to move common logic out of
>> 		vfio-pci and call them from mdev vendor drivers.
> 
> Are you saying to avoid the code duplications between vfio-pci and vfio-mdev?
> 

Exactly. I haven't check other bus-driver like vfio-platform, but even if only
having vfio-pci considered, there will be duplications.

>>
>> 	-	Maintainability.
>> 		This is pretty obvious :)
> 
> Definitely not, the burden is moving to the vendor driver side.
>

Moving to vendor side is not the target, as said above, this will probably cause
more abstraction and refactoring of existing vfio code.

> Again, Jike, I really want to enable you with the mediated framework we have been
> doing here. So it is probably easier for us to accommodate your need if you could
> follow the interfaces we have introduced and let us know if you have any specific
> issues.

I won't read this as that one is not welcome to comment as long as he met no
actual issue :)

--
Thanks,
Jike

>>> +
>>> +/*
>>> + * Parent Device
>>> + */
>>> +
>>> +struct parent_device {
>>> +	struct device		*dev;
>>> +	const struct parent_ops	*ops;
>>> +
>>> +	/* internal */
>>> +	struct kref		ref;
>>> +	struct list_head	next;
>>> +	struct list_head	mdev_list;
>>> +	struct mutex		mdev_list_lock;
>>> +	wait_queue_head_t	release_done;
>>> +};
>>> +
>>> +/**
>>> + * struct mdev_driver - Mediated device driver
>>> + * @name: driver name
>>> + * @probe: called when new device created
>>> + * @remove: called when device removed
>>> + * @driver: device driver structure
>>> + *
>>> + **/
>>> +struct mdev_driver {
>>> +	const char *name;
>>> +	int  (*probe)(struct device *dev);
>>> +	void (*remove)(struct device *dev);
>>> +	struct device_driver driver;
>>> +};
>>> +
>>> +static inline struct mdev_driver *to_mdev_driver(struct device_driver *drv)
>>> +{
>>> +	return drv ? container_of(drv, struct mdev_driver, driver) : NULL;
>>> +}
>>> +
>>> +static inline struct mdev_device *to_mdev_device(struct device *dev)
>>> +{
>>> +	return dev ? container_of(dev, struct mdev_device, dev) : NULL;
>>> +}
>>
>> These can be macros, like pci ones.
>>
>>> +
>>> +static inline void *mdev_get_drvdata(struct mdev_device *mdev)
>>> +{
>>> +	return mdev->driver_data;
>>> +}
>>> +
>>> +static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data)
>>> +{
>>> +	mdev->driver_data = data;
>>> +}
>>> +
>>> +extern struct bus_type mdev_bus_type;
>>> +
>>> +#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type)
>>> +
>>> +extern int  mdev_register_device(struct device *dev,
>>> +				 const struct parent_ops *ops);
>>> +extern void mdev_unregister_device(struct device *dev);
>>> +
>>> +extern int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
>>> +extern void mdev_unregister_driver(struct mdev_driver *drv);
>>> +
>>> +extern struct mdev_device *mdev_get_device(struct mdev_device *mdev);
>>> +extern void mdev_put_device(struct mdev_device *mdev);
>>> +
>>> +extern struct mdev_device *mdev_get_device_by_group(struct iommu_group *group);
>>> +
>>> +#endif /* MDEV_H */
>>>
>>
>> --
>> Thanks,
>> Jike
Kirti Wankhede Sept. 9, 2016, 5:48 p.m. UTC | #4
On 9/8/2016 1:39 PM, Jike Song wrote:
> On 08/25/2016 11:53 AM, Kirti Wankhede wrote:

>>  +---------------+
>>  |               |
>>  | +-----------+ |  mdev_register_driver() +--------------+
>>  | |           | +<------------------------+ __init()     |
>>  | |  mdev     | |                         |              |
>>  | |  bus      | +------------------------>+              |<-> VFIO user
>>  | |  driver   | |     probe()/remove()    | vfio_mdev.ko |    APIs
>>  | |           | |                         |              |
>>  | +-----------+ |                         +--------------+
>>  |               |
> 
> This aimed to have only one single vfio bus driver for all mediated devices,
> right?
>

Yes. That's correct.


>> +
>> +static int mdev_add_attribute_group(struct device *dev,
>> +				    const struct attribute_group **groups)
>> +{
>> +	return sysfs_create_groups(&dev->kobj, groups);
>> +}
>> +
>> +static void mdev_remove_attribute_group(struct device *dev,
>> +					const struct attribute_group **groups)
>> +{
>> +	sysfs_remove_groups(&dev->kobj, groups);
>> +}
> 
> These functions are not necessary. You can always specify the attribute groups
> to dev->groups before registering a new device.
> 

At the time of mdev device create, I specifically didn't used
dev->groups because we callback in vendor driver before that, see below
code snippet, and those attributes should only be added if create()
callback returns success.

        ret = parent->ops->create(mdev, mdev_params);
        if (ret)
                return ret;

        ret = mdev_add_attribute_group(&mdev->dev,
                                        parent->ops->mdev_attr_groups);
        if (ret)
                parent->ops->destroy(mdev);



>> +
>> +static struct parent_device *mdev_get_parent_from_dev(struct device *dev)
>> +{
>> +	struct parent_device *parent;
>> +
>> +	mutex_lock(&parent_list_lock);
>> +	parent = mdev_get_parent(__find_parent_device(dev));
>> +	mutex_unlock(&parent_list_lock);
>> +
>> +	return parent;
>> +}
> 
> As we have demonstrated, all these refs and locks and release workqueue are not necessary,
> as long as you have an independent device associated with the mdev host device
> ("parent" device here).
>

I don't think every lock will go away with that. This also changes how
mdev devices entries are created in sysfs. It adds an extra directory.


> PS, "parent" is somehow a name too generic?
>

This is the term used in Linux Kernel for such cases. See 'struct
device' in include/linux/device.h. I would prefer 'parent'.

>> +
>> +static int mdev_device_create_ops(struct mdev_device *mdev, char *mdev_params)
>> +{
>> +	struct parent_device *parent = mdev->parent;
>> +	int ret;
>> +
>> +	ret = parent->ops->create(mdev, mdev_params);
>> +	if (ret)
>> +		return ret;
>> +
>> +	ret = mdev_add_attribute_group(&mdev->dev,
>> +					parent->ops->mdev_attr_groups);
> 
> Ditto: dev->groups.
> 

See my above response for why this is indented to be so.


>> +	ret = parent_create_sysfs_files(dev);
>> +	if (ret)
>> +		goto add_sysfs_error;
>> +
>> +	ret = mdev_add_attribute_group(dev, ops->dev_attr_groups);
> 
> parent_create_sysfs_files and mdev_add_attribute_group are kind of doing
> the same thing, do you mind to merge them into one?
> 

Ok. I'll see I can do that.


>> +int mdev_device_get_online_status(struct device *dev, bool *online)
>> +{
>> +	int ret = 0;
>> +	struct mdev_device *mdev;
>> +	struct parent_device *parent;
>> +
>> +	mdev = mdev_get_device(to_mdev_device(dev));
>> +	if (!mdev)
>> +		return -EINVAL;
>> +
>> +	parent = mdev->parent;
>> +
>> +	if (parent->ops->get_online_status)
>> +		ret = parent->ops->get_online_status(mdev, online);
>> +
>> +	mdev_put_device(mdev);
>> +
>> +	return ret;
>> +}
> 
> The driver core has a perfect 'online' file for a device, with both
> 'show' and 'store' support, you don't need to write another one.
> 
> Please have a look at online_show and online_store in drivers/base/core.c.
> 

This is going to be removed as per the latest discussion.


> +
>> +extern struct class_attribute mdev_class_attrs[];
> 
> This is useless?
>

Oh, I missed to remove it. Thanks for pointing that out.


>> +static ssize_t mdev_create_store(struct device *dev,
>> +				 struct device_attribute *attr,
>> +				 const char *buf, size_t count)
>> +{
>> +	char *str, *pstr;
>> +	char *uuid_str, *mdev_params = NULL, *params = NULL;
>> +	uuid_le uuid;
>> +	int ret;
>> +
>> +	pstr = str = kstrndup(buf, count, GFP_KERNEL);
> 
> pstr is not used.
>

It is used below to free duped memory. If you see below code, 'str'
pointer gets incremented on strsep, so that can't be used to free
memory. pstr points to start of memory which we want to free while
returning from this function.

>> +
>> +	if (!str)
>> +		return -ENOMEM;
>> +
>> +	uuid_str = strsep(&str, ":");
>> +	if (!uuid_str) {
>> +		pr_err("mdev_create: Empty UUID string %s\n", buf);
>> +		ret = -EINVAL;
>> +		goto create_error;
>> +	}
>> +
>> +	if (str)
>> +		params = mdev_params = kstrdup(str, GFP_KERNEL);
>> +
>> +	ret = uuid_le_to_bin(uuid_str, &uuid);
>> +	if (ret) {
>> +		pr_err("mdev_create: UUID parse error %s\n", buf);
>> +		goto create_error;
>> +	}
>> +
>> +	ret = mdev_device_create(dev, uuid, mdev_params);
>> +	if (ret)
>> +		pr_err("mdev_create: Failed to create mdev device\n");
>> +	else
>> +		ret = count;
>> +
>> +create_error:
>> +	kfree(params);
>> +	kfree(pstr);
>> +	return ret;
>> +}
>> +
>> +static ssize_t mdev_destroy_store(struct device *dev,
>> +				  struct device_attribute *attr,
>> +				  const char *buf, size_t count)
>> +{
>> +	char *uuid_str, *str, *pstr;
>> +	uuid_le uuid;
>> +	int ret;
>> +
>> +	str = pstr = kstrndup(buf, count, GFP_KERNEL);
> 
> Ditto.
> 

Same as above.

>> +
>> +	if (!str)
>> +		return -ENOMEM;
>> +
>> +	uuid_str = strsep(&str, ":");
>> +	if (!uuid_str) {
>> +		pr_err("mdev_destroy: Empty UUID string %s\n", buf);
>> +		ret = -EINVAL;
>> +		goto destroy_error;
>> +	}
>> +
>> +	ret = uuid_le_to_bin(uuid_str, &uuid);
>> +	if (ret) {
>> +		pr_err("mdev_destroy: UUID parse error  %s\n", buf);
>> +		goto destroy_error;
>> +	}
>> +
>> +	ret = mdev_device_destroy(dev, uuid);
>> +	if (ret == 0)
>> +		ret = count;
>> +
>> +destroy_error:
>> +	kfree(pstr);
>> +	return ret;
>> +}
>> +
>> +
>> +int parent_create_sysfs_files(struct device *dev)
>> +{
>> +	int ret;
>> +
>> +	ret = sysfs_create_file(&dev->kobj,
>> +				&dev_attr_mdev_supported_types.attr);
>> +	if (ret) {
>> +		pr_err("Failed to create mdev_supported_types sysfs entry\n");
>> +		return ret;
>> +	}
>> +
>> +	ret = sysfs_create_file(&dev->kobj, &dev_attr_mdev_create.attr);
>> +	if (ret) {
>> +		pr_err("Failed to create mdev_create sysfs entry\n");
>> +		goto create_sysfs_failed;
>> +	}
>> +
>> +	ret = sysfs_create_file(&dev->kobj, &dev_attr_mdev_destroy.attr);
>> +	if (ret) {
>> +		pr_err("Failed to create mdev_destroy sysfs entry\n");
>> +		sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr);
>> +	} else
>> +		return ret;
>> +
>> +create_sysfs_failed:
>> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr);
>> +	return ret;
>> +}
>> +
>> +void parent_remove_sysfs_files(struct device *dev)
>> +{
>> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr);
>> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr);
>> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_destroy.attr);
>> +}
> 
> The 2 functions above are also unnecessary: you can always group it with a single
> function call of sysfs_create_files.
>

Ok. 'supported types' and 'create' are going to change as per discussion
going on for libvirt integration. These functions would get removed with
that change.


>> +
>> +static inline struct mdev_driver *to_mdev_driver(struct device_driver *drv)
>> +{
>> +	return drv ? container_of(drv, struct mdev_driver, driver) : NULL;
>> +}
>> +
>> +static inline struct mdev_device *to_mdev_device(struct device *dev)
>> +{
>> +	return dev ? container_of(dev, struct mdev_device, dev) : NULL;
>> +}
> 
> These can be macros, like pci ones.
>

These also checks for NULL of argument which macro doesn't.

Kirti.

>> +
>> +static inline void *mdev_get_drvdata(struct mdev_device *mdev)
>> +{
>> +	return mdev->driver_data;
>> +}
>> +
>> +static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data)
>> +{
>> +	mdev->driver_data = data;
>> +}
>> +
>> +extern struct bus_type mdev_bus_type;
>> +
>> +#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type)
>> +
>> +extern int  mdev_register_device(struct device *dev,
>> +				 const struct parent_ops *ops);
>> +extern void mdev_unregister_device(struct device *dev);
>> +
>> +extern int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
>> +extern void mdev_unregister_driver(struct mdev_driver *drv);
>> +
>> +extern struct mdev_device *mdev_get_device(struct mdev_device *mdev);
>> +extern void mdev_put_device(struct mdev_device *mdev);
>> +
>> +extern struct mdev_device *mdev_get_device_by_group(struct iommu_group *group);
>> +
>> +#endif /* MDEV_H */
>>
> 
> --
> Thanks,
> Jike
>
Alex Williamson Sept. 9, 2016, 6:42 p.m. UTC | #5
On Fri, 9 Sep 2016 23:18:45 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 9/8/2016 1:39 PM, Jike Song wrote:
> > On 08/25/2016 11:53 AM, Kirti Wankhede wrote:  
> 
> >>  +---------------+
> >>  |               |
> >>  | +-----------+ |  mdev_register_driver() +--------------+
> >>  | |           | +<------------------------+ __init()     |
> >>  | |  mdev     | |                         |              |
> >>  | |  bus      | +------------------------>+              |<-> VFIO user
> >>  | |  driver   | |     probe()/remove()    | vfio_mdev.ko |    APIs
> >>  | |           | |                         |              |
> >>  | +-----------+ |                         +--------------+
> >>  |               |  
> > 
> > This aimed to have only one single vfio bus driver for all mediated devices,
> > right?
> >  
> 
> Yes. That's correct.
> 
> 
> >> +
> >> +static int mdev_add_attribute_group(struct device *dev,
> >> +				    const struct attribute_group **groups)
> >> +{
> >> +	return sysfs_create_groups(&dev->kobj, groups);
> >> +}
> >> +
> >> +static void mdev_remove_attribute_group(struct device *dev,
> >> +					const struct attribute_group **groups)
> >> +{
> >> +	sysfs_remove_groups(&dev->kobj, groups);
> >> +}  
> > 
> > These functions are not necessary. You can always specify the attribute groups
> > to dev->groups before registering a new device.
> >   
> 
> At the time of mdev device create, I specifically didn't used
> dev->groups because we callback in vendor driver before that, see below
> code snippet, and those attributes should only be added if create()
> callback returns success.
> 
>         ret = parent->ops->create(mdev, mdev_params);
>         if (ret)
>                 return ret;
> 
>         ret = mdev_add_attribute_group(&mdev->dev,
>                                         parent->ops->mdev_attr_groups);
>         if (ret)
>                 parent->ops->destroy(mdev);
> 
> 
> 
> >> +
> >> +static struct parent_device *mdev_get_parent_from_dev(struct device *dev)
> >> +{
> >> +	struct parent_device *parent;
> >> +
> >> +	mutex_lock(&parent_list_lock);
> >> +	parent = mdev_get_parent(__find_parent_device(dev));
> >> +	mutex_unlock(&parent_list_lock);
> >> +
> >> +	return parent;
> >> +}  
> > 
> > As we have demonstrated, all these refs and locks and release workqueue are not necessary,
> > as long as you have an independent device associated with the mdev host device
> > ("parent" device here).
> >  
> 
> I don't think every lock will go away with that. This also changes how
> mdev devices entries are created in sysfs. It adds an extra directory.

Exposing the parent-child relationship through sysfs is a desirable
feature, so I'm not sure how this is a negative.  This part of Jike's
conversion was a big improvement, I thought.  Thanks,

Alex
Kirti Wankhede Sept. 9, 2016, 7:55 p.m. UTC | #6
On 9/10/2016 12:12 AM, Alex Williamson wrote:
> On Fri, 9 Sep 2016 23:18:45 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 9/8/2016 1:39 PM, Jike Song wrote:
>>> On 08/25/2016 11:53 AM, Kirti Wankhede wrote:  
>>
>>>>  +---------------+
>>>>  |               |
>>>>  | +-----------+ |  mdev_register_driver() +--------------+
>>>>  | |           | +<------------------------+ __init()     |
>>>>  | |  mdev     | |                         |              |
>>>>  | |  bus      | +------------------------>+              |<-> VFIO user
>>>>  | |  driver   | |     probe()/remove()    | vfio_mdev.ko |    APIs
>>>>  | |           | |                         |              |
>>>>  | +-----------+ |                         +--------------+
>>>>  |               |  
>>>
>>> This aimed to have only one single vfio bus driver for all mediated devices,
>>> right?
>>>  
>>
>> Yes. That's correct.
>>
>>
>>>> +
>>>> +static int mdev_add_attribute_group(struct device *dev,
>>>> +				    const struct attribute_group **groups)
>>>> +{
>>>> +	return sysfs_create_groups(&dev->kobj, groups);
>>>> +}
>>>> +
>>>> +static void mdev_remove_attribute_group(struct device *dev,
>>>> +					const struct attribute_group **groups)
>>>> +{
>>>> +	sysfs_remove_groups(&dev->kobj, groups);
>>>> +}  
>>>
>>> These functions are not necessary. You can always specify the attribute groups
>>> to dev->groups before registering a new device.
>>>   
>>
>> At the time of mdev device create, I specifically didn't used
>> dev->groups because we callback in vendor driver before that, see below
>> code snippet, and those attributes should only be added if create()
>> callback returns success.
>>
>>         ret = parent->ops->create(mdev, mdev_params);
>>         if (ret)
>>                 return ret;
>>
>>         ret = mdev_add_attribute_group(&mdev->dev,
>>                                         parent->ops->mdev_attr_groups);
>>         if (ret)
>>                 parent->ops->destroy(mdev);
>>
>>
>>
>>>> +
>>>> +static struct parent_device *mdev_get_parent_from_dev(struct device *dev)
>>>> +{
>>>> +	struct parent_device *parent;
>>>> +
>>>> +	mutex_lock(&parent_list_lock);
>>>> +	parent = mdev_get_parent(__find_parent_device(dev));
>>>> +	mutex_unlock(&parent_list_lock);
>>>> +
>>>> +	return parent;
>>>> +}  
>>>
>>> As we have demonstrated, all these refs and locks and release workqueue are not necessary,
>>> as long as you have an independent device associated with the mdev host device
>>> ("parent" device here).
>>>  
>>
>> I don't think every lock will go away with that. This also changes how
>> mdev devices entries are created in sysfs. It adds an extra directory.
> 
> Exposing the parent-child relationship through sysfs is a desirable
> feature, so I'm not sure how this is a negative.  This part of Jike's
> conversion was a big improvement, I thought.  Thanks,
> 

Jike's suggestion is to introduced a fake device over parent device i.e.
mdev-host, and then all mdev devices are children of 'mdev-host' not
children of real parent.

For example, directory structure we have now is:
/sys/bus/pci/devices/0000\:85\:00.0/<mdev_device>

mdev devices are in real parents directory.

By introducing fake device it would be:
/sys/bus/pci/devices/0000\:85\:00.0/mdev-host/<mdev_device>

mdev devices are in fake device's directory.

Lock would be still required, to handle the race conditions like
'mdev_create' is still in process and parent device is unregistered by
vendor driver/ parent device is unbind from vendor driver.

With the new changes/discussion, we believe the locking will be
simplified without having fake parent device.

With fake device suggestion, removed pointer to parent device from
mdev_device structure. When a create(struct mdev_device *mdev) callback
comes to vendor driver, how would vendor driver know for which physical
device this mdev device create call is intended to? because then
'parent' would be newly introduced fake device, not the real parent.

Thanks,
Kirti
Jike Song Sept. 12, 2016, 5:10 a.m. UTC | #7
On 09/10/2016 03:55 AM, Kirti Wankhede wrote:
> On 9/10/2016 12:12 AM, Alex Williamson wrote:
>> On Fri, 9 Sep 2016 23:18:45 +0530
>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>
>>> On 9/8/2016 1:39 PM, Jike Song wrote:
>>>> On 08/25/2016 11:53 AM, Kirti Wankhede wrote:  
>>>
>>>>>  +---------------+
>>>>>  |               |
>>>>>  | +-----------+ |  mdev_register_driver() +--------------+
>>>>>  | |           | +<------------------------+ __init()     |
>>>>>  | |  mdev     | |                         |              |
>>>>>  | |  bus      | +------------------------>+              |<-> VFIO user
>>>>>  | |  driver   | |     probe()/remove()    | vfio_mdev.ko |    APIs
>>>>>  | |           | |                         |              |
>>>>>  | +-----------+ |                         +--------------+
>>>>>  |               |  
>>>>
>>>> This aimed to have only one single vfio bus driver for all mediated devices,
>>>> right?
>>>>  
>>>
>>> Yes. That's correct.
>>>
>>>
>>>>> +
>>>>> +static int mdev_add_attribute_group(struct device *dev,
>>>>> +				    const struct attribute_group **groups)
>>>>> +{
>>>>> +	return sysfs_create_groups(&dev->kobj, groups);
>>>>> +}
>>>>> +
>>>>> +static void mdev_remove_attribute_group(struct device *dev,
>>>>> +					const struct attribute_group **groups)
>>>>> +{
>>>>> +	sysfs_remove_groups(&dev->kobj, groups);
>>>>> +}  
>>>>
>>>> These functions are not necessary. You can always specify the attribute groups
>>>> to dev->groups before registering a new device.
>>>>   
>>>
>>> At the time of mdev device create, I specifically didn't used
>>> dev->groups because we callback in vendor driver before that, see below
>>> code snippet, and those attributes should only be added if create()
>>> callback returns success.
>>>
>>>         ret = parent->ops->create(mdev, mdev_params);
>>>         if (ret)
>>>                 return ret;
>>>
>>>         ret = mdev_add_attribute_group(&mdev->dev,
>>>                                         parent->ops->mdev_attr_groups);
>>>         if (ret)
>>>                 parent->ops->destroy(mdev);
>>>
>>>
>>>
>>>>> +
>>>>> +static struct parent_device *mdev_get_parent_from_dev(struct device *dev)
>>>>> +{
>>>>> +	struct parent_device *parent;
>>>>> +
>>>>> +	mutex_lock(&parent_list_lock);
>>>>> +	parent = mdev_get_parent(__find_parent_device(dev));
>>>>> +	mutex_unlock(&parent_list_lock);
>>>>> +
>>>>> +	return parent;
>>>>> +}  
>>>>
>>>> As we have demonstrated, all these refs and locks and release workqueue are not necessary,
>>>> as long as you have an independent device associated with the mdev host device
>>>> ("parent" device here).
>>>>  
>>>
>>> I don't think every lock will go away with that. This also changes how
>>> mdev devices entries are created in sysfs. It adds an extra directory.
>>
>> Exposing the parent-child relationship through sysfs is a desirable
>> feature, so I'm not sure how this is a negative.  This part of Jike's
>> conversion was a big improvement, I thought.  Thanks,
>>
> 
> Jike's suggestion is to introduced a fake device over parent device i.e.
> mdev-host, and then all mdev devices are children of 'mdev-host' not
> children of real parent.
>

It really depends on how you define 'real parent' :)

With a physical-host-mdev hierarchy, the parent of mdev devices is the host
device, the parent of host device is the physical device. e.g.

        pdev            mdev_host       mdev_device
        dev<------------dev<------------dev
              parent          parent

        Figure 1: device hierarchy

> For example, directory structure we have now is:
> /sys/bus/pci/devices/0000\:85\:00.0/<mdev_device>
> 
> mdev devices are in real parents directory.
> 
> By introducing fake device it would be:
> /sys/bus/pci/devices/0000\:85\:00.0/mdev-host/<mdev_device>
> 
> mdev devices are in fake device's directory.
>

Yes, this is the wanted directory.

> Lock would be still required, to handle the race conditions like
> 'mdev_create' is still in process and parent device is unregistered by
> vendor driver/ parent device is unbind from vendor driver.
>

locks are provided to protect resources, would you elaborate more on
what is the exact resource you want to protect by a lock in mdev_create?

> With the new changes/discussion, we believe the locking will be
> simplified without having fake parent device.
>
> With fake device suggestion, removed pointer to parent device from
> mdev_device structure. When a create(struct mdev_device *mdev) callback
> comes to vendor driver, how would vendor driver know for which physical
> device this mdev device create call is intended to? because then
> 'parent' would be newly introduced fake device, not the real parent.

Please have a look at "Figure 1".

--
Thanks,
Jike
Kirti Wankhede Sept. 12, 2016, 7:49 a.m. UTC | #8
On 9/12/2016 10:40 AM, Jike Song wrote:
> On 09/10/2016 03:55 AM, Kirti Wankhede wrote:
>> On 9/10/2016 12:12 AM, Alex Williamson wrote:
>>> On Fri, 9 Sep 2016 23:18:45 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>
>>>> On 9/8/2016 1:39 PM, Jike Song wrote:
>>>>> On 08/25/2016 11:53 AM, Kirti Wankhede wrote:  
>>>>
>>>>>>  +---------------+
>>>>>>  |               |
>>>>>>  | +-----------+ |  mdev_register_driver() +--------------+
>>>>>>  | |           | +<------------------------+ __init()     |
>>>>>>  | |  mdev     | |                         |              |
>>>>>>  | |  bus      | +------------------------>+              |<-> VFIO user
>>>>>>  | |  driver   | |     probe()/remove()    | vfio_mdev.ko |    APIs
>>>>>>  | |           | |                         |              |
>>>>>>  | +-----------+ |                         +--------------+
>>>>>>  |               |  
>>>>>
>>>>> This aimed to have only one single vfio bus driver for all mediated devices,
>>>>> right?
>>>>>  
>>>>
>>>> Yes. That's correct.
>>>>
>>>>
>>>>>> +
>>>>>> +static int mdev_add_attribute_group(struct device *dev,
>>>>>> +				    const struct attribute_group **groups)
>>>>>> +{
>>>>>> +	return sysfs_create_groups(&dev->kobj, groups);
>>>>>> +}
>>>>>> +
>>>>>> +static void mdev_remove_attribute_group(struct device *dev,
>>>>>> +					const struct attribute_group **groups)
>>>>>> +{
>>>>>> +	sysfs_remove_groups(&dev->kobj, groups);
>>>>>> +}  
>>>>>
>>>>> These functions are not necessary. You can always specify the attribute groups
>>>>> to dev->groups before registering a new device.
>>>>>   
>>>>
>>>> At the time of mdev device create, I specifically didn't used
>>>> dev->groups because we callback in vendor driver before that, see below
>>>> code snippet, and those attributes should only be added if create()
>>>> callback returns success.
>>>>
>>>>         ret = parent->ops->create(mdev, mdev_params);
>>>>         if (ret)
>>>>                 return ret;
>>>>
>>>>         ret = mdev_add_attribute_group(&mdev->dev,
>>>>                                         parent->ops->mdev_attr_groups);
>>>>         if (ret)
>>>>                 parent->ops->destroy(mdev);
>>>>
>>>>
>>>>
>>>>>> +
>>>>>> +static struct parent_device *mdev_get_parent_from_dev(struct device *dev)
>>>>>> +{
>>>>>> +	struct parent_device *parent;
>>>>>> +
>>>>>> +	mutex_lock(&parent_list_lock);
>>>>>> +	parent = mdev_get_parent(__find_parent_device(dev));
>>>>>> +	mutex_unlock(&parent_list_lock);
>>>>>> +
>>>>>> +	return parent;
>>>>>> +}  
>>>>>
>>>>> As we have demonstrated, all these refs and locks and release workqueue are not necessary,
>>>>> as long as you have an independent device associated with the mdev host device
>>>>> ("parent" device here).
>>>>>  
>>>>
>>>> I don't think every lock will go away with that. This also changes how
>>>> mdev devices entries are created in sysfs. It adds an extra directory.
>>>
>>> Exposing the parent-child relationship through sysfs is a desirable
>>> feature, so I'm not sure how this is a negative.  This part of Jike's
>>> conversion was a big improvement, I thought.  Thanks,
>>>
>>
>> Jike's suggestion is to introduced a fake device over parent device i.e.
>> mdev-host, and then all mdev devices are children of 'mdev-host' not
>> children of real parent.
>>
> 
> It really depends on how you define 'real parent' :)
> 
> With a physical-host-mdev hierarchy, the parent of mdev devices is the host
> device, the parent of host device is the physical device. e.g.
> 
>         pdev            mdev_host       mdev_device
>         dev<------------dev<------------dev
>               parent          parent
> 
>         Figure 1: device hierarchy
> 

Right, mdev-host device doesn't represent physical device nor any mdev
device. Then what is the need of such device?

>> For example, directory structure we have now is:
>> /sys/bus/pci/devices/0000\:85\:00.0/<mdev_device>
>>
>> mdev devices are in real parents directory.
>>
>> By introducing fake device it would be:
>> /sys/bus/pci/devices/0000\:85\:00.0/mdev-host/<mdev_device>
>>
>> mdev devices are in fake device's directory.
>>
> 
> Yes, this is the wanted directory.
> 

I don't think so.


>> Lock would be still required, to handle the race conditions like
>> 'mdev_create' is still in process and parent device is unregistered by
>> vendor driver/ parent device is unbind from vendor driver.
>>
> 
> locks are provided to protect resources, would you elaborate more on
> what is the exact resource you want to protect by a lock in mdev_create?
> 

Simple, in your suggestion mdev-host device. Fake device will go away if
vendor driver unregisters the device from mdev module, right.

Thanks,
Kirti.

>> With the new changes/discussion, we believe the locking will be
>> simplified without having fake parent device.
>>
>> With fake device suggestion, removed pointer to parent device from
>> mdev_device structure. When a create(struct mdev_device *mdev) callback
>> comes to vendor driver, how would vendor driver know for which physical
>> device this mdev device create call is intended to? because then
>> 'parent' would be newly introduced fake device, not the real parent.
> 
> Please have a look at "Figure 1".
> 
> --
> Thanks,
> Jike
>
Alex Williamson Sept. 12, 2016, 3:53 p.m. UTC | #9
On Mon, 12 Sep 2016 13:19:11 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 9/12/2016 10:40 AM, Jike Song wrote:
> > On 09/10/2016 03:55 AM, Kirti Wankhede wrote:  
> >> On 9/10/2016 12:12 AM, Alex Williamson wrote:  
> >>> On Fri, 9 Sep 2016 23:18:45 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>  
> >>>> On 9/8/2016 1:39 PM, Jike Song wrote:  
> >>>>> On 08/25/2016 11:53 AM, Kirti Wankhede wrote:    
> >>>>  
> >>>>>>  +---------------+
> >>>>>>  |               |
> >>>>>>  | +-----------+ |  mdev_register_driver() +--------------+
> >>>>>>  | |           | +<------------------------+ __init()     |
> >>>>>>  | |  mdev     | |                         |              |
> >>>>>>  | |  bus      | +------------------------>+              |<-> VFIO user
> >>>>>>  | |  driver   | |     probe()/remove()    | vfio_mdev.ko |    APIs
> >>>>>>  | |           | |                         |              |
> >>>>>>  | +-----------+ |                         +--------------+
> >>>>>>  |               |    
> >>>>>
> >>>>> This aimed to have only one single vfio bus driver for all mediated devices,
> >>>>> right?
> >>>>>    
> >>>>
> >>>> Yes. That's correct.
> >>>>
> >>>>  
> >>>>>> +
> >>>>>> +static int mdev_add_attribute_group(struct device *dev,
> >>>>>> +				    const struct attribute_group **groups)
> >>>>>> +{
> >>>>>> +	return sysfs_create_groups(&dev->kobj, groups);
> >>>>>> +}
> >>>>>> +
> >>>>>> +static void mdev_remove_attribute_group(struct device *dev,
> >>>>>> +					const struct attribute_group **groups)
> >>>>>> +{
> >>>>>> +	sysfs_remove_groups(&dev->kobj, groups);
> >>>>>> +}    
> >>>>>
> >>>>> These functions are not necessary. You can always specify the attribute groups
> >>>>> to dev->groups before registering a new device.
> >>>>>     
> >>>>
> >>>> At the time of mdev device create, I specifically didn't used
> >>>> dev->groups because we callback in vendor driver before that, see below
> >>>> code snippet, and those attributes should only be added if create()
> >>>> callback returns success.
> >>>>
> >>>>         ret = parent->ops->create(mdev, mdev_params);
> >>>>         if (ret)
> >>>>                 return ret;
> >>>>
> >>>>         ret = mdev_add_attribute_group(&mdev->dev,
> >>>>                                         parent->ops->mdev_attr_groups);
> >>>>         if (ret)
> >>>>                 parent->ops->destroy(mdev);
> >>>>
> >>>>
> >>>>  
> >>>>>> +
> >>>>>> +static struct parent_device *mdev_get_parent_from_dev(struct device *dev)
> >>>>>> +{
> >>>>>> +	struct parent_device *parent;
> >>>>>> +
> >>>>>> +	mutex_lock(&parent_list_lock);
> >>>>>> +	parent = mdev_get_parent(__find_parent_device(dev));
> >>>>>> +	mutex_unlock(&parent_list_lock);
> >>>>>> +
> >>>>>> +	return parent;
> >>>>>> +}    
> >>>>>
> >>>>> As we have demonstrated, all these refs and locks and release workqueue are not necessary,
> >>>>> as long as you have an independent device associated with the mdev host device
> >>>>> ("parent" device here).
> >>>>>    
> >>>>
> >>>> I don't think every lock will go away with that. This also changes how
> >>>> mdev devices entries are created in sysfs. It adds an extra directory.  
> >>>
> >>> Exposing the parent-child relationship through sysfs is a desirable
> >>> feature, so I'm not sure how this is a negative.  This part of Jike's
> >>> conversion was a big improvement, I thought.  Thanks,
> >>>  
> >>
> >> Jike's suggestion is to introduced a fake device over parent device i.e.
> >> mdev-host, and then all mdev devices are children of 'mdev-host' not
> >> children of real parent.
> >>  
> > 
> > It really depends on how you define 'real parent' :)
> > 
> > With a physical-host-mdev hierarchy, the parent of mdev devices is the host
> > device, the parent of host device is the physical device. e.g.
> > 
> >         pdev            mdev_host       mdev_device
> >         dev<------------dev<------------dev
> >               parent          parent
> > 
> >         Figure 1: device hierarchy
> >   
> 
> Right, mdev-host device doesn't represent physical device nor any mdev
> device. Then what is the need of such device?

Is there anything implicitly wrong with using a device node to host the
mdev child devices?  Is the argument against it only that it's
unnecessary?  Can we make use of the device-core parent/child
dependencies as Jike has done w/o that extra node?
 
> >> For example, directory structure we have now is:
> >> /sys/bus/pci/devices/0000\:85\:00.0/<mdev_device>
> >>
> >> mdev devices are in real parents directory.
> >>
> >> By introducing fake device it would be:
> >> /sys/bus/pci/devices/0000\:85\:00.0/mdev-host/<mdev_device>
> >>
> >> mdev devices are in fake device's directory.
> >>  
> > 
> > Yes, this is the wanted directory.
> >   
> 
> I don't think so.

Why?

> >> Lock would be still required, to handle the race conditions like
> >> 'mdev_create' is still in process and parent device is unregistered by
> >> vendor driver/ parent device is unbind from vendor driver.
> >>  
> > 
> > locks are provided to protect resources, would you elaborate more on
> > what is the exact resource you want to protect by a lock in mdev_create?
> >   
> 
> Simple, in your suggestion mdev-host device. Fake device will go away if
> vendor driver unregisters the device from mdev module, right.

I don't follow the reply here, but aiui there's ordering implicit in
the device core that Jike is trying to take advantage of that
simplifies the mdev layer significantly.  In the case of an
mdev_create, the device core needs to take a reference to the parent
object, the mdev-host I'd guess in Jike's version, the created mdev
device would also have a reference to that object, so the physical host
device could not be removed so long as there are outstanding
references.  It's just a matter of managing references and acquiring
and releasing objects.  Thanks,

Alex
Jike Song Sept. 19, 2016, 7:08 a.m. UTC | #10
On 09/12/2016 11:53 PM, Alex Williamson wrote:
> On Mon, 12 Sep 2016 13:19:11 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 9/12/2016 10:40 AM, Jike Song wrote:
>>> On 09/10/2016 03:55 AM, Kirti Wankhede wrote:  
>>>> On 9/10/2016 12:12 AM, Alex Williamson wrote:  
>>>>> On Fri, 9 Sep 2016 23:18:45 +0530
>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>  
>>>>>> On 9/8/2016 1:39 PM, Jike Song wrote:  
>>>>>>> On 08/25/2016 11:53 AM, Kirti Wankhede wrote:    
>>>>>>  
>>>>>>>>  +---------------+
>>>>>>>>  |               |
>>>>>>>>  | +-----------+ |  mdev_register_driver() +--------------+
>>>>>>>>  | |           | +<------------------------+ __init()     |
>>>>>>>>  | |  mdev     | |                         |              |
>>>>>>>>  | |  bus      | +------------------------>+              |<-> VFIO user
>>>>>>>>  | |  driver   | |     probe()/remove()    | vfio_mdev.ko |    APIs
>>>>>>>>  | |           | |                         |              |
>>>>>>>>  | +-----------+ |                         +--------------+
>>>>>>>>  |               |    
>>>>>>>
>>>>>>> This aimed to have only one single vfio bus driver for all mediated devices,
>>>>>>> right?
>>>>>>>    
>>>>>>
>>>>>> Yes. That's correct.
>>>>>>
>>>>>>  
>>>>>>>> +
>>>>>>>> +static int mdev_add_attribute_group(struct device *dev,
>>>>>>>> +				    const struct attribute_group **groups)
>>>>>>>> +{
>>>>>>>> +	return sysfs_create_groups(&dev->kobj, groups);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void mdev_remove_attribute_group(struct device *dev,
>>>>>>>> +					const struct attribute_group **groups)
>>>>>>>> +{
>>>>>>>> +	sysfs_remove_groups(&dev->kobj, groups);
>>>>>>>> +}    
>>>>>>>
>>>>>>> These functions are not necessary. You can always specify the attribute groups
>>>>>>> to dev->groups before registering a new device.
>>>>>>>     
>>>>>>
>>>>>> At the time of mdev device create, I specifically didn't used
>>>>>> dev->groups because we callback in vendor driver before that, see below
>>>>>> code snippet, and those attributes should only be added if create()
>>>>>> callback returns success.
>>>>>>
>>>>>>         ret = parent->ops->create(mdev, mdev_params);
>>>>>>         if (ret)
>>>>>>                 return ret;
>>>>>>
>>>>>>         ret = mdev_add_attribute_group(&mdev->dev,
>>>>>>                                         parent->ops->mdev_attr_groups);
>>>>>>         if (ret)
>>>>>>                 parent->ops->destroy(mdev);
>>>>>>
>>>>>>
>>>>>>  
>>>>>>>> +
>>>>>>>> +static struct parent_device *mdev_get_parent_from_dev(struct device *dev)
>>>>>>>> +{
>>>>>>>> +	struct parent_device *parent;
>>>>>>>> +
>>>>>>>> +	mutex_lock(&parent_list_lock);
>>>>>>>> +	parent = mdev_get_parent(__find_parent_device(dev));
>>>>>>>> +	mutex_unlock(&parent_list_lock);
>>>>>>>> +
>>>>>>>> +	return parent;
>>>>>>>> +}    
>>>>>>>
>>>>>>> As we have demonstrated, all these refs and locks and release workqueue are not necessary,
>>>>>>> as long as you have an independent device associated with the mdev host device
>>>>>>> ("parent" device here).
>>>>>>>    
>>>>>>
>>>>>> I don't think every lock will go away with that. This also changes how
>>>>>> mdev devices entries are created in sysfs. It adds an extra directory.  
>>>>>
>>>>> Exposing the parent-child relationship through sysfs is a desirable
>>>>> feature, so I'm not sure how this is a negative.  This part of Jike's
>>>>> conversion was a big improvement, I thought.  Thanks,
>>>>>  
>>>>
>>>> Jike's suggestion is to introduced a fake device over parent device i.e.
>>>> mdev-host, and then all mdev devices are children of 'mdev-host' not
>>>> children of real parent.
>>>>  
>>>
>>> It really depends on how you define 'real parent' :)
>>>
>>> With a physical-host-mdev hierarchy, the parent of mdev devices is the host
>>> device, the parent of host device is the physical device. e.g.
>>>
>>>         pdev            mdev_host       mdev_device
>>>         dev<------------dev<------------dev
>>>               parent          parent
>>>
>>>         Figure 1: device hierarchy
>>>   
>>
>> Right, mdev-host device doesn't represent physical device nor any mdev
>> device. Then what is the need of such device?
> 
> Is there anything implicitly wrong with using a device node to host the
> mdev child devices?  Is the argument against it only that it's
> unnecessary?  Can we make use of the device-core parent/child
> dependencies as Jike has done w/o that extra node?
>  
>>>> For example, directory structure we have now is:
>>>> /sys/bus/pci/devices/0000\:85\:00.0/<mdev_device>
>>>>
>>>> mdev devices are in real parents directory.
>>>>
>>>> By introducing fake device it would be:
>>>> /sys/bus/pci/devices/0000\:85\:00.0/mdev-host/<mdev_device>
>>>>
>>>> mdev devices are in fake device's directory.
>>>>  
>>>
>>> Yes, this is the wanted directory.
>>>   
>>
>> I don't think so.
> 
> Why?
> 
>>>> Lock would be still required, to handle the race conditions like
>>>> 'mdev_create' is still in process and parent device is unregistered by
>>>> vendor driver/ parent device is unbind from vendor driver.
>>>>  
>>>
>>> locks are provided to protect resources, would you elaborate more on
>>> what is the exact resource you want to protect by a lock in mdev_create?
>>>   
>>
>> Simple, in your suggestion mdev-host device. Fake device will go away if
>> vendor driver unregisters the device from mdev module, right.
> 
> I don't follow the reply here, but aiui there's ordering implicit in
> the device core that Jike is trying to take advantage of that
> simplifies the mdev layer significantly.  In the case of an
> mdev_create, the device core needs to take a reference to the parent
> object, the mdev-host I'd guess in Jike's version, the created mdev
> device would also have a reference to that object, so the physical host
> device could not be removed so long as there are outstanding
> references.  It's just a matter of managing references and acquiring
> and releasing objects.  Thanks,

Hi Kirti/Neo,

Any thought on this part? Could we have more discussion in case that
further concern raised?


--
Thanks,
Jike
Kirti Wankhede Sept. 19, 2016, 5:29 p.m. UTC | #11
On 9/12/2016 9:23 PM, Alex Williamson wrote:
> On Mon, 12 Sep 2016 13:19:11 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 9/12/2016 10:40 AM, Jike Song wrote:
>>> On 09/10/2016 03:55 AM, Kirti Wankhede wrote:  
>>>> On 9/10/2016 12:12 AM, Alex Williamson wrote:  
>>>>> On Fri, 9 Sep 2016 23:18:45 +0530
>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>  
>>>>>> On 9/8/2016 1:39 PM, Jike Song wrote:  
>>>>>>> On 08/25/2016 11:53 AM, Kirti Wankhede wrote:    
>>>>>>  
>>>>>>>>  +---------------+
>>>>>>>>  |               |
>>>>>>>>  | +-----------+ |  mdev_register_driver() +--------------+
>>>>>>>>  | |           | +<------------------------+ __init()     |
>>>>>>>>  | |  mdev     | |                         |              |
>>>>>>>>  | |  bus      | +------------------------>+              |<-> VFIO user
>>>>>>>>  | |  driver   | |     probe()/remove()    | vfio_mdev.ko |    APIs
>>>>>>>>  | |           | |                         |              |
>>>>>>>>  | +-----------+ |                         +--------------+
>>>>>>>>  |               |    
>>>>>>>
>>>>>>> This aimed to have only one single vfio bus driver for all mediated devices,
>>>>>>> right?
>>>>>>>    
>>>>>>
>>>>>> Yes. That's correct.
>>>>>>
>>>>>>  
>>>>>>>> +
>>>>>>>> +static int mdev_add_attribute_group(struct device *dev,
>>>>>>>> +				    const struct attribute_group **groups)
>>>>>>>> +{
>>>>>>>> +	return sysfs_create_groups(&dev->kobj, groups);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void mdev_remove_attribute_group(struct device *dev,
>>>>>>>> +					const struct attribute_group **groups)
>>>>>>>> +{
>>>>>>>> +	sysfs_remove_groups(&dev->kobj, groups);
>>>>>>>> +}    
>>>>>>>
>>>>>>> These functions are not necessary. You can always specify the attribute groups
>>>>>>> to dev->groups before registering a new device.
>>>>>>>     
>>>>>>
>>>>>> At the time of mdev device create, I specifically didn't used
>>>>>> dev->groups because we callback in vendor driver before that, see below
>>>>>> code snippet, and those attributes should only be added if create()
>>>>>> callback returns success.
>>>>>>
>>>>>>         ret = parent->ops->create(mdev, mdev_params);
>>>>>>         if (ret)
>>>>>>                 return ret;
>>>>>>
>>>>>>         ret = mdev_add_attribute_group(&mdev->dev,
>>>>>>                                         parent->ops->mdev_attr_groups);
>>>>>>         if (ret)
>>>>>>                 parent->ops->destroy(mdev);
>>>>>>
>>>>>>
>>>>>>  
>>>>>>>> +
>>>>>>>> +static struct parent_device *mdev_get_parent_from_dev(struct device *dev)
>>>>>>>> +{
>>>>>>>> +	struct parent_device *parent;
>>>>>>>> +
>>>>>>>> +	mutex_lock(&parent_list_lock);
>>>>>>>> +	parent = mdev_get_parent(__find_parent_device(dev));
>>>>>>>> +	mutex_unlock(&parent_list_lock);
>>>>>>>> +
>>>>>>>> +	return parent;
>>>>>>>> +}    
>>>>>>>
>>>>>>> As we have demonstrated, all these refs and locks and release workqueue are not necessary,
>>>>>>> as long as you have an independent device associated with the mdev host device
>>>>>>> ("parent" device here).
>>>>>>>    
>>>>>>
>>>>>> I don't think every lock will go away with that. This also changes how
>>>>>> mdev devices entries are created in sysfs. It adds an extra directory.  
>>>>>
>>>>> Exposing the parent-child relationship through sysfs is a desirable
>>>>> feature, so I'm not sure how this is a negative.  This part of Jike's
>>>>> conversion was a big improvement, I thought.  Thanks,
>>>>>  
>>>>
>>>> Jike's suggestion is to introduced a fake device over parent device i.e.
>>>> mdev-host, and then all mdev devices are children of 'mdev-host' not
>>>> children of real parent.
>>>>  
>>>
>>> It really depends on how you define 'real parent' :)
>>>
>>> With a physical-host-mdev hierarchy, the parent of mdev devices is the host
>>> device, the parent of host device is the physical device. e.g.
>>>
>>>         pdev            mdev_host       mdev_device
>>>         dev<------------dev<------------dev
>>>               parent          parent
>>>
>>>         Figure 1: device hierarchy
>>>   
>>
>> Right, mdev-host device doesn't represent physical device nor any mdev
>> device. Then what is the need of such device?
> 
> Is there anything implicitly wrong with using a device node to host the
> mdev child devices?  Is the argument against it only that it's
> unnecessary?  Can we make use of the device-core parent/child
> dependencies as Jike has done w/o that extra node?
>

I do feel that mdev core module would get simplified with the new sysfs
interface without having extra node.


>>>> For example, directory structure we have now is:
>>>> /sys/bus/pci/devices/0000\:85\:00.0/<mdev_device>
>>>>
>>>> mdev devices are in real parents directory.
>>>>
>>>> By introducing fake device it would be:
>>>> /sys/bus/pci/devices/0000\:85\:00.0/mdev-host/<mdev_device>
>>>>
>>>> mdev devices are in fake device's directory.
>>>>  
>>>
>>> Yes, this is the wanted directory.
>>>   
>>
>> I don't think so.
> 
> Why?
> 

This directory is not mandatory. right?

>>>> Lock would be still required, to handle the race conditions like
>>>> 'mdev_create' is still in process and parent device is unregistered by
>>>> vendor driver/ parent device is unbind from vendor driver.
>>>>  
>>>
>>> locks are provided to protect resources, would you elaborate more on
>>> what is the exact resource you want to protect by a lock in mdev_create?
>>>   
>>
>> Simple, in your suggestion mdev-host device. Fake device will go away if
>> vendor driver unregisters the device from mdev module, right.
> 
> I don't follow the reply here, but aiui there's ordering implicit in
> the device core that Jike is trying to take advantage of that
> simplifies the mdev layer significantly.  In the case of an
> mdev_create, the device core needs to take a reference to the parent
> object, the mdev-host I'd guess in Jike's version, the created mdev
> device would also have a reference to that object, so the physical host
> device could not be removed so long as there are outstanding
> references.  It's just a matter of managing references and acquiring
> and releasing objects.  Thanks,
>

I do think this could be simplified without having extra node.

> the created mdev
> device would also have a reference to that object, so the physical host
> device could not be removed so long as there are outstanding
> references.

Yes, this is also true when physical device is direct parent of mdev
device. mdev device keeps reference of parent, so physical host device
could not be removed as long as mdev devices are present. That is why
from mdev_unregister_device() a chance is given to free all child mdev
devices.

Thanks,
Kirti
Alex Williamson Sept. 19, 2016, 6:11 p.m. UTC | #12
On Mon, 19 Sep 2016 22:59:34 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 9/12/2016 9:23 PM, Alex Williamson wrote:
> > On Mon, 12 Sep 2016 13:19:11 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 9/12/2016 10:40 AM, Jike Song wrote:  
> >>> On 09/10/2016 03:55 AM, Kirti Wankhede wrote:    
> >>>> On 9/10/2016 12:12 AM, Alex Williamson wrote:    
> >>>>> On Fri, 9 Sep 2016 23:18:45 +0530
> >>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>    
> >>>>>> On 9/8/2016 1:39 PM, Jike Song wrote:    
> >>>>>>> On 08/25/2016 11:53 AM, Kirti Wankhede wrote:      
> >>>>>>    
> >>>>>>>>  +---------------+
> >>>>>>>>  |               |
> >>>>>>>>  | +-----------+ |  mdev_register_driver() +--------------+
> >>>>>>>>  | |           | +<------------------------+ __init()     |
> >>>>>>>>  | |  mdev     | |                         |              |
> >>>>>>>>  | |  bus      | +------------------------>+              |<-> VFIO user
> >>>>>>>>  | |  driver   | |     probe()/remove()    | vfio_mdev.ko |    APIs
> >>>>>>>>  | |           | |                         |              |
> >>>>>>>>  | +-----------+ |                         +--------------+
> >>>>>>>>  |               |      
> >>>>>>>
> >>>>>>> This aimed to have only one single vfio bus driver for all mediated devices,
> >>>>>>> right?
> >>>>>>>      
> >>>>>>
> >>>>>> Yes. That's correct.
> >>>>>>
> >>>>>>    
> >>>>>>>> +
> >>>>>>>> +static int mdev_add_attribute_group(struct device *dev,
> >>>>>>>> +				    const struct attribute_group **groups)
> >>>>>>>> +{
> >>>>>>>> +	return sysfs_create_groups(&dev->kobj, groups);
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> +static void mdev_remove_attribute_group(struct device *dev,
> >>>>>>>> +					const struct attribute_group **groups)
> >>>>>>>> +{
> >>>>>>>> +	sysfs_remove_groups(&dev->kobj, groups);
> >>>>>>>> +}      
> >>>>>>>
> >>>>>>> These functions are not necessary. You can always specify the attribute groups
> >>>>>>> to dev->groups before registering a new device.
> >>>>>>>       
> >>>>>>
> >>>>>> At the time of mdev device create, I specifically didn't used
> >>>>>> dev->groups because we callback in vendor driver before that, see below
> >>>>>> code snippet, and those attributes should only be added if create()
> >>>>>> callback returns success.
> >>>>>>
> >>>>>>         ret = parent->ops->create(mdev, mdev_params);
> >>>>>>         if (ret)
> >>>>>>                 return ret;
> >>>>>>
> >>>>>>         ret = mdev_add_attribute_group(&mdev->dev,
> >>>>>>                                         parent->ops->mdev_attr_groups);
> >>>>>>         if (ret)
> >>>>>>                 parent->ops->destroy(mdev);
> >>>>>>
> >>>>>>
> >>>>>>    
> >>>>>>>> +
> >>>>>>>> +static struct parent_device *mdev_get_parent_from_dev(struct device *dev)
> >>>>>>>> +{
> >>>>>>>> +	struct parent_device *parent;
> >>>>>>>> +
> >>>>>>>> +	mutex_lock(&parent_list_lock);
> >>>>>>>> +	parent = mdev_get_parent(__find_parent_device(dev));
> >>>>>>>> +	mutex_unlock(&parent_list_lock);
> >>>>>>>> +
> >>>>>>>> +	return parent;
> >>>>>>>> +}      
> >>>>>>>
> >>>>>>> As we have demonstrated, all these refs and locks and release workqueue are not necessary,
> >>>>>>> as long as you have an independent device associated with the mdev host device
> >>>>>>> ("parent" device here).
> >>>>>>>      
> >>>>>>
> >>>>>> I don't think every lock will go away with that. This also changes how
> >>>>>> mdev devices entries are created in sysfs. It adds an extra directory.    
> >>>>>
> >>>>> Exposing the parent-child relationship through sysfs is a desirable
> >>>>> feature, so I'm not sure how this is a negative.  This part of Jike's
> >>>>> conversion was a big improvement, I thought.  Thanks,
> >>>>>    
> >>>>
> >>>> Jike's suggestion is to introduced a fake device over parent device i.e.
> >>>> mdev-host, and then all mdev devices are children of 'mdev-host' not
> >>>> children of real parent.
> >>>>    
> >>>
> >>> It really depends on how you define 'real parent' :)
> >>>
> >>> With a physical-host-mdev hierarchy, the parent of mdev devices is the host
> >>> device, the parent of host device is the physical device. e.g.
> >>>
> >>>         pdev            mdev_host       mdev_device
> >>>         dev<------------dev<------------dev
> >>>               parent          parent
> >>>
> >>>         Figure 1: device hierarchy
> >>>     
> >>
> >> Right, mdev-host device doesn't represent physical device nor any mdev
> >> device. Then what is the need of such device?  
> > 
> > Is there anything implicitly wrong with using a device node to host the
> > mdev child devices?  Is the argument against it only that it's
> > unnecessary?  Can we make use of the device-core parent/child
> > dependencies as Jike has done w/o that extra node?
> >  
> 
> I do feel that mdev core module would get simplified with the new sysfs
> interface without having extra node.

Can you provide an example of why that is?
 
> >>>> For example, directory structure we have now is:
> >>>> /sys/bus/pci/devices/0000\:85\:00.0/<mdev_device>
> >>>>
> >>>> mdev devices are in real parents directory.
> >>>>
> >>>> By introducing fake device it would be:
> >>>> /sys/bus/pci/devices/0000\:85\:00.0/mdev-host/<mdev_device>
> >>>>
> >>>> mdev devices are in fake device's directory.
> >>>>    
> >>>
> >>> Yes, this is the wanted directory.
> >>>     
> >>
> >> I don't think so.  
> > 
> > Why?
> >   
> 
> This directory is not mandatory. right?

Clearly you've done an implementation without it, so it's not
functionally mandatory, but Jike has made significant code reduction
and simplification with it.  Those are desirable things.

> >>>> Lock would be still required, to handle the race conditions like
> >>>> 'mdev_create' is still in process and parent device is unregistered by
> >>>> vendor driver/ parent device is unbind from vendor driver.
> >>>>    
> >>>
> >>> locks are provided to protect resources, would you elaborate more on
> >>> what is the exact resource you want to protect by a lock in mdev_create?
> >>>     
> >>
> >> Simple, in your suggestion mdev-host device. Fake device will go away if
> >> vendor driver unregisters the device from mdev module, right.  
> > 
> > I don't follow the reply here, but aiui there's ordering implicit in
> > the device core that Jike is trying to take advantage of that
> > simplifies the mdev layer significantly.  In the case of an
> > mdev_create, the device core needs to take a reference to the parent
> > object, the mdev-host I'd guess in Jike's version, the created mdev
> > device would also have a reference to that object, so the physical host
> > device could not be removed so long as there are outstanding
> > references.  It's just a matter of managing references and acquiring
> > and releasing objects.  Thanks,
> >  
> 
> I do think this could be simplified without having extra node.

The simplification is really what I'm after, whether or not it includes
an extra device node is not something I'm sure I have an opinion on
yet.  Aren't we really just talking about an extra sysfs directory
node?  Doesn't it make it easier for userspace to efficiently identify
all the mdev children when they're segregated from the other attributes
and sub-nodes of a parent device?
 
> > the created mdev
> > device would also have a reference to that object, so the physical host
> > device could not be removed so long as there are outstanding
> > references.  
> 
> Yes, this is also true when physical device is direct parent of mdev
> device. mdev device keeps reference of parent, so physical host device
> could not be removed as long as mdev devices are present. That is why
> from mdev_unregister_device() a chance is given to free all child mdev
> devices.

But why aren't we using the device core do do that?  It seems like
we're getting hung up on this device node, which is more of a stylistic
and sysfs layout issue when the primary comment is to simplify the mdev
infrastructure by taking more advantage of the parent/child
dependencies of the device core.  Thanks,

Alex
Kirti Wankhede Sept. 19, 2016, 8:09 p.m. UTC | #13
On 9/19/2016 11:41 PM, Alex Williamson wrote:
> On Mon, 19 Sep 2016 22:59:34 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 9/12/2016 9:23 PM, Alex Williamson wrote:
>>> On Mon, 12 Sep 2016 13:19:11 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>   
>>>> On 9/12/2016 10:40 AM, Jike Song wrote:  
>>>>> On 09/10/2016 03:55 AM, Kirti Wankhede wrote:    
>>>>>> On 9/10/2016 12:12 AM, Alex Williamson wrote:    
>>>>>>> On Fri, 9 Sep 2016 23:18:45 +0530
>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>>>    
>>>>>>>>>> +
>>>>>>>>>> +static struct parent_device *mdev_get_parent_from_dev(struct device *dev)
>>>>>>>>>> +{
>>>>>>>>>> +	struct parent_device *parent;
>>>>>>>>>> +
>>>>>>>>>> +	mutex_lock(&parent_list_lock);
>>>>>>>>>> +	parent = mdev_get_parent(__find_parent_device(dev));
>>>>>>>>>> +	mutex_unlock(&parent_list_lock);
>>>>>>>>>> +
>>>>>>>>>> +	return parent;
>>>>>>>>>> +}      
>>>>>>>>>
>>>>>>>>> As we have demonstrated, all these refs and locks and release workqueue are not necessary,
>>>>>>>>> as long as you have an independent device associated with the mdev host device
>>>>>>>>> ("parent" device here).
>>>>>>>>>      
>>>>>>>>
>>>>>>>> I don't think every lock will go away with that. This also changes how
>>>>>>>> mdev devices entries are created in sysfs. It adds an extra directory.    
>>>>>>>
>>>>>>> Exposing the parent-child relationship through sysfs is a desirable
>>>>>>> feature, so I'm not sure how this is a negative.  This part of Jike's
>>>>>>> conversion was a big improvement, I thought.  Thanks,
>>>>>>>    
>>>>>>
>>>>>> Jike's suggestion is to introduced a fake device over parent device i.e.
>>>>>> mdev-host, and then all mdev devices are children of 'mdev-host' not
>>>>>> children of real parent.
>>>>>>    
>>>>>
>>>>> It really depends on how you define 'real parent' :)
>>>>>
>>>>> With a physical-host-mdev hierarchy, the parent of mdev devices is the host
>>>>> device, the parent of host device is the physical device. e.g.
>>>>>
>>>>>         pdev            mdev_host       mdev_device
>>>>>         dev<------------dev<------------dev
>>>>>               parent          parent
>>>>>
>>>>>         Figure 1: device hierarchy
>>>>>     
>>>>
>>>> Right, mdev-host device doesn't represent physical device nor any mdev
>>>> device. Then what is the need of such device?  
>>>
>>> Is there anything implicitly wrong with using a device node to host the
>>> mdev child devices?  Is the argument against it only that it's
>>> unnecessary?  Can we make use of the device-core parent/child
>>> dependencies as Jike has done w/o that extra node?
>>>  
>>
>> I do feel that mdev core module would get simplified with the new sysfs
>> interface without having extra node.
> 
> Can you provide an example of why that is?
>  

'online' or earlier 'start'/'stop' interface is removed and would add
open() and close() callbacks. ref count from struct mdev_device and
mdev_get_device()/mdev_put_device() were added for this interface, these
would go away.
Using device-core parent/child dependencies between physical device and
mdev device, other functions would get simplified.

>>>>>> For example, directory structure we have now is:
>>>>>> /sys/bus/pci/devices/0000\:85\:00.0/<mdev_device>
>>>>>>
>>>>>> mdev devices are in real parents directory.
>>>>>>
>>>>>> By introducing fake device it would be:
>>>>>> /sys/bus/pci/devices/0000\:85\:00.0/mdev-host/<mdev_device>
>>>>>>
>>>>>> mdev devices are in fake device's directory.
>>>>>>    
>>>>>
>>>>> Yes, this is the wanted directory.
>>>>>     
>>>>
>>>> I don't think so.  
>>>
>>> Why?
>>>   
>>
>> This directory is not mandatory. right?
> 
> Clearly you've done an implementation without it, so it's not
> functionally mandatory, but Jike has made significant code reduction
> and simplification with it.  Those are desirable things.
> 
>>>>>> Lock would be still required, to handle the race conditions like
>>>>>> 'mdev_create' is still in process and parent device is unregistered by
>>>>>> vendor driver/ parent device is unbind from vendor driver.
>>>>>>    
>>>>>
>>>>> locks are provided to protect resources, would you elaborate more on
>>>>> what is the exact resource you want to protect by a lock in mdev_create?
>>>>>     
>>>>
>>>> Simple, in your suggestion mdev-host device. Fake device will go away if
>>>> vendor driver unregisters the device from mdev module, right.  
>>>
>>> I don't follow the reply here, but aiui there's ordering implicit in
>>> the device core that Jike is trying to take advantage of that
>>> simplifies the mdev layer significantly.  In the case of an
>>> mdev_create, the device core needs to take a reference to the parent
>>> object, the mdev-host I'd guess in Jike's version, the created mdev
>>> device would also have a reference to that object, so the physical host
>>> device could not be removed so long as there are outstanding
>>> references.  It's just a matter of managing references and acquiring
>>> and releasing objects.  Thanks,
>>>  
>>
>> I do think this could be simplified without having extra node.
> 
> The simplification is really what I'm after, whether or not it includes
> an extra device node is not something I'm sure I have an opinion on
> yet.  Aren't we really just talking about an extra sysfs directory
> node?

No, not just extra sysfs directory. I'm trying to keep parent/child
relationship between physical device and mdev device direct without
having extra device node.

> Doesn't it make it easier for userspace to efficiently identify
> all the mdev children when they're segregated from the other attributes
> and sub-nodes of a parent device?
>  

Physical devices are generally leaf nodes. I think its easier to find
all mdev children in its own directory.

>>> the created mdev
>>> device would also have a reference to that object, so the physical host
>>> device could not be removed so long as there are outstanding
>>> references.  
>>
>> Yes, this is also true when physical device is direct parent of mdev
>> device. mdev device keeps reference of parent, so physical host device
>> could not be removed as long as mdev devices are present. That is why
>> from mdev_unregister_device() a chance is given to free all child mdev
>> devices.
> 
> But why aren't we using the device core do do that?  It seems like
> we're getting hung up on this device node, which is more of a stylistic
> and sysfs layout issue when the primary comment is to simplify the mdev
> infrastructure by taking more advantage of the parent/child
> dependencies of the device core.  Thanks,
> 

Definitely we would use device core parent/child dependency and simplify
mdev framework without having extra device node.

Thanks,
Kirti
Alex Williamson Sept. 19, 2016, 8:59 p.m. UTC | #14
On Tue, 20 Sep 2016 01:39:19 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 9/19/2016 11:41 PM, Alex Williamson wrote:
> > On Mon, 19 Sep 2016 22:59:34 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 9/12/2016 9:23 PM, Alex Williamson wrote:  
> >>> On Mon, 12 Sep 2016 13:19:11 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>     
> >>>> On 9/12/2016 10:40 AM, Jike Song wrote:    
> >>>>> On 09/10/2016 03:55 AM, Kirti Wankhede wrote:      
> >>>>>> On 9/10/2016 12:12 AM, Alex Williamson wrote:      
> >>>>>>> On Fri, 9 Sep 2016 23:18:45 +0530
> >>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>>>      
> >>>>>>>>>> +
> >>>>>>>>>> +static struct parent_device *mdev_get_parent_from_dev(struct device *dev)
> >>>>>>>>>> +{
> >>>>>>>>>> +	struct parent_device *parent;
> >>>>>>>>>> +
> >>>>>>>>>> +	mutex_lock(&parent_list_lock);
> >>>>>>>>>> +	parent = mdev_get_parent(__find_parent_device(dev));
> >>>>>>>>>> +	mutex_unlock(&parent_list_lock);
> >>>>>>>>>> +
> >>>>>>>>>> +	return parent;
> >>>>>>>>>> +}        
> >>>>>>>>>
> >>>>>>>>> As we have demonstrated, all these refs and locks and release workqueue are not necessary,
> >>>>>>>>> as long as you have an independent device associated with the mdev host device
> >>>>>>>>> ("parent" device here).
> >>>>>>>>>        
> >>>>>>>>
> >>>>>>>> I don't think every lock will go away with that. This also changes how
> >>>>>>>> mdev devices entries are created in sysfs. It adds an extra directory.      
> >>>>>>>
> >>>>>>> Exposing the parent-child relationship through sysfs is a desirable
> >>>>>>> feature, so I'm not sure how this is a negative.  This part of Jike's
> >>>>>>> conversion was a big improvement, I thought.  Thanks,
> >>>>>>>      
> >>>>>>
> >>>>>> Jike's suggestion is to introduced a fake device over parent device i.e.
> >>>>>> mdev-host, and then all mdev devices are children of 'mdev-host' not
> >>>>>> children of real parent.
> >>>>>>      
> >>>>>
> >>>>> It really depends on how you define 'real parent' :)
> >>>>>
> >>>>> With a physical-host-mdev hierarchy, the parent of mdev devices is the host
> >>>>> device, the parent of host device is the physical device. e.g.
> >>>>>
> >>>>>         pdev            mdev_host       mdev_device
> >>>>>         dev<------------dev<------------dev
> >>>>>               parent          parent
> >>>>>
> >>>>>         Figure 1: device hierarchy
> >>>>>       
> >>>>
> >>>> Right, mdev-host device doesn't represent physical device nor any mdev
> >>>> device. Then what is the need of such device?    
> >>>
> >>> Is there anything implicitly wrong with using a device node to host the
> >>> mdev child devices?  Is the argument against it only that it's
> >>> unnecessary?  Can we make use of the device-core parent/child
> >>> dependencies as Jike has done w/o that extra node?
> >>>    
> >>
> >> I do feel that mdev core module would get simplified with the new sysfs
> >> interface without having extra node.  
> > 
> > Can you provide an example of why that is?
> >    
> 
> 'online' or earlier 'start'/'stop' interface is removed and would add
> open() and close() callbacks. ref count from struct mdev_device and
> mdev_get_device()/mdev_put_device() were added for this interface, these
> would go away.
> Using device-core parent/child dependencies between physical device and
> mdev device, other functions would get simplified.

Yes, we've refined the interface over time and I'm glad that you're
incorporating the device-core simplifications, but this doesn't really
argue for or against the extra device node.
 
> >>>>>> For example, directory structure we have now is:
> >>>>>> /sys/bus/pci/devices/0000\:85\:00.0/<mdev_device>
> >>>>>>
> >>>>>> mdev devices are in real parents directory.
> >>>>>>
> >>>>>> By introducing fake device it would be:
> >>>>>> /sys/bus/pci/devices/0000\:85\:00.0/mdev-host/<mdev_device>
> >>>>>>
> >>>>>> mdev devices are in fake device's directory.
> >>>>>>      
> >>>>>
> >>>>> Yes, this is the wanted directory.
> >>>>>       
> >>>>
> >>>> I don't think so.    
> >>>
> >>> Why?
> >>>     
> >>
> >> This directory is not mandatory. right?  
> > 
> > Clearly you've done an implementation without it, so it's not
> > functionally mandatory, but Jike has made significant code reduction
> > and simplification with it.  Those are desirable things.
> >   
> >>>>>> Lock would be still required, to handle the race conditions like
> >>>>>> 'mdev_create' is still in process and parent device is unregistered by
> >>>>>> vendor driver/ parent device is unbind from vendor driver.
> >>>>>>      
> >>>>>
> >>>>> locks are provided to protect resources, would you elaborate more on
> >>>>> what is the exact resource you want to protect by a lock in mdev_create?
> >>>>>       
> >>>>
> >>>> Simple, in your suggestion mdev-host device. Fake device will go away if
> >>>> vendor driver unregisters the device from mdev module, right.    
> >>>
> >>> I don't follow the reply here, but aiui there's ordering implicit in
> >>> the device core that Jike is trying to take advantage of that
> >>> simplifies the mdev layer significantly.  In the case of an
> >>> mdev_create, the device core needs to take a reference to the parent
> >>> object, the mdev-host I'd guess in Jike's version, the created mdev
> >>> device would also have a reference to that object, so the physical host
> >>> device could not be removed so long as there are outstanding
> >>> references.  It's just a matter of managing references and acquiring
> >>> and releasing objects.  Thanks,
> >>>    
> >>
> >> I do think this could be simplified without having extra node.  
> > 
> > The simplification is really what I'm after, whether or not it includes
> > an extra device node is not something I'm sure I have an opinion on
> > yet.  Aren't we really just talking about an extra sysfs directory
> > node?  
> 
> No, not just extra sysfs directory. I'm trying to keep parent/child
> relationship between physical device and mdev device direct without
> having extra device node.

But it's just a difference of whether the mdev devices are rooted at
the parent itself or a child of the parent.  It's just a slight
organizational change, isn't it?

> > Doesn't it make it easier for userspace to efficiently identify
> > all the mdev children when they're segregated from the other attributes
> > and sub-nodes of a parent device?
> >    
> 
> Physical devices are generally leaf nodes. I think its easier to find
> all mdev children in its own directory.

So we're just going to drop a bunch of UUID links in the parent device
sysfs directory, userspace needs to follow the link to figure out what
they are?  Doesn't that seem messy?  SR-IOV VFs are prefixed with
"virtfn" in the parent directory, putting them under an mdev-host
device is just another way to implement that organization.

> >>> the created mdev
> >>> device would also have a reference to that object, so the physical host
> >>> device could not be removed so long as there are outstanding
> >>> references.    
> >>
> >> Yes, this is also true when physical device is direct parent of mdev
> >> device. mdev device keeps reference of parent, so physical host device
> >> could not be removed as long as mdev devices are present. That is why
> >> from mdev_unregister_device() a chance is given to free all child mdev
> >> devices.  
> > 
> > But why aren't we using the device core do do that?  It seems like
> > we're getting hung up on this device node, which is more of a stylistic
> > and sysfs layout issue when the primary comment is to simplify the mdev
> > infrastructure by taking more advantage of the parent/child
> > dependencies of the device core.  Thanks,
> >   
> 
> Definitely we would use device core parent/child dependency and simplify
> mdev framework without having extra device node.

That's great, I wasn't sure if you were rejecting the whole
parent/child aspect or just the extra node.  Still, I don't see the
mdev-host node as an outright bad idea, it seems like just a way to
classify a set of child devices.  In the SR-IOV case the VFs are actual
PCI devices spawned from the parent device, so making them direct
children of the PF might make sense.  Here we're going through this
mdev core interface to create mdev devices and an mdev-host device also
seems like a realistic representation of that.  In any case, it seems
like a fairly minor tweak to add or remove it, we can argue one way or
the other on the next draft.  Thanks,

Alex
diff mbox

Patch

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index da6e2ce77495..23eced02aaf6 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -48,4 +48,5 @@  menuconfig VFIO_NOIOMMU
 
 source "drivers/vfio/pci/Kconfig"
 source "drivers/vfio/platform/Kconfig"
+source "drivers/vfio/mdev/Kconfig"
 source "virt/lib/Kconfig"
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 7b8a31f63fea..4a23c13b6be4 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -7,3 +7,4 @@  obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
 obj-$(CONFIG_VFIO_PCI) += pci/
 obj-$(CONFIG_VFIO_PLATFORM) += platform/
+obj-$(CONFIG_VFIO_MDEV) += mdev/
diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
new file mode 100644
index 000000000000..a34fbc66f92f
--- /dev/null
+++ b/drivers/vfio/mdev/Kconfig
@@ -0,0 +1,12 @@ 
+
+config VFIO_MDEV
+    tristate "Mediated device driver framework"
+    depends on VFIO
+    default n
+    help
+        Provides a framework to virtualize device.
+	See Documentation/vfio-mediated-device.txt for more details.
+
+        If you don't know what do here, say N.
+
+
diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
new file mode 100644
index 000000000000..56a75e689582
--- /dev/null
+++ b/drivers/vfio/mdev/Makefile
@@ -0,0 +1,5 @@ 
+
+mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o
+
+obj-$(CONFIG_VFIO_MDEV) += mdev.o
+
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
new file mode 100644
index 000000000000..9f278c7507f7
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -0,0 +1,509 @@ 
+/*
+ * Mediated device Core Driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/uuid.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/sysfs.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+#define DRIVER_VERSION		"0.1"
+#define DRIVER_AUTHOR		"NVIDIA Corporation"
+#define DRIVER_DESC		"Mediated device Core Driver"
+
+static LIST_HEAD(parent_list);
+static DEFINE_MUTEX(parent_list_lock);
+
+static int mdev_add_attribute_group(struct device *dev,
+				    const struct attribute_group **groups)
+{
+	return sysfs_create_groups(&dev->kobj, groups);
+}
+
+static void mdev_remove_attribute_group(struct device *dev,
+					const struct attribute_group **groups)
+{
+	sysfs_remove_groups(&dev->kobj, groups);
+}
+
+/* Should be called holding parent->mdev_list_lock */
+static struct mdev_device *__find_mdev_device(struct parent_device *parent,
+					      uuid_le uuid)
+{
+	struct mdev_device *mdev;
+
+	list_for_each_entry(mdev, &parent->mdev_list, next) {
+		if (uuid_le_cmp(mdev->uuid, uuid) == 0)
+			return mdev;
+	}
+	return NULL;
+}
+
+/* Should be called holding parent_list_lock */
+static struct parent_device *__find_parent_device(struct device *dev)
+{
+	struct parent_device *parent;
+
+	list_for_each_entry(parent, &parent_list, next) {
+		if (parent->dev == dev)
+			return parent;
+	}
+	return NULL;
+}
+
+static void mdev_release_parent(struct kref *kref)
+{
+	struct parent_device *parent = container_of(kref, struct parent_device,
+						    ref);
+	kfree(parent);
+}
+
+static
+inline struct parent_device *mdev_get_parent(struct parent_device *parent)
+{
+	if (parent)
+		kref_get(&parent->ref);
+
+	return parent;
+}
+
+static inline void mdev_put_parent(struct parent_device *parent)
+{
+	if (parent)
+		kref_put(&parent->ref, mdev_release_parent);
+}
+
+static struct parent_device *mdev_get_parent_from_dev(struct device *dev)
+{
+	struct parent_device *parent;
+
+	mutex_lock(&parent_list_lock);
+	parent = mdev_get_parent(__find_parent_device(dev));
+	mutex_unlock(&parent_list_lock);
+
+	return parent;
+}
+
+static int mdev_device_create_ops(struct mdev_device *mdev, char *mdev_params)
+{
+	struct parent_device *parent = mdev->parent;
+	int ret;
+
+	ret = parent->ops->create(mdev, mdev_params);
+	if (ret)
+		return ret;
+
+	ret = mdev_add_attribute_group(&mdev->dev,
+					parent->ops->mdev_attr_groups);
+	if (ret)
+		parent->ops->destroy(mdev);
+
+	return ret;
+}
+
+static int mdev_device_destroy_ops(struct mdev_device *mdev, bool force)
+{
+	struct parent_device *parent = mdev->parent;
+	int ret = 0;
+
+	/*
+	 * If vendor driver doesn't return success that means vendor
+	 * driver doesn't support hot-unplug
+	 */
+	ret = parent->ops->destroy(mdev);
+	if (ret && !force)
+		return -EBUSY;
+
+	mdev_remove_attribute_group(&mdev->dev,
+				    parent->ops->mdev_attr_groups);
+
+	return ret;
+}
+
+static void mdev_release_device(struct kref *kref)
+{
+	struct mdev_device *mdev = container_of(kref, struct mdev_device, ref);
+	struct parent_device *parent = mdev->parent;
+
+	list_del(&mdev->next);
+
+	/*
+	 * This unlock pairs with mutex held by mdev_put_device() through
+	 * kref_put_mutex()
+	 */
+	mutex_unlock(&parent->mdev_list_lock);
+
+	device_unregister(&mdev->dev);
+	wake_up(&parent->release_done);
+	mdev_put_parent(parent);
+}
+
+struct mdev_device *mdev_get_device(struct mdev_device *mdev)
+{
+	if (mdev)
+		kref_get(&mdev->ref);
+	return mdev;
+}
+EXPORT_SYMBOL(mdev_get_device);
+
+void mdev_put_device(struct mdev_device *mdev)
+{
+	struct parent_device *parent;
+
+	if (!mdev)
+		return;
+
+	parent = mdev->parent;
+	kref_put_mutex(&mdev->ref, mdev_release_device,
+		       &parent->mdev_list_lock);
+}
+EXPORT_SYMBOL(mdev_put_device);
+
+/*
+ * mdev_register_device : Register a device
+ * @dev: device structure representing parent device.
+ * @ops: Parent device operation structure to be registered.
+ *
+ * Add device to list of registered parent devices.
+ * Returns a negative value on error, otherwise 0.
+ */
+int mdev_register_device(struct device *dev, const struct parent_ops *ops)
+{
+	int ret = 0;
+	struct parent_device *parent;
+
+	if (!dev || !ops)
+		return -EINVAL;
+
+	/* check for mandatory ops */
+	if (!ops->create || !ops->destroy)
+		return -EINVAL;
+
+	mutex_lock(&parent_list_lock);
+
+	/* Check for duplicate */
+	parent = __find_parent_device(dev);
+	if (parent) {
+		ret = -EEXIST;
+		goto add_dev_err;
+	}
+
+	parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+	if (!parent) {
+		ret = -ENOMEM;
+		goto add_dev_err;
+	}
+
+	kref_init(&parent->ref);
+	list_add(&parent->next, &parent_list);
+
+	parent->dev = dev;
+	parent->ops = ops;
+	mutex_init(&parent->mdev_list_lock);
+	INIT_LIST_HEAD(&parent->mdev_list);
+	init_waitqueue_head(&parent->release_done);
+	mutex_unlock(&parent_list_lock);
+
+	ret = parent_create_sysfs_files(dev);
+	if (ret)
+		goto add_sysfs_error;
+
+	ret = mdev_add_attribute_group(dev, ops->dev_attr_groups);
+	if (ret)
+		goto add_group_error;
+
+	dev_info(dev, "MDEV: Registered\n");
+	return 0;
+
+add_group_error:
+	mdev_remove_sysfs_files(dev);
+add_sysfs_error:
+	mutex_lock(&parent_list_lock);
+	list_del(&parent->next);
+	mutex_unlock(&parent_list_lock);
+	mdev_put_parent(parent);
+	return ret;
+
+add_dev_err:
+	mutex_unlock(&parent_list_lock);
+	return ret;
+}
+EXPORT_SYMBOL(mdev_register_device);
+
+/*
+ * mdev_unregister_device : Unregister a parent device
+ * @dev: device structure representing parent device.
+ *
+ * Remove device from list of registered parent devices. Give a chance to free
+ * existing mediated devices for given device.
+ */
+
+void mdev_unregister_device(struct device *dev)
+{
+	struct parent_device *parent;
+	struct mdev_device *mdev = NULL;
+	int ret;
+
+	mutex_lock(&parent_list_lock);
+	parent = __find_parent_device(dev);
+
+	if (!parent) {
+		mutex_unlock(&parent_list_lock);
+		return;
+	}
+	dev_info(dev, "MDEV: Unregistering\n");
+
+	/*
+	 * Remove parent from the list and remove "mdev_create" and
+	 * "mdev_destroy" sysfs files so that no new mediated device could be
+	 * created for this parent
+	 */
+	list_del(&parent->next);
+	parent_remove_sysfs_files(dev);
+	mutex_unlock(&parent_list_lock);
+
+	mdev_remove_attribute_group(dev,
+				    parent->ops->dev_attr_groups);
+
+	while (!list_empty(&parent->mdev_list)) {
+		mutex_lock(&parent->mdev_list_lock);
+		if (!list_empty(&parent->mdev_list)) {
+			mdev = list_first_entry(&parent->mdev_list,
+						struct mdev_device, next);
+			mdev_device_destroy_ops(mdev, true);
+		}
+		mutex_unlock(&parent->mdev_list_lock);
+
+		if (mdev)
+			mdev_put_device(mdev);
+	}
+
+	do {
+		ret = wait_event_interruptible_timeout(parent->release_done,
+				list_empty(&parent->mdev_list), HZ * 10);
+		if (ret == -ERESTARTSYS) {
+			dev_warn(dev, "Mediated devices are in use, task"
+				      " \"%s\" (%d) "
+				      "blocked until all are released",
+				      current->comm, task_pid_nr(current));
+		}
+	} while (ret <= 0);
+
+	mdev_put_parent(parent);
+}
+EXPORT_SYMBOL(mdev_unregister_device);
+
+/*
+ * Functions required for mdev_sysfs
+ */
+static void mdev_device_release(struct device *dev)
+{
+	struct mdev_device *mdev = to_mdev_device(dev);
+
+	dev_dbg(&mdev->dev, "MDEV: destroying\n");
+	kfree(mdev);
+}
+
+int mdev_device_create(struct device *dev, uuid_le uuid, char *mdev_params)
+{
+	int ret;
+	struct mdev_device *mdev;
+	struct parent_device *parent;
+
+	parent = mdev_get_parent_from_dev(dev);
+	if (!parent)
+		return -EINVAL;
+
+	mutex_lock(&parent->mdev_list_lock);
+	/* Check for duplicate */
+	mdev = __find_mdev_device(parent, uuid);
+	if (mdev) {
+		ret = -EEXIST;
+		goto create_err;
+	}
+
+	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
+	if (!mdev) {
+		ret = -ENOMEM;
+		goto create_err;
+	}
+
+	memcpy(&mdev->uuid, &uuid, sizeof(uuid_le));
+	mdev->parent = parent;
+	kref_init(&mdev->ref);
+
+	mdev->dev.parent  = dev;
+	mdev->dev.bus     = &mdev_bus_type;
+	mdev->dev.release = mdev_device_release;
+	dev_set_name(&mdev->dev, "%pUl", uuid.b);
+
+	ret = device_register(&mdev->dev);
+	if (ret) {
+		put_device(&mdev->dev);
+		goto create_err;
+	}
+
+	ret = mdev_device_create_ops(mdev, mdev_params);
+	if (ret)
+		goto create_failed;
+
+	ret = mdev_create_sysfs_files(&mdev->dev);
+	if (ret)
+		goto create_sysfs_error;
+
+	list_add(&mdev->next, &parent->mdev_list);
+	mutex_unlock(&parent->mdev_list_lock);
+
+	dev_dbg(&mdev->dev, "MDEV: created\n");
+
+	return ret;
+
+create_sysfs_error:
+	mdev_device_destroy_ops(mdev, true);
+
+create_failed:
+	device_unregister(&mdev->dev);
+
+create_err:
+	mutex_unlock(&parent->mdev_list_lock);
+	mdev_put_parent(parent);
+	return ret;
+}
+
+int mdev_device_destroy(struct device *dev, uuid_le uuid)
+{
+	struct mdev_device *mdev;
+	struct parent_device *parent;
+	int ret;
+
+	parent = mdev_get_parent_from_dev(dev);
+	if (!parent)
+		return -ENODEV;
+
+	mutex_lock(&parent->mdev_list_lock);
+	mdev = __find_mdev_device(parent, uuid);
+	if (!mdev) {
+		ret = -EINVAL;
+		goto destroy_err;
+	}
+
+	mdev_remove_sysfs_files(&mdev->dev);
+	ret = mdev_device_destroy_ops(mdev, false);
+	if (ret)
+		goto destroy_err;
+
+	mutex_unlock(&parent->mdev_list_lock);
+	mdev_put_device(mdev);
+
+	mdev_put_parent(parent);
+	return ret;
+
+destroy_err:
+	mutex_unlock(&parent->mdev_list_lock);
+	mdev_put_parent(parent);
+	return ret;
+}
+
+void mdev_device_supported_config(struct device *dev, char *str)
+{
+	struct parent_device *parent;
+
+	parent = mdev_get_parent_from_dev(dev);
+
+	if (parent) {
+		if (parent->ops->supported_config)
+			parent->ops->supported_config(parent->dev, str);
+		mdev_put_parent(parent);
+	}
+}
+
+int mdev_device_set_online_status(struct device *dev, bool online)
+{
+	int ret = 0;
+	struct mdev_device *mdev;
+	struct parent_device *parent;
+
+	mdev = mdev_get_device(to_mdev_device(dev));
+	if (!mdev)
+		return -EINVAL;
+
+	parent = mdev->parent;
+
+	if (parent->ops->set_online_status)
+		ret = parent->ops->set_online_status(mdev, online);
+
+	if (ret)
+		pr_err("mdev online failed  %d\n", ret);
+	else {
+		if (online)
+			kobject_uevent(&mdev->dev.kobj, KOBJ_ONLINE);
+		else
+			kobject_uevent(&mdev->dev.kobj, KOBJ_OFFLINE);
+	}
+
+	mdev_put_device(mdev);
+
+	return ret;
+}
+
+int mdev_device_get_online_status(struct device *dev, bool *online)
+{
+	int ret = 0;
+	struct mdev_device *mdev;
+	struct parent_device *parent;
+
+	mdev = mdev_get_device(to_mdev_device(dev));
+	if (!mdev)
+		return -EINVAL;
+
+	parent = mdev->parent;
+
+	if (parent->ops->get_online_status)
+		ret = parent->ops->get_online_status(mdev, online);
+
+	mdev_put_device(mdev);
+
+	return ret;
+}
+
+static int __init mdev_init(void)
+{
+	int ret;
+
+	ret = mdev_bus_register();
+	if (ret) {
+		pr_err("Failed to register mdev bus\n");
+		return ret;
+	}
+
+	return ret;
+}
+
+static void __exit mdev_exit(void)
+{
+	mdev_bus_unregister();
+}
+
+module_init(mdev_init)
+module_exit(mdev_exit)
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
new file mode 100644
index 000000000000..8afc2d8e5c04
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_driver.c
@@ -0,0 +1,131 @@ 
+/*
+ * MDEV driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/iommu.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+static int mdev_attach_iommu(struct mdev_device *mdev)
+{
+	int ret;
+	struct iommu_group *group;
+
+	group = iommu_group_alloc();
+	if (IS_ERR(group)) {
+		dev_err(&mdev->dev, "MDEV: failed to allocate group!\n");
+		return PTR_ERR(group);
+	}
+
+	ret = iommu_group_add_device(group, &mdev->dev);
+	if (ret) {
+		dev_err(&mdev->dev, "MDEV: failed to add dev to group!\n");
+		goto attach_fail;
+	}
+
+	mdev->group = group;
+
+	dev_info(&mdev->dev, "MDEV: group_id = %d\n",
+				 iommu_group_id(group));
+attach_fail:
+	iommu_group_put(group);
+	return ret;
+}
+
+static void mdev_detach_iommu(struct mdev_device *mdev)
+{
+	iommu_group_remove_device(&mdev->dev);
+	mdev->group = NULL;
+	dev_info(&mdev->dev, "MDEV: detaching iommu\n");
+}
+
+static int mdev_probe(struct device *dev)
+{
+	struct mdev_driver *drv = to_mdev_driver(dev->driver);
+	struct mdev_device *mdev = to_mdev_device(dev);
+	int ret;
+
+	ret = mdev_attach_iommu(mdev);
+	if (ret) {
+		dev_err(dev, "Failed to attach IOMMU\n");
+		return ret;
+	}
+
+	if (drv && drv->probe)
+		ret = drv->probe(dev);
+
+	if (ret)
+		mdev_detach_iommu(mdev);
+
+	return ret;
+}
+
+static int mdev_remove(struct device *dev)
+{
+	struct mdev_driver *drv = to_mdev_driver(dev->driver);
+	struct mdev_device *mdev = to_mdev_device(dev);
+
+	if (drv && drv->remove)
+		drv->remove(dev);
+
+	mdev_detach_iommu(mdev);
+
+	return 0;
+}
+
+struct bus_type mdev_bus_type = {
+	.name		= "mdev",
+	.probe		= mdev_probe,
+	.remove		= mdev_remove,
+};
+EXPORT_SYMBOL_GPL(mdev_bus_type);
+
+/*
+ * mdev_register_driver - register a new MDEV driver
+ * @drv: the driver to register
+ * @owner: module owner of driver to be registered
+ *
+ * Returns a negative value on error, otherwise 0.
+ */
+int mdev_register_driver(struct mdev_driver *drv, struct module *owner)
+{
+	/* initialize common driver fields */
+	drv->driver.name = drv->name;
+	drv->driver.bus = &mdev_bus_type;
+	drv->driver.owner = owner;
+
+	/* register with core */
+	return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL(mdev_register_driver);
+
+/*
+ * mdev_unregister_driver - unregister MDEV driver
+ * @drv: the driver to unregister
+ *
+ */
+void mdev_unregister_driver(struct mdev_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL(mdev_unregister_driver);
+
+int mdev_bus_register(void)
+{
+	return bus_register(&mdev_bus_type);
+}
+
+void mdev_bus_unregister(void)
+{
+	bus_unregister(&mdev_bus_type);
+}
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
new file mode 100644
index 000000000000..07ad1b381370
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -0,0 +1,36 @@ 
+/*
+ * Mediated device interal definitions
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef MDEV_PRIVATE_H
+#define MDEV_PRIVATE_H
+
+int  mdev_bus_register(void);
+void mdev_bus_unregister(void);
+
+/* Function prototypes for mdev_sysfs */
+
+extern struct class_attribute mdev_class_attrs[];
+
+int  parent_create_sysfs_files(struct device *dev);
+void parent_remove_sysfs_files(struct device *dev);
+
+int  mdev_create_sysfs_files(struct device *dev);
+void mdev_remove_sysfs_files(struct device *dev);
+
+int  mdev_device_create(struct device *dev, uuid_le uuid, char *mdev_params);
+int  mdev_device_destroy(struct device *dev, uuid_le uuid);
+void mdev_device_supported_config(struct device *dev, char *str);
+
+int mdev_device_set_online_status(struct device *dev, bool online);
+int mdev_device_get_online_status(struct device *dev, bool *online);
+
+#endif /* MDEV_PRIVATE_H */
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
new file mode 100644
index 000000000000..ed55cd5d6595
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -0,0 +1,240 @@ 
+/*
+ * File attributes for Mediated devices
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/sysfs.h>
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+/* Prototypes */
+static ssize_t mdev_supported_types_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf);
+static DEVICE_ATTR_RO(mdev_supported_types);
+
+static ssize_t mdev_create_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count);
+static DEVICE_ATTR_WO(mdev_create);
+
+static ssize_t mdev_destroy_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count);
+static DEVICE_ATTR_WO(mdev_destroy);
+
+static ssize_t online_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count);
+static ssize_t online_show(struct device *dev, struct device_attribute *attr,
+			   char *buf);
+static DEVICE_ATTR_RW(online);
+
+/* Static functions */
+
+#define SUPPORTED_TYPE_BUFFER_LENGTH	4096
+
+/* mdev sysfs Functions */
+static ssize_t mdev_supported_types_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	char *str, *ptr;
+	ssize_t n;
+
+	str = kzalloc(sizeof(*str) * SUPPORTED_TYPE_BUFFER_LENGTH, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	ptr = str;
+	mdev_device_supported_config(dev, str);
+
+	n = sprintf(buf, "%s\n", str);
+	kfree(ptr);
+
+	return n;
+}
+
+static ssize_t mdev_create_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	char *str, *pstr;
+	char *uuid_str, *mdev_params = NULL, *params = NULL;
+	uuid_le uuid;
+	int ret;
+
+	pstr = str = kstrndup(buf, count, GFP_KERNEL);
+
+	if (!str)
+		return -ENOMEM;
+
+	uuid_str = strsep(&str, ":");
+	if (!uuid_str) {
+		pr_err("mdev_create: Empty UUID string %s\n", buf);
+		ret = -EINVAL;
+		goto create_error;
+	}
+
+	if (str)
+		params = mdev_params = kstrdup(str, GFP_KERNEL);
+
+	ret = uuid_le_to_bin(uuid_str, &uuid);
+	if (ret) {
+		pr_err("mdev_create: UUID parse error %s\n", buf);
+		goto create_error;
+	}
+
+	ret = mdev_device_create(dev, uuid, mdev_params);
+	if (ret)
+		pr_err("mdev_create: Failed to create mdev device\n");
+	else
+		ret = count;
+
+create_error:
+	kfree(params);
+	kfree(pstr);
+	return ret;
+}
+
+static ssize_t mdev_destroy_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	char *uuid_str, *str, *pstr;
+	uuid_le uuid;
+	int ret;
+
+	str = pstr = kstrndup(buf, count, GFP_KERNEL);
+
+	if (!str)
+		return -ENOMEM;
+
+	uuid_str = strsep(&str, ":");
+	if (!uuid_str) {
+		pr_err("mdev_destroy: Empty UUID string %s\n", buf);
+		ret = -EINVAL;
+		goto destroy_error;
+	}
+
+	ret = uuid_le_to_bin(uuid_str, &uuid);
+	if (ret) {
+		pr_err("mdev_destroy: UUID parse error  %s\n", buf);
+		goto destroy_error;
+	}
+
+	ret = mdev_device_destroy(dev, uuid);
+	if (ret == 0)
+		ret = count;
+
+destroy_error:
+	kfree(pstr);
+	return ret;
+}
+
+static ssize_t online_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	char *str;
+	int ret;
+	uint32_t online_status;
+	bool online;
+
+	str = kstrndup(buf, count, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	ret = kstrtouint(str, 0, &online_status);
+	kfree(str);
+
+	if (ret) {
+		pr_err("online: parsing error %s\n", buf);
+		return ret;
+	}
+
+	online = online_status > 0 ? true : false;
+
+	ret = mdev_device_set_online_status(dev, online);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+static ssize_t online_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	int ret;
+	bool online = false;
+
+	ret = mdev_device_get_online_status(dev, &online);
+	if (ret)
+		return ret;
+
+	ret = sprintf(buf, "%d\n", online);
+	return ret;
+}
+
+int parent_create_sysfs_files(struct device *dev)
+{
+	int ret;
+
+	ret = sysfs_create_file(&dev->kobj,
+				&dev_attr_mdev_supported_types.attr);
+	if (ret) {
+		pr_err("Failed to create mdev_supported_types sysfs entry\n");
+		return ret;
+	}
+
+	ret = sysfs_create_file(&dev->kobj, &dev_attr_mdev_create.attr);
+	if (ret) {
+		pr_err("Failed to create mdev_create sysfs entry\n");
+		goto create_sysfs_failed;
+	}
+
+	ret = sysfs_create_file(&dev->kobj, &dev_attr_mdev_destroy.attr);
+	if (ret) {
+		pr_err("Failed to create mdev_destroy sysfs entry\n");
+		sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr);
+	} else
+		return ret;
+
+create_sysfs_failed:
+	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr);
+	return ret;
+}
+
+void parent_remove_sysfs_files(struct device *dev)
+{
+	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr);
+	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr);
+	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_destroy.attr);
+}
+
+int mdev_create_sysfs_files(struct device *dev)
+{
+	int ret;
+
+	ret = sysfs_create_file(&dev->kobj, &dev_attr_online.attr);
+	if (ret)
+		pr_err("Failed to create 'online' entry\n");
+
+	return ret;
+}
+
+void mdev_remove_sysfs_files(struct device *dev)
+{
+	sysfs_remove_file(&dev->kobj, &dev_attr_online.attr);
+}
+
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
new file mode 100644
index 000000000000..babcb7293199
--- /dev/null
+++ b/include/linux/mdev.h
@@ -0,0 +1,212 @@ 
+/*
+ * Mediated device definition
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef MDEV_H
+#define MDEV_H
+
+#include <uapi/linux/vfio.h>
+
+struct parent_device;
+
+/*
+ * Mediated device
+ */
+
+struct mdev_device {
+	struct device		dev;
+	struct parent_device	*parent;
+	struct iommu_group	*group;
+	uuid_le			uuid;
+	void			*driver_data;
+
+	/* internal only */
+	struct kref		ref;
+	struct list_head	next;
+};
+
+
+/**
+ * struct parent_ops - Structure to be registered for each parent device to
+ * register the device to mdev module.
+ *
+ * @owner:		The module owner.
+ * @dev_attr_groups:	Default attributes of the parent device.
+ * @mdev_attr_groups:	Default attributes of the mediated device.
+ * @supported_config:	Called to get information about supported types.
+ *			@dev : device structure of parent device.
+ *			@config: should return string listing supported config
+ *			Returns integer: success (0) or error (< 0)
+ * @create:		Called to allocate basic resources in parent device's
+ *			driver for a particular mediated device. It is
+ *			mandatory to provide create ops.
+ *			@mdev: mdev_device structure on of mediated device
+ *			      that is being created
+ *			@mdev_params: extra parameters required by parent
+ *			device's driver.
+ *			Returns integer: success (0) or error (< 0)
+ * @destroy:		Called to free resources in parent device's driver for a
+ *			a mediated device. It is mandatory to provide destroy
+ *			ops.
+ *			@mdev: mdev_device device structure which is being
+ *			       destroyed
+ *			Returns integer: success (0) or error (< 0)
+ *			If VMM is running and destroy() is called that means the
+ *			mdev is being hotunpluged. Return error if VMM is
+ *			running and driver doesn't support mediated device
+ *			hotplug.
+ * @reset:		Called to reset mediated device.
+ *			@mdev: mdev_device device structure.
+ *			Returns integer: success (0) or error (< 0)
+ * @set_online_status:	Called to change to status of mediated device.
+ *			@mdev: mediated device.
+ *			@online: set true or false to make mdev device online or
+ *			offline.
+ *			Returns integer: success (0) or error (< 0)
+ * @get_online_status:	Called to get online/offline status of  mediated device
+ *			@mdev: mediated device.
+ *			@online: Returns status of mediated device.
+ *			Returns integer: success (0) or error (< 0)
+ * @read:		Read emulation callback
+ *			@mdev: mediated device structure
+ *			@buf: read buffer
+ *			@count: number of bytes to read
+ *			@pos: address.
+ *			Retuns number on bytes read on success or error.
+ * @write:		Write emulation callback
+ *			@mdev: mediated device structure
+ *			@buf: write buffer
+ *			@count: number of bytes to be written
+ *			@pos: address.
+ *			Retuns number on bytes written on success or error.
+ * @get_irq_info:	Called to retrieve information about mediated device IRQ
+ *			@mdev: mediated device structure
+ *			@irq_info: VFIO IRQ flags and count.
+ *			Returns integer: success (0) or error (< 0)
+ * @set_irqs:		Called to send about interrupts configuration
+ *			information that VMM sets.
+ *			@mdev: mediated device structure
+ *			@flags, index, start, count and *data : same as that of
+ *			struct vfio_irq_set of VFIO_DEVICE_SET_IRQS API.
+ * @get_device_info:	Called to get VFIO device information for a mediated
+ *			device.
+ *			@vfio_device_info: VFIO device info.
+ *			Returns integer: success (0) or error (< 0)
+ * @get_region_info:	Called to get VFIO region size and flags of mediated
+ *			device.
+ *			@mdev: mediated device structure
+ *			@region_info: output, returns size and flags of
+ *				      requested region.
+ *			@cap_type_id: returns id of capability.
+ *			@cap_type: returns pointer to capability structure
+ *			corresponding to capability id.
+ *			Returns integer: success (0) or error (< 0)
+ *
+ * Parent device that support mediated device should be registered with mdev
+ * module with parent_ops structure.
+ */
+
+struct parent_ops {
+	struct module   *owner;
+	const struct attribute_group **dev_attr_groups;
+	const struct attribute_group **mdev_attr_groups;
+
+	int	(*supported_config)(struct device *dev, char *config);
+	int     (*create)(struct mdev_device *mdev, char *mdev_params);
+	int     (*destroy)(struct mdev_device *mdev);
+	int     (*reset)(struct mdev_device *mdev);
+	int     (*set_online_status)(struct mdev_device *mdev, bool online);
+	int     (*get_online_status)(struct mdev_device *mdev, bool *online);
+	ssize_t (*read)(struct mdev_device *mdev, char *buf, size_t count,
+			loff_t pos);
+	ssize_t (*write)(struct mdev_device *mdev, char *buf, size_t count,
+			 loff_t pos);
+	int	(*mmap)(struct mdev_device *mdev, struct vm_area_struct *vma);
+	int	(*get_irq_info)(struct mdev_device *mdev,
+				struct vfio_irq_info *irq_info);
+	int     (*set_irqs)(struct mdev_device *mdev, uint32_t flags,
+			    unsigned int index, unsigned int start,
+			    unsigned int count, void *data);
+	int	(*get_device_info)(struct mdev_device *mdev,
+				   struct vfio_device_info *dev_info);
+	int	(*get_region_info)(struct mdev_device *mdev,
+				   struct vfio_region_info *region_info,
+				   u16 *cap_type_id, void **cap_type);
+};
+
+/*
+ * Parent Device
+ */
+
+struct parent_device {
+	struct device		*dev;
+	const struct parent_ops	*ops;
+
+	/* internal */
+	struct kref		ref;
+	struct list_head	next;
+	struct list_head	mdev_list;
+	struct mutex		mdev_list_lock;
+	wait_queue_head_t	release_done;
+};
+
+/**
+ * struct mdev_driver - Mediated device driver
+ * @name: driver name
+ * @probe: called when new device created
+ * @remove: called when device removed
+ * @driver: device driver structure
+ *
+ **/
+struct mdev_driver {
+	const char *name;
+	int  (*probe)(struct device *dev);
+	void (*remove)(struct device *dev);
+	struct device_driver driver;
+};
+
+static inline struct mdev_driver *to_mdev_driver(struct device_driver *drv)
+{
+	return drv ? container_of(drv, struct mdev_driver, driver) : NULL;
+}
+
+static inline struct mdev_device *to_mdev_device(struct device *dev)
+{
+	return dev ? container_of(dev, struct mdev_device, dev) : NULL;
+}
+
+static inline void *mdev_get_drvdata(struct mdev_device *mdev)
+{
+	return mdev->driver_data;
+}
+
+static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data)
+{
+	mdev->driver_data = data;
+}
+
+extern struct bus_type mdev_bus_type;
+
+#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type)
+
+extern int  mdev_register_device(struct device *dev,
+				 const struct parent_ops *ops);
+extern void mdev_unregister_device(struct device *dev);
+
+extern int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
+extern void mdev_unregister_driver(struct mdev_driver *drv);
+
+extern struct mdev_device *mdev_get_device(struct mdev_device *mdev);
+extern void mdev_put_device(struct mdev_device *mdev);
+
+extern struct mdev_device *mdev_get_device_by_group(struct iommu_group *group);
+
+#endif /* MDEV_H */