diff mbox

[v6,1/4] vfio: Mediated device Core driver

Message ID 1470251034-1555-2-git-send-email-kwankhede@nvidia.com
State New
Headers show

Commit Message

Kirti Wankhede Aug. 3, 2016, 7:03 p.m. UTC
Design for Mediated Device Driver:
Main purpose of this driver is to provide a common interface for mediated
device management that can be used by different drivers of different
devices.

This module provides a generic interface to create the device, add it to
mediated bus, add device to IOMMU group and then add it to vfio group.

Below is the high Level block diagram, with Nvidia, Intel and IBM devices
as example, since these are the devices which are going to actively use
this module as of now.

 +---------------+
 |               |
 | +-----------+ |  mdev_register_driver() +--------------+
 | |           | +<------------------------+ __init()     |
 | |           | |                         |              |
 | |  mdev     | +------------------------>+              |<-> VFIO user
 | |  bus      | |     probe()/remove()    | vfio_mpci.ko |    APIs
 | |  driver   | |                         |              |
 | |           | |                         +--------------+
 | |           | |  mdev_register_driver() +--------------+
 | |           | +<------------------------+ __init()     |
 | |           | |                         |              |
 | |           | +------------------------>+              |<-> VFIO user
 | +-----------+ |     probe()/remove()    | vfio_mccw.ko |    APIs
 |               |                         |              |
 |  MDEV CORE    |                         +--------------+
 |   MODULE      |
 |   mdev.ko     |
 | +-----------+ |  mdev_register_device() +--------------+
 | |           | +<------------------------+              |
 | |           | |                         |  nvidia.ko   |<-> physical
 | |           | +------------------------>+              |    device
 | |           | |        callback         +--------------+
 | | Physical  | |
 | |  device   | |  mdev_register_device() +--------------+
 | | interface | |<------------------------+              |
 | |           | |                         |  i915.ko     |<-> physical
 | |           | +------------------------>+              |    device
 | |           | |        callback         +--------------+
 | |           | |
 | |           | |  mdev_register_device() +--------------+
 | |           | +<------------------------+              |
 | |           | |                         | ccw_device.ko|<-> physical
 | |           | +------------------------>+              |    device
 | |           | |        callback         +--------------+
 | +-----------+ |
 +---------------+

Core driver provides two types of registration interfaces:
1. Registration interface for mediated bus driver:

/**
  * struct mdev_driver - Mediated device's driver
  * @name: driver name
  * @probe: called when new device created
  * @remove:called when device removed
  * @match: called when new device or driver is added for this bus.
	    Return 1 if given device can be handled by given driver and
	    zero otherwise.
  * @driver:device driver structure
  *
  **/
struct mdev_driver {
         const char *name;
         int  (*probe)  (struct device *dev);
         void (*remove) (struct device *dev);
         int  (*match)(struct device *dev);
         struct device_driver    driver;
};

int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
void mdev_unregister_driver(struct mdev_driver *drv);

Mediated device's driver for mdev should use this interface to register
with Core driver. With this, mediated devices driver for such devices is
responsible to add mediated device to VFIO group.

2. Physical device driver interface
This interface provides vendor driver the set APIs to manage physical
device related work in their own driver. APIs are :
- supported_config: provide supported configuration list by the vendor
		    driver
- create: to allocate basic resources in vendor driver for a mediated
	  device.
- destroy: to free resources in vendor driver when mediated device is
	   destroyed.
- reset: to free and reallocate resources in vendor driver during reboot
- start: to initiate mediated device initialization process from vendor
	 driver
- shutdown: to teardown mediated device resources during teardown.
- read : read emulation callback.
- write: write emulation callback.
- set_irqs: send interrupt configuration information that VMM sets.
- get_region_info: to provide region size and its flags for the mediated
		   device.
- validate_map_request: to validate remap pfn request.

This registration interface should be used by vendor drivers to register
each physical device to mdev core driver.
Locks to serialize above callbacks are removed. If required, vendor driver
can have locks to serialize above APIs in their driver.

Added support to keep track of physical mappings for each mdev device.
APIs to be used by mediated device bus driver to add and delete mappings to
tracking logic:
int mdev_add_phys_mapping(struct mdev_device *mdev,
                          struct address_space *mapping,
                          unsigned long addr, unsigned long size)
void mdev_del_phys_mapping(struct mdev_device *mdev, unsigned long addr)

API to be used by vendor driver to invalidate mapping:
int mdev_device_invalidate_mapping(struct mdev_device *mdev,
                                   unsigned long addr, unsigned long size)

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I73a5084574270b14541c529461ea2f03c292d510
---
 drivers/vfio/Kconfig             |   1 +
 drivers/vfio/Makefile            |   1 +
 drivers/vfio/mdev/Kconfig        |  12 +
 drivers/vfio/mdev/Makefile       |   5 +
 drivers/vfio/mdev/mdev_core.c    | 676 +++++++++++++++++++++++++++++++++++++++
 drivers/vfio/mdev/mdev_driver.c  | 142 ++++++++
 drivers/vfio/mdev/mdev_private.h |  33 ++
 drivers/vfio/mdev/mdev_sysfs.c   | 269 ++++++++++++++++
 include/linux/mdev.h             | 236 ++++++++++++++
 9 files changed, 1375 insertions(+)
 create mode 100644 drivers/vfio/mdev/Kconfig
 create mode 100644 drivers/vfio/mdev/Makefile
 create mode 100644 drivers/vfio/mdev/mdev_core.c
 create mode 100644 drivers/vfio/mdev/mdev_driver.c
 create mode 100644 drivers/vfio/mdev/mdev_private.h
 create mode 100644 drivers/vfio/mdev/mdev_sysfs.c
 create mode 100644 include/linux/mdev.h

Comments

Tian, Kevin Aug. 4, 2016, 7:21 a.m. UTC | #1
> From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> Sent: Thursday, August 04, 2016 3:04 AM
> 
> 
> 2. Physical device driver interface
> This interface provides vendor driver the set APIs to manage physical
> device related work in their own driver. APIs are :
> - supported_config: provide supported configuration list by the vendor
> 		    driver
> - create: to allocate basic resources in vendor driver for a mediated
> 	  device.
> - destroy: to free resources in vendor driver when mediated device is
> 	   destroyed.
> - reset: to free and reallocate resources in vendor driver during reboot

Currently I saw 'reset' callback only invoked from VFIO ioctl path. Do 
you think whether it makes sense to expose a sysfs 'reset' node too,
similar to what people see under a PCI device node?

> - start: to initiate mediated device initialization process from vendor
> 	 driver
> - shutdown: to teardown mediated device resources during teardown.

I think 'shutdown' should be 'stop' based on actual code.

Thanks
Kevin
Kirti Wankhede Aug. 5, 2016, 6:13 a.m. UTC | #2
On 8/4/2016 12:51 PM, Tian, Kevin wrote:
>> From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
>> Sent: Thursday, August 04, 2016 3:04 AM
>>
>>
>> 2. Physical device driver interface
>> This interface provides vendor driver the set APIs to manage physical
>> device related work in their own driver. APIs are :
>> - supported_config: provide supported configuration list by the vendor
>> 		    driver
>> - create: to allocate basic resources in vendor driver for a mediated
>> 	  device.
>> - destroy: to free resources in vendor driver when mediated device is
>> 	   destroyed.
>> - reset: to free and reallocate resources in vendor driver during reboot
> 
> Currently I saw 'reset' callback only invoked from VFIO ioctl path. Do 
> you think whether it makes sense to expose a sysfs 'reset' node too,
> similar to what people see under a PCI device node?
> 

All vendor drivers might not support reset of mdev from sysfs. But those
who want to support can expose 'reset' node using 'mdev_attr_groups' of
'struct parent_ops'.


>> - start: to initiate mediated device initialization process from vendor
>> 	 driver
>> - shutdown: to teardown mediated device resources during teardown.
> 
> I think 'shutdown' should be 'stop' based on actual code.
>

Thanks for catching that, yes I missed to updated here.

Thanks,
Kirti

> Thanks
> Kevin
>
Alex Williamson Aug. 9, 2016, 7 p.m. UTC | #3
On Thu, 4 Aug 2016 00:33:51 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> Design for Mediated Device Driver:
> Main purpose of this driver is to provide a common interface for mediated
> device management that can be used by different drivers of different
> devices.
> 
> This module provides a generic interface to create the device, add it to
> mediated bus, add device to IOMMU group and then add it to vfio group.
> 
> Below is the high Level block diagram, with Nvidia, Intel and IBM devices
> as example, since these are the devices which are going to actively use
> this module as of now.
> 
>  +---------------+
>  |               |
>  | +-----------+ |  mdev_register_driver() +--------------+
>  | |           | +<------------------------+ __init()     |
>  | |           | |                         |              |
>  | |  mdev     | +------------------------>+              |<-> VFIO user
>  | |  bus      | |     probe()/remove()    | vfio_mpci.ko |    APIs
>  | |  driver   | |                         |              |
>  | |           | |                         +--------------+
>  | |           | |  mdev_register_driver() +--------------+
>  | |           | +<------------------------+ __init()     |
>  | |           | |                         |              |
>  | |           | +------------------------>+              |<-> VFIO user
>  | +-----------+ |     probe()/remove()    | vfio_mccw.ko |    APIs
>  |               |                         |              |
>  |  MDEV CORE    |                         +--------------+
>  |   MODULE      |
>  |   mdev.ko     |
>  | +-----------+ |  mdev_register_device() +--------------+
>  | |           | +<------------------------+              |
>  | |           | |                         |  nvidia.ko   |<-> physical
>  | |           | +------------------------>+              |    device
>  | |           | |        callback         +--------------+
>  | | Physical  | |
>  | |  device   | |  mdev_register_device() +--------------+
>  | | interface | |<------------------------+              |
>  | |           | |                         |  i915.ko     |<-> physical
>  | |           | +------------------------>+              |    device
>  | |           | |        callback         +--------------+
>  | |           | |
>  | |           | |  mdev_register_device() +--------------+
>  | |           | +<------------------------+              |
>  | |           | |                         | ccw_device.ko|<-> physical
>  | |           | +------------------------>+              |    device
>  | |           | |        callback         +--------------+
>  | +-----------+ |
>  +---------------+
> 
> Core driver provides two types of registration interfaces:
> 1. Registration interface for mediated bus driver:
> 
> /**
>   * struct mdev_driver - Mediated device's driver
>   * @name: driver name
>   * @probe: called when new device created
>   * @remove:called when device removed
>   * @match: called when new device or driver is added for this bus.
> 	    Return 1 if given device can be handled by given driver and
> 	    zero otherwise.
>   * @driver:device driver structure
>   *
>   **/
> struct mdev_driver {
>          const char *name;
>          int  (*probe)  (struct device *dev);
>          void (*remove) (struct device *dev);
>          int  (*match)(struct device *dev);
>          struct device_driver    driver;
> };
> 
> int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
> void mdev_unregister_driver(struct mdev_driver *drv);
> 
> Mediated device's driver for mdev should use this interface to register
> with Core driver. With this, mediated devices driver for such devices is
> responsible to add mediated device to VFIO group.
> 
> 2. Physical device driver interface
> This interface provides vendor driver the set APIs to manage physical
> device related work in their own driver. APIs are :
> - supported_config: provide supported configuration list by the vendor
> 		    driver
> - create: to allocate basic resources in vendor driver for a mediated
> 	  device.
> - destroy: to free resources in vendor driver when mediated device is
> 	   destroyed.
> - reset: to free and reallocate resources in vendor driver during reboot
> - start: to initiate mediated device initialization process from vendor
> 	 driver
> - shutdown: to teardown mediated device resources during teardown.
> - read : read emulation callback.
> - write: write emulation callback.
> - set_irqs: send interrupt configuration information that VMM sets.
> - get_region_info: to provide region size and its flags for the mediated
> 		   device.
> - validate_map_request: to validate remap pfn request.
> 
> This registration interface should be used by vendor drivers to register
> each physical device to mdev core driver.
> Locks to serialize above callbacks are removed. If required, vendor driver
> can have locks to serialize above APIs in their driver.
> 
> Added support to keep track of physical mappings for each mdev device.
> APIs to be used by mediated device bus driver to add and delete mappings to
> tracking logic:
> int mdev_add_phys_mapping(struct mdev_device *mdev,
>                           struct address_space *mapping,
>                           unsigned long addr, unsigned long size)
> void mdev_del_phys_mapping(struct mdev_device *mdev, unsigned long addr)
> 
> API to be used by vendor driver to invalidate mapping:
> int mdev_device_invalidate_mapping(struct mdev_device *mdev,
>                                    unsigned long addr, unsigned long size)
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I73a5084574270b14541c529461ea2f03c292d510
> ---
>  drivers/vfio/Kconfig             |   1 +
>  drivers/vfio/Makefile            |   1 +
>  drivers/vfio/mdev/Kconfig        |  12 +
>  drivers/vfio/mdev/Makefile       |   5 +
>  drivers/vfio/mdev/mdev_core.c    | 676 +++++++++++++++++++++++++++++++++++++++
>  drivers/vfio/mdev/mdev_driver.c  | 142 ++++++++
>  drivers/vfio/mdev/mdev_private.h |  33 ++
>  drivers/vfio/mdev/mdev_sysfs.c   | 269 ++++++++++++++++
>  include/linux/mdev.h             | 236 ++++++++++++++
>  9 files changed, 1375 insertions(+)
>  create mode 100644 drivers/vfio/mdev/Kconfig
>  create mode 100644 drivers/vfio/mdev/Makefile
>  create mode 100644 drivers/vfio/mdev/mdev_core.c
>  create mode 100644 drivers/vfio/mdev/mdev_driver.c
>  create mode 100644 drivers/vfio/mdev/mdev_private.h
>  create mode 100644 drivers/vfio/mdev/mdev_sysfs.c
>  create mode 100644 include/linux/mdev.h
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index da6e2ce77495..23eced02aaf6 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -48,4 +48,5 @@ menuconfig VFIO_NOIOMMU
>  
>  source "drivers/vfio/pci/Kconfig"
>  source "drivers/vfio/platform/Kconfig"
> +source "drivers/vfio/mdev/Kconfig"
>  source "virt/lib/Kconfig"
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 7b8a31f63fea..4a23c13b6be4 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -7,3 +7,4 @@ obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
>  obj-$(CONFIG_VFIO_PLATFORM) += platform/
> +obj-$(CONFIG_VFIO_MDEV) += mdev/
> diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
> new file mode 100644
> index 000000000000..a34fbc66f92f
> --- /dev/null
> +++ b/drivers/vfio/mdev/Kconfig
> @@ -0,0 +1,12 @@
> +
> +config VFIO_MDEV
> +    tristate "Mediated device driver framework"
> +    depends on VFIO
> +    default n
> +    help
> +        Provides a framework to virtualize device.
> +	See Documentation/vfio-mediated-device.txt for more details.
> +
> +        If you don't know what do here, say N.
> +
> +
> diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
> new file mode 100644
> index 000000000000..56a75e689582
> --- /dev/null
> +++ b/drivers/vfio/mdev/Makefile
> @@ -0,0 +1,5 @@
> +
> +mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o
> +
> +obj-$(CONFIG_VFIO_MDEV) += mdev.o
> +
> diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
> new file mode 100644
> index 000000000000..90ff073abfce
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev_core.c
> @@ -0,0 +1,676 @@
> +/*
> + * Mediated device Core Driver
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/fs.h>
> +#include <linux/slab.h>
> +#include <linux/sched.h>
> +#include <linux/uuid.h>
> +#include <linux/vfio.h>
> +#include <linux/iommu.h>
> +#include <linux/sysfs.h>
> +#include <linux/mdev.h>
> +
> +#include "mdev_private.h"
> +
> +#define DRIVER_VERSION		"0.1"
> +#define DRIVER_AUTHOR		"NVIDIA Corporation"
> +#define DRIVER_DESC		"Mediated device Core Driver"
> +
> +#define MDEV_CLASS_NAME		"mdev"
> +
> +static LIST_HEAD(parent_list);
> +static DEFINE_MUTEX(parent_list_lock);
> +
> +static int mdev_add_attribute_group(struct device *dev,
> +				    const struct attribute_group **groups)
> +{
> +	return sysfs_create_groups(&dev->kobj, groups);
> +}
> +
> +static void mdev_remove_attribute_group(struct device *dev,
> +					const struct attribute_group **groups)
> +{
> +	sysfs_remove_groups(&dev->kobj, groups);
> +}
> +
> +/* Should be called holding parent->mdev_list_lock */

I often like to prepend "__" onto the name of functions like this to
signal a special calling convention.

> +static struct mdev_device *find_mdev_device(struct parent_device *parent,
> +					    uuid_le uuid, int instance)
> +{
> +	struct mdev_device *mdev;
> +
> +	list_for_each_entry(mdev, &parent->mdev_list, next) {
> +		if ((uuid_le_cmp(mdev->uuid, uuid) == 0) &&
> +		    (mdev->instance == instance))
> +			return mdev;
> +	}
> +	return NULL;
> +}
> +
> +/* Should be called holding parent_list_lock */
> +static struct parent_device *find_parent_device(struct device *dev)
> +{
> +	struct parent_device *parent;
> +
> +	list_for_each_entry(parent, &parent_list, next) {
> +		if (parent->dev == dev)
> +			return parent;
> +	}
> +	return NULL;
> +}
> +
> +static void mdev_release_parent(struct kref *kref)
> +{
> +	struct parent_device *parent = container_of(kref, struct parent_device,
> +						    ref);
> +	kfree(parent);
> +}
> +
> +static
> +inline struct parent_device *mdev_get_parent(struct parent_device *parent)
> +{
> +	if (parent)
> +		kref_get(&parent->ref);
> +
> +	return parent;
> +}
> +
> +static inline void mdev_put_parent(struct parent_device *parent)
> +{
> +	if (parent)
> +		kref_put(&parent->ref, mdev_release_parent);
> +}
> +
> +static struct parent_device *mdev_get_parent_by_dev(struct device *dev)
> +{
> +	struct parent_device *parent = NULL, *p;
> +
> +	mutex_lock(&parent_list_lock);
> +	list_for_each_entry(p, &parent_list, next) {
> +		if (p->dev == dev) {
> +			parent = mdev_get_parent(p);
> +			break;
> +		}
> +	}
> +	mutex_unlock(&parent_list_lock);
> +	return parent;

Use what you've created:

{
	struct parent_device *parent;

	mutex_lock(&parent_list_lock);
	parent = mdev_get_parent(find_parent_device(dev));
	mutex_unlock(&parent_list_lock);

	return parent;
}

> +}
> +
> +static int mdev_device_create_ops(struct mdev_device *mdev, char *mdev_params)
> +{
> +	struct parent_device *parent = mdev->parent;
> +	int ret;
> +
> +	ret = parent->ops->create(mdev, mdev_params);
> +	if (ret)
> +		return ret;
> +
> +	ret = mdev_add_attribute_group(&mdev->dev,
> +					parent->ops->mdev_attr_groups);
> +	if (ret)
> +		parent->ops->destroy(mdev);
> +
> +	return ret;
> +}
> +
> +static int mdev_device_destroy_ops(struct mdev_device *mdev, bool force)
> +{
> +	struct parent_device *parent = mdev->parent;
> +	int ret = 0;
> +
> +	/*
> +	 * If vendor driver doesn't return success that means vendor
> +	 * driver doesn't support hot-unplug
> +	 */
> +	ret = parent->ops->destroy(mdev);
> +	if (ret && !force)
> +		return -EBUSY;

This still seems troublesome, I'm not sure why we don't just require
hot-unplug support.  Without it, we seem to have a limbo state where a
device exists, but not fully.

> +
> +	mdev_remove_attribute_group(&mdev->dev,
> +				    parent->ops->mdev_attr_groups);
> +
> +	return ret;
> +}
> +
> +static void mdev_release_device(struct kref *kref)
> +{
> +	struct mdev_device *mdev = container_of(kref, struct mdev_device, ref);
> +	struct parent_device *parent = mdev->parent;
> +
> +	list_del(&mdev->next);
> +	mutex_unlock(&parent->mdev_list_lock);

Maybe worthy of a short comment to more obviously match this unlock to
the kref_put_mutex() below.

> +
> +	device_unregister(&mdev->dev);
> +	wake_up(&parent->release_done);
> +	mdev_put_parent(parent);
> +}
> +
> +struct mdev_device *mdev_get_device(struct mdev_device *mdev)
> +{
> +	kref_get(&mdev->ref);
> +	return mdev;
> +}
> +EXPORT_SYMBOL(mdev_get_device);

Is the intention here that the caller already has a reference to mdev
and wants to get another?  Or I see the cases where use locking
to get this reference.  There's potential to misuse this, if not
outright abuse it, which worries me.  A reference cannot be
spontaneously generated, it needs to be sourced from somewhere.

> +
> +void mdev_put_device(struct mdev_device *mdev)
> +{
> +	struct parent_device *parent = mdev->parent;
> +
> +	kref_put_mutex(&mdev->ref, mdev_release_device,
> +		       &parent->mdev_list_lock);
> +}
> +EXPORT_SYMBOL(mdev_put_device);
> +
> +/*
> + * Find first mediated device from given uuid and increment refcount of
> + * mediated device. Caller should call mdev_put_device() when the use of
> + * mdev_device is done.
> + */
> +static struct mdev_device *mdev_get_first_device_by_uuid(uuid_le uuid)
> +{
> +	struct mdev_device *mdev = NULL, *p;
> +	struct parent_device *parent;
> +
> +	mutex_lock(&parent_list_lock);
> +	list_for_each_entry(parent, &parent_list, next) {
> +		mutex_lock(&parent->mdev_list_lock);
> +		list_for_each_entry(p, &parent->mdev_list, next) {
> +			if (uuid_le_cmp(p->uuid, uuid) == 0) {
> +				mdev = mdev_get_device(p);
> +				break;
> +			}
> +		}
> +		mutex_unlock(&parent->mdev_list_lock);
> +
> +		if (mdev)
> +			break;
> +	}
> +	mutex_unlock(&parent_list_lock);
> +	return mdev;
> +}

This is used later by mdev_device_start() and mdev_device_stop() to get
the parent_device so it can call the start and stop ops callbacks
respectively.  That seems to imply that all of instances for a given
uuid come from the same parent_device.  Where is that enforced?  I'm
still having a hard time buying into the uuid+instance plan when it
seems like each mdev_device should have an actual unique uuid.
Userspace tools can figure out which uuids to start for a given user, I
don't see much value in collecting them to instances within a uuid.

> +
> +/*
> + * Find mediated device from given iommu_group and increment refcount of
> + * mediated device. Caller should call mdev_put_device() when the use of
> + * mdev_device is done.
> + */
> +struct mdev_device *mdev_get_device_by_group(struct iommu_group *group)
> +{
> +	struct mdev_device *mdev = NULL, *p;
> +	struct parent_device *parent;
> +
> +	mutex_lock(&parent_list_lock);
> +	list_for_each_entry(parent, &parent_list, next) {
> +		mutex_lock(&parent->mdev_list_lock);
> +		list_for_each_entry(p, &parent->mdev_list, next) {
> +			if (!p->group)
> +				continue;
> +
> +			if (iommu_group_id(p->group) == iommu_group_id(group)) {
> +				mdev = mdev_get_device(p);
> +				break;
> +			}
> +		}
> +		mutex_unlock(&parent->mdev_list_lock);
> +
> +		if (mdev)
> +			break;
> +	}
> +	mutex_unlock(&parent_list_lock);
> +	return mdev;
> +}
> +EXPORT_SYMBOL(mdev_get_device_by_group);
> +
> +/*
> + * mdev_register_device : Register a device
> + * @dev: device structure representing parent device.
> + * @ops: Parent device operation structure to be registered.
> + *
> + * Add device to list of registered parent devices.
> + * Returns a negative value on error, otherwise 0.
> + */
> +int mdev_register_device(struct device *dev, const struct parent_ops *ops)
> +{
> +	int ret = 0;
> +	struct parent_device *parent;
> +
> +	if (!dev || !ops)
> +		return -EINVAL;
> +
> +	/* check for mandatory ops */
> +	if (!ops->create || !ops->destroy)
> +		return -EINVAL;
> +
> +	mutex_lock(&parent_list_lock);
> +
> +	/* Check for duplicate */
> +	parent = find_parent_device(dev);
> +	if (parent) {
> +		ret = -EEXIST;
> +		goto add_dev_err;
> +	}
> +
> +	parent = kzalloc(sizeof(*parent), GFP_KERNEL);
> +	if (!parent) {
> +		ret = -ENOMEM;
> +		goto add_dev_err;
> +	}
> +
> +	kref_init(&parent->ref);
> +	list_add(&parent->next, &parent_list);
> +
> +	parent->dev = dev;
> +	parent->ops = ops;
> +	mutex_init(&parent->mdev_list_lock);
> +	INIT_LIST_HEAD(&parent->mdev_list);
> +	init_waitqueue_head(&parent->release_done);
> +	mutex_unlock(&parent_list_lock);
> +
> +	ret = mdev_create_sysfs_files(dev);
> +	if (ret)
> +		goto add_sysfs_error;
> +
> +	ret = mdev_add_attribute_group(dev, ops->dev_attr_groups);
> +	if (ret)
> +		goto add_group_error;
> +
> +	dev_info(dev, "MDEV: Registered\n");
> +	return 0;
> +
> +add_group_error:
> +	mdev_remove_sysfs_files(dev);
> +add_sysfs_error:
> +	mutex_lock(&parent_list_lock);
> +	list_del(&parent->next);
> +	mutex_unlock(&parent_list_lock);
> +	mdev_put_parent(parent);
> +	return ret;
> +
> +add_dev_err:
> +	mutex_unlock(&parent_list_lock);
> +	return ret;
> +}
> +EXPORT_SYMBOL(mdev_register_device);
> +
> +/*
> + * mdev_unregister_device : Unregister a parent device
> + * @dev: device structure representing parent device.
> + *
> + * Remove device from list of registered parent devices. Give a chance to free
> + * existing mediated devices for given device.
> + */
> +
> +void mdev_unregister_device(struct device *dev)
> +{
> +	struct parent_device *parent;
> +	struct mdev_device *mdev, *n;

Above *p was used for a temp pointer.  

> +	int ret;
> +
> +	mutex_lock(&parent_list_lock);
> +	parent = find_parent_device(dev);
> +
> +	if (!parent) {
> +		mutex_unlock(&parent_list_lock);
> +		return;
> +	}
> +	dev_info(dev, "MDEV: Unregistering\n");
> +
> +	/*
> +	 * Remove parent from the list and remove create and destroy sysfs

Quoting "create" and "destroy" would make this a bit more readable:

	Remove parent from the list and remove "create" and "destroy"
	sysfs...

Took me a couple reads to figure out "remove create" wasn't a typo.

> +	 * files so that no new mediated device could be created for this parent
> +	 */
> +	list_del(&parent->next);
> +	mdev_remove_sysfs_files(dev);
> +	mutex_unlock(&parent_list_lock);
> +
> +	mdev_remove_attribute_group(dev,
> +				    parent->ops->dev_attr_groups);
> +

Why do we need to remove sysfs files under the parent_list_lock?

> +	mutex_lock(&parent->mdev_list_lock);
> +	list_for_each_entry_safe(mdev, n, &parent->mdev_list, next) {
> +		mdev_device_destroy_ops(mdev, true);
> +		mutex_unlock(&parent->mdev_list_lock);
> +		mdev_put_device(mdev);
> +		mutex_lock(&parent->mdev_list_lock);

*cringe*  Any time we need to release the list lock inside the
traversal makes me nervous.  What about using list_first_entry() since
I don't think using list_for_each_entry_safe() really makes it safe
from concurrent operations on the list once we drop that lock.

> +	}
> +	mutex_unlock(&parent->mdev_list_lock);
> +
> +	do {
> +		ret = wait_event_interruptible_timeout(parent->release_done,
> +				list_empty(&parent->mdev_list), HZ * 10);
> +		if (ret == -ERESTARTSYS) {
> +			dev_warn(dev, "Mediated devices are in use, task"
> +				      " \"%s\" (%d) "
> +				      "blocked until all are released",
> +				      current->comm, task_pid_nr(current));
> +		}
> +	} while (ret <= 0);
> +
> +	mdev_put_parent(parent);
> +}
> +EXPORT_SYMBOL(mdev_unregister_device);
> +
> +/*
> + * Functions required for mdev_sysfs
> + */
> +static void mdev_device_release(struct device *dev)
> +{
> +	struct mdev_device *mdev = to_mdev_device(dev);
> +
> +	dev_dbg(&mdev->dev, "MDEV: destroying\n");
> +	kfree(mdev);
> +}
> +
> +int mdev_device_create(struct device *dev, uuid_le uuid, uint32_t instance,
> +		       char *mdev_params)
> +{
> +	int ret;
> +	struct mdev_device *mdev;
> +	struct parent_device *parent;
> +
> +	parent = mdev_get_parent_by_dev(dev);
> +	if (!parent)
> +		return -EINVAL;
> +
> +	mutex_lock(&parent->mdev_list_lock);
> +	/* Check for duplicate */
> +	mdev = find_mdev_device(parent, uuid, instance);
> +	if (mdev) {
> +		ret = -EEXIST;
> +		goto create_err;
> +	}
> +
> +	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
> +	if (!mdev) {
> +		ret = -ENOMEM;
> +		goto create_err;
> +	}
> +
> +	memcpy(&mdev->uuid, &uuid, sizeof(uuid_le));
> +	mdev->instance = instance;
> +	mdev->parent = parent;
> +	kref_init(&mdev->ref);
> +
> +	mdev->dev.parent  = dev;
> +	mdev->dev.bus     = &mdev_bus_type;
> +	mdev->dev.release = mdev_device_release;
> +	dev_set_name(&mdev->dev, "%pUl-%d", uuid.b, instance);
> +
> +	ret = device_register(&mdev->dev);
> +	if (ret) {
> +		put_device(&mdev->dev);
> +		goto create_err;
> +	}
> +
> +	ret = mdev_device_create_ops(mdev, mdev_params);
> +	if (ret)
> +		goto create_failed;
> +
> +	list_add(&mdev->next, &parent->mdev_list);
> +	mutex_unlock(&parent->mdev_list_lock);
> +
> +	dev_dbg(&mdev->dev, "MDEV: created\n");
> +
> +	return ret;
> +
> +create_failed:
> +	device_unregister(&mdev->dev);
> +
> +create_err:
> +	mutex_unlock(&parent->mdev_list_lock);
> +	mdev_put_parent(parent);
> +	return ret;
> +}
> +
> +int mdev_device_destroy(struct device *dev, uuid_le uuid, uint32_t instance)
> +{
> +	struct mdev_device *mdev;
> +	struct parent_device *parent;
> +	int ret;
> +
> +	parent = mdev_get_parent_by_dev(dev);
> +	if (!parent)
> +		return -EINVAL;
> +
> +	mutex_lock(&parent->mdev_list_lock);
> +	mdev = find_mdev_device(parent, uuid, instance);
> +	if (!mdev) {
> +		ret = -EINVAL;

-ENODEV?

> +		goto destroy_err;
> +	}
> +
> +	ret = mdev_device_destroy_ops(mdev, false);
> +	if (ret)
> +		goto destroy_err;
> +
> +	mutex_unlock(&parent->mdev_list_lock);
> +	mdev_put_device(mdev);
> +
> +	mdev_put_parent(parent);
> +	return ret;
> +
> +destroy_err:
> +	mutex_unlock(&parent->mdev_list_lock);
> +	mdev_put_parent(parent);
> +	return ret;
> +}
> +
> +int mdev_device_invalidate_mapping(struct mdev_device *mdev,
> +				   unsigned long addr, unsigned long size)
> +{
> +	int ret = -EINVAL;
> +	struct mdev_phys_mapping *phys_mappings;
> +	struct addr_desc *addr_desc;
> +
> +	if (!mdev || !mdev->phys_mappings.mapping)
> +		return ret;
> +
> +	phys_mappings = &mdev->phys_mappings;
> +
> +	mutex_lock(&phys_mappings->addr_desc_list_lock);
> +
> +	list_for_each_entry(addr_desc, &phys_mappings->addr_desc_list, next) {
> +
> +		if ((addr > addr_desc->start) &&
> +		    (addr + size < addr_desc->start + addr_desc->size)) {

This looks incomplete, minimally I think these should be >= and <=, but
that still only covers fully enclosed invalidation ranges.  Do we need
to support partial invalidations?

> +			unmap_mapping_range(phys_mappings->mapping,
> +					    addr, size, 0);
> +			ret = 0;
> +			goto unlock_exit;

If partial overlaps can occur, we'll need an exhaustive search.

> +		}
> +	}
> +
> +unlock_exit:
> +	mutex_unlock(&phys_mappings->addr_desc_list_lock);
> +	return ret;
> +}
> +EXPORT_SYMBOL(mdev_device_invalidate_mapping);
> +
> +/* Sanity check for the physical mapping list for mediated device */
> +
> +int mdev_add_phys_mapping(struct mdev_device *mdev,
> +			  struct address_space *mapping,
> +			  unsigned long addr, unsigned long size)
> +{
> +	struct mdev_phys_mapping *phys_mappings;
> +	struct addr_desc *addr_desc, *new_addr_desc;
> +	int ret = 0;
> +
> +	if (!mdev)
> +		return -EINVAL;
> +
> +	phys_mappings = &mdev->phys_mappings;
> +	if (phys_mappings->mapping && (mapping != phys_mappings->mapping))
> +		return -EINVAL;
> +
> +	if (!phys_mappings->mapping) {
> +		phys_mappings->mapping = mapping;
> +		mutex_init(&phys_mappings->addr_desc_list_lock);
> +		INIT_LIST_HEAD(&phys_mappings->addr_desc_list);
> +	}

This looks racy, should we be acquiring the mutex earlier?

> +
> +	mutex_lock(&phys_mappings->addr_desc_list_lock);
> +
> +	list_for_each_entry(addr_desc, &phys_mappings->addr_desc_list, next) {
> +		if ((addr + size < addr_desc->start) ||
> +		    (addr_desc->start + addr_desc->size) < addr)

<= on both, I think

> +			continue;
> +		else {
> +			/* should be no overlap */
> +			ret = -EINVAL;
> +			goto mapping_exit;
> +		}
> +	}
> +
> +	/* add the new entry to the list */
> +	new_addr_desc = kzalloc(sizeof(*new_addr_desc), GFP_KERNEL);
> +
> +	if (!new_addr_desc) {
> +		ret = -ENOMEM;
> +		goto mapping_exit;
> +	}
> +
> +	new_addr_desc->start = addr;
> +	new_addr_desc->size = size;
> +	list_add(&new_addr_desc->next, &phys_mappings->addr_desc_list);
> +
> +mapping_exit:
> +	mutex_unlock(&phys_mappings->addr_desc_list_lock);
> +	return ret;
> +}
> +EXPORT_SYMBOL(mdev_add_phys_mapping);
> +
> +void mdev_del_phys_mapping(struct mdev_device *mdev, unsigned long addr)
> +{
> +	struct mdev_phys_mapping *phys_mappings;
> +	struct addr_desc *addr_desc;
> +
> +	if (!mdev)
> +		return;
> +
> +	phys_mappings = &mdev->phys_mappings;
> +
> +	mutex_lock(&phys_mappings->addr_desc_list_lock);
> +	list_for_each_entry(addr_desc, &phys_mappings->addr_desc_list, next) {
> +		if (addr_desc->start == addr) {
> +			list_del(&addr_desc->next);
> +			kfree(addr_desc);
> +			break;
> +		}
> +	}
> +	mutex_unlock(&phys_mappings->addr_desc_list_lock);
> +}
> +EXPORT_SYMBOL(mdev_del_phys_mapping);
> +
> +void mdev_device_supported_config(struct device *dev, char *str)
> +{
> +	struct parent_device *parent;
> +
> +	parent = mdev_get_parent_by_dev(dev);
> +
> +	if (parent) {
> +		if (parent->ops->supported_config)
> +			parent->ops->supported_config(parent->dev, str);
> +		mdev_put_parent(parent);
> +	}
> +}
> +
> +int mdev_device_start(uuid_le uuid)
> +{
> +	int ret = 0;
> +	struct mdev_device *mdev;
> +	struct parent_device *parent;
> +
> +	mdev = mdev_get_first_device_by_uuid(uuid);
> +	if (!mdev)
> +		return -EINVAL;
> +
> +	parent = mdev->parent;
> +
> +	if (parent->ops->start)
> +		ret = parent->ops->start(mdev->uuid);

Assumes uuids do not span parent_devices?

> +
> +	if (ret)
> +		pr_err("mdev_start failed  %d\n", ret);
> +	else
> +		kobject_uevent(&mdev->dev.kobj, KOBJ_ONLINE);
> +
> +	mdev_put_device(mdev);
> +
> +	return ret;
> +}
> +
> +int mdev_device_stop(uuid_le uuid)
> +{
> +	int ret = 0;
> +	struct mdev_device *mdev;
> +	struct parent_device *parent;
> +
> +	mdev = mdev_get_first_device_by_uuid(uuid);
> +	if (!mdev)
> +		return -EINVAL;
> +
> +	parent = mdev->parent;
> +
> +	if (parent->ops->stop)
> +		ret = parent->ops->stop(mdev->uuid);
> +
> +	if (ret)
> +		pr_err("mdev stop failed %d\n", ret);
> +	else
> +		kobject_uevent(&mdev->dev.kobj, KOBJ_OFFLINE);
> +
> +	mdev_put_device(mdev);
> +	return ret;
> +}
> +
> +static struct class mdev_class = {
> +	.name		= MDEV_CLASS_NAME,
> +	.owner		= THIS_MODULE,
> +	.class_attrs	= mdev_class_attrs,
> +};
> +
> +static int __init mdev_init(void)
> +{
> +	int ret;
> +
> +	ret = class_register(&mdev_class);
> +	if (ret) {
> +		pr_err("Failed to register mdev class\n");
> +		return ret;
> +	}
> +
> +	ret = mdev_bus_register();
> +	if (ret) {
> +		pr_err("Failed to register mdev bus\n");
> +		class_unregister(&mdev_class);
> +		return ret;
> +	}
> +
> +	return ret;
> +}
> +
> +static void __exit mdev_exit(void)
> +{
> +	mdev_bus_unregister();
> +	class_unregister(&mdev_class);
> +}
> +
> +module_init(mdev_init)
> +module_exit(mdev_exit)
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
> new file mode 100644
> index 000000000000..00680bd06224
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev_driver.c
> @@ -0,0 +1,142 @@
> +/*
> + * MDEV driver
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/device.h>
> +#include <linux/iommu.h>
> +#include <linux/mdev.h>
> +
> +#include "mdev_private.h"
> +
> +static int mdev_attach_iommu(struct mdev_device *mdev)
> +{
> +	int ret;
> +	struct iommu_group *group;
> +
> +	group = iommu_group_alloc();
> +	if (IS_ERR(group)) {
> +		dev_err(&mdev->dev, "MDEV: failed to allocate group!\n");
> +		return PTR_ERR(group);
> +	}
> +
> +	ret = iommu_group_add_device(group, &mdev->dev);
> +	if (ret) {
> +		dev_err(&mdev->dev, "MDEV: failed to add dev to group!\n");
> +		goto attach_fail;
> +	}
> +
> +	mdev->group = group;
> +
> +	dev_info(&mdev->dev, "MDEV: group_id = %d\n",
> +				 iommu_group_id(group));
> +attach_fail:
> +	iommu_group_put(group);
> +	return ret;
> +}
> +
> +static void mdev_detach_iommu(struct mdev_device *mdev)
> +{
> +	iommu_group_remove_device(&mdev->dev);
> +	mdev->group = NULL;
> +	dev_info(&mdev->dev, "MDEV: detaching iommu\n");
> +}
> +
> +static int mdev_probe(struct device *dev)
> +{
> +	struct mdev_driver *drv = to_mdev_driver(dev->driver);
> +	struct mdev_device *mdev = to_mdev_device(dev);
> +	int ret;
> +
> +	ret = mdev_attach_iommu(mdev);
> +	if (ret) {
> +		dev_err(dev, "Failed to attach IOMMU\n");
> +		return ret;
> +	}
> +
> +	if (drv && drv->probe)
> +		ret = drv->probe(dev);
> +
> +	if (ret)
> +		mdev_detach_iommu(mdev);
> +
> +	return ret;
> +}
> +
> +static int mdev_remove(struct device *dev)
> +{
> +	struct mdev_driver *drv = to_mdev_driver(dev->driver);
> +	struct mdev_device *mdev = to_mdev_device(dev);
> +
> +	if (drv && drv->remove)
> +		drv->remove(dev);
> +
> +	mdev_detach_iommu(mdev);
> +
> +	return 0;
> +}
> +
> +static int mdev_match(struct device *dev, struct device_driver *driver)
> +{
> +	struct mdev_driver *drv = to_mdev_driver(driver);
> +
> +	if (drv && drv->match)
> +		return drv->match(dev);
> +
> +	return 0;
> +}
> +
> +struct bus_type mdev_bus_type = {
> +	.name		= "mdev",
> +	.match		= mdev_match,
> +	.probe		= mdev_probe,
> +	.remove		= mdev_remove,
> +};
> +EXPORT_SYMBOL_GPL(mdev_bus_type);
> +
> +/*
> + * mdev_register_driver - register a new MDEV driver
> + * @drv: the driver to register
> + * @owner: module owner of driver to be registered
> + *
> + * Returns a negative value on error, otherwise 0.
> + */
> +int mdev_register_driver(struct mdev_driver *drv, struct module *owner)
> +{
> +	/* initialize common driver fields */
> +	drv->driver.name = drv->name;
> +	drv->driver.bus = &mdev_bus_type;
> +	drv->driver.owner = owner;
> +
> +	/* register with core */
> +	return driver_register(&drv->driver);
> +}
> +EXPORT_SYMBOL(mdev_register_driver);
> +
> +/*
> + * mdev_unregister_driver - unregister MDEV driver
> + * @drv: the driver to unregister
> + *
> + */
> +void mdev_unregister_driver(struct mdev_driver *drv)
> +{
> +	driver_unregister(&drv->driver);
> +}
> +EXPORT_SYMBOL(mdev_unregister_driver);
> +
> +int mdev_bus_register(void)
> +{
> +	return bus_register(&mdev_bus_type);
> +}
> +
> +void mdev_bus_unregister(void)
> +{
> +	bus_unregister(&mdev_bus_type);
> +}
> diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
> new file mode 100644
> index 000000000000..ee2db61a8091
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev_private.h
> @@ -0,0 +1,33 @@
> +/*
> + * Mediated device interal definitions
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef MDEV_PRIVATE_H
> +#define MDEV_PRIVATE_H
> +
> +int  mdev_bus_register(void);
> +void mdev_bus_unregister(void);
> +
> +/* Function prototypes for mdev_sysfs */
> +
> +extern struct class_attribute mdev_class_attrs[];
> +
> +int  mdev_create_sysfs_files(struct device *dev);
> +void mdev_remove_sysfs_files(struct device *dev);
> +
> +int  mdev_device_create(struct device *dev, uuid_le uuid, uint32_t instance,
> +			char *mdev_params);
> +int  mdev_device_destroy(struct device *dev, uuid_le uuid, uint32_t instance);
> +void mdev_device_supported_config(struct device *dev, char *str);
> +int  mdev_device_start(uuid_le uuid);
> +int  mdev_device_stop(uuid_le uuid);
> +
> +#endif /* MDEV_PRIVATE_H */
> diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
> new file mode 100644
> index 000000000000..e0457e68cf78
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev_sysfs.c
> @@ -0,0 +1,269 @@
> +/*
> + * File attributes for Mediated devices
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/sysfs.h>
> +#include <linux/ctype.h>
> +#include <linux/device.h>
> +#include <linux/slab.h>
> +#include <linux/uuid.h>
> +#include <linux/mdev.h>
> +
> +#include "mdev_private.h"
> +
> +/* Prototypes */
> +static ssize_t mdev_supported_types_show(struct device *dev,
> +					 struct device_attribute *attr,
> +					 char *buf);
> +static DEVICE_ATTR_RO(mdev_supported_types);
> +
> +static ssize_t mdev_create_store(struct device *dev,
> +				 struct device_attribute *attr,
> +				 const char *buf, size_t count);
> +static DEVICE_ATTR_WO(mdev_create);
> +
> +static ssize_t mdev_destroy_store(struct device *dev,
> +				  struct device_attribute *attr,
> +				  const char *buf, size_t count);
> +static DEVICE_ATTR_WO(mdev_destroy);
> +
> +/* Static functions */
> +
> +
> +#define SUPPORTED_TYPE_BUFFER_LENGTH	4096
> +
> +/* mdev sysfs Functions */
> +static ssize_t mdev_supported_types_show(struct device *dev,
> +					 struct device_attribute *attr,
> +					 char *buf)
> +{
> +	char *str, *ptr;
> +	ssize_t n;
> +
> +	str = kzalloc(sizeof(*str) * SUPPORTED_TYPE_BUFFER_LENGTH, GFP_KERNEL);
> +	if (!str)
> +		return -ENOMEM;
> +
> +	ptr = str;
> +	mdev_device_supported_config(dev, str);
> +
> +	n = sprintf(buf, "%s\n", str);
> +	kfree(ptr);
> +
> +	return n;
> +}
> +
> +static ssize_t mdev_create_store(struct device *dev,
> +				 struct device_attribute *attr,
> +				 const char *buf, size_t count)
> +{
> +	char *str, *pstr;
> +	char *uuid_str, *instance_str, *mdev_params = NULL, *params = NULL;
> +	uuid_le uuid;
> +	uint32_t instance;
> +	int ret;
> +
> +	pstr = str = kstrndup(buf, count, GFP_KERNEL);
> +
> +	if (!str)
> +		return -ENOMEM;
> +
> +	uuid_str = strsep(&str, ":");
> +	if (!uuid_str) {
> +		pr_err("mdev_create: Empty UUID string %s\n", buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	if (!str) {
> +		pr_err("mdev_create: mdev instance not present %s\n", buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	instance_str = strsep(&str, ":");
> +	if (!instance_str) {
> +		pr_err("mdev_create: Empty instance string %s\n", buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	ret = kstrtouint(instance_str, 0, &instance);
> +	if (ret) {
> +		pr_err("mdev_create: mdev instance parsing error %s\n", buf);
> +		goto create_error;
> +	}
> +
> +	if (str)
> +		params = mdev_params = kstrdup(str, GFP_KERNEL);
> +
> +	ret = uuid_le_to_bin(uuid_str, &uuid);
> +	if (ret) {
> +		pr_err("mdev_create: UUID parse error %s\n", buf);
> +		goto create_error;
> +	}
> +
> +	ret = mdev_device_create(dev, uuid, instance, mdev_params);
> +	if (ret)
> +		pr_err("mdev_create: Failed to create mdev device\n");
> +	else
> +		ret = count;
> +
> +create_error:
> +	kfree(params);
> +	kfree(pstr);
> +	return ret;
> +}
> +
> +static ssize_t mdev_destroy_store(struct device *dev,
> +				  struct device_attribute *attr,
> +				  const char *buf, size_t count)
> +{

I wonder if we should just have a "remove" file in sysfs under the
device.

> +	char *uuid_str, *str, *pstr;
> +	uuid_le uuid;
> +	unsigned int instance;
> +	int ret;
> +
> +	str = pstr = kstrndup(buf, count, GFP_KERNEL);
> +
> +	if (!str)
> +		return -ENOMEM;
> +
> +	uuid_str = strsep(&str, ":");
> +	if (!uuid_str) {
> +		pr_err("mdev_destroy: Empty UUID string %s\n", buf);
> +		ret = -EINVAL;
> +		goto destroy_error;
> +	}
> +
> +	if (str == NULL) {
> +		pr_err("mdev_destroy: instance not specified %s\n", buf);
> +		ret = -EINVAL;
> +		goto destroy_error;
> +	}
> +
> +	ret = kstrtouint(str, 0, &instance);
> +	if (ret) {
> +		pr_err("mdev_destroy: instance parsing error %s\n", buf);
> +		goto destroy_error;
> +	}
> +
> +	ret = uuid_le_to_bin(uuid_str, &uuid);
> +	if (ret) {
> +		pr_err("mdev_destroy: UUID parse error  %s\n", buf);
> +		goto destroy_error;
> +	}
> +
> +	ret = mdev_device_destroy(dev, uuid, instance);
> +	if (ret == 0)
> +		ret = count;
> +
> +destroy_error:
> +	kfree(pstr);
> +	return ret;
> +}
> +
> +ssize_t mdev_start_store(struct class *class, struct class_attribute *attr,
> +			 const char *buf, size_t count)
> +{
> +	char *uuid_str, *ptr;
> +	uuid_le uuid;
> +	int ret;
> +
> +	ptr = uuid_str = kstrndup(buf, count, GFP_KERNEL);
> +
> +	if (!uuid_str)
> +		return -ENOMEM;
> +
> +	ret = uuid_le_to_bin(uuid_str, &uuid);
> +	if (ret) {
> +		pr_err("mdev_start: UUID parse error  %s\n", buf);
> +		goto start_error;
> +	}
> +
> +	ret = mdev_device_start(uuid);
> +	if (ret == 0)
> +		ret = count;
> +
> +start_error:
> +	kfree(ptr);
> +	return ret;
> +}
> +
> +ssize_t mdev_stop_store(struct class *class, struct class_attribute *attr,
> +			    const char *buf, size_t count)
> +{
> +	char *uuid_str, *ptr;
> +	uuid_le uuid;
> +	int ret;
> +
> +	ptr = uuid_str = kstrndup(buf, count, GFP_KERNEL);
> +
> +	if (!uuid_str)
> +		return -ENOMEM;
> +
> +	ret = uuid_le_to_bin(uuid_str, &uuid);
> +	if (ret) {
> +		pr_err("mdev_stop: UUID parse error %s\n", buf);
> +		goto stop_error;
> +	}
> +
> +	ret = mdev_device_stop(uuid);
> +	if (ret == 0)
> +		ret = count;
> +
> +stop_error:
> +	kfree(ptr);
> +	return ret;
> +
> +}
> +
> +struct class_attribute mdev_class_attrs[] = {
> +	__ATTR_WO(mdev_start),
> +	__ATTR_WO(mdev_stop),
> +	__ATTR_NULL
> +};
> +
> +int mdev_create_sysfs_files(struct device *dev)
> +{
> +	int ret;
> +
> +	ret = sysfs_create_file(&dev->kobj,
> +				&dev_attr_mdev_supported_types.attr);
> +	if (ret) {
> +		pr_err("Failed to create mdev_supported_types sysfs entry\n");
> +		return ret;
> +	}
> +
> +	ret = sysfs_create_file(&dev->kobj, &dev_attr_mdev_create.attr);
> +	if (ret) {
> +		pr_err("Failed to create mdev_create sysfs entry\n");
> +		goto create_sysfs_failed;
> +	}
> +
> +	ret = sysfs_create_file(&dev->kobj, &dev_attr_mdev_destroy.attr);
> +	if (ret) {
> +		pr_err("Failed to create mdev_destroy sysfs entry\n");
> +		sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr);
> +	} else
> +		return ret;
> +
> +create_sysfs_failed:
> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr);
> +	return ret;
> +}
> +
> +void mdev_remove_sysfs_files(struct device *dev)
> +{
> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr);
> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr);
> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_destroy.attr);
> +}
> diff --git a/include/linux/mdev.h b/include/linux/mdev.h
> new file mode 100644
> index 000000000000..0b41f301a9b7
> --- /dev/null
> +++ b/include/linux/mdev.h
> @@ -0,0 +1,236 @@
> +/*
> + * Mediated device definition
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef MDEV_H
> +#define MDEV_H
> +
> +#include <uapi/linux/vfio.h>
> +
> +struct parent_device;
> +
> +/*
> + * Mediated device
> + */
> +
> +struct addr_desc {
> +	unsigned long start;
> +	unsigned long size;
> +	struct list_head next;
> +};
> +
> +struct mdev_phys_mapping {
> +	struct address_space *mapping;
> +	struct list_head addr_desc_list;
> +	struct mutex addr_desc_list_lock;
> +};
> +
> +struct mdev_device {
> +	struct device		dev;
> +	struct parent_device	*parent;
> +	struct iommu_group	*group;
> +	uuid_le			uuid;
> +	uint32_t		instance;
> +	void			*driver_data;
> +
> +	/* internal only */
> +	struct kref		ref;
> +	struct list_head	next;
> +
> +	struct mdev_phys_mapping phys_mappings;
> +};
> +
> +
> +/**
> + * struct parent_ops - Structure to be registered for each parent device to
> + * register the device to mdev module.
> + *
> + * @owner:		The module owner.
> + * @dev_attr_groups:	Default attributes of the parent device.
> + * @mdev_attr_groups:	Default attributes of the mediated device.
> + * @supported_config:	Called to get information about supported types.
> + *			@dev : device structure of parent device.
> + *			@config: should return string listing supported config
> + *			Returns integer: success (0) or error (< 0)
> + * @create:		Called to allocate basic resources in parent device's
> + *			driver for a particular mediated device. It is
> + *			mandatory to provide create ops.
> + *			@mdev: mdev_device structure on of mediated device
> + *			      that is being created
> + *			@mdev_params: extra parameters required by parent
> + *			device's driver.
> + *			Returns integer: success (0) or error (< 0)
> + * @destroy:		Called to free resources in parent device's driver for a
> + *			a mediated device instance. It is mandatory to provide
> + *			destroy ops.
> + *			@mdev: mdev_device device structure which is being
> + *			       destroyed
> + *			Returns integer: success (0) or error (< 0)
> + *			If VMM is running and destroy() is called that means the
> + *			mdev is being hotunpluged. Return error if VMM is
> + *			running and driver doesn't support mediated device
> + *			hotplug.
> + * @reset:		Called to reset mediated device.
> + *			@mdev: mdev_device device structure
> + *			Returns integer: success (0) or error (< 0)
> + * @start:		Called to initiate mediated device initialization
> + *			process in parent device's driver before VMM starts.
> + *			@uuid: UUID
> + *			Returns integer: success (0) or error (< 0)
> + * @stop:		Called to teardown mediated device related resources
> + *			@uuid: UUID
> + *			Returns integer: success (0) or error (< 0)
> + * @read:		Read emulation callback
> + *			@mdev: mediated device structure
> + *			@buf: read buffer
> + *			@count: number of bytes to read
> + *			@pos: address.
> + *			Retuns number on bytes read on success or error.
> + * @write:		Write emulation callback
> + *			@mdev: mediated device structure
> + *			@buf: write buffer
> + *			@count: number of bytes to be written
> + *			@pos: address.
> + *			Retuns number on bytes written on success or error.
> + * @set_irqs:		Called to send about interrupts configuration
> + *			information that VMM sets.
> + *			@mdev: mediated device structure
> + *			@flags, index, start, count and *data : same as that of
> + *			struct vfio_irq_set of VFIO_DEVICE_SET_IRQS API.
> + * @get_region_info:	Called to get VFIO region size and flags of mediated
> + *			device.
> + *			@mdev: mediated device structure
> + *			@region_index: VFIO region index
> + *			@region_info: output, returns size and flags of
> + *				      requested region.
> + *			Returns integer: success (0) or error (< 0)
> + * @validate_map_request: Validate remap pfn request
> + *			@mdev: mediated device structure
> + *			@pos: address
> + *			@virtaddr: target user address to start at. Vendor
> + *				   driver can change if required.
> + *			@pfn: parent address of kernel memory, vendor driver
> + *			      can change if required.
> + *			@size: size of map area, vendor driver can change the
> + *			       size of map area if desired.
> + *			@prot: page protection flags for this mapping, vendor
> + *			       driver can change, if required.
> + *			Returns integer: success (0) or error (< 0)
> + *
> + * Parent device that support mediated device should be registered with mdev
> + * module with parent_ops structure.
> + */
> +
> +struct parent_ops {
> +	struct module   *owner;
> +	const struct attribute_group **dev_attr_groups;
> +	const struct attribute_group **mdev_attr_groups;
> +
> +	int	(*supported_config)(struct device *dev, char *config);
> +	int     (*create)(struct mdev_device *mdev, char *mdev_params);
> +	int     (*destroy)(struct mdev_device *mdev);
> +	int     (*reset)(struct mdev_device *mdev);
> +	int     (*start)(uuid_le uuid);
> +	int     (*stop)(uuid_le uuid);
> +	ssize_t (*read)(struct mdev_device *mdev, char *buf, size_t count,
> +			loff_t pos);
> +	ssize_t (*write)(struct mdev_device *mdev, char *buf, size_t count,
> +			 loff_t pos);
> +	int     (*set_irqs)(struct mdev_device *mdev, uint32_t flags,
> +			    unsigned int index, unsigned int start,
> +			    unsigned int count, void *data);
> +	int	(*get_region_info)(struct mdev_device *mdev, int region_index,
> +				   struct vfio_region_info *region_info);
> +	int	(*validate_map_request)(struct mdev_device *mdev, loff_t pos,
> +					u64 *virtaddr, unsigned long *pfn,
> +					unsigned long *size, pgprot_t *prot);
> +};
> +
> +/*
> + * Parent Device
> + */
> +
> +struct parent_device {
> +	struct device		*dev;
> +	const struct parent_ops	*ops;
> +
> +	/* internal */
> +	struct kref		ref;
> +	struct list_head	next;
> +	struct list_head	mdev_list;
> +	struct mutex		mdev_list_lock;
> +	wait_queue_head_t	release_done;
> +};
> +
> +/**
> + * struct mdev_driver - Mediated device driver
> + * @name: driver name
> + * @probe: called when new device created
> + * @remove: called when device removed
> + * @match: called when new device or driver is added for this bus. Return 1 if
> + *	   given device can be handled by given driver and zero otherwise.
> + * @driver: device driver structure
> + *
> + **/
> +struct mdev_driver {
> +	const char *name;
> +	int  (*probe)(struct device *dev);
> +	void (*remove)(struct device *dev);
> +	int  (*match)(struct device *dev);
> +	struct device_driver driver;
> +};
> +
> +static inline struct mdev_driver *to_mdev_driver(struct device_driver *drv)
> +{
> +	return drv ? container_of(drv, struct mdev_driver, driver) : NULL;
> +}
> +
> +static inline struct mdev_device *to_mdev_device(struct device *dev)
> +{
> +	return dev ? container_of(dev, struct mdev_device, dev) : NULL;
> +}
> +
> +static inline void *mdev_get_drvdata(struct mdev_device *mdev)
> +{
> +	return mdev->driver_data;
> +}
> +
> +static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data)
> +{
> +	mdev->driver_data = data;
> +}
> +
> +extern struct bus_type mdev_bus_type;
> +
> +#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type)
> +
> +extern int  mdev_register_device(struct device *dev,
> +				 const struct parent_ops *ops);
> +extern void mdev_unregister_device(struct device *dev);
> +
> +extern int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
> +extern void mdev_unregister_driver(struct mdev_driver *drv);
> +
> +extern struct mdev_device *mdev_get_device(struct mdev_device *mdev);
> +extern void mdev_put_device(struct mdev_device *mdev);
> +
> +extern struct mdev_device *mdev_get_device_by_group(struct iommu_group *group);
> +
> +extern int mdev_device_invalidate_mapping(struct mdev_device *mdev,
> +					unsigned long addr, unsigned long size);
> +
> +extern int mdev_add_phys_mapping(struct mdev_device *mdev,
> +				 struct address_space *mapping,
> +				 unsigned long addr, unsigned long size);
> +
> +
> +extern void mdev_del_phys_mapping(struct mdev_device *mdev, unsigned long addr);
> +#endif /* MDEV_H */
Kirti Wankhede Aug. 12, 2016, 6:44 p.m. UTC | #4
On 8/10/2016 12:30 AM, Alex Williamson wrote:
> On Thu, 4 Aug 2016 00:33:51 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> This is used later by mdev_device_start() and mdev_device_stop() to get
> the parent_device so it can call the start and stop ops callbacks
> respectively.  That seems to imply that all of instances for a given
> uuid come from the same parent_device.  Where is that enforced?  I'm
> still having a hard time buying into the uuid+instance plan when it
> seems like each mdev_device should have an actual unique uuid.
> Userspace tools can figure out which uuids to start for a given user, I
> don't see much value in collecting them to instances within a uuid.
> 

Initially we started discussion with VM_UUID+instance suggestion, where
instance was introduced to support multiple devices in a VM.
'mdev_create' creates device and 'mdev_start' is to commit resources of
all instances of similar devices assigned to VM.

For example, to create 2 devices:
# echo "$UUID:0:params" > /sys/devices/../mdev_create
# echo "$UUID:1:params" > /sys/devices/../mdev_create

"$UUID-0" and "$UUID-1" devices are created.

Commit resources for above devices with single 'mdev_start':
# echo "$UUID" > /sys/class/mdev/mdev_start

Considering $UUID to be a unique UUID of a device, we don't need
'instance', so 'mdev_create' would look like:

# echo "$UUID1:params" > /sys/devices/../mdev_create
# echo "$UUID2:params" > /sys/devices/../mdev_create

where $UUID1 and $UUID2 would be mdev device's unique UUID and 'params'
would be vendor specific parameters.

Device nodes would be created as "$UUID1" and "$UUID"

Then 'mdev_start' would be:
# echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_start

Similarly 'mdev_stop' and 'mdev_destroy' would be:

# echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_stop

and

# echo "$UUID1" > /sys/devices/../mdev_destroy
# echo "$UUID2" > /sys/devices/../mdev_destroy

Does this seems reasonable?

Thanks,
Kirti
Alex Williamson Aug. 12, 2016, 9:16 p.m. UTC | #5
On Sat, 13 Aug 2016 00:14:39 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 8/10/2016 12:30 AM, Alex Williamson wrote:
> > On Thu, 4 Aug 2016 00:33:51 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > 
> > This is used later by mdev_device_start() and mdev_device_stop() to get
> > the parent_device so it can call the start and stop ops callbacks
> > respectively.  That seems to imply that all of instances for a given
> > uuid come from the same parent_device.  Where is that enforced?  I'm
> > still having a hard time buying into the uuid+instance plan when it
> > seems like each mdev_device should have an actual unique uuid.
> > Userspace tools can figure out which uuids to start for a given user, I
> > don't see much value in collecting them to instances within a uuid.
> >   
> 
> Initially we started discussion with VM_UUID+instance suggestion, where
> instance was introduced to support multiple devices in a VM.

The instance number was never required in order to support multiple
devices in a VM, IIRC this UUID+instance scheme was to appease NVIDIA
management tools which wanted to re-use the VM UUID by creating vGPU
devices with that same UUID and therefore associate udev events to a
given VM.  Only then does an instance number become necessary since the
UUID needs to be static for a vGPUs within a VM.  This has always felt
like a very dodgy solution when we should probably just be querying
libvirt to give us a device to VM association.

> 'mdev_create' creates device and 'mdev_start' is to commit resources of
> all instances of similar devices assigned to VM.
> 
> For example, to create 2 devices:
> # echo "$UUID:0:params" > /sys/devices/../mdev_create
> # echo "$UUID:1:params" > /sys/devices/../mdev_create
> 
> "$UUID-0" and "$UUID-1" devices are created.
> 
> Commit resources for above devices with single 'mdev_start':
> # echo "$UUID" > /sys/class/mdev/mdev_start
> 
> Considering $UUID to be a unique UUID of a device, we don't need
> 'instance', so 'mdev_create' would look like:
> 
> # echo "$UUID1:params" > /sys/devices/../mdev_create
> # echo "$UUID2:params" > /sys/devices/../mdev_create
> 
> where $UUID1 and $UUID2 would be mdev device's unique UUID and 'params'
> would be vendor specific parameters.
> 
> Device nodes would be created as "$UUID1" and "$UUID"
> 
> Then 'mdev_start' would be:
> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_start
> 
> Similarly 'mdev_stop' and 'mdev_destroy' would be:
> 
> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_stop

I'm not sure a comma separated list makes sense here, for both
simplicity in the kernel and more fine grained error reporting, we
probably want to start/stop them individually.  Actually, why is it
that we can't use the mediated device being opened and released to
automatically signal to the backend vendor driver to commit and release
resources? I don't fully understand why userspace needs this interface.

> and
> 
> # echo "$UUID1" > /sys/devices/../mdev_destroy
> # echo "$UUID2" > /sys/devices/../mdev_destroy
> 
> Does this seems reasonable?

I've been hoping we could drop the instance numbers and create actual
unique UUIDs per mediated device for a while ;)  Thanks,

Alex
Kirti Wankhede Aug. 13, 2016, 12:37 a.m. UTC | #6
On 8/13/2016 2:46 AM, Alex Williamson wrote:
> On Sat, 13 Aug 2016 00:14:39 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 8/10/2016 12:30 AM, Alex Williamson wrote:
>>> On Thu, 4 Aug 2016 00:33:51 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>
>>> This is used later by mdev_device_start() and mdev_device_stop() to get
>>> the parent_device so it can call the start and stop ops callbacks
>>> respectively.  That seems to imply that all of instances for a given
>>> uuid come from the same parent_device.  Where is that enforced?  I'm
>>> still having a hard time buying into the uuid+instance plan when it
>>> seems like each mdev_device should have an actual unique uuid.
>>> Userspace tools can figure out which uuids to start for a given user, I
>>> don't see much value in collecting them to instances within a uuid.
>>>   
>>
>> Initially we started discussion with VM_UUID+instance suggestion, where
>> instance was introduced to support multiple devices in a VM.
> 
> The instance number was never required in order to support multiple
> devices in a VM, IIRC this UUID+instance scheme was to appease NVIDIA
> management tools which wanted to re-use the VM UUID by creating vGPU
> devices with that same UUID and therefore associate udev events to a
> given VM.  Only then does an instance number become necessary since the
> UUID needs to be static for a vGPUs within a VM.  This has always felt
> like a very dodgy solution when we should probably just be querying
> libvirt to give us a device to VM association.
> 
>> 'mdev_create' creates device and 'mdev_start' is to commit resources of
>> all instances of similar devices assigned to VM.
>>
>> For example, to create 2 devices:
>> # echo "$UUID:0:params" > /sys/devices/../mdev_create
>> # echo "$UUID:1:params" > /sys/devices/../mdev_create
>>
>> "$UUID-0" and "$UUID-1" devices are created.
>>
>> Commit resources for above devices with single 'mdev_start':
>> # echo "$UUID" > /sys/class/mdev/mdev_start
>>
>> Considering $UUID to be a unique UUID of a device, we don't need
>> 'instance', so 'mdev_create' would look like:
>>
>> # echo "$UUID1:params" > /sys/devices/../mdev_create
>> # echo "$UUID2:params" > /sys/devices/../mdev_create
>>
>> where $UUID1 and $UUID2 would be mdev device's unique UUID and 'params'
>> would be vendor specific parameters.
>>
>> Device nodes would be created as "$UUID1" and "$UUID"
>>
>> Then 'mdev_start' would be:
>> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_start
>>
>> Similarly 'mdev_stop' and 'mdev_destroy' would be:
>>
>> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_stop
> 
> I'm not sure a comma separated list makes sense here, for both
> simplicity in the kernel and more fine grained error reporting, we
> probably want to start/stop them individually.  Actually, why is it
> that we can't use the mediated device being opened and released to
> automatically signal to the backend vendor driver to commit and release
> resources? I don't fully understand why userspace needs this interface.
> 

For NVIDIA vGPU solution we need to know all devices assigned to a VM in
one shot to commit resources of all vGPUs assigned to a VM along with
some common resources.

For start callback, I can pass on the list of UUIDs as is to vendor
driver. Let vendor driver decide whether to iterate for each device and
commit resources or do it in one shot.

Thanks,
Kirti

>> and
>>
>> # echo "$UUID1" > /sys/devices/../mdev_destroy
>> # echo "$UUID2" > /sys/devices/../mdev_destroy
>>
>> Does this seems reasonable?
> 
> I've been hoping we could drop the instance numbers and create actual
> unique UUIDs per mediated device for a while ;)  Thanks,
> 
> Alex
>
Tian, Kevin Aug. 15, 2016, 9:15 a.m. UTC | #7
> From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> Sent: Friday, August 05, 2016 2:13 PM
> 
> On 8/4/2016 12:51 PM, Tian, Kevin wrote:
> >> From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> >> Sent: Thursday, August 04, 2016 3:04 AM
> >>
> >>
> >> 2. Physical device driver interface
> >> This interface provides vendor driver the set APIs to manage physical
> >> device related work in their own driver. APIs are :
> >> - supported_config: provide supported configuration list by the vendor
> >> 		    driver
> >> - create: to allocate basic resources in vendor driver for a mediated
> >> 	  device.
> >> - destroy: to free resources in vendor driver when mediated device is
> >> 	   destroyed.
> >> - reset: to free and reallocate resources in vendor driver during reboot
> >
> > Currently I saw 'reset' callback only invoked from VFIO ioctl path. Do
> > you think whether it makes sense to expose a sysfs 'reset' node too,
> > similar to what people see under a PCI device node?
> >
> 
> All vendor drivers might not support reset of mdev from sysfs. But those
> who want to support can expose 'reset' node using 'mdev_attr_groups' of
> 'struct parent_ops'.
> 

Yes, this way it works. Just wonder whether it makes sense to expose reset
sysfs node by default if a reset callback is provided by vendor driver. :-)

Thanks
Kevin
Tian, Kevin Aug. 15, 2016, 9:38 a.m. UTC | #8
> From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> Sent: Saturday, August 13, 2016 8:37 AM
> 
> 
> 
> On 8/13/2016 2:46 AM, Alex Williamson wrote:
> > On Sat, 13 Aug 2016 00:14:39 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >
> >> On 8/10/2016 12:30 AM, Alex Williamson wrote:
> >>> On Thu, 4 Aug 2016 00:33:51 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>
> >>> This is used later by mdev_device_start() and mdev_device_stop() to get
> >>> the parent_device so it can call the start and stop ops callbacks
> >>> respectively.  That seems to imply that all of instances for a given
> >>> uuid come from the same parent_device.  Where is that enforced?  I'm
> >>> still having a hard time buying into the uuid+instance plan when it
> >>> seems like each mdev_device should have an actual unique uuid.
> >>> Userspace tools can figure out which uuids to start for a given user, I
> >>> don't see much value in collecting them to instances within a uuid.
> >>>
> >>
> >> Initially we started discussion with VM_UUID+instance suggestion, where
> >> instance was introduced to support multiple devices in a VM.
> >
> > The instance number was never required in order to support multiple
> > devices in a VM, IIRC this UUID+instance scheme was to appease NVIDIA
> > management tools which wanted to re-use the VM UUID by creating vGPU
> > devices with that same UUID and therefore associate udev events to a
> > given VM.  Only then does an instance number become necessary since the
> > UUID needs to be static for a vGPUs within a VM.  This has always felt
> > like a very dodgy solution when we should probably just be querying
> > libvirt to give us a device to VM association.

Agree with Alex here. We'd better not assume that UUID will be a VM_UUID
for mdev in the basic design. It's bound to NVIDIA management stack too tightly.

I'm OK to give enough flexibility for various upper level management stacks,
e.g. instead of $UUID+INDEX style, would $UUID+STRING provide a better
option where either UUID or STRING could be optional? Upper management 
stack can choose its own policy to identify a mdev:

a) $UUID only, so each mdev is allocated with a unique UUID
b) STRING only, which could be an index (0, 1, 2, ...), or any combination 
(vgpu0, vgpu1, etc.)
c) $UUID+STRING, where UUID could be a VM UUID, and STRING could be
a numeric index

> >
> >> 'mdev_create' creates device and 'mdev_start' is to commit resources of
> >> all instances of similar devices assigned to VM.
> >>
> >> For example, to create 2 devices:
> >> # echo "$UUID:0:params" > /sys/devices/../mdev_create
> >> # echo "$UUID:1:params" > /sys/devices/../mdev_create
> >>
> >> "$UUID-0" and "$UUID-1" devices are created.
> >>
> >> Commit resources for above devices with single 'mdev_start':
> >> # echo "$UUID" > /sys/class/mdev/mdev_start
> >>
> >> Considering $UUID to be a unique UUID of a device, we don't need
> >> 'instance', so 'mdev_create' would look like:
> >>
> >> # echo "$UUID1:params" > /sys/devices/../mdev_create
> >> # echo "$UUID2:params" > /sys/devices/../mdev_create
> >>
> >> where $UUID1 and $UUID2 would be mdev device's unique UUID and 'params'
> >> would be vendor specific parameters.
> >>
> >> Device nodes would be created as "$UUID1" and "$UUID"
> >>
> >> Then 'mdev_start' would be:
> >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_start
> >>
> >> Similarly 'mdev_stop' and 'mdev_destroy' would be:
> >>
> >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_stop
> >
> > I'm not sure a comma separated list makes sense here, for both
> > simplicity in the kernel and more fine grained error reporting, we
> > probably want to start/stop them individually.  Actually, why is it
> > that we can't use the mediated device being opened and released to
> > automatically signal to the backend vendor driver to commit and release
> > resources? I don't fully understand why userspace needs this interface.

There is a meaningful use of start/stop interface, as required in live
migration support. Such interface allows vendor driver to quiescent 
mdev activity on source device before mdev hardware state is snapshot,
and then resume mdev activity on dest device after its state is recovered.
Intel has implemented experimental live migration support in KVMGT (soon
to release), based on above two interfaces (plus another two to get/set
mdev state).

> >
> 
> For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> one shot to commit resources of all vGPUs assigned to a VM along with
> some common resources.

Kirti, can you elaborate the background about above one-shot commit
requirement? It's hard to understand such a requirement. 

As I relied in another mail, I really hope start/stop become a per-mdev
attribute instead of global one, e.g.:

echo "0/1" > /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start

In many scenario the user space client may only want to talk to mdev
instance directly, w/o need to contact its parent device. Still take
live migration for example, I don't think Qemu wants to know parent
device of assigned mdev instances.

Thanks
Kevin
Alex Williamson Aug. 15, 2016, 3:59 p.m. UTC | #9
On Mon, 15 Aug 2016 09:38:52 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> > Sent: Saturday, August 13, 2016 8:37 AM
> > 
> > 
> > 
> > On 8/13/2016 2:46 AM, Alex Williamson wrote:  
> > > On Sat, 13 Aug 2016 00:14:39 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >  
> > >> On 8/10/2016 12:30 AM, Alex Williamson wrote:  
> > >>> On Thu, 4 Aug 2016 00:33:51 +0530
> > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >>>
> > >>> This is used later by mdev_device_start() and mdev_device_stop() to get
> > >>> the parent_device so it can call the start and stop ops callbacks
> > >>> respectively.  That seems to imply that all of instances for a given
> > >>> uuid come from the same parent_device.  Where is that enforced?  I'm
> > >>> still having a hard time buying into the uuid+instance plan when it
> > >>> seems like each mdev_device should have an actual unique uuid.
> > >>> Userspace tools can figure out which uuids to start for a given user, I
> > >>> don't see much value in collecting them to instances within a uuid.
> > >>>  
> > >>
> > >> Initially we started discussion with VM_UUID+instance suggestion, where
> > >> instance was introduced to support multiple devices in a VM.  
> > >
> > > The instance number was never required in order to support multiple
> > > devices in a VM, IIRC this UUID+instance scheme was to appease NVIDIA
> > > management tools which wanted to re-use the VM UUID by creating vGPU
> > > devices with that same UUID and therefore associate udev events to a
> > > given VM.  Only then does an instance number become necessary since the
> > > UUID needs to be static for a vGPUs within a VM.  This has always felt
> > > like a very dodgy solution when we should probably just be querying
> > > libvirt to give us a device to VM association.  
> 
> Agree with Alex here. We'd better not assume that UUID will be a VM_UUID
> for mdev in the basic design. It's bound to NVIDIA management stack too tightly.
> 
> I'm OK to give enough flexibility for various upper level management stacks,
> e.g. instead of $UUID+INDEX style, would $UUID+STRING provide a better
> option where either UUID or STRING could be optional? Upper management 
> stack can choose its own policy to identify a mdev:
> 
> a) $UUID only, so each mdev is allocated with a unique UUID
> b) STRING only, which could be an index (0, 1, 2, ...), or any combination 
> (vgpu0, vgpu1, etc.)
> c) $UUID+STRING, where UUID could be a VM UUID, and STRING could be
> a numeric index
> 
> > >  
> > >> 'mdev_create' creates device and 'mdev_start' is to commit resources of
> > >> all instances of similar devices assigned to VM.
> > >>
> > >> For example, to create 2 devices:
> > >> # echo "$UUID:0:params" > /sys/devices/../mdev_create
> > >> # echo "$UUID:1:params" > /sys/devices/../mdev_create
> > >>
> > >> "$UUID-0" and "$UUID-1" devices are created.
> > >>
> > >> Commit resources for above devices with single 'mdev_start':
> > >> # echo "$UUID" > /sys/class/mdev/mdev_start
> > >>
> > >> Considering $UUID to be a unique UUID of a device, we don't need
> > >> 'instance', so 'mdev_create' would look like:
> > >>
> > >> # echo "$UUID1:params" > /sys/devices/../mdev_create
> > >> # echo "$UUID2:params" > /sys/devices/../mdev_create
> > >>
> > >> where $UUID1 and $UUID2 would be mdev device's unique UUID and 'params'
> > >> would be vendor specific parameters.
> > >>
> > >> Device nodes would be created as "$UUID1" and "$UUID"
> > >>
> > >> Then 'mdev_start' would be:
> > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_start
> > >>
> > >> Similarly 'mdev_stop' and 'mdev_destroy' would be:
> > >>
> > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_stop  
> > >
> > > I'm not sure a comma separated list makes sense here, for both
> > > simplicity in the kernel and more fine grained error reporting, we
> > > probably want to start/stop them individually.  Actually, why is it
> > > that we can't use the mediated device being opened and released to
> > > automatically signal to the backend vendor driver to commit and release
> > > resources? I don't fully understand why userspace needs this interface.  
> 
> There is a meaningful use of start/stop interface, as required in live
> migration support. Such interface allows vendor driver to quiescent 
> mdev activity on source device before mdev hardware state is snapshot,
> and then resume mdev activity on dest device after its state is recovered.
> Intel has implemented experimental live migration support in KVMGT (soon
> to release), based on above two interfaces (plus another two to get/set
> mdev state).

Ok, that's actually an interesting use case for start/stop.

> > 
> > For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> > one shot to commit resources of all vGPUs assigned to a VM along with
> > some common resources.  
> 
> Kirti, can you elaborate the background about above one-shot commit
> requirement? It's hard to understand such a requirement. 

Agree, I know NVIDIA isn't planning to support hotplug initially, but
this seems like we're precluding hotplug from the design.  I don't
understand what's driving this one-shot requirement.

> As I relied in another mail, I really hope start/stop become a per-mdev
> attribute instead of global one, e.g.:
> 
> echo "0/1" > /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start
> 
> In many scenario the user space client may only want to talk to mdev
> instance directly, w/o need to contact its parent device. Still take
> live migration for example, I don't think Qemu wants to know parent
> device of assigned mdev instances.

Yep, QEMU won't know the parent device, only libvirt level tools
managing the creation and destruction of the mdev device would know
that.  Perhaps in addition to migration uses we could even use
start/stop for basic power management, device D3 state in the guest
could translate to a stop command to remove that vGPU from scheduling
while still retaining most of the state and resource allocations.
Thanks,

Alex
Neo Jia Aug. 15, 2016, 7:59 p.m. UTC | #10
On Mon, Aug 15, 2016 at 09:38:52AM +0000, Tian, Kevin wrote:
> > From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> > Sent: Saturday, August 13, 2016 8:37 AM
> > 
> > 
> > 
> > On 8/13/2016 2:46 AM, Alex Williamson wrote:
> > > On Sat, 13 Aug 2016 00:14:39 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >
> > >> On 8/10/2016 12:30 AM, Alex Williamson wrote:
> > >>> On Thu, 4 Aug 2016 00:33:51 +0530
> > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >>>
> > >>> This is used later by mdev_device_start() and mdev_device_stop() to get
> > >>> the parent_device so it can call the start and stop ops callbacks
> > >>> respectively.  That seems to imply that all of instances for a given
> > >>> uuid come from the same parent_device.  Where is that enforced?  I'm
> > >>> still having a hard time buying into the uuid+instance plan when it
> > >>> seems like each mdev_device should have an actual unique uuid.
> > >>> Userspace tools can figure out which uuids to start for a given user, I
> > >>> don't see much value in collecting them to instances within a uuid.
> > >>>
> > >>
> > >> Initially we started discussion with VM_UUID+instance suggestion, where
> > >> instance was introduced to support multiple devices in a VM.
> > >
> > > The instance number was never required in order to support multiple
> > > devices in a VM, IIRC this UUID+instance scheme was to appease NVIDIA
> > > management tools which wanted to re-use the VM UUID by creating vGPU
> > > devices with that same UUID and therefore associate udev events to a
> > > given VM.  Only then does an instance number become necessary since the
> > > UUID needs to be static for a vGPUs within a VM.  This has always felt
> > > like a very dodgy solution when we should probably just be querying
> > > libvirt to give us a device to VM association.
> 
> Agree with Alex here. We'd better not assume that UUID will be a VM_UUID
> for mdev in the basic design. It's bound to NVIDIA management stack too tightly.
> 
> I'm OK to give enough flexibility for various upper level management stacks,
> e.g. instead of $UUID+INDEX style, would $UUID+STRING provide a better
> option where either UUID or STRING could be optional? Upper management 
> stack can choose its own policy to identify a mdev:
> 
> a) $UUID only, so each mdev is allocated with a unique UUID
> b) STRING only, which could be an index (0, 1, 2, ...), or any combination 
> (vgpu0, vgpu1, etc.)
> c) $UUID+STRING, where UUID could be a VM UUID, and STRING could be
> a numeric index
> 
> > >
> > >> 'mdev_create' creates device and 'mdev_start' is to commit resources of
> > >> all instances of similar devices assigned to VM.
> > >>
> > >> For example, to create 2 devices:
> > >> # echo "$UUID:0:params" > /sys/devices/../mdev_create
> > >> # echo "$UUID:1:params" > /sys/devices/../mdev_create
> > >>
> > >> "$UUID-0" and "$UUID-1" devices are created.
> > >>
> > >> Commit resources for above devices with single 'mdev_start':
> > >> # echo "$UUID" > /sys/class/mdev/mdev_start
> > >>
> > >> Considering $UUID to be a unique UUID of a device, we don't need
> > >> 'instance', so 'mdev_create' would look like:
> > >>
> > >> # echo "$UUID1:params" > /sys/devices/../mdev_create
> > >> # echo "$UUID2:params" > /sys/devices/../mdev_create
> > >>
> > >> where $UUID1 and $UUID2 would be mdev device's unique UUID and 'params'
> > >> would be vendor specific parameters.
> > >>
> > >> Device nodes would be created as "$UUID1" and "$UUID"
> > >>
> > >> Then 'mdev_start' would be:
> > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_start
> > >>
> > >> Similarly 'mdev_stop' and 'mdev_destroy' would be:
> > >>
> > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_stop
> > >
> > > I'm not sure a comma separated list makes sense here, for both
> > > simplicity in the kernel and more fine grained error reporting, we
> > > probably want to start/stop them individually.  Actually, why is it
> > > that we can't use the mediated device being opened and released to
> > > automatically signal to the backend vendor driver to commit and release
> > > resources? I don't fully understand why userspace needs this interface.
> 
> There is a meaningful use of start/stop interface, as required in live
> migration support. Such interface allows vendor driver to quiescent 
> mdev activity on source device before mdev hardware state is snapshot,
> and then resume mdev activity on dest device after its state is recovered.
> Intel has implemented experimental live migration support in KVMGT (soon
> to release), based on above two interfaces (plus another two to get/set
> mdev state).
> 
> > >
> > 
> > For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> > one shot to commit resources of all vGPUs assigned to a VM along with
> > some common resources.
> 
> Kirti, can you elaborate the background about above one-shot commit
> requirement? It's hard to understand such a requirement. 
> 
> As I relied in another mail, I really hope start/stop become a per-mdev
> attribute instead of global one, e.g.:
> 
> echo "0/1" > /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start
> 
> In many scenario the user space client may only want to talk to mdev
> instance directly, w/o need to contact its parent device. Still take
> live migration for example, I don't think Qemu wants to know parent
> device of assigned mdev instances.

Hi Kevin,

Having a global /sys/class/mdev/mdev_start doesn't require anybody to know
parent device. you can just do 

echo "mdev_UUID" > /sys/class/mdev/mdev_start

or 

echo "mdev_UUID" > /sys/class/mdev/mdev_stop

without knowing the parent device.

Thanks,
Neo

> 
> Thanks
> Kevin
Neo Jia Aug. 15, 2016, 10:09 p.m. UTC | #11
On Mon, Aug 15, 2016 at 09:59:26AM -0600, Alex Williamson wrote:
> On Mon, 15 Aug 2016 09:38:52 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> > > Sent: Saturday, August 13, 2016 8:37 AM
> > > 
> > > 
> > > 
> > > On 8/13/2016 2:46 AM, Alex Williamson wrote:  
> > > > On Sat, 13 Aug 2016 00:14:39 +0530
> > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > >  
> > > >> On 8/10/2016 12:30 AM, Alex Williamson wrote:  
> > > >>> On Thu, 4 Aug 2016 00:33:51 +0530
> > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > >>>
> > > >>> This is used later by mdev_device_start() and mdev_device_stop() to get
> > > >>> the parent_device so it can call the start and stop ops callbacks
> > > >>> respectively.  That seems to imply that all of instances for a given
> > > >>> uuid come from the same parent_device.  Where is that enforced?  I'm
> > > >>> still having a hard time buying into the uuid+instance plan when it
> > > >>> seems like each mdev_device should have an actual unique uuid.
> > > >>> Userspace tools can figure out which uuids to start for a given user, I
> > > >>> don't see much value in collecting them to instances within a uuid.
> > > >>>  
> > > >>
> > > >> Initially we started discussion with VM_UUID+instance suggestion, where
> > > >> instance was introduced to support multiple devices in a VM.  
> > > >
> > > > The instance number was never required in order to support multiple
> > > > devices in a VM, IIRC this UUID+instance scheme was to appease NVIDIA
> > > > management tools which wanted to re-use the VM UUID by creating vGPU
> > > > devices with that same UUID and therefore associate udev events to a
> > > > given VM.  Only then does an instance number become necessary since the
> > > > UUID needs to be static for a vGPUs within a VM.  This has always felt
> > > > like a very dodgy solution when we should probably just be querying
> > > > libvirt to give us a device to VM association.  
> > 
> > Agree with Alex here. We'd better not assume that UUID will be a VM_UUID
> > for mdev in the basic design. It's bound to NVIDIA management stack too tightly.
> > 
> > I'm OK to give enough flexibility for various upper level management stacks,
> > e.g. instead of $UUID+INDEX style, would $UUID+STRING provide a better
> > option where either UUID or STRING could be optional? Upper management 
> > stack can choose its own policy to identify a mdev:
> > 
> > a) $UUID only, so each mdev is allocated with a unique UUID
> > b) STRING only, which could be an index (0, 1, 2, ...), or any combination 
> > (vgpu0, vgpu1, etc.)
> > c) $UUID+STRING, where UUID could be a VM UUID, and STRING could be
> > a numeric index
> > 
> > > >  
> > > >> 'mdev_create' creates device and 'mdev_start' is to commit resources of
> > > >> all instances of similar devices assigned to VM.
> > > >>
> > > >> For example, to create 2 devices:
> > > >> # echo "$UUID:0:params" > /sys/devices/../mdev_create
> > > >> # echo "$UUID:1:params" > /sys/devices/../mdev_create
> > > >>
> > > >> "$UUID-0" and "$UUID-1" devices are created.
> > > >>
> > > >> Commit resources for above devices with single 'mdev_start':
> > > >> # echo "$UUID" > /sys/class/mdev/mdev_start
> > > >>
> > > >> Considering $UUID to be a unique UUID of a device, we don't need
> > > >> 'instance', so 'mdev_create' would look like:
> > > >>
> > > >> # echo "$UUID1:params" > /sys/devices/../mdev_create
> > > >> # echo "$UUID2:params" > /sys/devices/../mdev_create
> > > >>
> > > >> where $UUID1 and $UUID2 would be mdev device's unique UUID and 'params'
> > > >> would be vendor specific parameters.
> > > >>
> > > >> Device nodes would be created as "$UUID1" and "$UUID"
> > > >>
> > > >> Then 'mdev_start' would be:
> > > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_start
> > > >>
> > > >> Similarly 'mdev_stop' and 'mdev_destroy' would be:
> > > >>
> > > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_stop  
> > > >
> > > > I'm not sure a comma separated list makes sense here, for both
> > > > simplicity in the kernel and more fine grained error reporting, we
> > > > probably want to start/stop them individually.  Actually, why is it
> > > > that we can't use the mediated device being opened and released to
> > > > automatically signal to the backend vendor driver to commit and release
> > > > resources? I don't fully understand why userspace needs this interface.  
> > 
> > There is a meaningful use of start/stop interface, as required in live
> > migration support. Such interface allows vendor driver to quiescent 
> > mdev activity on source device before mdev hardware state is snapshot,
> > and then resume mdev activity on dest device after its state is recovered.
> > Intel has implemented experimental live migration support in KVMGT (soon
> > to release), based on above two interfaces (plus another two to get/set
> > mdev state).
> 
> Ok, that's actually an interesting use case for start/stop.
> 
> > > 
> > > For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > some common resources.  
> > 
> > Kirti, can you elaborate the background about above one-shot commit
> > requirement? It's hard to understand such a requirement. 
> 
> Agree, I know NVIDIA isn't planning to support hotplug initially, but
> this seems like we're precluding hotplug from the design.  I don't
> understand what's driving this one-shot requirement.

Hi Alex,

The requirement here is based on how our internal vGPU device model designed and
with this we are able to pre-allocate resources required for multiple virtual
devices within same domain.

And I don't think this syntax will stop us from supporting hotplug at all.

For example, you can always create a virtual mdev and then do

echo "mdev_UUID" > /sys/class/mdev/mdev_start

then use QEMU monitor to add the device for hotplug.

> 
> > As I relied in another mail, I really hope start/stop become a per-mdev
> > attribute instead of global one, e.g.:
> > 
> > echo "0/1" > /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start
> > 
> > In many scenario the user space client may only want to talk to mdev
> > instance directly, w/o need to contact its parent device. Still take
> > live migration for example, I don't think Qemu wants to know parent
> > device of assigned mdev instances.
> 
> Yep, QEMU won't know the parent device, only libvirt level tools
> managing the creation and destruction of the mdev device would know
> that.  Perhaps in addition to migration uses we could even use
> start/stop for basic power management, device D3 state in the guest
> could translate to a stop command to remove that vGPU from scheduling
> while still retaining most of the state and resource allocations.

Just recap what I have replied to Kevin on his previous email, the current
mdev_start and mdev_stop doesn't require any knowledge of parent device.

Thanks,
Neo

> Thanks,
> 
> Alex
Alex Williamson Aug. 15, 2016, 10:47 p.m. UTC | #12
On Mon, 15 Aug 2016 12:59:08 -0700
Neo Jia <cjia@nvidia.com> wrote:

> On Mon, Aug 15, 2016 at 09:38:52AM +0000, Tian, Kevin wrote:
> > > From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> > > Sent: Saturday, August 13, 2016 8:37 AM
> > > 
> > > 
> > > 
> > > On 8/13/2016 2:46 AM, Alex Williamson wrote:  
> > > > On Sat, 13 Aug 2016 00:14:39 +0530
> > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > >  
> > > >> On 8/10/2016 12:30 AM, Alex Williamson wrote:  
> > > >>> On Thu, 4 Aug 2016 00:33:51 +0530
> > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > >>>
> > > >>> This is used later by mdev_device_start() and mdev_device_stop() to get
> > > >>> the parent_device so it can call the start and stop ops callbacks
> > > >>> respectively.  That seems to imply that all of instances for a given
> > > >>> uuid come from the same parent_device.  Where is that enforced?  I'm
> > > >>> still having a hard time buying into the uuid+instance plan when it
> > > >>> seems like each mdev_device should have an actual unique uuid.
> > > >>> Userspace tools can figure out which uuids to start for a given user, I
> > > >>> don't see much value in collecting them to instances within a uuid.
> > > >>>  
> > > >>
> > > >> Initially we started discussion with VM_UUID+instance suggestion, where
> > > >> instance was introduced to support multiple devices in a VM.  
> > > >
> > > > The instance number was never required in order to support multiple
> > > > devices in a VM, IIRC this UUID+instance scheme was to appease NVIDIA
> > > > management tools which wanted to re-use the VM UUID by creating vGPU
> > > > devices with that same UUID and therefore associate udev events to a
> > > > given VM.  Only then does an instance number become necessary since the
> > > > UUID needs to be static for a vGPUs within a VM.  This has always felt
> > > > like a very dodgy solution when we should probably just be querying
> > > > libvirt to give us a device to VM association.  
> > 
> > Agree with Alex here. We'd better not assume that UUID will be a VM_UUID
> > for mdev in the basic design. It's bound to NVIDIA management stack too tightly.
> > 
> > I'm OK to give enough flexibility for various upper level management stacks,
> > e.g. instead of $UUID+INDEX style, would $UUID+STRING provide a better
> > option where either UUID or STRING could be optional? Upper management 
> > stack can choose its own policy to identify a mdev:
> > 
> > a) $UUID only, so each mdev is allocated with a unique UUID
> > b) STRING only, which could be an index (0, 1, 2, ...), or any combination 
> > (vgpu0, vgpu1, etc.)
> > c) $UUID+STRING, where UUID could be a VM UUID, and STRING could be
> > a numeric index
> >   
> > > >  
> > > >> 'mdev_create' creates device and 'mdev_start' is to commit resources of
> > > >> all instances of similar devices assigned to VM.
> > > >>
> > > >> For example, to create 2 devices:
> > > >> # echo "$UUID:0:params" > /sys/devices/../mdev_create
> > > >> # echo "$UUID:1:params" > /sys/devices/../mdev_create
> > > >>
> > > >> "$UUID-0" and "$UUID-1" devices are created.
> > > >>
> > > >> Commit resources for above devices with single 'mdev_start':
> > > >> # echo "$UUID" > /sys/class/mdev/mdev_start
> > > >>
> > > >> Considering $UUID to be a unique UUID of a device, we don't need
> > > >> 'instance', so 'mdev_create' would look like:
> > > >>
> > > >> # echo "$UUID1:params" > /sys/devices/../mdev_create
> > > >> # echo "$UUID2:params" > /sys/devices/../mdev_create
> > > >>
> > > >> where $UUID1 and $UUID2 would be mdev device's unique UUID and 'params'
> > > >> would be vendor specific parameters.
> > > >>
> > > >> Device nodes would be created as "$UUID1" and "$UUID"
> > > >>
> > > >> Then 'mdev_start' would be:
> > > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_start
> > > >>
> > > >> Similarly 'mdev_stop' and 'mdev_destroy' would be:
> > > >>
> > > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_stop  
> > > >
> > > > I'm not sure a comma separated list makes sense here, for both
> > > > simplicity in the kernel and more fine grained error reporting, we
> > > > probably want to start/stop them individually.  Actually, why is it
> > > > that we can't use the mediated device being opened and released to
> > > > automatically signal to the backend vendor driver to commit and release
> > > > resources? I don't fully understand why userspace needs this interface.  
> > 
> > There is a meaningful use of start/stop interface, as required in live
> > migration support. Such interface allows vendor driver to quiescent 
> > mdev activity on source device before mdev hardware state is snapshot,
> > and then resume mdev activity on dest device after its state is recovered.
> > Intel has implemented experimental live migration support in KVMGT (soon
> > to release), based on above two interfaces (plus another two to get/set
> > mdev state).
> >   
> > > >  
> > > 
> > > For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > some common resources.  
> > 
> > Kirti, can you elaborate the background about above one-shot commit
> > requirement? It's hard to understand such a requirement. 
> > 
> > As I relied in another mail, I really hope start/stop become a per-mdev
> > attribute instead of global one, e.g.:
> > 
> > echo "0/1" > /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start
> > 
> > In many scenario the user space client may only want to talk to mdev
> > instance directly, w/o need to contact its parent device. Still take
> > live migration for example, I don't think Qemu wants to know parent
> > device of assigned mdev instances.  
> 
> Hi Kevin,
> 
> Having a global /sys/class/mdev/mdev_start doesn't require anybody to know
> parent device. you can just do 
> 
> echo "mdev_UUID" > /sys/class/mdev/mdev_start
> 
> or 
> 
> echo "mdev_UUID" > /sys/class/mdev/mdev_stop
> 
> without knowing the parent device.

That doesn't give an individual user the ability to stop and start
their devices though, because in order for a user to have write
permissions there, they get permission to DoS other users by pumping
arbitrary UUIDs into those files.  By placing start/stop per mdev, we
have mdev level granularity of granting start/stop privileges.  Really
though, do we want QEMU fumbling around through sysfs or do we want an
interface through the vfio API to perform start/stop?  Thanks,

Alex
Alex Williamson Aug. 15, 2016, 10:52 p.m. UTC | #13
On Mon, 15 Aug 2016 15:09:30 -0700
Neo Jia <cjia@nvidia.com> wrote:

> On Mon, Aug 15, 2016 at 09:59:26AM -0600, Alex Williamson wrote:
> > On Mon, 15 Aug 2016 09:38:52 +0000
> > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> >   
> > > > From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> > > > Sent: Saturday, August 13, 2016 8:37 AM
> > > > 
> > > > 
> > > > 
> > > > On 8/13/2016 2:46 AM, Alex Williamson wrote:    
> > > > > On Sat, 13 Aug 2016 00:14:39 +0530
> > > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > >    
> > > > >> On 8/10/2016 12:30 AM, Alex Williamson wrote:    
> > > > >>> On Thu, 4 Aug 2016 00:33:51 +0530
> > > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > >>>
> > > > >>> This is used later by mdev_device_start() and mdev_device_stop() to get
> > > > >>> the parent_device so it can call the start and stop ops callbacks
> > > > >>> respectively.  That seems to imply that all of instances for a given
> > > > >>> uuid come from the same parent_device.  Where is that enforced?  I'm
> > > > >>> still having a hard time buying into the uuid+instance plan when it
> > > > >>> seems like each mdev_device should have an actual unique uuid.
> > > > >>> Userspace tools can figure out which uuids to start for a given user, I
> > > > >>> don't see much value in collecting them to instances within a uuid.
> > > > >>>    
> > > > >>
> > > > >> Initially we started discussion with VM_UUID+instance suggestion, where
> > > > >> instance was introduced to support multiple devices in a VM.    
> > > > >
> > > > > The instance number was never required in order to support multiple
> > > > > devices in a VM, IIRC this UUID+instance scheme was to appease NVIDIA
> > > > > management tools which wanted to re-use the VM UUID by creating vGPU
> > > > > devices with that same UUID and therefore associate udev events to a
> > > > > given VM.  Only then does an instance number become necessary since the
> > > > > UUID needs to be static for a vGPUs within a VM.  This has always felt
> > > > > like a very dodgy solution when we should probably just be querying
> > > > > libvirt to give us a device to VM association.    
> > > 
> > > Agree with Alex here. We'd better not assume that UUID will be a VM_UUID
> > > for mdev in the basic design. It's bound to NVIDIA management stack too tightly.
> > > 
> > > I'm OK to give enough flexibility for various upper level management stacks,
> > > e.g. instead of $UUID+INDEX style, would $UUID+STRING provide a better
> > > option where either UUID or STRING could be optional? Upper management 
> > > stack can choose its own policy to identify a mdev:
> > > 
> > > a) $UUID only, so each mdev is allocated with a unique UUID
> > > b) STRING only, which could be an index (0, 1, 2, ...), or any combination 
> > > (vgpu0, vgpu1, etc.)
> > > c) $UUID+STRING, where UUID could be a VM UUID, and STRING could be
> > > a numeric index
> > >   
> > > > >    
> > > > >> 'mdev_create' creates device and 'mdev_start' is to commit resources of
> > > > >> all instances of similar devices assigned to VM.
> > > > >>
> > > > >> For example, to create 2 devices:
> > > > >> # echo "$UUID:0:params" > /sys/devices/../mdev_create
> > > > >> # echo "$UUID:1:params" > /sys/devices/../mdev_create
> > > > >>
> > > > >> "$UUID-0" and "$UUID-1" devices are created.
> > > > >>
> > > > >> Commit resources for above devices with single 'mdev_start':
> > > > >> # echo "$UUID" > /sys/class/mdev/mdev_start
> > > > >>
> > > > >> Considering $UUID to be a unique UUID of a device, we don't need
> > > > >> 'instance', so 'mdev_create' would look like:
> > > > >>
> > > > >> # echo "$UUID1:params" > /sys/devices/../mdev_create
> > > > >> # echo "$UUID2:params" > /sys/devices/../mdev_create
> > > > >>
> > > > >> where $UUID1 and $UUID2 would be mdev device's unique UUID and 'params'
> > > > >> would be vendor specific parameters.
> > > > >>
> > > > >> Device nodes would be created as "$UUID1" and "$UUID"
> > > > >>
> > > > >> Then 'mdev_start' would be:
> > > > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_start
> > > > >>
> > > > >> Similarly 'mdev_stop' and 'mdev_destroy' would be:
> > > > >>
> > > > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_stop    
> > > > >
> > > > > I'm not sure a comma separated list makes sense here, for both
> > > > > simplicity in the kernel and more fine grained error reporting, we
> > > > > probably want to start/stop them individually.  Actually, why is it
> > > > > that we can't use the mediated device being opened and released to
> > > > > automatically signal to the backend vendor driver to commit and release
> > > > > resources? I don't fully understand why userspace needs this interface.    
> > > 
> > > There is a meaningful use of start/stop interface, as required in live
> > > migration support. Such interface allows vendor driver to quiescent 
> > > mdev activity on source device before mdev hardware state is snapshot,
> > > and then resume mdev activity on dest device after its state is recovered.
> > > Intel has implemented experimental live migration support in KVMGT (soon
> > > to release), based on above two interfaces (plus another two to get/set
> > > mdev state).  
> > 
> > Ok, that's actually an interesting use case for start/stop.
> >   
> > > > 
> > > > For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> > > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > > some common resources.    
> > > 
> > > Kirti, can you elaborate the background about above one-shot commit
> > > requirement? It's hard to understand such a requirement.   
> > 
> > Agree, I know NVIDIA isn't planning to support hotplug initially, but
> > this seems like we're precluding hotplug from the design.  I don't
> > understand what's driving this one-shot requirement.  
> 
> Hi Alex,
> 
> The requirement here is based on how our internal vGPU device model designed and
> with this we are able to pre-allocate resources required for multiple virtual
> devices within same domain.
> 
> And I don't think this syntax will stop us from supporting hotplug at all.
> 
> For example, you can always create a virtual mdev and then do
> 
> echo "mdev_UUID" > /sys/class/mdev/mdev_start
> 
> then use QEMU monitor to add the device for hotplug.

Hi Neo,

I'm still not understanding the advantage you get from the "one-shot"
approach then if we can always add more mdevs by starting them later.
Are the hotplug mdevs somehow less capable than the initial set of
mdevs added in a single shot?  If the initial set is allocated
from the "same domain", does that give them some sort of hardware
locality/resource benefit?  Thanks,

Alex
Neo Jia Aug. 15, 2016, 11:23 p.m. UTC | #14
On Mon, Aug 15, 2016 at 04:52:39PM -0600, Alex Williamson wrote:
> On Mon, 15 Aug 2016 15:09:30 -0700
> Neo Jia <cjia@nvidia.com> wrote:
> 
> > On Mon, Aug 15, 2016 at 09:59:26AM -0600, Alex Williamson wrote:
> > > On Mon, 15 Aug 2016 09:38:52 +0000
> > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > >   
> > > > > From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> > > > > Sent: Saturday, August 13, 2016 8:37 AM
> > > > > 
> > > > > 
> > > > > 
> > > > > On 8/13/2016 2:46 AM, Alex Williamson wrote:    
> > > > > > On Sat, 13 Aug 2016 00:14:39 +0530
> > > > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > > >    
> > > > > >> On 8/10/2016 12:30 AM, Alex Williamson wrote:    
> > > > > >>> On Thu, 4 Aug 2016 00:33:51 +0530
> > > > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > > >>>
> > > > > >>> This is used later by mdev_device_start() and mdev_device_stop() to get
> > > > > >>> the parent_device so it can call the start and stop ops callbacks
> > > > > >>> respectively.  That seems to imply that all of instances for a given
> > > > > >>> uuid come from the same parent_device.  Where is that enforced?  I'm
> > > > > >>> still having a hard time buying into the uuid+instance plan when it
> > > > > >>> seems like each mdev_device should have an actual unique uuid.
> > > > > >>> Userspace tools can figure out which uuids to start for a given user, I
> > > > > >>> don't see much value in collecting them to instances within a uuid.
> > > > > >>>    
> > > > > >>
> > > > > >> Initially we started discussion with VM_UUID+instance suggestion, where
> > > > > >> instance was introduced to support multiple devices in a VM.    
> > > > > >
> > > > > > The instance number was never required in order to support multiple
> > > > > > devices in a VM, IIRC this UUID+instance scheme was to appease NVIDIA
> > > > > > management tools which wanted to re-use the VM UUID by creating vGPU
> > > > > > devices with that same UUID and therefore associate udev events to a
> > > > > > given VM.  Only then does an instance number become necessary since the
> > > > > > UUID needs to be static for a vGPUs within a VM.  This has always felt
> > > > > > like a very dodgy solution when we should probably just be querying
> > > > > > libvirt to give us a device to VM association.    
> > > > 
> > > > Agree with Alex here. We'd better not assume that UUID will be a VM_UUID
> > > > for mdev in the basic design. It's bound to NVIDIA management stack too tightly.
> > > > 
> > > > I'm OK to give enough flexibility for various upper level management stacks,
> > > > e.g. instead of $UUID+INDEX style, would $UUID+STRING provide a better
> > > > option where either UUID or STRING could be optional? Upper management 
> > > > stack can choose its own policy to identify a mdev:
> > > > 
> > > > a) $UUID only, so each mdev is allocated with a unique UUID
> > > > b) STRING only, which could be an index (0, 1, 2, ...), or any combination 
> > > > (vgpu0, vgpu1, etc.)
> > > > c) $UUID+STRING, where UUID could be a VM UUID, and STRING could be
> > > > a numeric index
> > > >   
> > > > > >    
> > > > > >> 'mdev_create' creates device and 'mdev_start' is to commit resources of
> > > > > >> all instances of similar devices assigned to VM.
> > > > > >>
> > > > > >> For example, to create 2 devices:
> > > > > >> # echo "$UUID:0:params" > /sys/devices/../mdev_create
> > > > > >> # echo "$UUID:1:params" > /sys/devices/../mdev_create
> > > > > >>
> > > > > >> "$UUID-0" and "$UUID-1" devices are created.
> > > > > >>
> > > > > >> Commit resources for above devices with single 'mdev_start':
> > > > > >> # echo "$UUID" > /sys/class/mdev/mdev_start
> > > > > >>
> > > > > >> Considering $UUID to be a unique UUID of a device, we don't need
> > > > > >> 'instance', so 'mdev_create' would look like:
> > > > > >>
> > > > > >> # echo "$UUID1:params" > /sys/devices/../mdev_create
> > > > > >> # echo "$UUID2:params" > /sys/devices/../mdev_create
> > > > > >>
> > > > > >> where $UUID1 and $UUID2 would be mdev device's unique UUID and 'params'
> > > > > >> would be vendor specific parameters.
> > > > > >>
> > > > > >> Device nodes would be created as "$UUID1" and "$UUID"
> > > > > >>
> > > > > >> Then 'mdev_start' would be:
> > > > > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_start
> > > > > >>
> > > > > >> Similarly 'mdev_stop' and 'mdev_destroy' would be:
> > > > > >>
> > > > > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_stop    
> > > > > >
> > > > > > I'm not sure a comma separated list makes sense here, for both
> > > > > > simplicity in the kernel and more fine grained error reporting, we
> > > > > > probably want to start/stop them individually.  Actually, why is it
> > > > > > that we can't use the mediated device being opened and released to
> > > > > > automatically signal to the backend vendor driver to commit and release
> > > > > > resources? I don't fully understand why userspace needs this interface.    
> > > > 
> > > > There is a meaningful use of start/stop interface, as required in live
> > > > migration support. Such interface allows vendor driver to quiescent 
> > > > mdev activity on source device before mdev hardware state is snapshot,
> > > > and then resume mdev activity on dest device after its state is recovered.
> > > > Intel has implemented experimental live migration support in KVMGT (soon
> > > > to release), based on above two interfaces (plus another two to get/set
> > > > mdev state).  
> > > 
> > > Ok, that's actually an interesting use case for start/stop.
> > >   
> > > > > 
> > > > > For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> > > > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > > > some common resources.    
> > > > 
> > > > Kirti, can you elaborate the background about above one-shot commit
> > > > requirement? It's hard to understand such a requirement.   
> > > 
> > > Agree, I know NVIDIA isn't planning to support hotplug initially, but
> > > this seems like we're precluding hotplug from the design.  I don't
> > > understand what's driving this one-shot requirement.  
> > 
> > Hi Alex,
> > 
> > The requirement here is based on how our internal vGPU device model designed and
> > with this we are able to pre-allocate resources required for multiple virtual
> > devices within same domain.
> > 
> > And I don't think this syntax will stop us from supporting hotplug at all.
> > 
> > For example, you can always create a virtual mdev and then do
> > 
> > echo "mdev_UUID" > /sys/class/mdev/mdev_start
> > 
> > then use QEMU monitor to add the device for hotplug.
> 
> Hi Neo,
> 
> I'm still not understanding the advantage you get from the "one-shot"
> approach then if we can always add more mdevs by starting them later.
> Are the hotplug mdevs somehow less capable than the initial set of
> mdevs added in a single shot?  If the initial set is allocated
> from the "same domain", does that give them some sort of hardware
> locality/resource benefit?  Thanks,

Hi Alex,

At least we will not able to guarantee some special hardware resource for the
hotplug devices.

So from our point of view, we also have dedicated internal SW entity to manage all
virtual devices for each "domain/virtual machine", and such SW entity will be created 
at virtual device start time.

This is why we need to do this in one-shot to support multiple virtual device
per VM case.

Thanks,
Neo

> 
> Alex
Neo Jia Aug. 15, 2016, 11:54 p.m. UTC | #15
On Mon, Aug 15, 2016 at 04:47:41PM -0600, Alex Williamson wrote:
> On Mon, 15 Aug 2016 12:59:08 -0700
> Neo Jia <cjia@nvidia.com> wrote:
> 
> > On Mon, Aug 15, 2016 at 09:38:52AM +0000, Tian, Kevin wrote:
> > > > From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> > > > Sent: Saturday, August 13, 2016 8:37 AM
> > > > 
> > > > 
> > > > 
> > > > On 8/13/2016 2:46 AM, Alex Williamson wrote:  
> > > > > On Sat, 13 Aug 2016 00:14:39 +0530
> > > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > >  
> > > > >> On 8/10/2016 12:30 AM, Alex Williamson wrote:  
> > > > >>> On Thu, 4 Aug 2016 00:33:51 +0530
> > > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > >>>
> > > > >>> This is used later by mdev_device_start() and mdev_device_stop() to get
> > > > >>> the parent_device so it can call the start and stop ops callbacks
> > > > >>> respectively.  That seems to imply that all of instances for a given
> > > > >>> uuid come from the same parent_device.  Where is that enforced?  I'm
> > > > >>> still having a hard time buying into the uuid+instance plan when it
> > > > >>> seems like each mdev_device should have an actual unique uuid.
> > > > >>> Userspace tools can figure out which uuids to start for a given user, I
> > > > >>> don't see much value in collecting them to instances within a uuid.
> > > > >>>  
> > > > >>
> > > > >> Initially we started discussion with VM_UUID+instance suggestion, where
> > > > >> instance was introduced to support multiple devices in a VM.  
> > > > >
> > > > > The instance number was never required in order to support multiple
> > > > > devices in a VM, IIRC this UUID+instance scheme was to appease NVIDIA
> > > > > management tools which wanted to re-use the VM UUID by creating vGPU
> > > > > devices with that same UUID and therefore associate udev events to a
> > > > > given VM.  Only then does an instance number become necessary since the
> > > > > UUID needs to be static for a vGPUs within a VM.  This has always felt
> > > > > like a very dodgy solution when we should probably just be querying
> > > > > libvirt to give us a device to VM association.  
> > > 
> > > Agree with Alex here. We'd better not assume that UUID will be a VM_UUID
> > > for mdev in the basic design. It's bound to NVIDIA management stack too tightly.
> > > 
> > > I'm OK to give enough flexibility for various upper level management stacks,
> > > e.g. instead of $UUID+INDEX style, would $UUID+STRING provide a better
> > > option where either UUID or STRING could be optional? Upper management 
> > > stack can choose its own policy to identify a mdev:
> > > 
> > > a) $UUID only, so each mdev is allocated with a unique UUID
> > > b) STRING only, which could be an index (0, 1, 2, ...), or any combination 
> > > (vgpu0, vgpu1, etc.)
> > > c) $UUID+STRING, where UUID could be a VM UUID, and STRING could be
> > > a numeric index
> > >   
> > > > >  
> > > > >> 'mdev_create' creates device and 'mdev_start' is to commit resources of
> > > > >> all instances of similar devices assigned to VM.
> > > > >>
> > > > >> For example, to create 2 devices:
> > > > >> # echo "$UUID:0:params" > /sys/devices/../mdev_create
> > > > >> # echo "$UUID:1:params" > /sys/devices/../mdev_create
> > > > >>
> > > > >> "$UUID-0" and "$UUID-1" devices are created.
> > > > >>
> > > > >> Commit resources for above devices with single 'mdev_start':
> > > > >> # echo "$UUID" > /sys/class/mdev/mdev_start
> > > > >>
> > > > >> Considering $UUID to be a unique UUID of a device, we don't need
> > > > >> 'instance', so 'mdev_create' would look like:
> > > > >>
> > > > >> # echo "$UUID1:params" > /sys/devices/../mdev_create
> > > > >> # echo "$UUID2:params" > /sys/devices/../mdev_create
> > > > >>
> > > > >> where $UUID1 and $UUID2 would be mdev device's unique UUID and 'params'
> > > > >> would be vendor specific parameters.
> > > > >>
> > > > >> Device nodes would be created as "$UUID1" and "$UUID"
> > > > >>
> > > > >> Then 'mdev_start' would be:
> > > > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_start
> > > > >>
> > > > >> Similarly 'mdev_stop' and 'mdev_destroy' would be:
> > > > >>
> > > > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_stop  
> > > > >
> > > > > I'm not sure a comma separated list makes sense here, for both
> > > > > simplicity in the kernel and more fine grained error reporting, we
> > > > > probably want to start/stop them individually.  Actually, why is it
> > > > > that we can't use the mediated device being opened and released to
> > > > > automatically signal to the backend vendor driver to commit and release
> > > > > resources? I don't fully understand why userspace needs this interface.  
> > > 
> > > There is a meaningful use of start/stop interface, as required in live
> > > migration support. Such interface allows vendor driver to quiescent 
> > > mdev activity on source device before mdev hardware state is snapshot,
> > > and then resume mdev activity on dest device after its state is recovered.
> > > Intel has implemented experimental live migration support in KVMGT (soon
> > > to release), based on above two interfaces (plus another two to get/set
> > > mdev state).
> > >   
> > > > >  
> > > > 
> > > > For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> > > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > > some common resources.  
> > > 
> > > Kirti, can you elaborate the background about above one-shot commit
> > > requirement? It's hard to understand such a requirement. 
> > > 
> > > As I relied in another mail, I really hope start/stop become a per-mdev
> > > attribute instead of global one, e.g.:
> > > 
> > > echo "0/1" > /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start
> > > 
> > > In many scenario the user space client may only want to talk to mdev
> > > instance directly, w/o need to contact its parent device. Still take
> > > live migration for example, I don't think Qemu wants to know parent
> > > device of assigned mdev instances.  
> > 
> > Hi Kevin,
> > 
> > Having a global /sys/class/mdev/mdev_start doesn't require anybody to know
> > parent device. you can just do 
> > 
> > echo "mdev_UUID" > /sys/class/mdev/mdev_start
> > 
> > or 
> > 
> > echo "mdev_UUID" > /sys/class/mdev/mdev_stop
> > 
> > without knowing the parent device.
> 
> That doesn't give an individual user the ability to stop and start
> their devices though, because in order for a user to have write
> permissions there, they get permission to DoS other users by pumping
> arbitrary UUIDs into those files.  By placing start/stop per mdev, we
> have mdev level granularity of granting start/stop privileges.  Really
> though, do we want QEMU fumbling around through sysfs or do we want an
> interface through the vfio API to perform start/stop?  Thanks,

Hi Alex,

With the current sysfs proposal, I don't think QEMU needs to do anything
regards to manage virtual device lifecycle. It is part of the upper layer
management stack who is responsible to create virtual device and get
ready for consumers like QEMU.

In terms of the VFIO API, I assume we will basically loop through all virtual
devices inside QEMU vfio/pci.c for example, and call the start or stop
individually, right?

If that is the case, it doesn't change much from the current design about how to
handle the "one-short" start requirement, and we are adding more dependency with
VFIO API.

So, I think we probably should just focus on the sysfs and make sure such
interface work for us.

Thanks,
Neo

> 
> Alex
Tian, Kevin Aug. 16, 2016, 12:18 a.m. UTC | #16
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Tuesday, August 16, 2016 6:48 AM
> 
> On Mon, 15 Aug 2016 12:59:08 -0700
> Neo Jia <cjia@nvidia.com> wrote:
> 
> > On Mon, Aug 15, 2016 at 09:38:52AM +0000, Tian, Kevin wrote:
> > > > From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> > > > Sent: Saturday, August 13, 2016 8:37 AM
> > > >
> > > >
> > > >
> > > > On 8/13/2016 2:46 AM, Alex Williamson wrote:
> > > > > On Sat, 13 Aug 2016 00:14:39 +0530
> > > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > >
> > > > >> On 8/10/2016 12:30 AM, Alex Williamson wrote:
> > > > >>> On Thu, 4 Aug 2016 00:33:51 +0530
> > > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > >>>
> > > > >>> This is used later by mdev_device_start() and mdev_device_stop() to get
> > > > >>> the parent_device so it can call the start and stop ops callbacks
> > > > >>> respectively.  That seems to imply that all of instances for a given
> > > > >>> uuid come from the same parent_device.  Where is that enforced?  I'm
> > > > >>> still having a hard time buying into the uuid+instance plan when it
> > > > >>> seems like each mdev_device should have an actual unique uuid.
> > > > >>> Userspace tools can figure out which uuids to start for a given user, I
> > > > >>> don't see much value in collecting them to instances within a uuid.
> > > > >>>
> > > > >>
> > > > >> Initially we started discussion with VM_UUID+instance suggestion, where
> > > > >> instance was introduced to support multiple devices in a VM.
> > > > >
> > > > > The instance number was never required in order to support multiple
> > > > > devices in a VM, IIRC this UUID+instance scheme was to appease NVIDIA
> > > > > management tools which wanted to re-use the VM UUID by creating vGPU
> > > > > devices with that same UUID and therefore associate udev events to a
> > > > > given VM.  Only then does an instance number become necessary since the
> > > > > UUID needs to be static for a vGPUs within a VM.  This has always felt
> > > > > like a very dodgy solution when we should probably just be querying
> > > > > libvirt to give us a device to VM association.
> > >
> > > Agree with Alex here. We'd better not assume that UUID will be a VM_UUID
> > > for mdev in the basic design. It's bound to NVIDIA management stack too tightly.
> > >
> > > I'm OK to give enough flexibility for various upper level management stacks,
> > > e.g. instead of $UUID+INDEX style, would $UUID+STRING provide a better
> > > option where either UUID or STRING could be optional? Upper management
> > > stack can choose its own policy to identify a mdev:
> > >
> > > a) $UUID only, so each mdev is allocated with a unique UUID
> > > b) STRING only, which could be an index (0, 1, 2, ...), or any combination
> > > (vgpu0, vgpu1, etc.)
> > > c) $UUID+STRING, where UUID could be a VM UUID, and STRING could be
> > > a numeric index
> > >
> > > > >
> > > > >> 'mdev_create' creates device and 'mdev_start' is to commit resources of
> > > > >> all instances of similar devices assigned to VM.
> > > > >>
> > > > >> For example, to create 2 devices:
> > > > >> # echo "$UUID:0:params" > /sys/devices/../mdev_create
> > > > >> # echo "$UUID:1:params" > /sys/devices/../mdev_create
> > > > >>
> > > > >> "$UUID-0" and "$UUID-1" devices are created.
> > > > >>
> > > > >> Commit resources for above devices with single 'mdev_start':
> > > > >> # echo "$UUID" > /sys/class/mdev/mdev_start
> > > > >>
> > > > >> Considering $UUID to be a unique UUID of a device, we don't need
> > > > >> 'instance', so 'mdev_create' would look like:
> > > > >>
> > > > >> # echo "$UUID1:params" > /sys/devices/../mdev_create
> > > > >> # echo "$UUID2:params" > /sys/devices/../mdev_create
> > > > >>
> > > > >> where $UUID1 and $UUID2 would be mdev device's unique UUID and 'params'
> > > > >> would be vendor specific parameters.
> > > > >>
> > > > >> Device nodes would be created as "$UUID1" and "$UUID"
> > > > >>
> > > > >> Then 'mdev_start' would be:
> > > > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_start
> > > > >>
> > > > >> Similarly 'mdev_stop' and 'mdev_destroy' would be:
> > > > >>
> > > > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_stop
> > > > >
> > > > > I'm not sure a comma separated list makes sense here, for both
> > > > > simplicity in the kernel and more fine grained error reporting, we
> > > > > probably want to start/stop them individually.  Actually, why is it
> > > > > that we can't use the mediated device being opened and released to
> > > > > automatically signal to the backend vendor driver to commit and release
> > > > > resources? I don't fully understand why userspace needs this interface.
> > >
> > > There is a meaningful use of start/stop interface, as required in live
> > > migration support. Such interface allows vendor driver to quiescent
> > > mdev activity on source device before mdev hardware state is snapshot,
> > > and then resume mdev activity on dest device after its state is recovered.
> > > Intel has implemented experimental live migration support in KVMGT (soon
> > > to release), based on above two interfaces (plus another two to get/set
> > > mdev state).
> > >
> > > > >
> > > >
> > > > For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> > > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > > some common resources.
> > >
> > > Kirti, can you elaborate the background about above one-shot commit
> > > requirement? It's hard to understand such a requirement.
> > >
> > > As I relied in another mail, I really hope start/stop become a per-mdev
> > > attribute instead of global one, e.g.:
> > >
> > > echo "0/1" > /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start
> > >
> > > In many scenario the user space client may only want to talk to mdev
> > > instance directly, w/o need to contact its parent device. Still take
> > > live migration for example, I don't think Qemu wants to know parent
> > > device of assigned mdev instances.
> >
> > Hi Kevin,
> >
> > Having a global /sys/class/mdev/mdev_start doesn't require anybody to know
> > parent device. you can just do
> >
> > echo "mdev_UUID" > /sys/class/mdev/mdev_start
> >
> > or
> >
> > echo "mdev_UUID" > /sys/class/mdev/mdev_stop
> >
> > without knowing the parent device.
> 
> That doesn't give an individual user the ability to stop and start
> their devices though, because in order for a user to have write
> permissions there, they get permission to DoS other users by pumping
> arbitrary UUIDs into those files.  By placing start/stop per mdev, we
> have mdev level granularity of granting start/stop privileges.  Really

Agree.

> though, do we want QEMU fumbling around through sysfs or do we want an
> interface through the vfio API to perform start/stop?  Thanks,
> 

Either way looks OK. If we think mdev as a vfio feature, seems vfio API
can be a good fit to carry those mdev attributes. On the other hand, if
we think mdev as a standalone component (vfio is just one kernel 
implementation), using sysfs might be more generic (e.g. for some reason
mdev is split from vfio in the future?) similar to other physical devices. 
Another limitation with vfio API is that vendor driver cannot deliver 
additional capability w/o updating kernel vfio driver...

Thanks
Kevin
Tian, Kevin Aug. 16, 2016, 12:30 a.m. UTC | #17
> From: Neo Jia [mailto:cjia@nvidia.com]
> Sent: Tuesday, August 16, 2016 3:59 AM
> 
> On Mon, Aug 15, 2016 at 09:38:52AM +0000, Tian, Kevin wrote:
> > > From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> > > Sent: Saturday, August 13, 2016 8:37 AM
> > >
> > >
> > >
> > > On 8/13/2016 2:46 AM, Alex Williamson wrote:
> > > > On Sat, 13 Aug 2016 00:14:39 +0530
> > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > >
> > > >> On 8/10/2016 12:30 AM, Alex Williamson wrote:
> > > >>> On Thu, 4 Aug 2016 00:33:51 +0530
> > > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > >>>
> > > >>> This is used later by mdev_device_start() and mdev_device_stop() to get
> > > >>> the parent_device so it can call the start and stop ops callbacks
> > > >>> respectively.  That seems to imply that all of instances for a given
> > > >>> uuid come from the same parent_device.  Where is that enforced?  I'm
> > > >>> still having a hard time buying into the uuid+instance plan when it
> > > >>> seems like each mdev_device should have an actual unique uuid.
> > > >>> Userspace tools can figure out which uuids to start for a given user, I
> > > >>> don't see much value in collecting them to instances within a uuid.
> > > >>>
> > > >>
> > > >> Initially we started discussion with VM_UUID+instance suggestion, where
> > > >> instance was introduced to support multiple devices in a VM.
> > > >
> > > > The instance number was never required in order to support multiple
> > > > devices in a VM, IIRC this UUID+instance scheme was to appease NVIDIA
> > > > management tools which wanted to re-use the VM UUID by creating vGPU
> > > > devices with that same UUID and therefore associate udev events to a
> > > > given VM.  Only then does an instance number become necessary since the
> > > > UUID needs to be static for a vGPUs within a VM.  This has always felt
> > > > like a very dodgy solution when we should probably just be querying
> > > > libvirt to give us a device to VM association.
> >
> > Agree with Alex here. We'd better not assume that UUID will be a VM_UUID
> > for mdev in the basic design. It's bound to NVIDIA management stack too tightly.
> >
> > I'm OK to give enough flexibility for various upper level management stacks,
> > e.g. instead of $UUID+INDEX style, would $UUID+STRING provide a better
> > option where either UUID or STRING could be optional? Upper management
> > stack can choose its own policy to identify a mdev:
> >
> > a) $UUID only, so each mdev is allocated with a unique UUID
> > b) STRING only, which could be an index (0, 1, 2, ...), or any combination
> > (vgpu0, vgpu1, etc.)
> > c) $UUID+STRING, where UUID could be a VM UUID, and STRING could be
> > a numeric index
> >
> > > >
> > > >> 'mdev_create' creates device and 'mdev_start' is to commit resources of
> > > >> all instances of similar devices assigned to VM.
> > > >>
> > > >> For example, to create 2 devices:
> > > >> # echo "$UUID:0:params" > /sys/devices/../mdev_create
> > > >> # echo "$UUID:1:params" > /sys/devices/../mdev_create
> > > >>
> > > >> "$UUID-0" and "$UUID-1" devices are created.
> > > >>
> > > >> Commit resources for above devices with single 'mdev_start':
> > > >> # echo "$UUID" > /sys/class/mdev/mdev_start
> > > >>
> > > >> Considering $UUID to be a unique UUID of a device, we don't need
> > > >> 'instance', so 'mdev_create' would look like:
> > > >>
> > > >> # echo "$UUID1:params" > /sys/devices/../mdev_create
> > > >> # echo "$UUID2:params" > /sys/devices/../mdev_create
> > > >>
> > > >> where $UUID1 and $UUID2 would be mdev device's unique UUID and 'params'
> > > >> would be vendor specific parameters.
> > > >>
> > > >> Device nodes would be created as "$UUID1" and "$UUID"
> > > >>
> > > >> Then 'mdev_start' would be:
> > > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_start
> > > >>
> > > >> Similarly 'mdev_stop' and 'mdev_destroy' would be:
> > > >>
> > > >> # echo "$UUID1, $UUID2" > /sys/class/mdev/mdev_stop
> > > >
> > > > I'm not sure a comma separated list makes sense here, for both
> > > > simplicity in the kernel and more fine grained error reporting, we
> > > > probably want to start/stop them individually.  Actually, why is it
> > > > that we can't use the mediated device being opened and released to
> > > > automatically signal to the backend vendor driver to commit and release
> > > > resources? I don't fully understand why userspace needs this interface.
> >
> > There is a meaningful use of start/stop interface, as required in live
> > migration support. Such interface allows vendor driver to quiescent
> > mdev activity on source device before mdev hardware state is snapshot,
> > and then resume mdev activity on dest device after its state is recovered.
> > Intel has implemented experimental live migration support in KVMGT (soon
> > to release), based on above two interfaces (plus another two to get/set
> > mdev state).
> >
> > > >
> > >
> > > For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > some common resources.
> >
> > Kirti, can you elaborate the background about above one-shot commit
> > requirement? It's hard to understand such a requirement.
> >
> > As I relied in another mail, I really hope start/stop become a per-mdev
> > attribute instead of global one, e.g.:
> >
> > echo "0/1" > /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start
> >
> > In many scenario the user space client may only want to talk to mdev
> > instance directly, w/o need to contact its parent device. Still take
> > live migration for example, I don't think Qemu wants to know parent
> > device of assigned mdev instances.
> 
> Hi Kevin,
> 
> Having a global /sys/class/mdev/mdev_start doesn't require anybody to know
> parent device. you can just do
> 
> echo "mdev_UUID" > /sys/class/mdev/mdev_start
> 
> or
> 
> echo "mdev_UUID" > /sys/class/mdev/mdev_stop
> 
> without knowing the parent device.
> 

You can look at some existing sysfs example, e.g.:

echo "0/1" > /sys/bus/cpu/devices/cpu1/online

You may also argue why not using a global style:

echo "cpu1" > /sys/bus/cpu/devices/cpu_online
echo "cpu1" > /sys/bus/cpu/devices/cpu_offline

There are many similar examples...

Thanks
Kevin
Tian, Kevin Aug. 16, 2016, 12:49 a.m. UTC | #18
> From: Neo Jia [mailto:cjia@nvidia.com]
> Sent: Tuesday, August 16, 2016 7:24 AM
> 
> > > >
> > > > > >
> > > > > > For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> > > > > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > > > > some common resources.
> > > > >
> > > > > Kirti, can you elaborate the background about above one-shot commit
> > > > > requirement? It's hard to understand such a requirement.
> > > >
> > > > Agree, I know NVIDIA isn't planning to support hotplug initially, but
> > > > this seems like we're precluding hotplug from the design.  I don't
> > > > understand what's driving this one-shot requirement.
> > >
> > > Hi Alex,
> > >
> > > The requirement here is based on how our internal vGPU device model designed and
> > > with this we are able to pre-allocate resources required for multiple virtual
> > > devices within same domain.
> > >
> > > And I don't think this syntax will stop us from supporting hotplug at all.
> > >
> > > For example, you can always create a virtual mdev and then do
> > >
> > > echo "mdev_UUID" > /sys/class/mdev/mdev_start
> > >
> > > then use QEMU monitor to add the device for hotplug.
> >
> > Hi Neo,
> >
> > I'm still not understanding the advantage you get from the "one-shot"
> > approach then if we can always add more mdevs by starting them later.
> > Are the hotplug mdevs somehow less capable than the initial set of
> > mdevs added in a single shot?  If the initial set is allocated
> > from the "same domain", does that give them some sort of hardware
> > locality/resource benefit?  Thanks,
> 
> Hi Alex,
> 
> At least we will not able to guarantee some special hardware resource for the
> hotplug devices.
> 
> So from our point of view, we also have dedicated internal SW entity to manage all
> virtual devices for each "domain/virtual machine", and such SW entity will be created
> at virtual device start time.
> 
> This is why we need to do this in one-shot to support multiple virtual device
> per VM case.
> 

Is pre-allocation of special hardware resource done one-time for all mdev instances?
Can it be done one-by-one as long as mdev is started early before VM is launched?

If such one-shot requirement is really required, it would be cleaner to me to
introduce a mdev group concept, so mdev instances with one-short start 
requirements can be put under a mdev group. Then you can do one-shot start
by:

echo "0/1" > /sys/class/mdev/group/0/start

Thanks
Kevin
Neo Jia Aug. 16, 2016, 3:45 a.m. UTC | #19
On Tue, Aug 16, 2016 at 12:30:25AM +0000, Tian, Kevin wrote:
> > From: Neo Jia [mailto:cjia@nvidia.com]
> > Sent: Tuesday, August 16, 2016 3:59 AM

> > > >
> > > > For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> > > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > > some common resources.
> > >
> > > Kirti, can you elaborate the background about above one-shot commit
> > > requirement? It's hard to understand such a requirement.
> > >
> > > As I relied in another mail, I really hope start/stop become a per-mdev
> > > attribute instead of global one, e.g.:
> > >
> > > echo "0/1" > /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start
> > >
> > > In many scenario the user space client may only want to talk to mdev
> > > instance directly, w/o need to contact its parent device. Still take
> > > live migration for example, I don't think Qemu wants to know parent
> > > device of assigned mdev instances.
> > 
> > Hi Kevin,
> > 
> > Having a global /sys/class/mdev/mdev_start doesn't require anybody to know
> > parent device. you can just do
> > 
> > echo "mdev_UUID" > /sys/class/mdev/mdev_start
> > 
> > or
> > 
> > echo "mdev_UUID" > /sys/class/mdev/mdev_stop
> > 
> > without knowing the parent device.
> > 
> 
> You can look at some existing sysfs example, e.g.:
> 
> echo "0/1" > /sys/bus/cpu/devices/cpu1/online
> 
> You may also argue why not using a global style:
> 
> echo "cpu1" > /sys/bus/cpu/devices/cpu_online
> echo "cpu1" > /sys/bus/cpu/devices/cpu_offline
> 
> There are many similar examples...

Hi Kevin,

My response above is to your question about using the global sysfs entry as you
don't want to have the global path because

"I don't think Qemu wants to know parent device of assigned mdev instances.".

So I just want to confirm with you that (in case you miss):

    /sys/class/mdev/mdev_start | mdev_stop 

doesn't require the knowledge of parent device.

Thanks,
Neo

> 
> Thanks
> Kevin
Tian, Kevin Aug. 16, 2016, 3:50 a.m. UTC | #20
> From: Neo Jia [mailto:cjia@nvidia.com]
> Sent: Tuesday, August 16, 2016 11:46 AM
> 
> On Tue, Aug 16, 2016 at 12:30:25AM +0000, Tian, Kevin wrote:
> > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > Sent: Tuesday, August 16, 2016 3:59 AM
> 
> > > > >
> > > > > For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> > > > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > > > some common resources.
> > > >
> > > > Kirti, can you elaborate the background about above one-shot commit
> > > > requirement? It's hard to understand such a requirement.
> > > >
> > > > As I relied in another mail, I really hope start/stop become a per-mdev
> > > > attribute instead of global one, e.g.:
> > > >
> > > > echo "0/1" > /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start
> > > >
> > > > In many scenario the user space client may only want to talk to mdev
> > > > instance directly, w/o need to contact its parent device. Still take
> > > > live migration for example, I don't think Qemu wants to know parent
> > > > device of assigned mdev instances.
> > >
> > > Hi Kevin,
> > >
> > > Having a global /sys/class/mdev/mdev_start doesn't require anybody to know
> > > parent device. you can just do
> > >
> > > echo "mdev_UUID" > /sys/class/mdev/mdev_start
> > >
> > > or
> > >
> > > echo "mdev_UUID" > /sys/class/mdev/mdev_stop
> > >
> > > without knowing the parent device.
> > >
> >
> > You can look at some existing sysfs example, e.g.:
> >
> > echo "0/1" > /sys/bus/cpu/devices/cpu1/online
> >
> > You may also argue why not using a global style:
> >
> > echo "cpu1" > /sys/bus/cpu/devices/cpu_online
> > echo "cpu1" > /sys/bus/cpu/devices/cpu_offline
> >
> > There are many similar examples...
> 
> Hi Kevin,
> 
> My response above is to your question about using the global sysfs entry as you
> don't want to have the global path because
> 
> "I don't think Qemu wants to know parent device of assigned mdev instances.".
> 
> So I just want to confirm with you that (in case you miss):
> 
>     /sys/class/mdev/mdev_start | mdev_stop
> 
> doesn't require the knowledge of parent device.
> 

Qemu is just one example, where your explanation of parent device
makes sense but still it's not good for Qemu to populate /sys/class/mdev
directly. Qemu is passed with the actual sysfs path of assigned mdev
instance, so any mdev attributes touched by Qemu should be put under 
that node (e.g. start/stop for live migration usage as I explained earlier).

Thanks
Kevin
Neo Jia Aug. 16, 2016, 4:16 a.m. UTC | #21
On Tue, Aug 16, 2016 at 03:50:44AM +0000, Tian, Kevin wrote:
> > From: Neo Jia [mailto:cjia@nvidia.com]
> > Sent: Tuesday, August 16, 2016 11:46 AM
> > 
> > On Tue, Aug 16, 2016 at 12:30:25AM +0000, Tian, Kevin wrote:
> > > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > > Sent: Tuesday, August 16, 2016 3:59 AM
> > 
> > > > > >
> > > > > > For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> > > > > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > > > > some common resources.
> > > > >
> > > > > Kirti, can you elaborate the background about above one-shot commit
> > > > > requirement? It's hard to understand such a requirement.
> > > > >
> > > > > As I relied in another mail, I really hope start/stop become a per-mdev
> > > > > attribute instead of global one, e.g.:
> > > > >
> > > > > echo "0/1" > /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start
> > > > >
> > > > > In many scenario the user space client may only want to talk to mdev
> > > > > instance directly, w/o need to contact its parent device. Still take
> > > > > live migration for example, I don't think Qemu wants to know parent
> > > > > device of assigned mdev instances.
> > > >
> > > > Hi Kevin,
> > > >
> > > > Having a global /sys/class/mdev/mdev_start doesn't require anybody to know
> > > > parent device. you can just do
> > > >
> > > > echo "mdev_UUID" > /sys/class/mdev/mdev_start
> > > >
> > > > or
> > > >
> > > > echo "mdev_UUID" > /sys/class/mdev/mdev_stop
> > > >
> > > > without knowing the parent device.
> > > >
> > >
> > > You can look at some existing sysfs example, e.g.:
> > >
> > > echo "0/1" > /sys/bus/cpu/devices/cpu1/online
> > >
> > > You may also argue why not using a global style:
> > >
> > > echo "cpu1" > /sys/bus/cpu/devices/cpu_online
> > > echo "cpu1" > /sys/bus/cpu/devices/cpu_offline
> > >
> > > There are many similar examples...
> > 
> > Hi Kevin,
> > 
> > My response above is to your question about using the global sysfs entry as you
> > don't want to have the global path because
> > 
> > "I don't think Qemu wants to know parent device of assigned mdev instances.".
> > 
> > So I just want to confirm with you that (in case you miss):
> > 
> >     /sys/class/mdev/mdev_start | mdev_stop
> > 
> > doesn't require the knowledge of parent device.
> > 
> 
> Qemu is just one example, where your explanation of parent device
> makes sense but still it's not good for Qemu to populate /sys/class/mdev
> directly. Qemu is passed with the actual sysfs path of assigned mdev
> instance, so any mdev attributes touched by Qemu should be put under 
> that node (e.g. start/stop for live migration usage as I explained earlier).

Exactly, qemu is passed with the actual sysfs path.

So, QEMU doesn't touch the file /sys/class/mdev/mdev_start | mdev_stop at all.

QEMU will take the sysfs path as input:

 -device vfio-pci,sysfsdev=/sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0,id=vgpu0

As you are saying in live migration, QEMU needs to access "start" and "stop".  Could you 
please share more details, such as how QEMU access the "start" and "stop" sysfs,
when and what triggers that?

Thanks,
Neo

> 

> Thanks
> Kevin
Tian, Kevin Aug. 16, 2016, 4:52 a.m. UTC | #22
> From: Neo Jia [mailto:cjia@nvidia.com]
> Sent: Tuesday, August 16, 2016 12:17 PM
> 
> On Tue, Aug 16, 2016 at 03:50:44AM +0000, Tian, Kevin wrote:
> > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > Sent: Tuesday, August 16, 2016 11:46 AM
> > >
> > > On Tue, Aug 16, 2016 at 12:30:25AM +0000, Tian, Kevin wrote:
> > > > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > > > Sent: Tuesday, August 16, 2016 3:59 AM
> > >
> > > > > > >
> > > > > > > For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> > > > > > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > > > > > some common resources.
> > > > > >
> > > > > > Kirti, can you elaborate the background about above one-shot commit
> > > > > > requirement? It's hard to understand such a requirement.
> > > > > >
> > > > > > As I relied in another mail, I really hope start/stop become a per-mdev
> > > > > > attribute instead of global one, e.g.:
> > > > > >
> > > > > > echo "0/1" >
> /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start
> > > > > >
> > > > > > In many scenario the user space client may only want to talk to mdev
> > > > > > instance directly, w/o need to contact its parent device. Still take
> > > > > > live migration for example, I don't think Qemu wants to know parent
> > > > > > device of assigned mdev instances.
> > > > >
> > > > > Hi Kevin,
> > > > >
> > > > > Having a global /sys/class/mdev/mdev_start doesn't require anybody to know
> > > > > parent device. you can just do
> > > > >
> > > > > echo "mdev_UUID" > /sys/class/mdev/mdev_start
> > > > >
> > > > > or
> > > > >
> > > > > echo "mdev_UUID" > /sys/class/mdev/mdev_stop
> > > > >
> > > > > without knowing the parent device.
> > > > >
> > > >
> > > > You can look at some existing sysfs example, e.g.:
> > > >
> > > > echo "0/1" > /sys/bus/cpu/devices/cpu1/online
> > > >
> > > > You may also argue why not using a global style:
> > > >
> > > > echo "cpu1" > /sys/bus/cpu/devices/cpu_online
> > > > echo "cpu1" > /sys/bus/cpu/devices/cpu_offline
> > > >
> > > > There are many similar examples...
> > >
> > > Hi Kevin,
> > >
> > > My response above is to your question about using the global sysfs entry as you
> > > don't want to have the global path because
> > >
> > > "I don't think Qemu wants to know parent device of assigned mdev instances.".
> > >
> > > So I just want to confirm with you that (in case you miss):
> > >
> > >     /sys/class/mdev/mdev_start | mdev_stop
> > >
> > > doesn't require the knowledge of parent device.
> > >
> >
> > Qemu is just one example, where your explanation of parent device
> > makes sense but still it's not good for Qemu to populate /sys/class/mdev
> > directly. Qemu is passed with the actual sysfs path of assigned mdev
> > instance, so any mdev attributes touched by Qemu should be put under
> > that node (e.g. start/stop for live migration usage as I explained earlier).
> 
> Exactly, qemu is passed with the actual sysfs path.
> 
> So, QEMU doesn't touch the file /sys/class/mdev/mdev_start | mdev_stop at all.
> 
> QEMU will take the sysfs path as input:
> 
>  -device
> vfio-pci,sysfsdev=/sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0,i
> d=vgpu0

no need of passing "id=vgpu0" here. If necessary you can put id as an attribute 
under sysfs mdev node:

/sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/id

> 
> As you are saying in live migration, QEMU needs to access "start" and "stop".  Could you
> please share more details, such as how QEMU access the "start" and "stop" sysfs,
> when and what triggers that?
> 

A conceptual flow as below:

1. Quiescent mdev activity on the parent device (e.g. stop scheduling, wait for
in-flight DMA completed, etc.)

echo "0" > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/start

2. Save mdev state:

cat /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/state > xxx

3. xxx will be part of the final VM image and copied to a new machine

4. Allocate/prepare mdev on the new machine for this VM

5. Restore mdev state:

cat xxx > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/state
(might be a different path name)

6. start mdev on the new parent device:

echo "1" > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/start

Thanks
Kevin
Neo Jia Aug. 16, 2016, 5:43 a.m. UTC | #23
On Tue, Aug 16, 2016 at 04:52:30AM +0000, Tian, Kevin wrote:
> > From: Neo Jia [mailto:cjia@nvidia.com]
> > Sent: Tuesday, August 16, 2016 12:17 PM
> > 
> > On Tue, Aug 16, 2016 at 03:50:44AM +0000, Tian, Kevin wrote:
> > > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > > Sent: Tuesday, August 16, 2016 11:46 AM
> > > >
> > > > On Tue, Aug 16, 2016 at 12:30:25AM +0000, Tian, Kevin wrote:
> > > > > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > > > > Sent: Tuesday, August 16, 2016 3:59 AM
> > > >
> > > > > > > >
> > > > > > > > For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> > > > > > > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > > > > > > some common resources.
> > > > > > >
> > > > > > > Kirti, can you elaborate the background about above one-shot commit
> > > > > > > requirement? It's hard to understand such a requirement.
> > > > > > >
> > > > > > > As I relied in another mail, I really hope start/stop become a per-mdev
> > > > > > > attribute instead of global one, e.g.:
> > > > > > >
> > > > > > > echo "0/1" >
> > /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start
> > > > > > >
> > > > > > > In many scenario the user space client may only want to talk to mdev
> > > > > > > instance directly, w/o need to contact its parent device. Still take
> > > > > > > live migration for example, I don't think Qemu wants to know parent
> > > > > > > device of assigned mdev instances.
> > > > > >
> > > > > > Hi Kevin,
> > > > > >
> > > > > > Having a global /sys/class/mdev/mdev_start doesn't require anybody to know
> > > > > > parent device. you can just do
> > > > > >
> > > > > > echo "mdev_UUID" > /sys/class/mdev/mdev_start
> > > > > >
> > > > > > or
> > > > > >
> > > > > > echo "mdev_UUID" > /sys/class/mdev/mdev_stop
> > > > > >
> > > > > > without knowing the parent device.
> > > > > >
> > > > >
> > > > > You can look at some existing sysfs example, e.g.:
> > > > >
> > > > > echo "0/1" > /sys/bus/cpu/devices/cpu1/online
> > > > >
> > > > > You may also argue why not using a global style:
> > > > >
> > > > > echo "cpu1" > /sys/bus/cpu/devices/cpu_online
> > > > > echo "cpu1" > /sys/bus/cpu/devices/cpu_offline
> > > > >
> > > > > There are many similar examples...
> > > >
> > > > Hi Kevin,
> > > >
> > > > My response above is to your question about using the global sysfs entry as you
> > > > don't want to have the global path because
> > > >
> > > > "I don't think Qemu wants to know parent device of assigned mdev instances.".
> > > >
> > > > So I just want to confirm with you that (in case you miss):
> > > >
> > > >     /sys/class/mdev/mdev_start | mdev_stop
> > > >
> > > > doesn't require the knowledge of parent device.
> > > >
> > >
> > > Qemu is just one example, where your explanation of parent device
> > > makes sense but still it's not good for Qemu to populate /sys/class/mdev
> > > directly. Qemu is passed with the actual sysfs path of assigned mdev
> > > instance, so any mdev attributes touched by Qemu should be put under
> > > that node (e.g. start/stop for live migration usage as I explained earlier).
> > 
> > Exactly, qemu is passed with the actual sysfs path.
> > 
> > So, QEMU doesn't touch the file /sys/class/mdev/mdev_start | mdev_stop at all.
> > 
> > QEMU will take the sysfs path as input:
> > 
> >  -device
> > vfio-pci,sysfsdev=/sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0,i
> > d=vgpu0
> 
> no need of passing "id=vgpu0" here. If necessary you can put id as an attribute 
> under sysfs mdev node:
> 
> /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/id

I think we have moved away from the device index based on Alex's comment, so the
device path will be:

 /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818

> 
> > 
> > As you are saying in live migration, QEMU needs to access "start" and "stop".  Could you
> > please share more details, such as how QEMU access the "start" and "stop" sysfs,
> > when and what triggers that?
> > 
> 
> A conceptual flow as below:
> 
> 1. Quiescent mdev activity on the parent device (e.g. stop scheduling, wait for
> in-flight DMA completed, etc.)
> 
> echo "0" > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/start
> 
> 2. Save mdev state:
> 
> cat /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/state > xxx
> 
> 3. xxx will be part of the final VM image and copied to a new machine
> 
> 4. Allocate/prepare mdev on the new machine for this VM
> 
> 5. Restore mdev state:
> 
> cat xxx > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/state
> (might be a different path name)
> 
> 6. start mdev on the new parent device:
> 
> echo "1" > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/start

Thanks for the sequence, so based on above live migration, the access of "start/stop"
are from other user space program not QEMU process.

(Just to be clear, I am not saying that I won't consider your suggestion of 
accommodating the "start/stop" file from global to mdev node, but I do want to point 
out that keeping them inside global shouldn't impact your live migration sequence 
above.)

Thanks,
Neo

> 
> Thanks
> Kevin
Tian, Kevin Aug. 16, 2016, 5:58 a.m. UTC | #24
> From: Neo Jia [mailto:cjia@nvidia.com]
> Sent: Tuesday, August 16, 2016 1:44 PM
> 
> On Tue, Aug 16, 2016 at 04:52:30AM +0000, Tian, Kevin wrote:
> > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > Sent: Tuesday, August 16, 2016 12:17 PM
> > >
> > > On Tue, Aug 16, 2016 at 03:50:44AM +0000, Tian, Kevin wrote:
> > > > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > > > Sent: Tuesday, August 16, 2016 11:46 AM
> > > > >
> > > > > On Tue, Aug 16, 2016 at 12:30:25AM +0000, Tian, Kevin wrote:
> > > > > > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > > > > > Sent: Tuesday, August 16, 2016 3:59 AM
> > > > >
> > > > > > > > >
> > > > > > > > > For NVIDIA vGPU solution we need to know all devices assigned to a VM
> in
> > > > > > > > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > > > > > > > some common resources.
> > > > > > > >
> > > > > > > > Kirti, can you elaborate the background about above one-shot commit
> > > > > > > > requirement? It's hard to understand such a requirement.
> > > > > > > >
> > > > > > > > As I relied in another mail, I really hope start/stop become a per-mdev
> > > > > > > > attribute instead of global one, e.g.:
> > > > > > > >
> > > > > > > > echo "0/1" >
> > > /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start
> > > > > > > >
> > > > > > > > In many scenario the user space client may only want to talk to mdev
> > > > > > > > instance directly, w/o need to contact its parent device. Still take
> > > > > > > > live migration for example, I don't think Qemu wants to know parent
> > > > > > > > device of assigned mdev instances.
> > > > > > >
> > > > > > > Hi Kevin,
> > > > > > >
> > > > > > > Having a global /sys/class/mdev/mdev_start doesn't require anybody to
> know
> > > > > > > parent device. you can just do
> > > > > > >
> > > > > > > echo "mdev_UUID" > /sys/class/mdev/mdev_start
> > > > > > >
> > > > > > > or
> > > > > > >
> > > > > > > echo "mdev_UUID" > /sys/class/mdev/mdev_stop
> > > > > > >
> > > > > > > without knowing the parent device.
> > > > > > >
> > > > > >
> > > > > > You can look at some existing sysfs example, e.g.:
> > > > > >
> > > > > > echo "0/1" > /sys/bus/cpu/devices/cpu1/online
> > > > > >
> > > > > > You may also argue why not using a global style:
> > > > > >
> > > > > > echo "cpu1" > /sys/bus/cpu/devices/cpu_online
> > > > > > echo "cpu1" > /sys/bus/cpu/devices/cpu_offline
> > > > > >
> > > > > > There are many similar examples...
> > > > >
> > > > > Hi Kevin,
> > > > >
> > > > > My response above is to your question about using the global sysfs entry as you
> > > > > don't want to have the global path because
> > > > >
> > > > > "I don't think Qemu wants to know parent device of assigned mdev instances.".
> > > > >
> > > > > So I just want to confirm with you that (in case you miss):
> > > > >
> > > > >     /sys/class/mdev/mdev_start | mdev_stop
> > > > >
> > > > > doesn't require the knowledge of parent device.
> > > > >
> > > >
> > > > Qemu is just one example, where your explanation of parent device
> > > > makes sense but still it's not good for Qemu to populate /sys/class/mdev
> > > > directly. Qemu is passed with the actual sysfs path of assigned mdev
> > > > instance, so any mdev attributes touched by Qemu should be put under
> > > > that node (e.g. start/stop for live migration usage as I explained earlier).
> > >
> > > Exactly, qemu is passed with the actual sysfs path.
> > >
> > > So, QEMU doesn't touch the file /sys/class/mdev/mdev_start | mdev_stop at all.
> > >
> > > QEMU will take the sysfs path as input:
> > >
> > >  -device
> > >
> vfio-pci,sysfsdev=/sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0,i
> > > d=vgpu0
> >
> > no need of passing "id=vgpu0" here. If necessary you can put id as an attribute
> > under sysfs mdev node:
> >
> > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/id
> 
> I think we have moved away from the device index based on Alex's comment, so the
> device path will be:
> 
>  /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818

pass /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818
as parameter, and then Qemu can access 'id' under that path. You
don't need to pass a separate 'id' field. That's my point.


> 
> >
> > >
> > > As you are saying in live migration, QEMU needs to access "start" and "stop".  Could
> you
> > > please share more details, such as how QEMU access the "start" and "stop" sysfs,
> > > when and what triggers that?
> > >
> >
> > A conceptual flow as below:
> >
> > 1. Quiescent mdev activity on the parent device (e.g. stop scheduling, wait for
> > in-flight DMA completed, etc.)
> >
> > echo "0" > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/start
> >
> > 2. Save mdev state:
> >
> > cat /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/state > xxx
> >
> > 3. xxx will be part of the final VM image and copied to a new machine
> >
> > 4. Allocate/prepare mdev on the new machine for this VM
> >
> > 5. Restore mdev state:
> >
> > cat xxx > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/state
> > (might be a different path name)
> >
> > 6. start mdev on the new parent device:
> >
> > echo "1" > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/start
> 
> Thanks for the sequence, so based on above live migration, the access of "start/stop"
> are from other user space program not QEMU process.
> 
> (Just to be clear, I am not saying that I won't consider your suggestion of
> accommodating the "start/stop" file from global to mdev node, but I do want to point
> out that keeping them inside global shouldn't impact your live migration sequence
> above.)
> 

come on... I just use above bash command to show the step. Qemu itself definitely
needs to open the sysfs file descriptor and then read/write the fd... If we use VFIO
API as example, it might be more obvious. :-)

Thanks
Kevin
Neo Jia Aug. 16, 2016, 6:13 a.m. UTC | #25
On Tue, Aug 16, 2016 at 05:58:54AM +0000, Tian, Kevin wrote:
> > From: Neo Jia [mailto:cjia@nvidia.com]
> > Sent: Tuesday, August 16, 2016 1:44 PM
> > 
> > On Tue, Aug 16, 2016 at 04:52:30AM +0000, Tian, Kevin wrote:
> > > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > > Sent: Tuesday, August 16, 2016 12:17 PM
> > > >
> > > > On Tue, Aug 16, 2016 at 03:50:44AM +0000, Tian, Kevin wrote:
> > > > > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > > > > Sent: Tuesday, August 16, 2016 11:46 AM
> > > > > >
> > > > > > On Tue, Aug 16, 2016 at 12:30:25AM +0000, Tian, Kevin wrote:
> > > > > > > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > > > > > > Sent: Tuesday, August 16, 2016 3:59 AM
> > > > > >
> > > > > > > > > >
> > > > > > > > > > For NVIDIA vGPU solution we need to know all devices assigned to a VM
> > in
> > > > > > > > > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > > > > > > > > some common resources.
> > > > > > > > >
> > > > > > > > > Kirti, can you elaborate the background about above one-shot commit
> > > > > > > > > requirement? It's hard to understand such a requirement.
> > > > > > > > >
> > > > > > > > > As I relied in another mail, I really hope start/stop become a per-mdev
> > > > > > > > > attribute instead of global one, e.g.:
> > > > > > > > >
> > > > > > > > > echo "0/1" >
> > > > /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start
> > > > > > > > >
> > > > > > > > > In many scenario the user space client may only want to talk to mdev
> > > > > > > > > instance directly, w/o need to contact its parent device. Still take
> > > > > > > > > live migration for example, I don't think Qemu wants to know parent
> > > > > > > > > device of assigned mdev instances.
> > > > > > > >
> > > > > > > > Hi Kevin,
> > > > > > > >
> > > > > > > > Having a global /sys/class/mdev/mdev_start doesn't require anybody to
> > know
> > > > > > > > parent device. you can just do
> > > > > > > >
> > > > > > > > echo "mdev_UUID" > /sys/class/mdev/mdev_start
> > > > > > > >
> > > > > > > > or
> > > > > > > >
> > > > > > > > echo "mdev_UUID" > /sys/class/mdev/mdev_stop
> > > > > > > >
> > > > > > > > without knowing the parent device.
> > > > > > > >
> > > > > > >
> > > > > > > You can look at some existing sysfs example, e.g.:
> > > > > > >
> > > > > > > echo "0/1" > /sys/bus/cpu/devices/cpu1/online
> > > > > > >
> > > > > > > You may also argue why not using a global style:
> > > > > > >
> > > > > > > echo "cpu1" > /sys/bus/cpu/devices/cpu_online
> > > > > > > echo "cpu1" > /sys/bus/cpu/devices/cpu_offline
> > > > > > >
> > > > > > > There are many similar examples...
> > > > > >
> > > > > > Hi Kevin,
> > > > > >
> > > > > > My response above is to your question about using the global sysfs entry as you
> > > > > > don't want to have the global path because
> > > > > >
> > > > > > "I don't think Qemu wants to know parent device of assigned mdev instances.".
> > > > > >
> > > > > > So I just want to confirm with you that (in case you miss):
> > > > > >
> > > > > >     /sys/class/mdev/mdev_start | mdev_stop
> > > > > >
> > > > > > doesn't require the knowledge of parent device.
> > > > > >
> > > > >
> > > > > Qemu is just one example, where your explanation of parent device
> > > > > makes sense but still it's not good for Qemu to populate /sys/class/mdev
> > > > > directly. Qemu is passed with the actual sysfs path of assigned mdev
> > > > > instance, so any mdev attributes touched by Qemu should be put under
> > > > > that node (e.g. start/stop for live migration usage as I explained earlier).
> > > >
> > > > Exactly, qemu is passed with the actual sysfs path.
> > > >
> > > > So, QEMU doesn't touch the file /sys/class/mdev/mdev_start | mdev_stop at all.
> > > >
> > > > QEMU will take the sysfs path as input:
> > > >
> > > >  -device
> > > >
> > vfio-pci,sysfsdev=/sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0,i
> > > > d=vgpu0
> > >
> > > no need of passing "id=vgpu0" here. If necessary you can put id as an attribute
> > > under sysfs mdev node:
> > >
> > > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/id
> > 
> > I think we have moved away from the device index based on Alex's comment, so the
> > device path will be:
> > 
> >  /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818
> 
> pass /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818
> as parameter, and then Qemu can access 'id' under that path. You
> don't need to pass a separate 'id' field. That's my point.
> 
> 
> > 
> > >
> > > >
> > > > As you are saying in live migration, QEMU needs to access "start" and "stop".  Could
> > you
> > > > please share more details, such as how QEMU access the "start" and "stop" sysfs,
> > > > when and what triggers that?
> > > >
> > >
> > > A conceptual flow as below:
> > >
> > > 1. Quiescent mdev activity on the parent device (e.g. stop scheduling, wait for
> > > in-flight DMA completed, etc.)
> > >
> > > echo "0" > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/start
> > >
> > > 2. Save mdev state:
> > >
> > > cat /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/state > xxx
> > >
> > > 3. xxx will be part of the final VM image and copied to a new machine
> > >
> > > 4. Allocate/prepare mdev on the new machine for this VM
> > >
> > > 5. Restore mdev state:
> > >
> > > cat xxx > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/state
> > > (might be a different path name)
> > >
> > > 6. start mdev on the new parent device:
> > >
> > > echo "1" > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/start
> > 
> > Thanks for the sequence, so based on above live migration, the access of "start/stop"
> > are from other user space program not QEMU process.
> > 
> > (Just to be clear, I am not saying that I won't consider your suggestion of
> > accommodating the "start/stop" file from global to mdev node, but I do want to point
> > out that keeping them inside global shouldn't impact your live migration sequence
> > above.)
> > 
> 
> come on... I just use above bash command to show the step. Qemu itself definitely
> needs to open the sysfs file descriptor and then read/write the fd... If we use VFIO
> API as example, it might be more obvious. :-)

Hi Kevin,

I am not just picking on this example.

The only fd that QEMU will access is the VFIO device fd (other than those
container/IOMMU type1 stuff), and there is no changes required for QEMU.

Thanks,
Neo

> 
> Thanks
> Kevin
Alex Williamson Aug. 16, 2016, 12:49 p.m. UTC | #26
On Tue, 16 Aug 2016 04:52:30 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Neo Jia [mailto:cjia@nvidia.com]
> > Sent: Tuesday, August 16, 2016 12:17 PM
> > 
> > On Tue, Aug 16, 2016 at 03:50:44AM +0000, Tian, Kevin wrote:  
> > > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > > Sent: Tuesday, August 16, 2016 11:46 AM
> > > >
> > > > On Tue, Aug 16, 2016 at 12:30:25AM +0000, Tian, Kevin wrote:  
> > > > > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > > > > Sent: Tuesday, August 16, 2016 3:59 AM  
> > > >  
> > > > > > > >
> > > > > > > > For NVIDIA vGPU solution we need to know all devices assigned to a VM in
> > > > > > > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > > > > > > some common resources.  
> > > > > > >
> > > > > > > Kirti, can you elaborate the background about above one-shot commit
> > > > > > > requirement? It's hard to understand such a requirement.
> > > > > > >
> > > > > > > As I relied in another mail, I really hope start/stop become a per-mdev
> > > > > > > attribute instead of global one, e.g.:
> > > > > > >
> > > > > > > echo "0/1" >  
> > /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start  
> > > > > > >
> > > > > > > In many scenario the user space client may only want to talk to mdev
> > > > > > > instance directly, w/o need to contact its parent device. Still take
> > > > > > > live migration for example, I don't think Qemu wants to know parent
> > > > > > > device of assigned mdev instances.  
> > > > > >
> > > > > > Hi Kevin,
> > > > > >
> > > > > > Having a global /sys/class/mdev/mdev_start doesn't require anybody to know
> > > > > > parent device. you can just do
> > > > > >
> > > > > > echo "mdev_UUID" > /sys/class/mdev/mdev_start
> > > > > >
> > > > > > or
> > > > > >
> > > > > > echo "mdev_UUID" > /sys/class/mdev/mdev_stop
> > > > > >
> > > > > > without knowing the parent device.
> > > > > >  
> > > > >
> > > > > You can look at some existing sysfs example, e.g.:
> > > > >
> > > > > echo "0/1" > /sys/bus/cpu/devices/cpu1/online
> > > > >
> > > > > You may also argue why not using a global style:
> > > > >
> > > > > echo "cpu1" > /sys/bus/cpu/devices/cpu_online
> > > > > echo "cpu1" > /sys/bus/cpu/devices/cpu_offline
> > > > >
> > > > > There are many similar examples...  
> > > >
> > > > Hi Kevin,
> > > >
> > > > My response above is to your question about using the global sysfs entry as you
> > > > don't want to have the global path because
> > > >
> > > > "I don't think Qemu wants to know parent device of assigned mdev instances.".
> > > >
> > > > So I just want to confirm with you that (in case you miss):
> > > >
> > > >     /sys/class/mdev/mdev_start | mdev_stop
> > > >
> > > > doesn't require the knowledge of parent device.
> > > >  
> > >
> > > Qemu is just one example, where your explanation of parent device
> > > makes sense but still it's not good for Qemu to populate /sys/class/mdev
> > > directly. Qemu is passed with the actual sysfs path of assigned mdev
> > > instance, so any mdev attributes touched by Qemu should be put under
> > > that node (e.g. start/stop for live migration usage as I explained earlier).  
> > 
> > Exactly, qemu is passed with the actual sysfs path.
> > 
> > So, QEMU doesn't touch the file /sys/class/mdev/mdev_start | mdev_stop at all.
> > 
> > QEMU will take the sysfs path as input:
> > 
> >  -device
> > vfio-pci,sysfsdev=/sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0,i
> > d=vgpu0  
> 
> no need of passing "id=vgpu0" here. If necessary you can put id as an attribute 
> under sysfs mdev node:
> 
> /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/id

QEMU needs an id parameter for devices, libvirt gives devices arbitrary
names, typically hostdev# for assigned devices.  This id is used to
reference the device for hmp/qmp commands.  This is not something the
mdev infrastructure should define.  Thanks,

Alex
Neo Jia Aug. 16, 2016, 8:30 p.m. UTC | #27
On Mon, Aug 15, 2016 at 04:47:41PM -0600, Alex Williamson wrote:
> On Mon, 15 Aug 2016 12:59:08 -0700
> Neo Jia <cjia@nvidia.com> wrote:
> 
> > > > >
> > > > > I'm not sure a comma separated list makes sense here, for both
> > > > > simplicity in the kernel and more fine grained error reporting, we
> > > > > probably want to start/stop them individually.  Actually, why is it
> > > > > that we can't use the mediated device being opened and released to
> > > > > automatically signal to the backend vendor driver to commit and release
> > > > > resources? I don't fully understand why userspace needs this interface.  
> > > 
> 
> That doesn't give an individual user the ability to stop and start
> their devices though, because in order for a user to have write
> permissions there, they get permission to DoS other users by pumping
> arbitrary UUIDs into those files.  By placing start/stop per mdev, we
> have mdev level granularity of granting start/stop privileges.  Really
> though, do we want QEMU fumbling around through sysfs or do we want an
> interface through the vfio API to perform start/stop?  Thanks,

Hi Alex,

I think those two suggests make sense, so we will move the "start/stop"
under mdev sysfs. 

This will be incorporated in our next v7 patch and by doing that, it will make
the locking scheme easier.

Thanks,
Neo

> 
> Alex
Alex Williamson Aug. 16, 2016, 8:51 p.m. UTC | #28
On Tue, 16 Aug 2016 13:30:06 -0700
Neo Jia <cjia@nvidia.com> wrote:

> On Mon, Aug 15, 2016 at 04:47:41PM -0600, Alex Williamson wrote:
> > On Mon, 15 Aug 2016 12:59:08 -0700
> > Neo Jia <cjia@nvidia.com> wrote:
> >   
> > > > > >
> > > > > > I'm not sure a comma separated list makes sense here, for both
> > > > > > simplicity in the kernel and more fine grained error reporting, we
> > > > > > probably want to start/stop them individually.  Actually, why is it
> > > > > > that we can't use the mediated device being opened and released to
> > > > > > automatically signal to the backend vendor driver to commit and release
> > > > > > resources? I don't fully understand why userspace needs this interface.    
> > > >   
> > 
> > That doesn't give an individual user the ability to stop and start
> > their devices though, because in order for a user to have write
> > permissions there, they get permission to DoS other users by pumping
> > arbitrary UUIDs into those files.  By placing start/stop per mdev, we
> > have mdev level granularity of granting start/stop privileges.  Really
> > though, do we want QEMU fumbling around through sysfs or do we want an
> > interface through the vfio API to perform start/stop?  Thanks,  
> 
> Hi Alex,
> 
> I think those two suggests make sense, so we will move the "start/stop"
> under mdev sysfs. 
> 
> This will be incorporated in our next v7 patch and by doing that, it will make
> the locking scheme easier.

Thanks Neo.  Also note that the semantics change when we move to per
device control.  It would be redundant to 'echo $UUID' into a start
file which only controls a single device.  So that means we probably
just want an 'echo 1'.  But if we can 'echo 1' then we can also 'echo
0', so we can reduce this to a single sysfs file.  Sysfs already has a
common interface for starting and stopping devices, the "online" file.
So I think we should probably move in that direction.  Additionally, an
"online" file should support a _show() function, so if we have an Intel
vGPU that perhaps does not need start/stop support, online could report
"1" after create to show that it's already online, possibly even
generate an error trying to change the online state.  Thanks,

Alex
Alex Williamson Aug. 16, 2016, 9:03 p.m. UTC | #29
On Mon, 15 Aug 2016 23:13:20 -0700
Neo Jia <cjia@nvidia.com> wrote:

> On Tue, Aug 16, 2016 at 05:58:54AM +0000, Tian, Kevin wrote:
> > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > Sent: Tuesday, August 16, 2016 1:44 PM
> > > 
> > > On Tue, Aug 16, 2016 at 04:52:30AM +0000, Tian, Kevin wrote:  
> > > > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > > > Sent: Tuesday, August 16, 2016 12:17 PM
> > > > >
> > > > > On Tue, Aug 16, 2016 at 03:50:44AM +0000, Tian, Kevin wrote:  
> > > > > > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > > > > > Sent: Tuesday, August 16, 2016 11:46 AM
> > > > > > >
> > > > > > > On Tue, Aug 16, 2016 at 12:30:25AM +0000, Tian, Kevin wrote:  
> > > > > > > > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > > > > > > > Sent: Tuesday, August 16, 2016 3:59 AM  
> > > > > > >  
> > > > > > > > > > >
> > > > > > > > > > > For NVIDIA vGPU solution we need to know all devices assigned to a VM  
> > > in  
> > > > > > > > > > > one shot to commit resources of all vGPUs assigned to a VM along with
> > > > > > > > > > > some common resources.  
> > > > > > > > > >
> > > > > > > > > > Kirti, can you elaborate the background about above one-shot commit
> > > > > > > > > > requirement? It's hard to understand such a requirement.
> > > > > > > > > >
> > > > > > > > > > As I relied in another mail, I really hope start/stop become a per-mdev
> > > > > > > > > > attribute instead of global one, e.g.:
> > > > > > > > > >
> > > > > > > > > > echo "0/1" >  
> > > > > /sys/class/mdev/12345678-1234-1234-1234-123456789abc/start  
> > > > > > > > > >
> > > > > > > > > > In many scenario the user space client may only want to talk to mdev
> > > > > > > > > > instance directly, w/o need to contact its parent device. Still take
> > > > > > > > > > live migration for example, I don't think Qemu wants to know parent
> > > > > > > > > > device of assigned mdev instances.  
> > > > > > > > >
> > > > > > > > > Hi Kevin,
> > > > > > > > >
> > > > > > > > > Having a global /sys/class/mdev/mdev_start doesn't require anybody to  
> > > know  
> > > > > > > > > parent device. you can just do
> > > > > > > > >
> > > > > > > > > echo "mdev_UUID" > /sys/class/mdev/mdev_start
> > > > > > > > >
> > > > > > > > > or
> > > > > > > > >
> > > > > > > > > echo "mdev_UUID" > /sys/class/mdev/mdev_stop
> > > > > > > > >
> > > > > > > > > without knowing the parent device.
> > > > > > > > >  
> > > > > > > >
> > > > > > > > You can look at some existing sysfs example, e.g.:
> > > > > > > >
> > > > > > > > echo "0/1" > /sys/bus/cpu/devices/cpu1/online
> > > > > > > >
> > > > > > > > You may also argue why not using a global style:
> > > > > > > >
> > > > > > > > echo "cpu1" > /sys/bus/cpu/devices/cpu_online
> > > > > > > > echo "cpu1" > /sys/bus/cpu/devices/cpu_offline
> > > > > > > >
> > > > > > > > There are many similar examples...  
> > > > > > >
> > > > > > > Hi Kevin,
> > > > > > >
> > > > > > > My response above is to your question about using the global sysfs entry as you
> > > > > > > don't want to have the global path because
> > > > > > >
> > > > > > > "I don't think Qemu wants to know parent device of assigned mdev instances.".
> > > > > > >
> > > > > > > So I just want to confirm with you that (in case you miss):
> > > > > > >
> > > > > > >     /sys/class/mdev/mdev_start | mdev_stop
> > > > > > >
> > > > > > > doesn't require the knowledge of parent device.
> > > > > > >  
> > > > > >
> > > > > > Qemu is just one example, where your explanation of parent device
> > > > > > makes sense but still it's not good for Qemu to populate /sys/class/mdev
> > > > > > directly. Qemu is passed with the actual sysfs path of assigned mdev
> > > > > > instance, so any mdev attributes touched by Qemu should be put under
> > > > > > that node (e.g. start/stop for live migration usage as I explained earlier).  
> > > > >
> > > > > Exactly, qemu is passed with the actual sysfs path.
> > > > >
> > > > > So, QEMU doesn't touch the file /sys/class/mdev/mdev_start | mdev_stop at all.
> > > > >
> > > > > QEMU will take the sysfs path as input:
> > > > >
> > > > >  -device
> > > > >  
> > > vfio-pci,sysfsdev=/sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0,i  
> > > > > d=vgpu0  
> > > >
> > > > no need of passing "id=vgpu0" here. If necessary you can put id as an attribute
> > > > under sysfs mdev node:
> > > >
> > > > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/id  
> > > 
> > > I think we have moved away from the device index based on Alex's comment, so the
> > > device path will be:
> > > 
> > >  /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818  
> > 
> > pass /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818
> > as parameter, and then Qemu can access 'id' under that path. You
> > don't need to pass a separate 'id' field. That's my point.

As noted in previous reply, id is a QEMU device parameter, it's not a
property of the mdev device.  I'd also caution against adding arbitrary
sysfs files with the expectation that QEMU will be able to manipulate
them.  I believe one of the benefits of vfio vs legacy KVM device
assignment is that vfio is self contained through the vfio API.  Each
time we expect QEMU to interact with the system via sysfs, that's a new
file that libvirt needs to add access to and a new attack vector where
we need to worry about security.  I still think it's the right idea
from a sysfs perspective to move to per device files, or an "online"
file to replace both, but let's be a little more strategic how we
expect QEMU to interact with the device.

> > > > > As you are saying in live migration, QEMU needs to access "start" and "stop".  Could  
> > > you  
> > > > > please share more details, such as how QEMU access the "start" and "stop" sysfs,
> > > > > when and what triggers that?
> > > > >  
> > > >
> > > > A conceptual flow as below:
> > > >
> > > > 1. Quiescent mdev activity on the parent device (e.g. stop scheduling, wait for
> > > > in-flight DMA completed, etc.)
> > > >
> > > > echo "0" > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/start
> > > >
> > > > 2. Save mdev state:
> > > >
> > > > cat /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/state > xxx
> > > >
> > > > 3. xxx will be part of the final VM image and copied to a new machine
> > > >
> > > > 4. Allocate/prepare mdev on the new machine for this VM
> > > >
> > > > 5. Restore mdev state:
> > > >
> > > > cat xxx > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/state
> > > > (might be a different path name)
> > > >
> > > > 6. start mdev on the new parent device:
> > > >
> > > > echo "1" > /sys/bus/mdev/devices/c0b26072-dd1b-4340-84fe-bf338c510818-0/start  

For example, another solution to this would be a device specific vfio
region dedicated to starting and stopping the device an manipulating
state.  A simplistic API might be that the first dword of this region
is reserved for control of the device, the ability to pause and resume
the device, and reads and writes to the remainder of the region collect
and restore device state, respectively.  We'd need to spend more time
thinking about such an API, but something like this could be added
between the kernel and QEMU as a feature that doesn't change the
existing API.  Thanks,

Alex
Neo Jia Aug. 16, 2016, 9:17 p.m. UTC | #30
On Tue, Aug 16, 2016 at 02:51:03PM -0600, Alex Williamson wrote:
> On Tue, 16 Aug 2016 13:30:06 -0700
> Neo Jia <cjia@nvidia.com> wrote:
> 
> > On Mon, Aug 15, 2016 at 04:47:41PM -0600, Alex Williamson wrote:
> > > On Mon, 15 Aug 2016 12:59:08 -0700
> > > Neo Jia <cjia@nvidia.com> wrote:
> > >   
> > > > > > >
> > > > > > > I'm not sure a comma separated list makes sense here, for both
> > > > > > > simplicity in the kernel and more fine grained error reporting, we
> > > > > > > probably want to start/stop them individually.  Actually, why is it
> > > > > > > that we can't use the mediated device being opened and released to
> > > > > > > automatically signal to the backend vendor driver to commit and release
> > > > > > > resources? I don't fully understand why userspace needs this interface.    
> > > > >   
> > > 
> > > That doesn't give an individual user the ability to stop and start
> > > their devices though, because in order for a user to have write
> > > permissions there, they get permission to DoS other users by pumping
> > > arbitrary UUIDs into those files.  By placing start/stop per mdev, we
> > > have mdev level granularity of granting start/stop privileges.  Really
> > > though, do we want QEMU fumbling around through sysfs or do we want an
> > > interface through the vfio API to perform start/stop?  Thanks,  
> > 
> > Hi Alex,
> > 
> > I think those two suggests make sense, so we will move the "start/stop"
> > under mdev sysfs. 
> > 
> > This will be incorporated in our next v7 patch and by doing that, it will make
> > the locking scheme easier.
> 
> Thanks Neo.  Also note that the semantics change when we move to per
> device control.  It would be redundant to 'echo $UUID' into a start
> file which only controls a single device.  So that means we probably
> just want an 'echo 1'.  But if we can 'echo 1' then we can also 'echo
> 0', so we can reduce this to a single sysfs file.  Sysfs already has a
> common interface for starting and stopping devices, the "online" file.
> So I think we should probably move in that direction.  Additionally, an
> "online" file should support a _show() function, so if we have an Intel
> vGPU that perhaps does not need start/stop support, online could report
> "1" after create to show that it's already online, possibly even
> generate an error trying to change the online state.  Thanks,

Agree. We will adopt the similar syntax and support _show() function.

Thanks,
Neo

> 
> Alex
diff mbox

Patch

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index da6e2ce77495..23eced02aaf6 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -48,4 +48,5 @@  menuconfig VFIO_NOIOMMU
 
 source "drivers/vfio/pci/Kconfig"
 source "drivers/vfio/platform/Kconfig"
+source "drivers/vfio/mdev/Kconfig"
 source "virt/lib/Kconfig"
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 7b8a31f63fea..4a23c13b6be4 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -7,3 +7,4 @@  obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
 obj-$(CONFIG_VFIO_PCI) += pci/
 obj-$(CONFIG_VFIO_PLATFORM) += platform/
+obj-$(CONFIG_VFIO_MDEV) += mdev/
diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
new file mode 100644
index 000000000000..a34fbc66f92f
--- /dev/null
+++ b/drivers/vfio/mdev/Kconfig
@@ -0,0 +1,12 @@ 
+
+config VFIO_MDEV
+    tristate "Mediated device driver framework"
+    depends on VFIO
+    default n
+    help
+        Provides a framework to virtualize device.
+	See Documentation/vfio-mediated-device.txt for more details.
+
+        If you don't know what do here, say N.
+
+
diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
new file mode 100644
index 000000000000..56a75e689582
--- /dev/null
+++ b/drivers/vfio/mdev/Makefile
@@ -0,0 +1,5 @@ 
+
+mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o
+
+obj-$(CONFIG_VFIO_MDEV) += mdev.o
+
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
new file mode 100644
index 000000000000..90ff073abfce
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -0,0 +1,676 @@ 
+/*
+ * Mediated device Core Driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/uuid.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/sysfs.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+#define DRIVER_VERSION		"0.1"
+#define DRIVER_AUTHOR		"NVIDIA Corporation"
+#define DRIVER_DESC		"Mediated device Core Driver"
+
+#define MDEV_CLASS_NAME		"mdev"
+
+static LIST_HEAD(parent_list);
+static DEFINE_MUTEX(parent_list_lock);
+
+static int mdev_add_attribute_group(struct device *dev,
+				    const struct attribute_group **groups)
+{
+	return sysfs_create_groups(&dev->kobj, groups);
+}
+
+static void mdev_remove_attribute_group(struct device *dev,
+					const struct attribute_group **groups)
+{
+	sysfs_remove_groups(&dev->kobj, groups);
+}
+
+/* Should be called holding parent->mdev_list_lock */
+static struct mdev_device *find_mdev_device(struct parent_device *parent,
+					    uuid_le uuid, int instance)
+{
+	struct mdev_device *mdev;
+
+	list_for_each_entry(mdev, &parent->mdev_list, next) {
+		if ((uuid_le_cmp(mdev->uuid, uuid) == 0) &&
+		    (mdev->instance == instance))
+			return mdev;
+	}
+	return NULL;
+}
+
+/* Should be called holding parent_list_lock */
+static struct parent_device *find_parent_device(struct device *dev)
+{
+	struct parent_device *parent;
+
+	list_for_each_entry(parent, &parent_list, next) {
+		if (parent->dev == dev)
+			return parent;
+	}
+	return NULL;
+}
+
+static void mdev_release_parent(struct kref *kref)
+{
+	struct parent_device *parent = container_of(kref, struct parent_device,
+						    ref);
+	kfree(parent);
+}
+
+static
+inline struct parent_device *mdev_get_parent(struct parent_device *parent)
+{
+	if (parent)
+		kref_get(&parent->ref);
+
+	return parent;
+}
+
+static inline void mdev_put_parent(struct parent_device *parent)
+{
+	if (parent)
+		kref_put(&parent->ref, mdev_release_parent);
+}
+
+static struct parent_device *mdev_get_parent_by_dev(struct device *dev)
+{
+	struct parent_device *parent = NULL, *p;
+
+	mutex_lock(&parent_list_lock);
+	list_for_each_entry(p, &parent_list, next) {
+		if (p->dev == dev) {
+			parent = mdev_get_parent(p);
+			break;
+		}
+	}
+	mutex_unlock(&parent_list_lock);
+	return parent;
+}
+
+static int mdev_device_create_ops(struct mdev_device *mdev, char *mdev_params)
+{
+	struct parent_device *parent = mdev->parent;
+	int ret;
+
+	ret = parent->ops->create(mdev, mdev_params);
+	if (ret)
+		return ret;
+
+	ret = mdev_add_attribute_group(&mdev->dev,
+					parent->ops->mdev_attr_groups);
+	if (ret)
+		parent->ops->destroy(mdev);
+
+	return ret;
+}
+
+static int mdev_device_destroy_ops(struct mdev_device *mdev, bool force)
+{
+	struct parent_device *parent = mdev->parent;
+	int ret = 0;
+
+	/*
+	 * If vendor driver doesn't return success that means vendor
+	 * driver doesn't support hot-unplug
+	 */
+	ret = parent->ops->destroy(mdev);
+	if (ret && !force)
+		return -EBUSY;
+
+	mdev_remove_attribute_group(&mdev->dev,
+				    parent->ops->mdev_attr_groups);
+
+	return ret;
+}
+
+static void mdev_release_device(struct kref *kref)
+{
+	struct mdev_device *mdev = container_of(kref, struct mdev_device, ref);
+	struct parent_device *parent = mdev->parent;
+
+	list_del(&mdev->next);
+	mutex_unlock(&parent->mdev_list_lock);
+
+	device_unregister(&mdev->dev);
+	wake_up(&parent->release_done);
+	mdev_put_parent(parent);
+}
+
+struct mdev_device *mdev_get_device(struct mdev_device *mdev)
+{
+	kref_get(&mdev->ref);
+	return mdev;
+}
+EXPORT_SYMBOL(mdev_get_device);
+
+void mdev_put_device(struct mdev_device *mdev)
+{
+	struct parent_device *parent = mdev->parent;
+
+	kref_put_mutex(&mdev->ref, mdev_release_device,
+		       &parent->mdev_list_lock);
+}
+EXPORT_SYMBOL(mdev_put_device);
+
+/*
+ * Find first mediated device from given uuid and increment refcount of
+ * mediated device. Caller should call mdev_put_device() when the use of
+ * mdev_device is done.
+ */
+static struct mdev_device *mdev_get_first_device_by_uuid(uuid_le uuid)
+{
+	struct mdev_device *mdev = NULL, *p;
+	struct parent_device *parent;
+
+	mutex_lock(&parent_list_lock);
+	list_for_each_entry(parent, &parent_list, next) {
+		mutex_lock(&parent->mdev_list_lock);
+		list_for_each_entry(p, &parent->mdev_list, next) {
+			if (uuid_le_cmp(p->uuid, uuid) == 0) {
+				mdev = mdev_get_device(p);
+				break;
+			}
+		}
+		mutex_unlock(&parent->mdev_list_lock);
+
+		if (mdev)
+			break;
+	}
+	mutex_unlock(&parent_list_lock);
+	return mdev;
+}
+
+/*
+ * Find mediated device from given iommu_group and increment refcount of
+ * mediated device. Caller should call mdev_put_device() when the use of
+ * mdev_device is done.
+ */
+struct mdev_device *mdev_get_device_by_group(struct iommu_group *group)
+{
+	struct mdev_device *mdev = NULL, *p;
+	struct parent_device *parent;
+
+	mutex_lock(&parent_list_lock);
+	list_for_each_entry(parent, &parent_list, next) {
+		mutex_lock(&parent->mdev_list_lock);
+		list_for_each_entry(p, &parent->mdev_list, next) {
+			if (!p->group)
+				continue;
+
+			if (iommu_group_id(p->group) == iommu_group_id(group)) {
+				mdev = mdev_get_device(p);
+				break;
+			}
+		}
+		mutex_unlock(&parent->mdev_list_lock);
+
+		if (mdev)
+			break;
+	}
+	mutex_unlock(&parent_list_lock);
+	return mdev;
+}
+EXPORT_SYMBOL(mdev_get_device_by_group);
+
+/*
+ * mdev_register_device : Register a device
+ * @dev: device structure representing parent device.
+ * @ops: Parent device operation structure to be registered.
+ *
+ * Add device to list of registered parent devices.
+ * Returns a negative value on error, otherwise 0.
+ */
+int mdev_register_device(struct device *dev, const struct parent_ops *ops)
+{
+	int ret = 0;
+	struct parent_device *parent;
+
+	if (!dev || !ops)
+		return -EINVAL;
+
+	/* check for mandatory ops */
+	if (!ops->create || !ops->destroy)
+		return -EINVAL;
+
+	mutex_lock(&parent_list_lock);
+
+	/* Check for duplicate */
+	parent = find_parent_device(dev);
+	if (parent) {
+		ret = -EEXIST;
+		goto add_dev_err;
+	}
+
+	parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+	if (!parent) {
+		ret = -ENOMEM;
+		goto add_dev_err;
+	}
+
+	kref_init(&parent->ref);
+	list_add(&parent->next, &parent_list);
+
+	parent->dev = dev;
+	parent->ops = ops;
+	mutex_init(&parent->mdev_list_lock);
+	INIT_LIST_HEAD(&parent->mdev_list);
+	init_waitqueue_head(&parent->release_done);
+	mutex_unlock(&parent_list_lock);
+
+	ret = mdev_create_sysfs_files(dev);
+	if (ret)
+		goto add_sysfs_error;
+
+	ret = mdev_add_attribute_group(dev, ops->dev_attr_groups);
+	if (ret)
+		goto add_group_error;
+
+	dev_info(dev, "MDEV: Registered\n");
+	return 0;
+
+add_group_error:
+	mdev_remove_sysfs_files(dev);
+add_sysfs_error:
+	mutex_lock(&parent_list_lock);
+	list_del(&parent->next);
+	mutex_unlock(&parent_list_lock);
+	mdev_put_parent(parent);
+	return ret;
+
+add_dev_err:
+	mutex_unlock(&parent_list_lock);
+	return ret;
+}
+EXPORT_SYMBOL(mdev_register_device);
+
+/*
+ * mdev_unregister_device : Unregister a parent device
+ * @dev: device structure representing parent device.
+ *
+ * Remove device from list of registered parent devices. Give a chance to free
+ * existing mediated devices for given device.
+ */
+
+void mdev_unregister_device(struct device *dev)
+{
+	struct parent_device *parent;
+	struct mdev_device *mdev, *n;
+	int ret;
+
+	mutex_lock(&parent_list_lock);
+	parent = find_parent_device(dev);
+
+	if (!parent) {
+		mutex_unlock(&parent_list_lock);
+		return;
+	}
+	dev_info(dev, "MDEV: Unregistering\n");
+
+	/*
+	 * Remove parent from the list and remove create and destroy sysfs
+	 * files so that no new mediated device could be created for this parent
+	 */
+	list_del(&parent->next);
+	mdev_remove_sysfs_files(dev);
+	mutex_unlock(&parent_list_lock);
+
+	mdev_remove_attribute_group(dev,
+				    parent->ops->dev_attr_groups);
+
+	mutex_lock(&parent->mdev_list_lock);
+	list_for_each_entry_safe(mdev, n, &parent->mdev_list, next) {
+		mdev_device_destroy_ops(mdev, true);
+		mutex_unlock(&parent->mdev_list_lock);
+		mdev_put_device(mdev);
+		mutex_lock(&parent->mdev_list_lock);
+	}
+	mutex_unlock(&parent->mdev_list_lock);
+
+	do {
+		ret = wait_event_interruptible_timeout(parent->release_done,
+				list_empty(&parent->mdev_list), HZ * 10);
+		if (ret == -ERESTARTSYS) {
+			dev_warn(dev, "Mediated devices are in use, task"
+				      " \"%s\" (%d) "
+				      "blocked until all are released",
+				      current->comm, task_pid_nr(current));
+		}
+	} while (ret <= 0);
+
+	mdev_put_parent(parent);
+}
+EXPORT_SYMBOL(mdev_unregister_device);
+
+/*
+ * Functions required for mdev_sysfs
+ */
+static void mdev_device_release(struct device *dev)
+{
+	struct mdev_device *mdev = to_mdev_device(dev);
+
+	dev_dbg(&mdev->dev, "MDEV: destroying\n");
+	kfree(mdev);
+}
+
+int mdev_device_create(struct device *dev, uuid_le uuid, uint32_t instance,
+		       char *mdev_params)
+{
+	int ret;
+	struct mdev_device *mdev;
+	struct parent_device *parent;
+
+	parent = mdev_get_parent_by_dev(dev);
+	if (!parent)
+		return -EINVAL;
+
+	mutex_lock(&parent->mdev_list_lock);
+	/* Check for duplicate */
+	mdev = find_mdev_device(parent, uuid, instance);
+	if (mdev) {
+		ret = -EEXIST;
+		goto create_err;
+	}
+
+	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
+	if (!mdev) {
+		ret = -ENOMEM;
+		goto create_err;
+	}
+
+	memcpy(&mdev->uuid, &uuid, sizeof(uuid_le));
+	mdev->instance = instance;
+	mdev->parent = parent;
+	kref_init(&mdev->ref);
+
+	mdev->dev.parent  = dev;
+	mdev->dev.bus     = &mdev_bus_type;
+	mdev->dev.release = mdev_device_release;
+	dev_set_name(&mdev->dev, "%pUl-%d", uuid.b, instance);
+
+	ret = device_register(&mdev->dev);
+	if (ret) {
+		put_device(&mdev->dev);
+		goto create_err;
+	}
+
+	ret = mdev_device_create_ops(mdev, mdev_params);
+	if (ret)
+		goto create_failed;
+
+	list_add(&mdev->next, &parent->mdev_list);
+	mutex_unlock(&parent->mdev_list_lock);
+
+	dev_dbg(&mdev->dev, "MDEV: created\n");
+
+	return ret;
+
+create_failed:
+	device_unregister(&mdev->dev);
+
+create_err:
+	mutex_unlock(&parent->mdev_list_lock);
+	mdev_put_parent(parent);
+	return ret;
+}
+
+int mdev_device_destroy(struct device *dev, uuid_le uuid, uint32_t instance)
+{
+	struct mdev_device *mdev;
+	struct parent_device *parent;
+	int ret;
+
+	parent = mdev_get_parent_by_dev(dev);
+	if (!parent)
+		return -EINVAL;
+
+	mutex_lock(&parent->mdev_list_lock);
+	mdev = find_mdev_device(parent, uuid, instance);
+	if (!mdev) {
+		ret = -EINVAL;
+		goto destroy_err;
+	}
+
+	ret = mdev_device_destroy_ops(mdev, false);
+	if (ret)
+		goto destroy_err;
+
+	mutex_unlock(&parent->mdev_list_lock);
+	mdev_put_device(mdev);
+
+	mdev_put_parent(parent);
+	return ret;
+
+destroy_err:
+	mutex_unlock(&parent->mdev_list_lock);
+	mdev_put_parent(parent);
+	return ret;
+}
+
+int mdev_device_invalidate_mapping(struct mdev_device *mdev,
+				   unsigned long addr, unsigned long size)
+{
+	int ret = -EINVAL;
+	struct mdev_phys_mapping *phys_mappings;
+	struct addr_desc *addr_desc;
+
+	if (!mdev || !mdev->phys_mappings.mapping)
+		return ret;
+
+	phys_mappings = &mdev->phys_mappings;
+
+	mutex_lock(&phys_mappings->addr_desc_list_lock);
+
+	list_for_each_entry(addr_desc, &phys_mappings->addr_desc_list, next) {
+
+		if ((addr > addr_desc->start) &&
+		    (addr + size < addr_desc->start + addr_desc->size)) {
+			unmap_mapping_range(phys_mappings->mapping,
+					    addr, size, 0);
+			ret = 0;
+			goto unlock_exit;
+		}
+	}
+
+unlock_exit:
+	mutex_unlock(&phys_mappings->addr_desc_list_lock);
+	return ret;
+}
+EXPORT_SYMBOL(mdev_device_invalidate_mapping);
+
+/* Sanity check for the physical mapping list for mediated device */
+
+int mdev_add_phys_mapping(struct mdev_device *mdev,
+			  struct address_space *mapping,
+			  unsigned long addr, unsigned long size)
+{
+	struct mdev_phys_mapping *phys_mappings;
+	struct addr_desc *addr_desc, *new_addr_desc;
+	int ret = 0;
+
+	if (!mdev)
+		return -EINVAL;
+
+	phys_mappings = &mdev->phys_mappings;
+	if (phys_mappings->mapping && (mapping != phys_mappings->mapping))
+		return -EINVAL;
+
+	if (!phys_mappings->mapping) {
+		phys_mappings->mapping = mapping;
+		mutex_init(&phys_mappings->addr_desc_list_lock);
+		INIT_LIST_HEAD(&phys_mappings->addr_desc_list);
+	}
+
+	mutex_lock(&phys_mappings->addr_desc_list_lock);
+
+	list_for_each_entry(addr_desc, &phys_mappings->addr_desc_list, next) {
+		if ((addr + size < addr_desc->start) ||
+		    (addr_desc->start + addr_desc->size) < addr)
+			continue;
+		else {
+			/* should be no overlap */
+			ret = -EINVAL;
+			goto mapping_exit;
+		}
+	}
+
+	/* add the new entry to the list */
+	new_addr_desc = kzalloc(sizeof(*new_addr_desc), GFP_KERNEL);
+
+	if (!new_addr_desc) {
+		ret = -ENOMEM;
+		goto mapping_exit;
+	}
+
+	new_addr_desc->start = addr;
+	new_addr_desc->size = size;
+	list_add(&new_addr_desc->next, &phys_mappings->addr_desc_list);
+
+mapping_exit:
+	mutex_unlock(&phys_mappings->addr_desc_list_lock);
+	return ret;
+}
+EXPORT_SYMBOL(mdev_add_phys_mapping);
+
+void mdev_del_phys_mapping(struct mdev_device *mdev, unsigned long addr)
+{
+	struct mdev_phys_mapping *phys_mappings;
+	struct addr_desc *addr_desc;
+
+	if (!mdev)
+		return;
+
+	phys_mappings = &mdev->phys_mappings;
+
+	mutex_lock(&phys_mappings->addr_desc_list_lock);
+	list_for_each_entry(addr_desc, &phys_mappings->addr_desc_list, next) {
+		if (addr_desc->start == addr) {
+			list_del(&addr_desc->next);
+			kfree(addr_desc);
+			break;
+		}
+	}
+	mutex_unlock(&phys_mappings->addr_desc_list_lock);
+}
+EXPORT_SYMBOL(mdev_del_phys_mapping);
+
+void mdev_device_supported_config(struct device *dev, char *str)
+{
+	struct parent_device *parent;
+
+	parent = mdev_get_parent_by_dev(dev);
+
+	if (parent) {
+		if (parent->ops->supported_config)
+			parent->ops->supported_config(parent->dev, str);
+		mdev_put_parent(parent);
+	}
+}
+
+int mdev_device_start(uuid_le uuid)
+{
+	int ret = 0;
+	struct mdev_device *mdev;
+	struct parent_device *parent;
+
+	mdev = mdev_get_first_device_by_uuid(uuid);
+	if (!mdev)
+		return -EINVAL;
+
+	parent = mdev->parent;
+
+	if (parent->ops->start)
+		ret = parent->ops->start(mdev->uuid);
+
+	if (ret)
+		pr_err("mdev_start failed  %d\n", ret);
+	else
+		kobject_uevent(&mdev->dev.kobj, KOBJ_ONLINE);
+
+	mdev_put_device(mdev);
+
+	return ret;
+}
+
+int mdev_device_stop(uuid_le uuid)
+{
+	int ret = 0;
+	struct mdev_device *mdev;
+	struct parent_device *parent;
+
+	mdev = mdev_get_first_device_by_uuid(uuid);
+	if (!mdev)
+		return -EINVAL;
+
+	parent = mdev->parent;
+
+	if (parent->ops->stop)
+		ret = parent->ops->stop(mdev->uuid);
+
+	if (ret)
+		pr_err("mdev stop failed %d\n", ret);
+	else
+		kobject_uevent(&mdev->dev.kobj, KOBJ_OFFLINE);
+
+	mdev_put_device(mdev);
+	return ret;
+}
+
+static struct class mdev_class = {
+	.name		= MDEV_CLASS_NAME,
+	.owner		= THIS_MODULE,
+	.class_attrs	= mdev_class_attrs,
+};
+
+static int __init mdev_init(void)
+{
+	int ret;
+
+	ret = class_register(&mdev_class);
+	if (ret) {
+		pr_err("Failed to register mdev class\n");
+		return ret;
+	}
+
+	ret = mdev_bus_register();
+	if (ret) {
+		pr_err("Failed to register mdev bus\n");
+		class_unregister(&mdev_class);
+		return ret;
+	}
+
+	return ret;
+}
+
+static void __exit mdev_exit(void)
+{
+	mdev_bus_unregister();
+	class_unregister(&mdev_class);
+}
+
+module_init(mdev_init)
+module_exit(mdev_exit)
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
new file mode 100644
index 000000000000..00680bd06224
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_driver.c
@@ -0,0 +1,142 @@ 
+/*
+ * MDEV driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/iommu.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+static int mdev_attach_iommu(struct mdev_device *mdev)
+{
+	int ret;
+	struct iommu_group *group;
+
+	group = iommu_group_alloc();
+	if (IS_ERR(group)) {
+		dev_err(&mdev->dev, "MDEV: failed to allocate group!\n");
+		return PTR_ERR(group);
+	}
+
+	ret = iommu_group_add_device(group, &mdev->dev);
+	if (ret) {
+		dev_err(&mdev->dev, "MDEV: failed to add dev to group!\n");
+		goto attach_fail;
+	}
+
+	mdev->group = group;
+
+	dev_info(&mdev->dev, "MDEV: group_id = %d\n",
+				 iommu_group_id(group));
+attach_fail:
+	iommu_group_put(group);
+	return ret;
+}
+
+static void mdev_detach_iommu(struct mdev_device *mdev)
+{
+	iommu_group_remove_device(&mdev->dev);
+	mdev->group = NULL;
+	dev_info(&mdev->dev, "MDEV: detaching iommu\n");
+}
+
+static int mdev_probe(struct device *dev)
+{
+	struct mdev_driver *drv = to_mdev_driver(dev->driver);
+	struct mdev_device *mdev = to_mdev_device(dev);
+	int ret;
+
+	ret = mdev_attach_iommu(mdev);
+	if (ret) {
+		dev_err(dev, "Failed to attach IOMMU\n");
+		return ret;
+	}
+
+	if (drv && drv->probe)
+		ret = drv->probe(dev);
+
+	if (ret)
+		mdev_detach_iommu(mdev);
+
+	return ret;
+}
+
+static int mdev_remove(struct device *dev)
+{
+	struct mdev_driver *drv = to_mdev_driver(dev->driver);
+	struct mdev_device *mdev = to_mdev_device(dev);
+
+	if (drv && drv->remove)
+		drv->remove(dev);
+
+	mdev_detach_iommu(mdev);
+
+	return 0;
+}
+
+static int mdev_match(struct device *dev, struct device_driver *driver)
+{
+	struct mdev_driver *drv = to_mdev_driver(driver);
+
+	if (drv && drv->match)
+		return drv->match(dev);
+
+	return 0;
+}
+
+struct bus_type mdev_bus_type = {
+	.name		= "mdev",
+	.match		= mdev_match,
+	.probe		= mdev_probe,
+	.remove		= mdev_remove,
+};
+EXPORT_SYMBOL_GPL(mdev_bus_type);
+
+/*
+ * mdev_register_driver - register a new MDEV driver
+ * @drv: the driver to register
+ * @owner: module owner of driver to be registered
+ *
+ * Returns a negative value on error, otherwise 0.
+ */
+int mdev_register_driver(struct mdev_driver *drv, struct module *owner)
+{
+	/* initialize common driver fields */
+	drv->driver.name = drv->name;
+	drv->driver.bus = &mdev_bus_type;
+	drv->driver.owner = owner;
+
+	/* register with core */
+	return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL(mdev_register_driver);
+
+/*
+ * mdev_unregister_driver - unregister MDEV driver
+ * @drv: the driver to unregister
+ *
+ */
+void mdev_unregister_driver(struct mdev_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL(mdev_unregister_driver);
+
+int mdev_bus_register(void)
+{
+	return bus_register(&mdev_bus_type);
+}
+
+void mdev_bus_unregister(void)
+{
+	bus_unregister(&mdev_bus_type);
+}
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
new file mode 100644
index 000000000000..ee2db61a8091
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -0,0 +1,33 @@ 
+/*
+ * Mediated device interal definitions
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef MDEV_PRIVATE_H
+#define MDEV_PRIVATE_H
+
+int  mdev_bus_register(void);
+void mdev_bus_unregister(void);
+
+/* Function prototypes for mdev_sysfs */
+
+extern struct class_attribute mdev_class_attrs[];
+
+int  mdev_create_sysfs_files(struct device *dev);
+void mdev_remove_sysfs_files(struct device *dev);
+
+int  mdev_device_create(struct device *dev, uuid_le uuid, uint32_t instance,
+			char *mdev_params);
+int  mdev_device_destroy(struct device *dev, uuid_le uuid, uint32_t instance);
+void mdev_device_supported_config(struct device *dev, char *str);
+int  mdev_device_start(uuid_le uuid);
+int  mdev_device_stop(uuid_le uuid);
+
+#endif /* MDEV_PRIVATE_H */
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
new file mode 100644
index 000000000000..e0457e68cf78
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -0,0 +1,269 @@ 
+/*
+ * File attributes for Mediated devices
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/sysfs.h>
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+/* Prototypes */
+static ssize_t mdev_supported_types_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf);
+static DEVICE_ATTR_RO(mdev_supported_types);
+
+static ssize_t mdev_create_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count);
+static DEVICE_ATTR_WO(mdev_create);
+
+static ssize_t mdev_destroy_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count);
+static DEVICE_ATTR_WO(mdev_destroy);
+
+/* Static functions */
+
+
+#define SUPPORTED_TYPE_BUFFER_LENGTH	4096
+
+/* mdev sysfs Functions */
+static ssize_t mdev_supported_types_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	char *str, *ptr;
+	ssize_t n;
+
+	str = kzalloc(sizeof(*str) * SUPPORTED_TYPE_BUFFER_LENGTH, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	ptr = str;
+	mdev_device_supported_config(dev, str);
+
+	n = sprintf(buf, "%s\n", str);
+	kfree(ptr);
+
+	return n;
+}
+
+static ssize_t mdev_create_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	char *str, *pstr;
+	char *uuid_str, *instance_str, *mdev_params = NULL, *params = NULL;
+	uuid_le uuid;
+	uint32_t instance;
+	int ret;
+
+	pstr = str = kstrndup(buf, count, GFP_KERNEL);
+
+	if (!str)
+		return -ENOMEM;
+
+	uuid_str = strsep(&str, ":");
+	if (!uuid_str) {
+		pr_err("mdev_create: Empty UUID string %s\n", buf);
+		ret = -EINVAL;
+		goto create_error;
+	}
+
+	if (!str) {
+		pr_err("mdev_create: mdev instance not present %s\n", buf);
+		ret = -EINVAL;
+		goto create_error;
+	}
+
+	instance_str = strsep(&str, ":");
+	if (!instance_str) {
+		pr_err("mdev_create: Empty instance string %s\n", buf);
+		ret = -EINVAL;
+		goto create_error;
+	}
+
+	ret = kstrtouint(instance_str, 0, &instance);
+	if (ret) {
+		pr_err("mdev_create: mdev instance parsing error %s\n", buf);
+		goto create_error;
+	}
+
+	if (str)
+		params = mdev_params = kstrdup(str, GFP_KERNEL);
+
+	ret = uuid_le_to_bin(uuid_str, &uuid);
+	if (ret) {
+		pr_err("mdev_create: UUID parse error %s\n", buf);
+		goto create_error;
+	}
+
+	ret = mdev_device_create(dev, uuid, instance, mdev_params);
+	if (ret)
+		pr_err("mdev_create: Failed to create mdev device\n");
+	else
+		ret = count;
+
+create_error:
+	kfree(params);
+	kfree(pstr);
+	return ret;
+}
+
+static ssize_t mdev_destroy_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	char *uuid_str, *str, *pstr;
+	uuid_le uuid;
+	unsigned int instance;
+	int ret;
+
+	str = pstr = kstrndup(buf, count, GFP_KERNEL);
+
+	if (!str)
+		return -ENOMEM;
+
+	uuid_str = strsep(&str, ":");
+	if (!uuid_str) {
+		pr_err("mdev_destroy: Empty UUID string %s\n", buf);
+		ret = -EINVAL;
+		goto destroy_error;
+	}
+
+	if (str == NULL) {
+		pr_err("mdev_destroy: instance not specified %s\n", buf);
+		ret = -EINVAL;
+		goto destroy_error;
+	}
+
+	ret = kstrtouint(str, 0, &instance);
+	if (ret) {
+		pr_err("mdev_destroy: instance parsing error %s\n", buf);
+		goto destroy_error;
+	}
+
+	ret = uuid_le_to_bin(uuid_str, &uuid);
+	if (ret) {
+		pr_err("mdev_destroy: UUID parse error  %s\n", buf);
+		goto destroy_error;
+	}
+
+	ret = mdev_device_destroy(dev, uuid, instance);
+	if (ret == 0)
+		ret = count;
+
+destroy_error:
+	kfree(pstr);
+	return ret;
+}
+
+ssize_t mdev_start_store(struct class *class, struct class_attribute *attr,
+			 const char *buf, size_t count)
+{
+	char *uuid_str, *ptr;
+	uuid_le uuid;
+	int ret;
+
+	ptr = uuid_str = kstrndup(buf, count, GFP_KERNEL);
+
+	if (!uuid_str)
+		return -ENOMEM;
+
+	ret = uuid_le_to_bin(uuid_str, &uuid);
+	if (ret) {
+		pr_err("mdev_start: UUID parse error  %s\n", buf);
+		goto start_error;
+	}
+
+	ret = mdev_device_start(uuid);
+	if (ret == 0)
+		ret = count;
+
+start_error:
+	kfree(ptr);
+	return ret;
+}
+
+ssize_t mdev_stop_store(struct class *class, struct class_attribute *attr,
+			    const char *buf, size_t count)
+{
+	char *uuid_str, *ptr;
+	uuid_le uuid;
+	int ret;
+
+	ptr = uuid_str = kstrndup(buf, count, GFP_KERNEL);
+
+	if (!uuid_str)
+		return -ENOMEM;
+
+	ret = uuid_le_to_bin(uuid_str, &uuid);
+	if (ret) {
+		pr_err("mdev_stop: UUID parse error %s\n", buf);
+		goto stop_error;
+	}
+
+	ret = mdev_device_stop(uuid);
+	if (ret == 0)
+		ret = count;
+
+stop_error:
+	kfree(ptr);
+	return ret;
+
+}
+
+struct class_attribute mdev_class_attrs[] = {
+	__ATTR_WO(mdev_start),
+	__ATTR_WO(mdev_stop),
+	__ATTR_NULL
+};
+
+int mdev_create_sysfs_files(struct device *dev)
+{
+	int ret;
+
+	ret = sysfs_create_file(&dev->kobj,
+				&dev_attr_mdev_supported_types.attr);
+	if (ret) {
+		pr_err("Failed to create mdev_supported_types sysfs entry\n");
+		return ret;
+	}
+
+	ret = sysfs_create_file(&dev->kobj, &dev_attr_mdev_create.attr);
+	if (ret) {
+		pr_err("Failed to create mdev_create sysfs entry\n");
+		goto create_sysfs_failed;
+	}
+
+	ret = sysfs_create_file(&dev->kobj, &dev_attr_mdev_destroy.attr);
+	if (ret) {
+		pr_err("Failed to create mdev_destroy sysfs entry\n");
+		sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr);
+	} else
+		return ret;
+
+create_sysfs_failed:
+	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr);
+	return ret;
+}
+
+void mdev_remove_sysfs_files(struct device *dev)
+{
+	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr);
+	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr);
+	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_destroy.attr);
+}
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
new file mode 100644
index 000000000000..0b41f301a9b7
--- /dev/null
+++ b/include/linux/mdev.h
@@ -0,0 +1,236 @@ 
+/*
+ * Mediated device definition
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef MDEV_H
+#define MDEV_H
+
+#include <uapi/linux/vfio.h>
+
+struct parent_device;
+
+/*
+ * Mediated device
+ */
+
+struct addr_desc {
+	unsigned long start;
+	unsigned long size;
+	struct list_head next;
+};
+
+struct mdev_phys_mapping {
+	struct address_space *mapping;
+	struct list_head addr_desc_list;
+	struct mutex addr_desc_list_lock;
+};
+
+struct mdev_device {
+	struct device		dev;
+	struct parent_device	*parent;
+	struct iommu_group	*group;
+	uuid_le			uuid;
+	uint32_t		instance;
+	void			*driver_data;
+
+	/* internal only */
+	struct kref		ref;
+	struct list_head	next;
+
+	struct mdev_phys_mapping phys_mappings;
+};
+
+
+/**
+ * struct parent_ops - Structure to be registered for each parent device to
+ * register the device to mdev module.
+ *
+ * @owner:		The module owner.
+ * @dev_attr_groups:	Default attributes of the parent device.
+ * @mdev_attr_groups:	Default attributes of the mediated device.
+ * @supported_config:	Called to get information about supported types.
+ *			@dev : device structure of parent device.
+ *			@config: should return string listing supported config
+ *			Returns integer: success (0) or error (< 0)
+ * @create:		Called to allocate basic resources in parent device's
+ *			driver for a particular mediated device. It is
+ *			mandatory to provide create ops.
+ *			@mdev: mdev_device structure on of mediated device
+ *			      that is being created
+ *			@mdev_params: extra parameters required by parent
+ *			device's driver.
+ *			Returns integer: success (0) or error (< 0)
+ * @destroy:		Called to free resources in parent device's driver for a
+ *			a mediated device instance. It is mandatory to provide
+ *			destroy ops.
+ *			@mdev: mdev_device device structure which is being
+ *			       destroyed
+ *			Returns integer: success (0) or error (< 0)
+ *			If VMM is running and destroy() is called that means the
+ *			mdev is being hotunpluged. Return error if VMM is
+ *			running and driver doesn't support mediated device
+ *			hotplug.
+ * @reset:		Called to reset mediated device.
+ *			@mdev: mdev_device device structure
+ *			Returns integer: success (0) or error (< 0)
+ * @start:		Called to initiate mediated device initialization
+ *			process in parent device's driver before VMM starts.
+ *			@uuid: UUID
+ *			Returns integer: success (0) or error (< 0)
+ * @stop:		Called to teardown mediated device related resources
+ *			@uuid: UUID
+ *			Returns integer: success (0) or error (< 0)
+ * @read:		Read emulation callback
+ *			@mdev: mediated device structure
+ *			@buf: read buffer
+ *			@count: number of bytes to read
+ *			@pos: address.
+ *			Retuns number on bytes read on success or error.
+ * @write:		Write emulation callback
+ *			@mdev: mediated device structure
+ *			@buf: write buffer
+ *			@count: number of bytes to be written
+ *			@pos: address.
+ *			Retuns number on bytes written on success or error.
+ * @set_irqs:		Called to send about interrupts configuration
+ *			information that VMM sets.
+ *			@mdev: mediated device structure
+ *			@flags, index, start, count and *data : same as that of
+ *			struct vfio_irq_set of VFIO_DEVICE_SET_IRQS API.
+ * @get_region_info:	Called to get VFIO region size and flags of mediated
+ *			device.
+ *			@mdev: mediated device structure
+ *			@region_index: VFIO region index
+ *			@region_info: output, returns size and flags of
+ *				      requested region.
+ *			Returns integer: success (0) or error (< 0)
+ * @validate_map_request: Validate remap pfn request
+ *			@mdev: mediated device structure
+ *			@pos: address
+ *			@virtaddr: target user address to start at. Vendor
+ *				   driver can change if required.
+ *			@pfn: parent address of kernel memory, vendor driver
+ *			      can change if required.
+ *			@size: size of map area, vendor driver can change the
+ *			       size of map area if desired.
+ *			@prot: page protection flags for this mapping, vendor
+ *			       driver can change, if required.
+ *			Returns integer: success (0) or error (< 0)
+ *
+ * Parent device that support mediated device should be registered with mdev
+ * module with parent_ops structure.
+ */
+
+struct parent_ops {
+	struct module   *owner;
+	const struct attribute_group **dev_attr_groups;
+	const struct attribute_group **mdev_attr_groups;
+
+	int	(*supported_config)(struct device *dev, char *config);
+	int     (*create)(struct mdev_device *mdev, char *mdev_params);
+	int     (*destroy)(struct mdev_device *mdev);
+	int     (*reset)(struct mdev_device *mdev);
+	int     (*start)(uuid_le uuid);
+	int     (*stop)(uuid_le uuid);
+	ssize_t (*read)(struct mdev_device *mdev, char *buf, size_t count,
+			loff_t pos);
+	ssize_t (*write)(struct mdev_device *mdev, char *buf, size_t count,
+			 loff_t pos);
+	int     (*set_irqs)(struct mdev_device *mdev, uint32_t flags,
+			    unsigned int index, unsigned int start,
+			    unsigned int count, void *data);
+	int	(*get_region_info)(struct mdev_device *mdev, int region_index,
+				   struct vfio_region_info *region_info);
+	int	(*validate_map_request)(struct mdev_device *mdev, loff_t pos,
+					u64 *virtaddr, unsigned long *pfn,
+					unsigned long *size, pgprot_t *prot);
+};
+
+/*
+ * Parent Device
+ */
+
+struct parent_device {
+	struct device		*dev;
+	const struct parent_ops	*ops;
+
+	/* internal */
+	struct kref		ref;
+	struct list_head	next;
+	struct list_head	mdev_list;
+	struct mutex		mdev_list_lock;
+	wait_queue_head_t	release_done;
+};
+
+/**
+ * struct mdev_driver - Mediated device driver
+ * @name: driver name
+ * @probe: called when new device created
+ * @remove: called when device removed
+ * @match: called when new device or driver is added for this bus. Return 1 if
+ *	   given device can be handled by given driver and zero otherwise.
+ * @driver: device driver structure
+ *
+ **/
+struct mdev_driver {
+	const char *name;
+	int  (*probe)(struct device *dev);
+	void (*remove)(struct device *dev);
+	int  (*match)(struct device *dev);
+	struct device_driver driver;
+};
+
+static inline struct mdev_driver *to_mdev_driver(struct device_driver *drv)
+{
+	return drv ? container_of(drv, struct mdev_driver, driver) : NULL;
+}
+
+static inline struct mdev_device *to_mdev_device(struct device *dev)
+{
+	return dev ? container_of(dev, struct mdev_device, dev) : NULL;
+}
+
+static inline void *mdev_get_drvdata(struct mdev_device *mdev)
+{
+	return mdev->driver_data;
+}
+
+static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data)
+{
+	mdev->driver_data = data;
+}
+
+extern struct bus_type mdev_bus_type;
+
+#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type)
+
+extern int  mdev_register_device(struct device *dev,
+				 const struct parent_ops *ops);
+extern void mdev_unregister_device(struct device *dev);
+
+extern int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
+extern void mdev_unregister_driver(struct mdev_driver *drv);
+
+extern struct mdev_device *mdev_get_device(struct mdev_device *mdev);
+extern void mdev_put_device(struct mdev_device *mdev);
+
+extern struct mdev_device *mdev_get_device_by_group(struct iommu_group *group);
+
+extern int mdev_device_invalidate_mapping(struct mdev_device *mdev,
+					unsigned long addr, unsigned long size);
+
+extern int mdev_add_phys_mapping(struct mdev_device *mdev,
+				 struct address_space *mapping,
+				 unsigned long addr, unsigned long size);
+
+
+extern void mdev_del_phys_mapping(struct mdev_device *mdev, unsigned long addr);
+#endif /* MDEV_H */