diff mbox

[RFC,v3,1/3] vGPU Core driver

Message ID 1462214441-3732-2-git-send-email-kwankhede@nvidia.com
State New
Headers show

Commit Message

Kirti Wankhede May 2, 2016, 6:40 p.m. UTC
Design for vGPU Driver:
Main purpose of vGPU driver is to provide a common interface for vGPU
management that can be used by differnt GPU drivers.

This module would provide a generic interface to create the device, add
it to vGPU bus, add device to IOMMU group and then add it to vfio group.

High Level block diagram:

+--------------+    vgpu_register_driver()+---------------+
|     __init() +------------------------->+               |
|              |                          |               |
|              +<-------------------------+    vgpu.ko    |
| vgpu_vfio.ko |   probe()/remove()       |               |
|              |                +---------+               +---------+
+--------------+                |         +-------+-------+         |
                                |                 ^                 |
                                | callback        |                 |
                                |         +-------+--------+        |
                                |         |vgpu_register_device()   |
                                |         |                |        |
                                +---^-----+-----+    +-----+------+-+
                                    | nvidia.ko |    |  i915.ko   |
                                    |           |    |            |
                                    +-----------+    +------------+

vGPU driver provides two types of registration interfaces:
1. Registration interface for vGPU bus driver:

/**
  * struct vgpu_driver - vGPU device driver
  * @name: driver name
  * @probe: called when new device created
  * @remove: called when device removed
  * @driver: device driver structure
  *
  **/
struct vgpu_driver {
         const char *name;
         int  (*probe)  (struct device *dev);
         void (*remove) (struct device *dev);
         struct device_driver    driver;
};

int  vgpu_register_driver(struct vgpu_driver *drv, struct module *owner);
void vgpu_unregister_driver(struct vgpu_driver *drv);

VFIO bus driver for vgpu, should use this interface to register with
vGPU driver. With this, VFIO bus driver for vGPU devices is responsible
to add vGPU device to VFIO group.

2. GPU driver interface
GPU driver interface provides GPU driver the set APIs to manage GPU driver
related work in their own driver. APIs are to:
- vgpu_supported_config: provide supported configuration list by the GPU.
- vgpu_create: to allocate basic resouces in GPU driver for a vGPU device.
- vgpu_destroy: to free resources in GPU driver during vGPU device destroy.
- vgpu_start: to initiate vGPU initialization process from GPU driver when VM
  boots and before QEMU starts.
- vgpu_shutdown: to teardown vGPU resources during VM teardown.
- read : read emulation callback.
- write: write emulation callback.
- vgpu_set_irqs: send interrupt configuration information that QEMU sets.
- vgpu_bar_info: to provice BAR size and its flags for the vGPU device.
- validate_map_request: to validate remap pfn request.

This registration interface should be used by GPU drivers to register
each physical device to vGPU driver.

Updated this patch with couple of more functions in GPU driver interface
which were discussed during v1 version of this RFC.

Thanks,
Kirti.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I1c13c411f61b7b2e750e85adfe1b097f9fd218b9
---
 drivers/Kconfig             |    2 +
 drivers/Makefile            |    1 +
 drivers/vgpu/Kconfig        |   21 ++
 drivers/vgpu/Makefile       |    4 +
 drivers/vgpu/vgpu-core.c    |  424 +++++++++++++++++++++++++++++++++++++++++++
 drivers/vgpu/vgpu-driver.c  |  136 ++++++++++++++
 drivers/vgpu/vgpu-sysfs.c   |  365 +++++++++++++++++++++++++++++++++++++
 drivers/vgpu/vgpu_private.h |   36 ++++
 include/linux/vgpu.h        |  216 ++++++++++++++++++++++
 9 files changed, 1205 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vgpu/Kconfig
 create mode 100644 drivers/vgpu/Makefile
 create mode 100644 drivers/vgpu/vgpu-core.c
 create mode 100644 drivers/vgpu/vgpu-driver.c
 create mode 100644 drivers/vgpu/vgpu-sysfs.c
 create mode 100644 drivers/vgpu/vgpu_private.h
 create mode 100644 include/linux/vgpu.h

Comments

Alex Williamson May 3, 2016, 10:43 p.m. UTC | #1
On Tue, 3 May 2016 00:10:39 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> Design for vGPU Driver:
> Main purpose of vGPU driver is to provide a common interface for vGPU
> management that can be used by differnt GPU drivers.
> 
> This module would provide a generic interface to create the device, add
> it to vGPU bus, add device to IOMMU group and then add it to vfio group.
> 
> High Level block diagram:
> 
> +--------------+    vgpu_register_driver()+---------------+
> |     __init() +------------------------->+               |
> |              |                          |               |
> |              +<-------------------------+    vgpu.ko    |
> | vgpu_vfio.ko |   probe()/remove()       |               |
> |              |                +---------+               +---------+
> +--------------+                |         +-------+-------+         |
>                                 |                 ^                 |
>                                 | callback        |                 |
>                                 |         +-------+--------+        |
>                                 |         |vgpu_register_device()   |
>                                 |         |                |        |
>                                 +---^-----+-----+    +-----+------+-+
>                                     | nvidia.ko |    |  i915.ko   |
>                                     |           |    |            |
>                                     +-----------+    +------------+
> 
> vGPU driver provides two types of registration interfaces:
> 1. Registration interface for vGPU bus driver:
> 
> /**
>   * struct vgpu_driver - vGPU device driver
>   * @name: driver name
>   * @probe: called when new device created
>   * @remove: called when device removed
>   * @driver: device driver structure
>   *
>   **/
> struct vgpu_driver {
>          const char *name;
>          int  (*probe)  (struct device *dev);
>          void (*remove) (struct device *dev);
>          struct device_driver    driver;
> };
> 
> int  vgpu_register_driver(struct vgpu_driver *drv, struct module *owner);
> void vgpu_unregister_driver(struct vgpu_driver *drv);
> 
> VFIO bus driver for vgpu, should use this interface to register with
> vGPU driver. With this, VFIO bus driver for vGPU devices is responsible
> to add vGPU device to VFIO group.
> 
> 2. GPU driver interface
> GPU driver interface provides GPU driver the set APIs to manage GPU driver
> related work in their own driver. APIs are to:
> - vgpu_supported_config: provide supported configuration list by the GPU.
> - vgpu_create: to allocate basic resouces in GPU driver for a vGPU device.
> - vgpu_destroy: to free resources in GPU driver during vGPU device destroy.
> - vgpu_start: to initiate vGPU initialization process from GPU driver when VM
>   boots and before QEMU starts.
> - vgpu_shutdown: to teardown vGPU resources during VM teardown.
> - read : read emulation callback.
> - write: write emulation callback.
> - vgpu_set_irqs: send interrupt configuration information that QEMU sets.
> - vgpu_bar_info: to provice BAR size and its flags for the vGPU device.
> - validate_map_request: to validate remap pfn request.
> 
> This registration interface should be used by GPU drivers to register
> each physical device to vGPU driver.
> 
> Updated this patch with couple of more functions in GPU driver interface
> which were discussed during v1 version of this RFC.
> 
> Thanks,
> Kirti.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I1c13c411f61b7b2e750e85adfe1b097f9fd218b9
> ---
>  drivers/Kconfig             |    2 +
>  drivers/Makefile            |    1 +
>  drivers/vgpu/Kconfig        |   21 ++
>  drivers/vgpu/Makefile       |    4 +
>  drivers/vgpu/vgpu-core.c    |  424 +++++++++++++++++++++++++++++++++++++++++++
>  drivers/vgpu/vgpu-driver.c  |  136 ++++++++++++++
>  drivers/vgpu/vgpu-sysfs.c   |  365 +++++++++++++++++++++++++++++++++++++
>  drivers/vgpu/vgpu_private.h |   36 ++++
>  include/linux/vgpu.h        |  216 ++++++++++++++++++++++
>  9 files changed, 1205 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/vgpu/Kconfig
>  create mode 100644 drivers/vgpu/Makefile
>  create mode 100644 drivers/vgpu/vgpu-core.c
>  create mode 100644 drivers/vgpu/vgpu-driver.c
>  create mode 100644 drivers/vgpu/vgpu-sysfs.c
>  create mode 100644 drivers/vgpu/vgpu_private.h
>  create mode 100644 include/linux/vgpu.h
> 
> diff --git a/drivers/Kconfig b/drivers/Kconfig
> index d2ac339..5fd9eae 100644
> --- a/drivers/Kconfig
> +++ b/drivers/Kconfig
> @@ -122,6 +122,8 @@ source "drivers/uio/Kconfig"
>  
>  source "drivers/vfio/Kconfig"
>  
> +source "drivers/vgpu/Kconfig"
> +
>  source "drivers/vlynq/Kconfig"
>  
>  source "drivers/virt/Kconfig"
> diff --git a/drivers/Makefile b/drivers/Makefile
> index 8f5d076..36f1110 100644
> --- a/drivers/Makefile
> +++ b/drivers/Makefile
> @@ -84,6 +84,7 @@ obj-$(CONFIG_FUSION)		+= message/
>  obj-y				+= firewire/
>  obj-$(CONFIG_UIO)		+= uio/
>  obj-$(CONFIG_VFIO)		+= vfio/
> +obj-$(CONFIG_VFIO)		+= vgpu/
>  obj-y				+= cdrom/
>  obj-y				+= auxdisplay/
>  obj-$(CONFIG_PCCARD)		+= pcmcia/
> diff --git a/drivers/vgpu/Kconfig b/drivers/vgpu/Kconfig
> new file mode 100644
> index 0000000..792eb48
> --- /dev/null
> +++ b/drivers/vgpu/Kconfig
> @@ -0,0 +1,21 @@
> +
> +menuconfig VGPU
> +    tristate "VGPU driver framework"
> +    depends on VFIO
> +    select VGPU_VFIO
> +    help
> +        VGPU provides a framework to virtualize GPU without SR-IOV cap
> +        See Documentation/vgpu.txt for more details.
> +
> +        If you don't know what do here, say N.
> +
> +config VGPU
> +    tristate
> +    depends on VFIO
> +    default n
> +
> +config VGPU_VFIO
> +    tristate
> +    depends on VGPU
> +    default n
> +

This is a little bit convoluted, it seems like everything added in this
patch is vfio agnostic, it doesn't necessarily care what the consumer
is.  That makes me think we should only be adding CONFIG_VGPU here and
it should not depend on CONFIG_VFIO or be enabling CONFIG_VGPU_VFIO.
The middle config entry is also redundant to the first, just move the
default line up to the first and remove the rest.

> diff --git a/drivers/vgpu/Makefile b/drivers/vgpu/Makefile
> new file mode 100644
> index 0000000..f5be980
> --- /dev/null
> +++ b/drivers/vgpu/Makefile
> @@ -0,0 +1,4 @@
> +
> +vgpu-y := vgpu-core.o vgpu-sysfs.o vgpu-driver.o
> +
> +obj-$(CONFIG_VGPU)			+= vgpu.o
> diff --git a/drivers/vgpu/vgpu-core.c b/drivers/vgpu/vgpu-core.c
> new file mode 100644
> index 0000000..1a7d274
> --- /dev/null
> +++ b/drivers/vgpu/vgpu-core.c
> @@ -0,0 +1,424 @@
> +/*
> + * VGPU Core Driver
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/kernel.h>
> +#include <linux/fs.h>
> +#include <linux/poll.h>
> +#include <linux/slab.h>
> +#include <linux/cdev.h>
> +#include <linux/sched.h>
> +#include <linux/wait.h>
> +#include <linux/uuid.h>
> +#include <linux/vfio.h>
> +#include <linux/iommu.h>
> +#include <linux/sysfs.h>
> +#include <linux/ctype.h>
> +#include <linux/vgpu.h>
> +
> +#include "vgpu_private.h"
> +
> +#define DRIVER_VERSION	"0.1"
> +#define DRIVER_AUTHOR	"NVIDIA Corporation"
> +#define DRIVER_DESC	"VGPU Core Driver"
> +
> +/*
> + * #defines
> + */
> +
> +#define VGPU_CLASS_NAME		"vgpu"
> +
> +/*
> + * Global Structures
> + */
> +
> +static struct vgpu {
> +	struct list_head    vgpu_devices_list;
> +	struct mutex        vgpu_devices_lock;
> +	struct list_head    gpu_devices_list;
> +	struct mutex        gpu_devices_lock;
> +} vgpu;
> +
> +static struct class vgpu_class;
> +
> +/*
> + * Functions
> + */
> +
> +struct vgpu_device *get_vgpu_device_from_group(struct iommu_group *group)
> +{
> +	struct vgpu_device *vdev = NULL;
> +
> +	mutex_lock(&vgpu.vgpu_devices_lock);
> +	list_for_each_entry(vdev, &vgpu.vgpu_devices_list, list) {
> +		if (vdev->group) {
> +			if (iommu_group_id(vdev->group) == iommu_group_id(group)) {
> +				mutex_unlock(&vgpu.vgpu_devices_lock);
> +				return vdev;
> +			}
> +		}
> +	}
> +	mutex_unlock(&vgpu.vgpu_devices_lock);
> +	return NULL;
> +}
> +
> +EXPORT_SYMBOL_GPL(get_vgpu_device_from_group);
> +
> +static int vgpu_add_attribute_group(struct device *dev,
> +			            const struct attribute_group **groups)
> +{
> +        return sysfs_create_groups(&dev->kobj, groups);
> +}
> +
> +static void vgpu_remove_attribute_group(struct device *dev,
> +			                const struct attribute_group **groups)
> +{
> +        sysfs_remove_groups(&dev->kobj, groups);
> +}
> +
> +int vgpu_register_device(struct pci_dev *dev, const struct gpu_device_ops *ops)

To make the API abundantly clear, how about vgpu_register_gpu_device()
to avoid confusion with a vgpu device.

Why do we care that it's a pci_dev?  It seems like there's only a very
small portion of the API that cares about pci_devs in order to describe
BARs, which could be switched based on the device type.  Otherwise we
could operate on a struct device here.

> +{
> +	int ret = 0;
> +	struct gpu_device *gpu_dev, *tmp;
> +
> +	if (!dev)
> +		return -EINVAL;
> +
> +        gpu_dev = kzalloc(sizeof(*gpu_dev), GFP_KERNEL);
> +        if (!gpu_dev)
> +                return -ENOMEM;
> +
> +	gpu_dev->dev = dev;
> +        gpu_dev->ops = ops;
> +
> +        mutex_lock(&vgpu.gpu_devices_lock);
> +
> +        /* Check for duplicates */
> +        list_for_each_entry(tmp, &vgpu.gpu_devices_list, gpu_next) {
> +                if (tmp->dev == dev) {
> +			ret = -EINVAL;

Maybe -EEXIST here to get a different error value.

> +			goto add_error;
> +                }
> +        }
> +
> +	ret = vgpu_create_pci_device_files(dev);

I don't actually see anything pci specific in that function.

> +	if (ret)
> +		goto add_error;
> +
> +	ret = vgpu_add_attribute_group(&dev->dev, ops->dev_attr_groups);
> +	if (ret)
> +		goto add_group_error;
> +
> +        list_add(&gpu_dev->gpu_next, &vgpu.gpu_devices_list);

Whitespace issues, please run scripts/checkpatch.pl on patches before
posting.

> +
> +	printk(KERN_INFO "VGPU: Registered dev 0x%x 0x%x, class 0x%x\n",
> +			 dev->vendor, dev->device, dev->class);

This is a place where we're using pci_dev specific fields, but it's not
very useful.  We're registering a specific device, not everything that
matches this set of vendor/device/class, so what is the user supposed
to learn from this?  A dev_info here would give us the name of the
specific device we're registering and be device type agnostic.

> +        mutex_unlock(&vgpu.gpu_devices_lock);
> +
> +        return 0;
> +
> +add_group_error:
> +	vgpu_remove_pci_device_files(dev);
> +add_error:
> +	mutex_unlock(&vgpu.gpu_devices_lock);
> +	kfree(gpu_dev);
> +	return ret;
> +
> +}
> +EXPORT_SYMBOL(vgpu_register_device);
> +
> +void vgpu_unregister_device(struct pci_dev *dev)
> +{
> +        struct gpu_device *gpu_dev;
> +
> +        mutex_lock(&vgpu.gpu_devices_lock);
> +        list_for_each_entry(gpu_dev, &vgpu.gpu_devices_list, gpu_next) {
> +		struct vgpu_device *vdev = NULL;
> +
> +                if (gpu_dev->dev != dev)
> +			continue;
> +
> +		printk(KERN_INFO "VGPU: Unregistered dev 0x%x 0x%x, class 0x%x\n",
> +				dev->vendor, dev->device, dev->class);

Same comments as above for function name, device type, and this print.

> +
> +		list_for_each_entry(vdev, &vgpu.vgpu_devices_list, list) {

How can we walk this list without holding vgpu_devices_lock?

> +			if (vdev->gpu_dev != gpu_dev)
> +				continue;
> +			destroy_vgpu_device(vdev);
> +		}
> +		vgpu_remove_attribute_group(&dev->dev, gpu_dev->ops->dev_attr_groups);
> +		vgpu_remove_pci_device_files(dev);
> +		list_del(&gpu_dev->gpu_next);
> +		mutex_unlock(&vgpu.gpu_devices_lock);

It's often desirable to avoid multiple exit points, especially when
locking is involved, to simplify the code flow.  It would be very easy
to accomplish that here.

> +		kfree(gpu_dev);
> +		return;
> +        }
> +        mutex_unlock(&vgpu.gpu_devices_lock);
> +}
> +EXPORT_SYMBOL(vgpu_unregister_device);
> +
> +/*
> + * Helper Functions
> + */
> +
> +static struct vgpu_device *vgpu_device_alloc(uuid_le uuid, int instance, char *name)
> +{
> +	struct vgpu_device *vgpu_dev = NULL;
> +
> +	vgpu_dev = kzalloc(sizeof(*vgpu_dev), GFP_KERNEL);
> +	if (!vgpu_dev)
> +		return ERR_PTR(-ENOMEM);
> +
> +	kref_init(&vgpu_dev->kref);
> +	memcpy(&vgpu_dev->uuid, &uuid, sizeof(uuid_le));
> +	vgpu_dev->vgpu_instance = instance;
> +	strcpy(vgpu_dev->dev_name, name);
> +
> +	mutex_lock(&vgpu.vgpu_devices_lock);
> +	list_add(&vgpu_dev->list, &vgpu.vgpu_devices_list);
> +	mutex_unlock(&vgpu.vgpu_devices_lock);
> +
> +	return vgpu_dev;
> +}
> +
> +static void vgpu_device_free(struct vgpu_device *vgpu_dev)
> +{
> +	if (vgpu_dev) {
> +		mutex_lock(&vgpu.vgpu_devices_lock);
> +		list_del(&vgpu_dev->list);
> +		mutex_unlock(&vgpu.vgpu_devices_lock);
> +		kfree(vgpu_dev);
> +	}

Why aren't we using the kref to remove and free the vgpu when the last
reference is released?

> +	return;

Unnecessary

> +}
> +
> +struct vgpu_device *vgpu_drv_get_vgpu_device(uuid_le uuid, int instance)
> +{
> +	struct vgpu_device *vdev = NULL;
> +
> +	mutex_lock(&vgpu.vgpu_devices_lock);
> +	list_for_each_entry(vdev, &vgpu.vgpu_devices_list, list) {
> +		if ((uuid_le_cmp(vdev->uuid, uuid) == 0) &&
> +		    (vdev->vgpu_instance == instance)) {
> +			mutex_unlock(&vgpu.vgpu_devices_lock);
> +			return vdev;

We're not taking any sort of reference to the vgpu, what prevents races
with it being removed?  A common exit path would be easy to achieve
here too.

> +		}
> +	}
> +	mutex_unlock(&vgpu.vgpu_devices_lock);
> +	return NULL;
> +}
> +
> +static void vgpu_device_release(struct device *dev)
> +{
> +	struct vgpu_device *vgpu_dev = to_vgpu_device(dev);
> +	vgpu_device_free(vgpu_dev);
> +}
> +
> +int create_vgpu_device(struct pci_dev *pdev, uuid_le uuid, uint32_t instance, char *vgpu_params)
> +{

I'm not seeing anything here that really cares if the host gpu is a
struct device vs pci_dev either.

> +	char name[64];
> +	int numChar = 0;
> +	int retval = 0;
> +	struct vgpu_device *vgpu_dev = NULL;
> +	struct gpu_device *gpu_dev;
> +
> +	printk(KERN_INFO "VGPU: %s: device ", __FUNCTION__);

pr_info() would be preferred, but this seems like leftover debug and
should be removed.

> +
> +	numChar = sprintf(name, "%pUb-%d", uuid.b, instance);

Use snprintf even though this shouldn't be able to overflow.

> +	name[numChar] = '\0';
> +
> +	vgpu_dev = vgpu_device_alloc(uuid, instance, name);
> +	if (IS_ERR(vgpu_dev)) {
> +		return PTR_ERR(vgpu_dev);
> +	}
> +
> +	vgpu_dev->dev.parent  = &pdev->dev;
> +	vgpu_dev->dev.bus     = &vgpu_bus_type;
> +	vgpu_dev->dev.release = vgpu_device_release;
> +	dev_set_name(&vgpu_dev->dev, "%s", name);
> +
> +	retval = device_register(&vgpu_dev->dev);
> +	if (retval)
> +		goto create_failed1;
> +
> +	printk(KERN_INFO "UUID %pUb \n", vgpu_dev->uuid.b);

This also looks like debug.

> +
> +	mutex_lock(&vgpu.gpu_devices_lock);
> +	list_for_each_entry(gpu_dev, &vgpu.gpu_devices_list, gpu_next) {
> +		if (gpu_dev->dev != pdev)
> +			continue;
> +
> +		vgpu_dev->gpu_dev = gpu_dev;
> +		if (gpu_dev->ops->vgpu_create) {
> +			retval = gpu_dev->ops->vgpu_create(pdev, vgpu_dev->uuid,
> +							   instance, vgpu_params);
> +			if (retval) {
> +				mutex_unlock(&vgpu.gpu_devices_lock);
> +				goto create_failed2;
> +			}
> +		}
> +		break;
> +	}
> +	if (!vgpu_dev->gpu_dev) {
> +		retval = -EINVAL;
> +		mutex_unlock(&vgpu.gpu_devices_lock);
> +		goto create_failed2;
> +	}
> +
> +	mutex_unlock(&vgpu.gpu_devices_lock);
> +
> +	retval = vgpu_add_attribute_group(&vgpu_dev->dev, gpu_dev->ops->vgpu_attr_groups);
> +	if (retval)
> +		goto create_attr_error;
> +
> +	return retval;
> +
> +create_attr_error:
> +	if (gpu_dev->ops->vgpu_destroy) {
> +		int ret = 0;
> +		ret = gpu_dev->ops->vgpu_destroy(gpu_dev->dev,
> +						 vgpu_dev->uuid,
> +						 vgpu_dev->vgpu_instance);

Unnecessary initialization and we don't do anything with the result.
Below indicates lack of vgpu_destroy indicates the vendor doesn't
support unplug, but doesn't that break our error cleanup path here?

> +	}
> +
> +create_failed2:
> +	device_unregister(&vgpu_dev->dev);
> +
> +create_failed1:
> +	vgpu_device_free(vgpu_dev);
> +
> +	return retval;
> +}
> +
> +void destroy_vgpu_device(struct vgpu_device *vgpu_dev)
> +{
> +	struct gpu_device *gpu_dev = vgpu_dev->gpu_dev;
> +
> +	printk(KERN_INFO "VGPU: destroying device %s ", vgpu_dev->dev_name);

dev_info()

> +	if (gpu_dev->ops->vgpu_destroy) {
> +		int retval = 0;

Unnecessary initialization, in fact this entire variable is unnecessary.

> +		retval = gpu_dev->ops->vgpu_destroy(gpu_dev->dev,
> +						    vgpu_dev->uuid,
> +						    vgpu_dev->vgpu_instance);
> +	/* if vendor driver doesn't return success that means vendor driver doesn't
> +	 * support hot-unplug */
> +		if (retval)
> +			return;

Should we return an error code then?  Inconsistent comment style.

> +	}
> +
> +	vgpu_remove_attribute_group(&vgpu_dev->dev, gpu_dev->ops->vgpu_attr_groups);
> +	device_unregister(&vgpu_dev->dev);
> +}
> +
> +void get_vgpu_supported_types(struct device *dev, char *str)
> +{
> +	struct gpu_device *gpu_dev;
> +
> +	mutex_lock(&vgpu.gpu_devices_lock);
> +	list_for_each_entry(gpu_dev, &vgpu.gpu_devices_list, gpu_next) {
> +		if (&gpu_dev->dev->dev == dev) {
> +			if (gpu_dev->ops->vgpu_supported_config)
> +				gpu_dev->ops->vgpu_supported_config(gpu_dev->dev, str);
> +			break;
> +		}
> +	}
> +	mutex_unlock(&vgpu.gpu_devices_lock);
> +}
> +
> +int vgpu_start_callback(struct vgpu_device *vgpu_dev)
> +{
> +	int ret = 0;
> +	struct gpu_device *gpu_dev = vgpu_dev->gpu_dev;
> +
> +	mutex_lock(&vgpu.gpu_devices_lock);
> +	if (gpu_dev->ops->vgpu_start)
> +		ret = gpu_dev->ops->vgpu_start(vgpu_dev->uuid);
> +	mutex_unlock(&vgpu.gpu_devices_lock);
> +	return ret;
> +}
> +
> +int vgpu_shutdown_callback(struct vgpu_device *vgpu_dev)
> +{
> +	int ret = 0;
> +	struct gpu_device *gpu_dev = vgpu_dev->gpu_dev;
> +
> +	mutex_lock(&vgpu.gpu_devices_lock);
> +	if (gpu_dev->ops->vgpu_shutdown)
> +		ret = gpu_dev->ops->vgpu_shutdown(vgpu_dev->uuid);
> +	mutex_unlock(&vgpu.gpu_devices_lock);
> +	return ret;
> +}
> +
> +char *vgpu_devnode(struct device *dev, umode_t *mode)
> +{
> +	return kasprintf(GFP_KERNEL, "vgpu/%s", dev_name(dev));
> +}
> +
> +static void release_vgpubus_dev(struct device *dev)
> +{
> +	struct vgpu_device *vgpu_dev = to_vgpu_device(dev);
> +	destroy_vgpu_device(vgpu_dev);
> +}
> +
> +static struct class vgpu_class = {
> +	.name		= VGPU_CLASS_NAME,
> +	.owner		= THIS_MODULE,
> +	.class_attrs	= vgpu_class_attrs,
> +	.dev_groups	= vgpu_dev_groups,
> +	.devnode	= vgpu_devnode,
> +	.dev_release    = release_vgpubus_dev,
> +};
> +
> +static int __init vgpu_init(void)
> +{
> +	int rc = 0;
> +
> +	memset(&vgpu, 0 , sizeof(vgpu));

Unnecessary, this is declared in the bss and zero initialized.

> +
> +	mutex_init(&vgpu.vgpu_devices_lock);
> +	INIT_LIST_HEAD(&vgpu.vgpu_devices_list);
> +	mutex_init(&vgpu.gpu_devices_lock);
> +	INIT_LIST_HEAD(&vgpu.gpu_devices_list);
> +
> +	rc = class_register(&vgpu_class);
> +	if (rc < 0) {
> +		printk(KERN_ERR "Error: failed to register vgpu class\n");

pr_err()

> +		goto failed1;
> +	}
> +
> +	rc = vgpu_bus_register();
> +	if (rc < 0) {
> +		printk(KERN_ERR "Error: failed to register vgpu bus\n");
> +		class_unregister(&vgpu_class);
> +	}
> +
> +    request_module_nowait("vgpu_vfio");
> +
> +failed1:
> +	return rc;

While common exit points are good, if there's no cleanup and no
locking, why do we need failed1?

> +}
> +
> +static void __exit vgpu_exit(void)
> +{
> +	vgpu_bus_unregister();
> +	class_unregister(&vgpu_class);
> +}
> +
> +module_init(vgpu_init)
> +module_exit(vgpu_exit)
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> diff --git a/drivers/vgpu/vgpu-driver.c b/drivers/vgpu/vgpu-driver.c
> new file mode 100644
> index 0000000..c4c2e9f
> --- /dev/null
> +++ b/drivers/vgpu/vgpu-driver.c
> @@ -0,0 +1,136 @@
> +/*
> + * VGPU driver
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/kernel.h>
> +#include <linux/fs.h>

I don't see any vfio here, or fs or sysfs or ctype.

> +#include <linux/vfio.h>
> +#include <linux/iommu.h>
> +#include <linux/sysfs.h>
> +#include <linux/ctype.h>
> +#include <linux/vgpu.h>
> +
> +#include "vgpu_private.h"
> +
> +static int vgpu_device_attach_iommu(struct vgpu_device *vgpu_dev)
> +{
> +        int retval = 0;
> +        struct iommu_group *group = NULL;
> +
> +        group = iommu_group_alloc();
> +        if (IS_ERR(group)) {
> +                printk(KERN_ERR "VGPU: failed to allocate group!\n");
> +                return PTR_ERR(group);
> +        }
> +
> +        retval = iommu_group_add_device(group, &vgpu_dev->dev);
> +        if (retval) {
> +                printk(KERN_ERR "VGPU: failed to add dev to group!\n");

dev_err()

> +                iommu_group_put(group);

The iommu group should be put regardless of error, the device holds a
reference to the group to keep it around.

> +                return retval;
> +        }
> +
> +        vgpu_dev->group = group;
> +
> +        printk(KERN_INFO "VGPU: group_id = %d \n", iommu_group_id(group));
> +        return retval;
> +}
> +
> +static void vgpu_device_detach_iommu(struct vgpu_device *vgpu_dev)
> +{
> +        iommu_group_put(vgpu_dev->dev.iommu_group);
> +        iommu_group_remove_device(&vgpu_dev->dev);

Only the remove _device should be needed here, the group reference
should have been released above, otherwise we're double incrementing
and double decrementing.

> +        printk(KERN_INFO "VGPU: detaching iommu \n");

debug.

> +}
> +
> +static int vgpu_device_probe(struct device *dev)
> +{
> +	struct vgpu_driver *drv = to_vgpu_driver(dev->driver);
> +	struct vgpu_device *vgpu_dev = to_vgpu_device(dev);
> +	int status = 0;
> +
> +	status = vgpu_device_attach_iommu(vgpu_dev);
> +	if (status) {
> +		printk(KERN_ERR "Failed to attach IOMMU\n");
> +		return status;
> +	}
> +
> +	if (drv && drv->probe) {
> +		status = drv->probe(dev);
> +	}
> +
> +	return status;
> +}
> +
> +static int vgpu_device_remove(struct device *dev)
> +{
> +	struct vgpu_driver *drv = to_vgpu_driver(dev->driver);
> +	struct vgpu_device *vgpu_dev = to_vgpu_device(dev);
> +	int status = 0;
> +
> +	if (drv && drv->remove) {
> +		drv->remove(dev);
> +	}
> +
> +	vgpu_device_detach_iommu(vgpu_dev);
> +
> +	return status;

return 0;  Or make this void.  .remove functions often return void, for
better or worse.

> +}
> +
> +struct bus_type vgpu_bus_type = {
> +	.name		= "vgpu",
> +	.probe		= vgpu_device_probe,
> +	.remove		= vgpu_device_remove,
> +};
> +EXPORT_SYMBOL_GPL(vgpu_bus_type);
> +
> +/**
> + * vgpu_register_driver - register a new vGPU driver
> + * @drv: the driver to register
> + * @owner: owner module of driver ro register
> + *
> + * Returns a negative value on error, otherwise 0.
> + */
> +int vgpu_register_driver(struct vgpu_driver *drv, struct module *owner)
> +{
> +	/* initialize common driver fields */
> +	drv->driver.name = drv->name;
> +	drv->driver.bus = &vgpu_bus_type;
> +	drv->driver.owner = owner;
> +
> +	/* register with core */
> +	return driver_register(&drv->driver);
> +}
> +EXPORT_SYMBOL(vgpu_register_driver);
> +
> +/**
> + * vgpu_unregister_driver - unregister vGPU driver
> + * @drv: the driver to unregister
> + *
> + */
> +void vgpu_unregister_driver(struct vgpu_driver *drv)
> +{
> +	driver_unregister(&drv->driver);
> +}
> +EXPORT_SYMBOL(vgpu_unregister_driver);
> +
> +int vgpu_bus_register(void)
> +{
> +	return bus_register(&vgpu_bus_type);
> +}
> +
> +void vgpu_bus_unregister(void)
> +{
> +	bus_unregister(&vgpu_bus_type);
> +}
> diff --git a/drivers/vgpu/vgpu-sysfs.c b/drivers/vgpu/vgpu-sysfs.c
> new file mode 100644
> index 0000000..b740f9a
> --- /dev/null
> +++ b/drivers/vgpu/vgpu-sysfs.c
> @@ -0,0 +1,365 @@
> +/*
> + * File attributes for vGPU devices
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/sched.h>
> +#include <linux/fs.h>
> +#include <linux/sysfs.h>
> +#include <linux/ctype.h>
> +#include <linux/uuid.h>
> +#include <linux/vfio.h>

No vfio, fs, or ctype here either

> +#include <linux/vgpu.h>
> +
> +#include "vgpu_private.h"
> +
> +/* Prototypes */
> +
> +static ssize_t vgpu_supported_types_show(struct device *dev,
> +					 struct device_attribute *attr,
> +					 char *buf);
> +static DEVICE_ATTR_RO(vgpu_supported_types);
> +
> +static ssize_t vgpu_create_store(struct device *dev,
> +				 struct device_attribute *attr,
> +				 const char *buf, size_t count);
> +static DEVICE_ATTR_WO(vgpu_create);
> +
> +static ssize_t vgpu_destroy_store(struct device *dev,
> +				  struct device_attribute *attr,
> +				  const char *buf, size_t count);
> +static DEVICE_ATTR_WO(vgpu_destroy);
> +
> +
> +/* Static functions */
> +
> +static bool is_uuid_sep(char sep)
> +{
> +	if (sep == '\n' || sep == '-' || sep == ':' || sep == '\0')
> +		return true;
> +	return false;
> +}
> +
> +
> +static int uuid_parse(const char *str, uuid_le *uuid)
> +{
> +	int i;
> +
> +	if (strlen(str) < 36)
> +		return -1;
> +
> +	for (i = 0; i < 16; i++) {
> +		if (!isxdigit(str[0]) || !isxdigit(str[1])) {
> +			printk(KERN_ERR "%s err", __FUNCTION__);
> +			return -EINVAL;
> +		}
> +
> +		uuid->b[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]);
> +		str += 2;
> +		if (is_uuid_sep(*str))
> +			str++;
> +	}
> +
> +	return 0;
> +}
> +
> +
> +/* Functions */
> +static ssize_t vgpu_supported_types_show(struct device *dev,
> +					 struct device_attribute *attr,
> +					 char *buf)
> +{
> +	char *str;
> +	ssize_t n;
> +
> +        str = kzalloc(sizeof(*str) * 512, GFP_KERNEL);

Arbitrary size limit?  Do we even need a separate buffer?

> +        if (!str)
> +                return -ENOMEM;
> +
> +	get_vgpu_supported_types(dev, str);
> +
> +	n = sprintf(buf,"%s\n", str);
> +	kfree(str);
> +
> +	return n;
> +}
> +
> +static ssize_t vgpu_create_store(struct device *dev,
> +				 struct device_attribute *attr,
> +				 const char *buf, size_t count)
> +{
> +	char *str, *pstr;
> +	char *uuid_str, *instance_str, *vgpu_params = NULL;
> +	uuid_le uuid;
> +	uint32_t instance;
> +	struct pci_dev *pdev;
> +	int ret = 0;
> +
> +	pstr = str = kstrndup(buf, count, GFP_KERNEL);
> +
> +	if (!str)
> +		return -ENOMEM;
> +
> +	if ((uuid_str = strsep(&str, ":")) == NULL) {
> +		printk(KERN_ERR "%s Empty UUID or string %s \n",
> +				 __FUNCTION__, buf);

pr_err() for all these.

> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	if (!str) {
> +		printk(KERN_ERR "%s vgpu instance not specified %s \n",
> +				 __FUNCTION__, buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	if ((instance_str = strsep(&str, ":")) == NULL) {
> +		printk(KERN_ERR "%s Empty instance or string %s \n",
> +				 __FUNCTION__, buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	instance = (unsigned int)simple_strtoul(instance_str, NULL, 0);
> +
> +	if (!str) {
> +		printk(KERN_ERR "%s vgpu params not specified %s \n",
> +				 __FUNCTION__, buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	vgpu_params = kstrdup(str, GFP_KERNEL);
> +
> +	if (!vgpu_params) {
> +		printk(KERN_ERR "%s vgpu params allocation failed \n",
> +				 __FUNCTION__);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	if (uuid_parse(uuid_str, &uuid) < 0) {
> +		printk(KERN_ERR "%s UUID parse error  %s \n", __FUNCTION__, buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	if (dev_is_pci(dev)) {
> +		pdev = to_pci_dev(dev);
> +
> +		if (create_vgpu_device(pdev, uuid, instance, vgpu_params) < 0) {

Why do we care?  I still haven't seen anything that requires the gpu to
be a pci device.

> +			printk(KERN_ERR "%s vgpu create error \n", __FUNCTION__);
> +			ret = -EINVAL;
> +			goto create_error;
> +		}
> +		ret = count;
> +	}
> +
> +create_error:
> +	if (vgpu_params)
> +		kfree(vgpu_params);
> +
> +	if (pstr)
> +		kfree(pstr);

kfree(NULL) does the right thing.

> +	return ret;
> +}
> +
> +static ssize_t vgpu_destroy_store(struct device *dev,
> +				  struct device_attribute *attr,
> +				  const char *buf, size_t count)
> +{
> +	char *uuid_str, *str;
> +	uuid_le uuid;
> +	unsigned int instance;
> +	struct vgpu_device *vgpu_dev = NULL;
> +
> +	str = kstrndup(buf, count, GFP_KERNEL);
> +
> +	if (!str)
> +		return -ENOMEM;
> +
> +	if ((uuid_str = strsep(&str, ":")) == NULL) {
> +		printk(KERN_ERR "%s Empty UUID or string %s \n", __FUNCTION__, buf);
> +		return -EINVAL;
> +	}
> +
> +	if (str == NULL) {
> +		printk(KERN_ERR "%s instance not specified %s \n", __FUNCTION__, buf);
> +		return -EINVAL;
> +	}
> +
> +	instance = (unsigned int)simple_strtoul(str, NULL, 0);
> +
> +	if (uuid_parse(uuid_str, &uuid) < 0) {
> +		printk(KERN_ERR "%s UUID parse error  %s \n", __FUNCTION__, buf);
> +		return -EINVAL;
> +	}
> +
> +	printk(KERN_INFO "%s UUID %pUb - %d \n", __FUNCTION__, uuid.b, instance);
> +
> +	vgpu_dev = vgpu_drv_get_vgpu_device(uuid, instance);

Since we have no reference counting, all we need to do to crash this is
race this destroy sysfs entry.

> +
> +	if (vgpu_dev)
> +		destroy_vgpu_device(vgpu_dev);
> +
> +	return count;

An error if not found might be nice.

> +}
> +
> +static ssize_t
> +vgpu_uuid_show(struct device *dev, struct device_attribute *attr, char *buf)
> +{
> +	struct vgpu_device *drv = to_vgpu_device(dev);
> +
> +	if (drv)
> +		return sprintf(buf, "%pUb \n", drv->uuid.b);
> +
> +	return sprintf(buf, " \n");
> +}
> +
> +static DEVICE_ATTR_RO(vgpu_uuid);
> +
> +static ssize_t
> +vgpu_group_id_show(struct device *dev, struct device_attribute *attr, char *buf)
> +{
> +	struct vgpu_device *drv = to_vgpu_device(dev);
> +
> +	if (drv && drv->group)
> +		return sprintf(buf, "%d \n", iommu_group_id(drv->group));
> +
> +	return sprintf(buf, " \n");

There should be an iommu_group link from the device to the group in
sysfs, otherwise this is inconsistent with real devices.

> +}
> +
> +static DEVICE_ATTR_RO(vgpu_group_id);
> +
> +
> +static struct attribute *vgpu_dev_attrs[] = {
> +	&dev_attr_vgpu_uuid.attr,
> +	&dev_attr_vgpu_group_id.attr,
> +	NULL,
> +};
> +
> +static const struct attribute_group vgpu_dev_group = {
> +	.attrs = vgpu_dev_attrs,
> +};
> +
> +const struct attribute_group *vgpu_dev_groups[] = {
> +	&vgpu_dev_group,
> +	NULL,
> +};
> +
> +
> +ssize_t vgpu_start_store(struct class *class, struct class_attribute *attr,
> +			 const char *buf, size_t count)
> +{
> +	char *uuid_str;
> +	uuid_le uuid;
> +	struct vgpu_device *vgpu_dev = NULL;
> +	int ret;
> +
> +	uuid_str = kstrndup(buf, count, GFP_KERNEL);
> +
> +	if (!uuid_str)
> +		return -ENOMEM;
> +
> +	if (uuid_parse(uuid_str, &uuid) < 0) {
> +		printk(KERN_ERR "%s UUID parse error  %s \n", __FUNCTION__, buf);
> +		return -EINVAL;
> +	}
> +
> +	vgpu_dev = vgpu_drv_get_vgpu_device(uuid, 0);

No reference counting, so we hope nobody destroys the device while we
have it...

> +
> +	if (vgpu_dev && dev_is_vgpu(&vgpu_dev->dev)) {
> +		kobject_uevent(&vgpu_dev->dev.kobj, KOBJ_ONLINE);
> +
> +		ret = vgpu_start_callback(vgpu_dev);
> +		if (ret < 0) {
> +			printk(KERN_ERR "%s vgpu_start callback failed  %d \n",
> +					 __FUNCTION__, ret);
> +			return ret;
> +		}
> +	}
> +
> +	return count;
> +}
> +
> +ssize_t vgpu_shutdown_store(struct class *class, struct class_attribute *attr,
> +			    const char *buf, size_t count)
> +{
> +	char *uuid_str;
> +	uuid_le uuid;
> +	struct vgpu_device *vgpu_dev = NULL;
> +	int ret;
> +
> +	uuid_str = kstrndup(buf, count, GFP_KERNEL);
> +
> +	if (!uuid_str)
> +		return -ENOMEM;
> +
> +	if (uuid_parse(uuid_str, &uuid) < 0) {
> +		printk(KERN_ERR "%s UUID parse error  %s \n", __FUNCTION__, buf);
> +		return -EINVAL;
> +	}
> +	vgpu_dev = vgpu_drv_get_vgpu_device(uuid, 0);
> +
> +	if (vgpu_dev && dev_is_vgpu(&vgpu_dev->dev)) {
> +		kobject_uevent(&vgpu_dev->dev.kobj, KOBJ_OFFLINE);
> +
> +		ret = vgpu_shutdown_callback(vgpu_dev);
> +		if (ret < 0) {
> +			printk(KERN_ERR "%s vgpu_shutdown callback failed  %d \n",
> +					 __FUNCTION__, ret);
> +			return ret;
> +		}
> +	}
> +
> +	return count;
> +}
> +
> +struct class_attribute vgpu_class_attrs[] = {
> +	__ATTR_WO(vgpu_start),
> +	__ATTR_WO(vgpu_shutdown),
> +	__ATTR_NULL
> +};
> +
> +int vgpu_create_pci_device_files(struct pci_dev *dev)

What's pci specific about this?

> +{
> +	int retval;
> +
> +	retval = sysfs_create_file(&dev->dev.kobj,
> +				   &dev_attr_vgpu_supported_types.attr);
> +	if (retval) {
> +		printk(KERN_ERR "VGPU-VFIO: failed to create vgpu_supported_types sysfs entry\n");
> +		return retval;
> +	}
> +
> +	retval = sysfs_create_file(&dev->dev.kobj, &dev_attr_vgpu_create.attr);
> +	if (retval) {
> +		printk(KERN_ERR "VGPU-VFIO: failed to create vgpu_create sysfs entry\n");
> +		return retval;
> +	}
> +
> +	retval = sysfs_create_file(&dev->dev.kobj, &dev_attr_vgpu_destroy.attr);
> +	if (retval) {
> +		printk(KERN_ERR "VGPU-VFIO: failed to create vgpu_destroy sysfs entry\n");
> +		return retval;
> +	}
> +
> +	return 0;
> +}
> +
> +
> +void vgpu_remove_pci_device_files(struct pci_dev *dev)

Or this?

> +{
> +	sysfs_remove_file(&dev->dev.kobj, &dev_attr_vgpu_supported_types.attr);
> +	sysfs_remove_file(&dev->dev.kobj, &dev_attr_vgpu_create.attr);
> +	sysfs_remove_file(&dev->dev.kobj, &dev_attr_vgpu_destroy.attr);
> +}
> diff --git a/drivers/vgpu/vgpu_private.h b/drivers/vgpu/vgpu_private.h
> new file mode 100644
> index 0000000..35158ef
> --- /dev/null
> +++ b/drivers/vgpu/vgpu_private.h
> @@ -0,0 +1,36 @@
> +/*
> + * VGPU interal definition
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author:
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef VGPU_PRIVATE_H
> +#define VGPU_PRIVATE_H
> +
> +struct vgpu_device *vgpu_drv_get_vgpu_device(uuid_le uuid, int instance);
> +
> +int  create_vgpu_device(struct pci_dev *pdev, uuid_le uuid, uint32_t instance,
> +		       char *vgpu_params);
> +void destroy_vgpu_device(struct vgpu_device *vgpu_dev);
> +
> +int  vgpu_bus_register(void);
> +void vgpu_bus_unregister(void);
> +
> +/* Function prototypes for vgpu_sysfs */
> +
> +extern struct class_attribute vgpu_class_attrs[];
> +extern const struct attribute_group *vgpu_dev_groups[];
> +
> +int  vgpu_create_pci_device_files(struct pci_dev *dev);
> +void vgpu_remove_pci_device_files(struct pci_dev *dev);
> +
> +void get_vgpu_supported_types(struct device *dev, char *str);
> +int  vgpu_start_callback(struct vgpu_device *vgpu_dev);
> +int  vgpu_shutdown_callback(struct vgpu_device *vgpu_dev);
> +
> +#endif /* VGPU_PRIVATE_H */
> diff --git a/include/linux/vgpu.h b/include/linux/vgpu.h
> new file mode 100644
> index 0000000..03a77cf
> --- /dev/null
> +++ b/include/linux/vgpu.h
> @@ -0,0 +1,216 @@
> +/*
> + * VGPU definition
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author:
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef VGPU_H
> +#define VGPU_H
> +
> +// Common Data structures
> +
> +struct pci_bar_info {
> +	uint64_t start;
> +	uint64_t size;
> +	uint32_t flags;
> +};
> +
> +enum vgpu_emul_space_e {
> +	vgpu_emul_space_config = 0, /*!< PCI configuration space */
> +	vgpu_emul_space_io = 1,     /*!< I/O register space */
> +	vgpu_emul_space_mmio = 2    /*!< Memory-mapped I/O space */
> +};

Actual PCI specific stuff, but should it be in the vgpu core or where
it's actually used?

> +
> +struct gpu_device;
> +
> +/*
> + * VGPU device
> + */
> +struct vgpu_device {
> +	struct kref		kref;

This should really be used more for reference counting.

> +	struct device		dev;
> +	struct gpu_device	*gpu_dev;

And the vgpu_device really should hold a reference to the gpu_device.

> +	struct iommu_group	*group;

Like it does for the iommu_group.

> +#define DEVICE_NAME_LEN		(64)
> +	char			dev_name[DEVICE_NAME_LEN];
> +	uuid_le			uuid;
> +	uint32_t		vgpu_instance;

prefixing vgpu_ on vgpu_device fields seems redundant, and inconsistent
since it's not vgpu_uuid.

> +	struct device_attribute	*dev_attr_vgpu_status;
> +	int			vgpu_device_status;
> +
> +	void			*driver_data;
> +
> +	struct list_head	list;
> +};
> +
> +
> +/**
> + * struct gpu_device_ops - Structure to be registered for each physical GPU to
> + * register the device to vgpu module.
> + *
> + * @owner:			The module owner.
> + * @dev_attr_groups:		Default attributes of the physical device.
> + * @vgpu_attr_groups:		Default attributes of the vGPU device.
> + * @vgpu_supported_config:	Called to get information about supported vgpu types.
> + *				@dev : pci device structure of physical GPU.
> + *				@config: should return string listing supported config
> + *				Returns integer: success (0) or error (< 0)
> + * @vgpu_create:		Called to allocate basic resouces in graphics
> + *				driver for a particular vgpu.
> + *				@dev: physical pci device structure on which vgpu
> + *				      should be created
> + *				@uuid: VM's uuid for which VM it is intended to
> + *				@instance: vgpu instance in that VM
> + *				@vgpu_params: extra parameters required by GPU driver.
> + *				Returns integer: success (0) or error (< 0)
> + * @vgpu_destroy:		Called to free resources in graphics driver for
> + *				a vgpu instance of that VM.
> + *				@dev: physical pci device structure to which
> + *				this vgpu points to.
> + *				@uuid: VM's uuid for which the vgpu belongs to.
> + *				@instance: vgpu instance in that VM
> + *				Returns integer: success (0) or error (< 0)
> + *				If VM is running and vgpu_destroy is called that
> + *				means the vGPU is being hotunpluged. Return error
> + *				if VM is running and graphics driver doesn't
> + *				support vgpu hotplug.
> + * @vgpu_start:			Called to do initiate vGPU initialization
> + *				process in graphics driver when VM boots before
> + *				qemu starts.
> + *				@uuid: VM's UUID which is booting.
> + *				Returns integer: success (0) or error (< 0)
> + * @vgpu_shutdown:		Called to teardown vGPU related resources for
> + *				the VM
> + *				@uuid: VM's UUID which is shutting down .
> + *				Returns integer: success (0) or error (< 0)
> + * @read:			Read emulation callback
> + *				@vdev: vgpu device structure
> + *				@buf: read buffer
> + *				@count: number bytes to read
> + *				@address_space: specifies for which address space
> + *				the request is: pci_config_space, IO register
> + *				space or MMIO space.
> + *				@pos: offset from base address.
> + *				Retuns number on bytes read on success or error.
> + * @write:			Write emulation callback
> + *				@vdev: vgpu device structure
> + *				@buf: write buffer
> + *				@count: number bytes to be written
> + *				@address_space: specifies for which address space
> + *				the request is: pci_config_space, IO register
> + *				space or MMIO space.
> + *				@pos: offset from base address.
> + *				Retuns number on bytes written on success or error.

How do these support multiple MMIO spaces or IO port spaces?  GPUs, and
therefore I assume vGPUs, often have more than one MMIO space, how
does the enum above tell us which one?  We could simply make this be a
region index.

> + * @vgpu_set_irqs:		Called to send about interrupts configuration
> + *				information that qemu set.
> + *				@vdev: vgpu device structure
> + *				@flags, index, start, count and *data : same as
> + *				that of struct vfio_irq_set of
> + *				VFIO_DEVICE_SET_IRQS API.

How do we learn about the supported interrupt types?  Should this be
called vgpu_vfio_set_irqs if it's following the vfio API?

> + * @vgpu_bar_info:		Called to get BAR size and flags of vGPU device.
> + *				@vdev: vgpu device structure
> + *				@bar_index: BAR index
> + *				@bar_info: output, returns size and flags of
> + *				requested BAR
> + *				Returns integer: success (0) or error (< 0)

This is called bar_info, but the bar_index is actually the vfio region
index and things like the config region info is being overloaded
through it.  We already have a structure defined for getting a generic
region index, why not use it?  Maybe this should just be
vgpu_vfio_get_region_info.

> + * @validate_map_request:	Validate remap pfn request
> + *				@vdev: vgpu device structure
> + *				@virtaddr: target user address to start at
> + *				@pfn: physical address of kernel memory, GPU
> + *				driver can change if required.
> + *				@size: size of map area, GPU driver can change
> + *				the size of map area if desired.
> + *				@prot: page protection flags for this mapping,
> + *				GPU driver can change, if required.
> + *				Returns integer: success (0) or error (< 0)

Was not at all clear to me what this did until I got to patch 2, this
is actually providing the fault handling for mmap'ing a vGPU mmio BAR.
Needs a better name or better description.

> + *
> + * Physical GPU that support vGPU should be register with vgpu module with
> + * gpu_device_ops structure.
> + */
> +
> +struct gpu_device_ops {
> +	struct module   *owner;
> +	const struct attribute_group **dev_attr_groups;
> +	const struct attribute_group **vgpu_attr_groups;
> +
> +	int	(*vgpu_supported_config)(struct pci_dev *dev, char *config);
> +	int     (*vgpu_create)(struct pci_dev *dev, uuid_le uuid,
> +			       uint32_t instance, char *vgpu_params);
> +	int     (*vgpu_destroy)(struct pci_dev *dev, uuid_le uuid,
> +			        uint32_t instance);
> +
> +	int     (*vgpu_start)(uuid_le uuid);
> +	int     (*vgpu_shutdown)(uuid_le uuid);
> +
> +	ssize_t (*read) (struct vgpu_device *vdev, char *buf, size_t count,
> +			 uint32_t address_space, loff_t pos);
> +	ssize_t (*write)(struct vgpu_device *vdev, char *buf, size_t count,
> +			 uint32_t address_space, loff_t pos);

Aren't these really 'enum vgpu_emul_space_e', not uint32_t?

> +	int     (*vgpu_set_irqs)(struct vgpu_device *vdev, uint32_t flags,
> +				 unsigned index, unsigned start, unsigned count,
> +				 void *data);
> +	int	(*vgpu_bar_info)(struct vgpu_device *vdev, int bar_index,
> +				 struct pci_bar_info *bar_info);
> +	int	(*validate_map_request)(struct vgpu_device *vdev,
> +					unsigned long virtaddr,
> +					unsigned long *pfn, unsigned long *size,
> +					pgprot_t *prot);
> +};
> +
> +/*
> + * Physical GPU
> + */
> +struct gpu_device {
> +	struct pci_dev                  *dev;
> +	const struct gpu_device_ops     *ops;
> +	struct list_head                gpu_next;
> +};
> +
> +/**
> + * struct vgpu_driver - vGPU device driver
> + * @name: driver name
> + * @probe: called when new device created
> + * @remove: called when device removed
> + * @driver: device driver structure
> + *
> + **/
> +struct vgpu_driver {
> +	const char *name;
> +	int  (*probe)  (struct device *dev);
> +	void (*remove) (struct device *dev);
> +	struct device_driver	driver;
> +};
> +
> +static inline struct vgpu_driver *to_vgpu_driver(struct device_driver *drv)
> +{
> +	return drv ? container_of(drv, struct vgpu_driver, driver) : NULL;
> +}
> +
> +static inline struct vgpu_device *to_vgpu_device(struct device *dev)
> +{
> +	return dev ? container_of(dev, struct vgpu_device, dev) : NULL;
> +}
> +
> +extern struct bus_type vgpu_bus_type;
> +
> +#define dev_is_vgpu(d) ((d)->bus == &vgpu_bus_type)
> +
> +extern int  vgpu_register_device(struct pci_dev *dev,
> +				 const struct gpu_device_ops *ops);
> +extern void vgpu_unregister_device(struct pci_dev *dev);
> +
> +extern int  vgpu_register_driver(struct vgpu_driver *drv, struct module *owner);
> +extern void vgpu_unregister_driver(struct vgpu_driver *drv);
> +
> +extern int vgpu_map_virtual_bar(uint64_t virt_bar_addr, uint64_t phys_bar_addr,
> +				uint32_t len, uint32_t flags);
> +extern int vgpu_dma_do_translate(dma_addr_t * gfn_buffer, uint32_t count);
> +
> +struct vgpu_device *get_vgpu_device_from_group(struct iommu_group *group);
> +
> +#endif /* VGPU_H */


The sysfs ABI needs to be documented in
Documentation/ABI/testing/sysfs-vgpu.  This is particularly important
for things like the format used for the create/destroy interfaces.
Thanks,

Alex
Tian, Kevin May 4, 2016, 2:45 a.m. UTC | #2
> From: Alex Williamson
> Sent: Wednesday, May 04, 2016 6:44 AM
> 
> > diff --git a/drivers/vgpu/Kconfig b/drivers/vgpu/Kconfig
> > new file mode 100644
> > index 0000000..792eb48
> > --- /dev/null
> > +++ b/drivers/vgpu/Kconfig
> > @@ -0,0 +1,21 @@
> > +
> > +menuconfig VGPU
> > +    tristate "VGPU driver framework"
> > +    depends on VFIO
> > +    select VGPU_VFIO
> > +    help
> > +        VGPU provides a framework to virtualize GPU without SR-IOV cap
> > +        See Documentation/vgpu.txt for more details.
> > +
> > +        If you don't know what do here, say N.
> > +
> > +config VGPU
> > +    tristate
> > +    depends on VFIO
> > +    default n
> > +
> > +config VGPU_VFIO
> > +    tristate
> > +    depends on VGPU
> > +    default n
> > +
> 
> This is a little bit convoluted, it seems like everything added in this
> patch is vfio agnostic, it doesn't necessarily care what the consumer
> is.  That makes me think we should only be adding CONFIG_VGPU here and
> it should not depend on CONFIG_VFIO or be enabling CONFIG_VGPU_VFIO.
> The middle config entry is also redundant to the first, just move the
> default line up to the first and remove the rest.

Agree. Removing such dependency also benefits other hypervisor if
VFIO is not used.

Alex, there is one idea which I'd like to hear your comment. When looking at
the whole series, we can see the majority logic (maybe I cannot say 100%)
is GPU agnostic. Same frameworks in VFIO and vGPU core are actually neutral
to underlying device type, which e.g. can be easily applied to a NIC card too
if a similar technology is developed there.

Do you think whether we'd better make framework not GPU specific now
(mostly naming change), or continue current style and change later only 
when there is a real implementation on a different device? 

Thanks
Kevin
Tian, Kevin May 4, 2016, 2:58 a.m. UTC | #3
> From: Alex Williamson
> Sent: Wednesday, May 04, 2016 6:44 AM
> > +/**
> > + * struct gpu_device_ops - Structure to be registered for each physical GPU to
> > + * register the device to vgpu module.
> > + *
> > + * @owner:			The module owner.
> > + * @dev_attr_groups:		Default attributes of the physical device.
> > + * @vgpu_attr_groups:		Default attributes of the vGPU device.
> > + * @vgpu_supported_config:	Called to get information about supported vgpu types.
> > + *				@dev : pci device structure of physical GPU.
> > + *				@config: should return string listing supported config
> > + *				Returns integer: success (0) or error (< 0)
> > + * @vgpu_create:		Called to allocate basic resouces in graphics

It's redundant to have vgpu prefixed to every op here. Same comment
for later sysfs node.

> > + *				driver for a particular vgpu.
> > + *				@dev: physical pci device structure on which vgpu
> > + *				      should be created
> > + *				@uuid: VM's uuid for which VM it is intended to
> > + *				@instance: vgpu instance in that VM

I didn't quite get @instance here. Is it whatever vendor specific format
to indicate a vgpu?

> > + *				@vgpu_params: extra parameters required by GPU driver.
> > + *				Returns integer: success (0) or error (< 0)
> > + * @vgpu_destroy:		Called to free resources in graphics driver for
> > + *				a vgpu instance of that VM.
> > + *				@dev: physical pci device structure to which
> > + *				this vgpu points to.
> > + *				@uuid: VM's uuid for which the vgpu belongs to.
> > + *				@instance: vgpu instance in that VM
> > + *				Returns integer: success (0) or error (< 0)
> > + *				If VM is running and vgpu_destroy is called that
> > + *				means the vGPU is being hotunpluged. Return error
> > + *				if VM is running and graphics driver doesn't
> > + *				support vgpu hotplug.
> > + * @vgpu_start:			Called to do initiate vGPU initialization
> > + *				process in graphics driver when VM boots before
> > + *				qemu starts.
> > + *				@uuid: VM's UUID which is booting.
> > + *				Returns integer: success (0) or error (< 0)
> > + * @vgpu_shutdown:		Called to teardown vGPU related resources for
> > + *				the VM
> > + *				@uuid: VM's UUID which is shutting down .
> > + *				Returns integer: success (0) or error (< 0)

Can you give some specific example about difference between destroy
and shutdown? Want to map it correctly into our side, e.g. whether we
need implement both or just one of them.

Another optional op is 'stop', allowing physical device to stop scheduling
vGPU including wait for in-flight DMA done. It would be useful to support
VM live migration with vGPU assigned.

> > + * @read:			Read emulation callback
> > + *				@vdev: vgpu device structure
> > + *				@buf: read buffer
> > + *				@count: number bytes to read
> > + *				@address_space: specifies for which address space
> > + *				the request is: pci_config_space, IO register
> > + *				space or MMIO space.
> > + *				@pos: offset from base address.
> > + *				Retuns number on bytes read on success or error.
> > + * @write:			Write emulation callback
> > + *				@vdev: vgpu device structure
> > + *				@buf: write buffer
> > + *				@count: number bytes to be written
> > + *				@address_space: specifies for which address space
> > + *				the request is: pci_config_space, IO register
> > + *				space or MMIO space.
> > + *				@pos: offset from base address.
> > + *				Retuns number on bytes written on success or error.
> 
> How do these support multiple MMIO spaces or IO port spaces?  GPUs, and
> therefore I assume vGPUs, often have more than one MMIO space, how
> does the enum above tell us which one?  We could simply make this be a
> region index.
> 
> > + * @vgpu_set_irqs:		Called to send about interrupts configuration
> > + *				information that qemu set.
> > + *				@vdev: vgpu device structure
> > + *				@flags, index, start, count and *data : same as
> > + *				that of struct vfio_irq_set of
> > + *				VFIO_DEVICE_SET_IRQS API.
> 
> How do we learn about the supported interrupt types?  Should this be
> called vgpu_vfio_set_irqs if it's following the vfio API?
> 
> > + * @vgpu_bar_info:		Called to get BAR size and flags of vGPU device.
> > + *				@vdev: vgpu device structure
> > + *				@bar_index: BAR index
> > + *				@bar_info: output, returns size and flags of
> > + *				requested BAR
> > + *				Returns integer: success (0) or error (< 0)
> 
> This is called bar_info, but the bar_index is actually the vfio region
> index and things like the config region info is being overloaded
> through it.  We already have a structure defined for getting a generic
> region index, why not use it?  Maybe this should just be
> vgpu_vfio_get_region_info.

Or is it more extensible to allow reporting the whole vconfig space?

Thanks
Kevin
Kirti Wankhede May 4, 2016, 1:31 p.m. UTC | #4
Thanks Alex.

 >> +config VGPU_VFIO
 >> +    tristate
 >> +    depends on VGPU
 >> +    default n
 >> +
 >
 > This is a little bit convoluted, it seems like everything added in this
 > patch is vfio agnostic, it doesn't necessarily care what the consumer
 > is.  That makes me think we should only be adding CONFIG_VGPU here and
 > it should not depend on CONFIG_VFIO or be enabling CONFIG_VGPU_VFIO.
 > The middle config entry is also redundant to the first, just move the
 > default line up to the first and remove the rest.

CONFIG_VGPU doesn't directly depend on VFIO. CONFIG_VGPU_VFIO is 
directly dependent on VFIO. But devices created by VGPU core module need 
a driver to manage those devices. CONFIG_VGPU_VFIO is the driver which 
will manage vgpu devices. So I think CONFIG_VGPU_VFIO should be enabled 
by CONFIG_VGPU.

This would look like:
menuconfig VGPU
     tristate "VGPU driver framework"
     select VGPU_VFIO
     default n
     help
         VGPU provides a framework to virtualize GPU without SR-IOV cap
         See Documentation/vgpu.txt for more details.

         If you don't know what do here, say N.

config VGPU_VFIO
     tristate
     depends on VGPU
     depends on VFIO
     default n



 >> +int vgpu_register_device(struct pci_dev *dev, const struct 
gpu_device_ops *ops)
 >
 > Why do we care that it's a pci_dev?  It seems like there's only a very
 > small portion of the API that cares about pci_devs in order to describe
 > BARs, which could be switched based on the device type.  Otherwise we
 > could operate on a struct device here.
 >

GPUs are PCI devices, hence used pci_dev. I agree with you, I'll change 
it to operate on struct device and add checks in vgpu_vfio.c where 
config space and BARs are populated.

 >> +static void vgpu_device_free(struct vgpu_device *vgpu_dev)
 >> +{
 >> +	if (vgpu_dev) {
 >> +		mutex_lock(&vgpu.vgpu_devices_lock);
 >> +		list_del(&vgpu_dev->list);
 >> +		mutex_unlock(&vgpu.vgpu_devices_lock);
 >> +		kfree(vgpu_dev);
 >> +	}
 >
 > Why aren't we using the kref to remove and free the vgpu when the last
 > reference is released?

vgpu_device_free() is called from 2 places,
1. from create_vgpu_device(), when device_register() is failed.
2. vgpu_device_release(), which is set as a release function to device 
registered by device_register():
vgpu_dev->dev.release = vgpu_device_release;
This release function is called from device_unregister() kernel function 
where it uses kref to call release function. I don't think I need to do 
it again.


 >> +struct vgpu_device *vgpu_drv_get_vgpu_device(uuid_le uuid, int 
instance)
 >> +{
 >> +	struct vgpu_device *vdev = NULL;
 >> +
 >> +	mutex_lock(&vgpu.vgpu_devices_lock);
 >> +	list_for_each_entry(vdev, &vgpu.vgpu_devices_list, list) {
 >> +		if ((uuid_le_cmp(vdev->uuid, uuid) == 0) &&
 >> +		    (vdev->vgpu_instance == instance)) {
 >> +			mutex_unlock(&vgpu.vgpu_devices_lock);
 >> +			return vdev;
 >
 > We're not taking any sort of reference to the vgpu, what prevents races
 > with it being removed?  A common exit path would be easy to achieve
 > here too.
 >

I'll add reference count.


 >> +create_attr_error:
 >> +	if (gpu_dev->ops->vgpu_destroy) {
 >> +		int ret = 0;
 >> +		ret = gpu_dev->ops->vgpu_destroy(gpu_dev->dev,
 >> +						 vgpu_dev->uuid,
 >> +						 vgpu_dev->vgpu_instance);
 >
 > Unnecessary initialization and we don't do anything with the result.
 > Below indicates lack of vgpu_destroy indicates the vendor doesn't
 > support unplug, but doesn't that break our error cleanup path here?
 >

Comment about vgpu_destroy:
If VM is running and vgpu_destroy is called that 
means the vGPU is being hotunpluged. Return
error if VM is running and graphics driver
doesn't support vgpu hotplug.

Its GPU drivers responsibility to check if VM is running and return 
accordingly. This is vgpu creation path. Vgpu device would be hotplug to 
VM on vgpu_start.

 >> +		retval = gpu_dev->ops->vgpu_destroy(gpu_dev->dev,
 >> +						    vgpu_dev->uuid,
 >> +						    vgpu_dev->vgpu_instance);
 >> +	/* if vendor driver doesn't return success that means vendor 
driver doesn't
 >> +	 * support hot-unplug */
 >> +		if (retval)
 >> +			return;
 >
 > Should we return an error code then?  Inconsistent comment style.
 >

destroy_vgpu_device() is called from
- release_vgpubus_dev(), which is a release function of vgpu_class and 
has return type void.
- vgpu_unregister_device(), which again return void

Even if error code is returned from here, it is not going to be used.

I'll change the comment style in next patch update.



 >> + * @write:			Write emulation callback
 >> + *				@vdev: vgpu device structure
 >> + *				@buf: write buffer
 >> + *				@count: number bytes to be written
 >> + *				@address_space: specifies for which address space
 >> + *				the request is: pci_config_space, IO register
 >> + *				space or MMIO space.
 >> + *				@pos: offset from base address.
 >> + *				Retuns number on bytes written on success or error.
 >
 > How do these support multiple MMIO spaces or IO port spaces?  GPUs, and
 > therefore I assume vGPUs, often have more than one MMIO space, how
 > does the enum above tell us which one?  We could simply make this be a
 > region index.
 >

addresss_space should be one of these, yes address_space is of
'enum vgpu_emul_space_e', will change uint32_t to vgpu_emul_space_e.
enum vgpu_emul_space_e {
         vgpu_emul_space_config = 0, /*!< PCI configuration space */
         vgpu_emul_space_io = 1,     /*!< I/O register space */
         vgpu_emul_space_mmio = 2    /*!< Memory-mapped I/O space */
};

If you see vgpu_dev_bar_rw() in 2nd patch, in vgpu_vfio.c
int bar_index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);

@pos: offset from base address is calculated as:
pos = vdev->bar_info[bar_index].start + offset

GPU driver can find the bar index from 'pos'.

 >> + * @vgpu_set_irqs:		Called to send about interrupts configuration
 >> + *				information that qemu set.
 >> + *				@vdev: vgpu device structure
 >> + *				@flags, index, start, count and *data : same as
 >> + *				that of struct vfio_irq_set of
 >> + *				VFIO_DEVICE_SET_IRQS API.
 >
 > How do we learn about the supported interrupt types?  Should this be
 > called vgpu_vfio_set_irqs if it's following the vfio API?
 >

GPU driver provides config space of vgpu device.
QEMU learns about supported interrupt type by reading this config space 
and of vgpu device and check PCI capabilites.
Yes, this follows vfio API, will change the function name.

 >> + * @vgpu_bar_info:		Called to get BAR size and flags of vGPU device.
 >> + *				@vdev: vgpu device structure
 >> + *				@bar_index: BAR index
 >> + *				@bar_info: output, returns size and flags of
 >> + *				requested BAR
 >> + *				Returns integer: success (0) or error (< 0)
 >
 > This is called bar_info, but the bar_index is actually the vfio region
 > index and things like the config region info is being overloaded
 > through it.  We already have a structure defined for getting a generic
 > region index, why not use it?  Maybe this should just be
 > vgpu_vfio_get_region_info.
 >

Ok. Will do.


 >> + * @validate_map_request:	Validate remap pfn request
 >> + *				@vdev: vgpu device structure
 >> + *				@virtaddr: target user address to start at
 >> + *				@pfn: physical address of kernel memory, GPU
 >> + *				driver can change if required.
 >> + *				@size: size of map area, GPU driver can change
 >> + *				the size of map area if desired.
 >> + *				@prot: page protection flags for this mapping,
 >> + *				GPU driver can change, if required.
 >> + *				Returns integer: success (0) or error (< 0)
 >
 > Was not at all clear to me what this did until I got to patch 2, this
 > is actually providing the fault handling for mmap'ing a vGPU mmio BAR.
 > Needs a better name or better description.
 >

If say VMM mmap whole BAR1 of GPU, say 128MB, so fault would occur when 
BAR1 is tried to access then the size is calculated as:
req_size = vma->vm_end - virtaddr
Since GPU is being shared by multiple vGPUs, GPU driver might not remap 
whole BAR1 for only one vGPU device, so would prefer, say map one page 
at a time. GPU driver returns PAGE_SIZE. This is used by 
remap_pfn_range(). Now on next access to BAR1 other than that page, we 
will again get a fault().
As the name says this call is to validate from GPU driver for the size 
and prot of map area. GPU driver can change size and prot for this map area.


 >> +	ssize_t (*write)(struct vgpu_device *vdev, char *buf, size_t count,
 >> +			 uint32_t address_space, loff_t pos);
 >
 > Aren't these really 'enum vgpu_emul_space_e', not uint32_t?
 >

Yes. I'll change to enum vgpu_emul_space_e.

Thanks,
Kirti.
Alex Williamson May 4, 2016, 4:57 p.m. UTC | #5
On Wed, 4 May 2016 02:45:59 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson
> > Sent: Wednesday, May 04, 2016 6:44 AM
> >   
> > > diff --git a/drivers/vgpu/Kconfig b/drivers/vgpu/Kconfig
> > > new file mode 100644
> > > index 0000000..792eb48
> > > --- /dev/null
> > > +++ b/drivers/vgpu/Kconfig
> > > @@ -0,0 +1,21 @@
> > > +
> > > +menuconfig VGPU
> > > +    tristate "VGPU driver framework"
> > > +    depends on VFIO
> > > +    select VGPU_VFIO
> > > +    help
> > > +        VGPU provides a framework to virtualize GPU without SR-IOV cap
> > > +        See Documentation/vgpu.txt for more details.
> > > +
> > > +        If you don't know what do here, say N.
> > > +
> > > +config VGPU
> > > +    tristate
> > > +    depends on VFIO
> > > +    default n
> > > +
> > > +config VGPU_VFIO
> > > +    tristate
> > > +    depends on VGPU
> > > +    default n
> > > +  
> > 
> > This is a little bit convoluted, it seems like everything added in this
> > patch is vfio agnostic, it doesn't necessarily care what the consumer
> > is.  That makes me think we should only be adding CONFIG_VGPU here and
> > it should not depend on CONFIG_VFIO or be enabling CONFIG_VGPU_VFIO.
> > The middle config entry is also redundant to the first, just move the
> > default line up to the first and remove the rest.  
> 
> Agree. Removing such dependency also benefits other hypervisor if
> VFIO is not used.
> 
> Alex, there is one idea which I'd like to hear your comment. When looking at
> the whole series, we can see the majority logic (maybe I cannot say 100%)
> is GPU agnostic. Same frameworks in VFIO and vGPU core are actually neutral
> to underlying device type, which e.g. can be easily applied to a NIC card too
> if a similar technology is developed there.
> 
> Do you think whether we'd better make framework not GPU specific now
> (mostly naming change), or continue current style and change later only 
> when there is a real implementation on a different device? 

Yeah, I see that too and I made a bunch of comments in patch 3 that
we're not doing anything vGPU specific and we should be careful about
assuming the user for the various interfaces.  In patch 1, we are
fairly v/GPU specific because we're dealing with how vGPUs are created
from the physical GPU.  Maybe the interface is general, maybe it's not,
it's hard to say.  Starting with patch 2 though, we really shouldn't
know or care what the device is beyond a PCI compatible device.  We're
just trying to create a vfio bus driver compatible with vfio-pci and
offload enough generic operations so that we don't need to pass
everything back to the vendor driver.  Patch 3 of course should be
completely device agnostic, we should only care that the vfio backend
provides mediation of the device, so an iommu is not required.  It may
be too much of a rathole to try to completely generalize the interface
at this point, but let's certainly try not to let vgpu specific ideas
spread beyond where we need.  Thanks,

Alex
Tian, Kevin May 5, 2016, 8:58 a.m. UTC | #6
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Thursday, May 05, 2016 12:57 AM
> 
> On Wed, 4 May 2016 02:45:59 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Alex Williamson
> > > Sent: Wednesday, May 04, 2016 6:44 AM
> > >
> > > > diff --git a/drivers/vgpu/Kconfig b/drivers/vgpu/Kconfig
> > > > new file mode 100644
> > > > index 0000000..792eb48
> > > > --- /dev/null
> > > > +++ b/drivers/vgpu/Kconfig
> > > > @@ -0,0 +1,21 @@
> > > > +
> > > > +menuconfig VGPU
> > > > +    tristate "VGPU driver framework"
> > > > +    depends on VFIO
> > > > +    select VGPU_VFIO
> > > > +    help
> > > > +        VGPU provides a framework to virtualize GPU without SR-IOV cap
> > > > +        See Documentation/vgpu.txt for more details.
> > > > +
> > > > +        If you don't know what do here, say N.
> > > > +
> > > > +config VGPU
> > > > +    tristate
> > > > +    depends on VFIO
> > > > +    default n
> > > > +
> > > > +config VGPU_VFIO
> > > > +    tristate
> > > > +    depends on VGPU
> > > > +    default n
> > > > +
> > >
> > > This is a little bit convoluted, it seems like everything added in this
> > > patch is vfio agnostic, it doesn't necessarily care what the consumer
> > > is.  That makes me think we should only be adding CONFIG_VGPU here and
> > > it should not depend on CONFIG_VFIO or be enabling CONFIG_VGPU_VFIO.
> > > The middle config entry is also redundant to the first, just move the
> > > default line up to the first and remove the rest.
> >
> > Agree. Removing such dependency also benefits other hypervisor if
> > VFIO is not used.
> >
> > Alex, there is one idea which I'd like to hear your comment. When looking at
> > the whole series, we can see the majority logic (maybe I cannot say 100%)
> > is GPU agnostic. Same frameworks in VFIO and vGPU core are actually neutral
> > to underlying device type, which e.g. can be easily applied to a NIC card too
> > if a similar technology is developed there.
> >
> > Do you think whether we'd better make framework not GPU specific now
> > (mostly naming change), or continue current style and change later only
> > when there is a real implementation on a different device?
> 
> Yeah, I see that too and I made a bunch of comments in patch 3 that
> we're not doing anything vGPU specific and we should be careful about
> assuming the user for the various interfaces.  In patch 1, we are
> fairly v/GPU specific because we're dealing with how vGPUs are created
> from the physical GPU.  Maybe the interface is general, maybe it's not,
> it's hard to say.  Starting with patch 2 though, we really shouldn't
> know or care what the device is beyond a PCI compatible device.  We're
> just trying to create a vfio bus driver compatible with vfio-pci and
> offload enough generic operations so that we don't need to pass
> everything back to the vendor driver.  Patch 3 of course should be
> completely device agnostic, we should only care that the vfio backend
> provides mediation of the device, so an iommu is not required.  It may
> be too much of a rathole to try to completely generalize the interface
> at this point, but let's certainly try not to let vgpu specific ideas
> spread beyond where we need.  Thanks,
> 

Even for patch 1 current implementation can apply to any PCI device
if we just replace vgpu with another name. There is nothing (or minimal)
being real GPU specific. But I'm not strong on this point, since somehow
I agree w/o the 2nd actual user some abstractions here may only make
sense for vgpu. So... just raise this thought and hear your comments. :-)

btw a curious question. I know you Alex can give final call to VFIO specific
code. What about vGPU core framework? It creates a new category under
driver directory (drivers/vgpu). Who else is required to review and ack
that part?

Thanks
Kevin
Tian, Kevin May 5, 2016, 9:06 a.m. UTC | #7
> From: Kirti Wankhede
> Sent: Wednesday, May 04, 2016 9:32 PM
> 
> Thanks Alex.
> 
>  >> +config VGPU_VFIO
>  >> +    tristate
>  >> +    depends on VGPU
>  >> +    default n
>  >> +
>  >
>  > This is a little bit convoluted, it seems like everything added in this
>  > patch is vfio agnostic, it doesn't necessarily care what the consumer
>  > is.  That makes me think we should only be adding CONFIG_VGPU here and
>  > it should not depend on CONFIG_VFIO or be enabling CONFIG_VGPU_VFIO.
>  > The middle config entry is also redundant to the first, just move the
>  > default line up to the first and remove the rest.
> 
> CONFIG_VGPU doesn't directly depend on VFIO. CONFIG_VGPU_VFIO is
> directly dependent on VFIO. But devices created by VGPU core module need
> a driver to manage those devices. CONFIG_VGPU_VFIO is the driver which
> will manage vgpu devices. So I think CONFIG_VGPU_VFIO should be enabled
> by CONFIG_VGPU.
> 
> This would look like:
> menuconfig VGPU
>      tristate "VGPU driver framework"
>      select VGPU_VFIO
>      default n
>      help
>          VGPU provides a framework to virtualize GPU without SR-IOV cap
>          See Documentation/vgpu.txt for more details.
> 
>          If you don't know what do here, say N.
> 
> config VGPU_VFIO
>      tristate
>      depends on VGPU
>      depends on VFIO
>      default n
> 

There could be multiple drivers operating VGPU. Why do we restrict
it to VFIO here?

>  >> +create_attr_error:
>  >> +	if (gpu_dev->ops->vgpu_destroy) {
>  >> +		int ret = 0;
>  >> +		ret = gpu_dev->ops->vgpu_destroy(gpu_dev->dev,
>  >> +						 vgpu_dev->uuid,
>  >> +						 vgpu_dev->vgpu_instance);
>  >
>  > Unnecessary initialization and we don't do anything with the result.
>  > Below indicates lack of vgpu_destroy indicates the vendor doesn't
>  > support unplug, but doesn't that break our error cleanup path here?
>  >
> 
> Comment about vgpu_destroy:
> If VM is running and vgpu_destroy is called that
> means the vGPU is being hotunpluged. Return
> error if VM is running and graphics driver
> doesn't support vgpu hotplug.
> 
> Its GPU drivers responsibility to check if VM is running and return
> accordingly. This is vgpu creation path. Vgpu device would be hotplug to
> VM on vgpu_start.

How does GPU driver know whether VM is running? VM is managed
by KVM here.

Maybe it's clearer to say whether vGPU is busy which means some work
has been loaded to vGPU. That's something GPU driver can tell.

> 
>  >> + * @vgpu_bar_info:		Called to get BAR size and flags of vGPU device.
>  >> + *				@vdev: vgpu device structure
>  >> + *				@bar_index: BAR index
>  >> + *				@bar_info: output, returns size and flags of
>  >> + *				requested BAR
>  >> + *				Returns integer: success (0) or error (< 0)
>  >
>  > This is called bar_info, but the bar_index is actually the vfio region
>  > index and things like the config region info is being overloaded
>  > through it.  We already have a structure defined for getting a generic
>  > region index, why not use it?  Maybe this should just be
>  > vgpu_vfio_get_region_info.
>  >
> 
> Ok. Will do.

As you commented earlier that GPU driver is required to provide config
space (which I agree), then what's the point of introducing another
bar specific structure? VFIO can use @write to get bar information 
from vgpu config space, just like how it's done on physical device today.

> 
> 
>  >> + * @validate_map_request:	Validate remap pfn request
>  >> + *				@vdev: vgpu device structure
>  >> + *				@virtaddr: target user address to start at
>  >> + *				@pfn: physical address of kernel memory, GPU
>  >> + *				driver can change if required.
>  >> + *				@size: size of map area, GPU driver can change
>  >> + *				the size of map area if desired.
>  >> + *				@prot: page protection flags for this mapping,
>  >> + *				GPU driver can change, if required.
>  >> + *				Returns integer: success (0) or error (< 0)
>  >
>  > Was not at all clear to me what this did until I got to patch 2, this
>  > is actually providing the fault handling for mmap'ing a vGPU mmio BAR.
>  > Needs a better name or better description.
>  >
> 
> If say VMM mmap whole BAR1 of GPU, say 128MB, so fault would occur when
> BAR1 is tried to access then the size is calculated as:
> req_size = vma->vm_end - virtaddr
> Since GPU is being shared by multiple vGPUs, GPU driver might not remap
> whole BAR1 for only one vGPU device, so would prefer, say map one page
> at a time. GPU driver returns PAGE_SIZE. This is used by
> remap_pfn_range(). Now on next access to BAR1 other than that page, we
> will again get a fault().
> As the name says this call is to validate from GPU driver for the size
> and prot of map area. GPU driver can change size and prot for this map area.

Currently we don't require such interface for Intel vGPU. Need to think about
its rationale carefully (still not clear to me). Jike, do you have any thought on
this?

Thanks
Kevin
Kirti Wankhede May 5, 2016, 10:44 a.m. UTC | #8
On 5/5/2016 2:36 PM, Tian, Kevin wrote:
>> From: Kirti Wankhede
>> Sent: Wednesday, May 04, 2016 9:32 PM
>>
>> Thanks Alex.
>>
>>  >> +config VGPU_VFIO
>>  >> +    tristate
>>  >> +    depends on VGPU
>>  >> +    default n
>>  >> +
>>  >
>>  > This is a little bit convoluted, it seems like everything added in this
>>  > patch is vfio agnostic, it doesn't necessarily care what the consumer
>>  > is.  That makes me think we should only be adding CONFIG_VGPU here and
>>  > it should not depend on CONFIG_VFIO or be enabling CONFIG_VGPU_VFIO.
>>  > The middle config entry is also redundant to the first, just move the
>>  > default line up to the first and remove the rest.
>>
>> CONFIG_VGPU doesn't directly depend on VFIO. CONFIG_VGPU_VFIO is
>> directly dependent on VFIO. But devices created by VGPU core module need
>> a driver to manage those devices. CONFIG_VGPU_VFIO is the driver which
>> will manage vgpu devices. So I think CONFIG_VGPU_VFIO should be enabled
>> by CONFIG_VGPU.
>>
>> This would look like:
>> menuconfig VGPU
>>      tristate "VGPU driver framework"
>>      select VGPU_VFIO
>>      default n
>>      help
>>          VGPU provides a framework to virtualize GPU without SR-IOV cap
>>          See Documentation/vgpu.txt for more details.
>>
>>          If you don't know what do here, say N.
>>
>> config VGPU_VFIO
>>      tristate
>>      depends on VGPU
>>      depends on VFIO
>>      default n
>>
>
> There could be multiple drivers operating VGPU. Why do we restrict
> it to VFIO here?
>

VGPU_VFIO uses VFIO APIs, it depends on VFIO.
I think since there is no other driver than VGPU_VFIO for VGPU devices, 
we should keep default selection of VGPU_VFIO on VGPU. May be in future 
if other driver is add ti operate vGPU devices, then default selection 
can be removed.

>>  >> +create_attr_error:
>>  >> +	if (gpu_dev->ops->vgpu_destroy) {
>>  >> +		int ret = 0;
>>  >> +		ret = gpu_dev->ops->vgpu_destroy(gpu_dev->dev,
>>  >> +						 vgpu_dev->uuid,
>>  >> +						 vgpu_dev->vgpu_instance);
>>  >
>>  > Unnecessary initialization and we don't do anything with the result.
>>  > Below indicates lack of vgpu_destroy indicates the vendor doesn't
>>  > support unplug, but doesn't that break our error cleanup path here?
>>  >
>>
>> Comment about vgpu_destroy:
>> If VM is running and vgpu_destroy is called that
>> means the vGPU is being hotunpluged. Return
>> error if VM is running and graphics driver
>> doesn't support vgpu hotplug.
>>
>> Its GPU drivers responsibility to check if VM is running and return
>> accordingly. This is vgpu creation path. Vgpu device would be hotplug to
>> VM on vgpu_start.
>
> How does GPU driver know whether VM is running? VM is managed
> by KVM here.
>
> Maybe it's clearer to say whether vGPU is busy which means some work
> has been loaded to vGPU. That's something GPU driver can tell.
>

GPU driver can detect based on resources allocated for the VM from 
vgpu_create/vgpu_start.

>>
>>  >> + * @vgpu_bar_info:		Called to get BAR size and flags of vGPU device.
>>  >> + *				@vdev: vgpu device structure
>>  >> + *				@bar_index: BAR index
>>  >> + *				@bar_info: output, returns size and flags of
>>  >> + *				requested BAR
>>  >> + *				Returns integer: success (0) or error (< 0)
>>  >
>>  > This is called bar_info, but the bar_index is actually the vfio region
>>  > index and things like the config region info is being overloaded
>>  > through it.  We already have a structure defined for getting a generic
>>  > region index, why not use it?  Maybe this should just be
>>  > vgpu_vfio_get_region_info.
>>  >
>>
>> Ok. Will do.
>
> As you commented earlier that GPU driver is required to provide config
> space (which I agree), then what's the point of introducing another
> bar specific structure? VFIO can use @write to get bar information
> from vgpu config space, just like how it's done on physical device today.
>

It is required not only for size, but also to fetch flags. Region flags 
could be combination of:

#define VFIO_REGION_INFO_FLAG_READ      (1 << 0) /* Region supports read */
#define VFIO_REGION_INFO_FLAG_WRITE     (1 << 1) /* Region supports write */
#define VFIO_REGION_INFO_FLAG_MMAP      (1 << 2) /* Region supports mmap */

Thanks,
Kirti.

>>
>>
>>  >> + * @validate_map_request:	Validate remap pfn request
>>  >> + *				@vdev: vgpu device structure
>>  >> + *				@virtaddr: target user address to start at
>>  >> + *				@pfn: physical address of kernel memory, GPU
>>  >> + *				driver can change if required.
>>  >> + *				@size: size of map area, GPU driver can change
>>  >> + *				the size of map area if desired.
>>  >> + *				@prot: page protection flags for this mapping,
>>  >> + *				GPU driver can change, if required.
>>  >> + *				Returns integer: success (0) or error (< 0)
>>  >
>>  > Was not at all clear to me what this did until I got to patch 2, this
>>  > is actually providing the fault handling for mmap'ing a vGPU mmio BAR.
>>  > Needs a better name or better description.
>>  >
>>
>> If say VMM mmap whole BAR1 of GPU, say 128MB, so fault would occur when
>> BAR1 is tried to access then the size is calculated as:
>> req_size = vma->vm_end - virtaddr
>> Since GPU is being shared by multiple vGPUs, GPU driver might not remap
>> whole BAR1 for only one vGPU device, so would prefer, say map one page
>> at a time. GPU driver returns PAGE_SIZE. This is used by
>> remap_pfn_range(). Now on next access to BAR1 other than that page, we
>> will again get a fault().
>> As the name says this call is to validate from GPU driver for the size
>> and prot of map area. GPU driver can change size and prot for this map area.
>
> Currently we don't require such interface for Intel vGPU. Need to think about
> its rationale carefully (still not clear to me). Jike, do you have any thought on
> this?
>
> Thanks
> Kevin
>
Tian, Kevin May 5, 2016, 12:07 p.m. UTC | #9
> From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> Sent: Thursday, May 05, 2016 6:45 PM
> 
> 
> On 5/5/2016 2:36 PM, Tian, Kevin wrote:
> >> From: Kirti Wankhede
> >> Sent: Wednesday, May 04, 2016 9:32 PM
> >>
> >> Thanks Alex.
> >>
> >>  >> +config VGPU_VFIO
> >>  >> +    tristate
> >>  >> +    depends on VGPU
> >>  >> +    default n
> >>  >> +
> >>  >
> >>  > This is a little bit convoluted, it seems like everything added in this
> >>  > patch is vfio agnostic, it doesn't necessarily care what the consumer
> >>  > is.  That makes me think we should only be adding CONFIG_VGPU here and
> >>  > it should not depend on CONFIG_VFIO or be enabling CONFIG_VGPU_VFIO.
> >>  > The middle config entry is also redundant to the first, just move the
> >>  > default line up to the first and remove the rest.
> >>
> >> CONFIG_VGPU doesn't directly depend on VFIO. CONFIG_VGPU_VFIO is
> >> directly dependent on VFIO. But devices created by VGPU core module need
> >> a driver to manage those devices. CONFIG_VGPU_VFIO is the driver which
> >> will manage vgpu devices. So I think CONFIG_VGPU_VFIO should be enabled
> >> by CONFIG_VGPU.
> >>
> >> This would look like:
> >> menuconfig VGPU
> >>      tristate "VGPU driver framework"
> >>      select VGPU_VFIO
> >>      default n
> >>      help
> >>          VGPU provides a framework to virtualize GPU without SR-IOV cap
> >>          See Documentation/vgpu.txt for more details.
> >>
> >>          If you don't know what do here, say N.
> >>
> >> config VGPU_VFIO
> >>      tristate
> >>      depends on VGPU
> >>      depends on VFIO
> >>      default n
> >>
> >
> > There could be multiple drivers operating VGPU. Why do we restrict
> > it to VFIO here?
> >
> 
> VGPU_VFIO uses VFIO APIs, it depends on VFIO.
> I think since there is no other driver than VGPU_VFIO for VGPU devices,
> we should keep default selection of VGPU_VFIO on VGPU. May be in future
> if other driver is add ti operate vGPU devices, then default selection
> can be removed.

What's your plan to support Xen here?

> 
> >>  >> +create_attr_error:
> >>  >> +	if (gpu_dev->ops->vgpu_destroy) {
> >>  >> +		int ret = 0;
> >>  >> +		ret = gpu_dev->ops->vgpu_destroy(gpu_dev->dev,
> >>  >> +						 vgpu_dev->uuid,
> >>  >> +						 vgpu_dev->vgpu_instance);
> >>  >
> >>  > Unnecessary initialization and we don't do anything with the result.
> >>  > Below indicates lack of vgpu_destroy indicates the vendor doesn't
> >>  > support unplug, but doesn't that break our error cleanup path here?
> >>  >
> >>
> >> Comment about vgpu_destroy:
> >> If VM is running and vgpu_destroy is called that
> >> means the vGPU is being hotunpluged. Return
> >> error if VM is running and graphics driver
> >> doesn't support vgpu hotplug.
> >>
> >> Its GPU drivers responsibility to check if VM is running and return
> >> accordingly. This is vgpu creation path. Vgpu device would be hotplug to
> >> VM on vgpu_start.
> >
> > How does GPU driver know whether VM is running? VM is managed
> > by KVM here.
> >
> > Maybe it's clearer to say whether vGPU is busy which means some work
> > has been loaded to vGPU. That's something GPU driver can tell.
> >
> 
> GPU driver can detect based on resources allocated for the VM from
> vgpu_create/vgpu_start.

Yes, in that case GPU driver only knows a vGPU is in-use, not who is
using vGPU (now is VM, in the future it could be a container). Anyway
my point is just not assuming VM to add limitation when it's not necessary. :-)

> 
> >>
> >>  >> + * @vgpu_bar_info:		Called to get BAR size and flags of vGPU device.
> >>  >> + *				@vdev: vgpu device structure
> >>  >> + *				@bar_index: BAR index
> >>  >> + *				@bar_info: output, returns size and flags of
> >>  >> + *				requested BAR
> >>  >> + *				Returns integer: success (0) or error (< 0)
> >>  >
> >>  > This is called bar_info, but the bar_index is actually the vfio region
> >>  > index and things like the config region info is being overloaded
> >>  > through it.  We already have a structure defined for getting a generic
> >>  > region index, why not use it?  Maybe this should just be
> >>  > vgpu_vfio_get_region_info.
> >>  >
> >>
> >> Ok. Will do.
> >
> > As you commented earlier that GPU driver is required to provide config
> > space (which I agree), then what's the point of introducing another
> > bar specific structure? VFIO can use @write to get bar information
> > from vgpu config space, just like how it's done on physical device today.
> >
> 
> It is required not only for size, but also to fetch flags. Region flags
> could be combination of:
> 
> #define VFIO_REGION_INFO_FLAG_READ      (1 << 0) /* Region supports read */
> #define VFIO_REGION_INFO_FLAG_WRITE     (1 << 1) /* Region supports write */
> #define VFIO_REGION_INFO_FLAG_MMAP      (1 << 2) /* Region supports mmap */
> 
> Thanks,
> Kirti.

That makes sense.

Thanks
Kevin
Kirti Wankhede May 5, 2016, 12:57 p.m. UTC | #10
On 5/5/2016 5:37 PM, Tian, Kevin wrote:
>> From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
>> Sent: Thursday, May 05, 2016 6:45 PM
>>
>>
>> On 5/5/2016 2:36 PM, Tian, Kevin wrote:
>>>> From: Kirti Wankhede
>>>> Sent: Wednesday, May 04, 2016 9:32 PM
>>>>
>>>> Thanks Alex.
>>>>
>>>>  >> +config VGPU_VFIO
>>>>  >> +    tristate
>>>>  >> +    depends on VGPU
>>>>  >> +    default n
>>>>  >> +
>>>>  >
>>>>  > This is a little bit convoluted, it seems like everything added in this
>>>>  > patch is vfio agnostic, it doesn't necessarily care what the consumer
>>>>  > is.  That makes me think we should only be adding CONFIG_VGPU here and
>>>>  > it should not depend on CONFIG_VFIO or be enabling CONFIG_VGPU_VFIO.
>>>>  > The middle config entry is also redundant to the first, just move the
>>>>  > default line up to the first and remove the rest.
>>>>
>>>> CONFIG_VGPU doesn't directly depend on VFIO. CONFIG_VGPU_VFIO is
>>>> directly dependent on VFIO. But devices created by VGPU core module need
>>>> a driver to manage those devices. CONFIG_VGPU_VFIO is the driver which
>>>> will manage vgpu devices. So I think CONFIG_VGPU_VFIO should be enabled
>>>> by CONFIG_VGPU.
>>>>
>>>> This would look like:
>>>> menuconfig VGPU
>>>>      tristate "VGPU driver framework"
>>>>      select VGPU_VFIO
>>>>      default n
>>>>      help
>>>>          VGPU provides a framework to virtualize GPU without SR-IOV cap
>>>>          See Documentation/vgpu.txt for more details.
>>>>
>>>>          If you don't know what do here, say N.
>>>>
>>>> config VGPU_VFIO
>>>>      tristate
>>>>      depends on VGPU
>>>>      depends on VFIO
>>>>      default n
>>>>
>>>
>>> There could be multiple drivers operating VGPU. Why do we restrict
>>> it to VFIO here?
>>>
>>
>> VGPU_VFIO uses VFIO APIs, it depends on VFIO.
>> I think since there is no other driver than VGPU_VFIO for VGPU devices,
>> we should keep default selection of VGPU_VFIO on VGPU. May be in future
>> if other driver is add ti operate vGPU devices, then default selection
>> can be removed.
>
> What's your plan to support Xen here?
>

No plans to support Xen.

Thanks,
Kirti
Jike Song May 6, 2016, 12:14 p.m. UTC | #11
On 05/05/2016 05:06 PM, Tian, Kevin wrote:
>> From: Kirti Wankhede
>>
>>  >> + * @validate_map_request:	Validate remap pfn request
>>  >> + *				@vdev: vgpu device structure
>>  >> + *				@virtaddr: target user address to start at
>>  >> + *				@pfn: physical address of kernel memory, GPU
>>  >> + *				driver can change if required.
>>  >> + *				@size: size of map area, GPU driver can change
>>  >> + *				the size of map area if desired.
>>  >> + *				@prot: page protection flags for this mapping,
>>  >> + *				GPU driver can change, if required.
>>  >> + *				Returns integer: success (0) or error (< 0)
>>  >
>>  > Was not at all clear to me what this did until I got to patch 2, this
>>  > is actually providing the fault handling for mmap'ing a vGPU mmio BAR.
>>  > Needs a better name or better description.
>>  >
>>
>> If say VMM mmap whole BAR1 of GPU, say 128MB, so fault would occur when
>> BAR1 is tried to access then the size is calculated as:
>> req_size = vma->vm_end - virtaddr
Hi Kirti,

virtaddr is the faulted one, vma->vm_end the vaddr of the mmap-ed 128MB BAR1?

Would you elaborate why (vm_end - fault_addr) results the requested size? 


>> Since GPU is being shared by multiple vGPUs, GPU driver might not remap
>> whole BAR1 for only one vGPU device, so would prefer, say map one page
>> at a time. GPU driver returns PAGE_SIZE. This is used by
>> remap_pfn_range(). Now on next access to BAR1 other than that page, we
>> will again get a fault().
>> As the name says this call is to validate from GPU driver for the size
>> and prot of map area. GPU driver can change size and prot for this map area.

If I understand correctly, you are trying to share a physical BAR among
multiple vGPUs, by mapping a single pfn each time, when fault happens?

> 
> Currently we don't require such interface for Intel vGPU. Need to think about
> its rationale carefully (still not clear to me). Jike, do you have any thought on
> this?

We need the mmap method of vgpu_device to be implemented, but I was
expecting something else, like calling remap_pfn_range() directly from
the mmap.

>
> Thanks
> Kevin
>

--
Thanks,
Jike
Kirti Wankhede May 6, 2016, 4:16 p.m. UTC | #12
On 5/6/2016 5:44 PM, Jike Song wrote:
> On 05/05/2016 05:06 PM, Tian, Kevin wrote:
>>> From: Kirti Wankhede
>>>
>>>  >> + * @validate_map_request:	Validate remap pfn request
>>>  >> + *				@vdev: vgpu device structure
>>>  >> + *				@virtaddr: target user address to start at
>>>  >> + *				@pfn: physical address of kernel memory, GPU
>>>  >> + *				driver can change if required.
>>>  >> + *				@size: size of map area, GPU driver can change
>>>  >> + *				the size of map area if desired.
>>>  >> + *				@prot: page protection flags for this mapping,
>>>  >> + *				GPU driver can change, if required.
>>>  >> + *				Returns integer: success (0) or error (< 0)
>>>  >
>>>  > Was not at all clear to me what this did until I got to patch 2, this
>>>  > is actually providing the fault handling for mmap'ing a vGPU mmio BAR.
>>>  > Needs a better name or better description.
>>>  >
>>>
>>> If say VMM mmap whole BAR1 of GPU, say 128MB, so fault would occur when
>>> BAR1 is tried to access then the size is calculated as:
>>> req_size = vma->vm_end - virtaddr
> Hi Kirti,
> 
> virtaddr is the faulted one, vma->vm_end the vaddr of the mmap-ed 128MB BAR1?
> 
> Would you elaborate why (vm_end - fault_addr) results the requested size? 
> 
> 

If first access is at start address of mmaped address, fault_addr is
vma->vm_start. Then (vm_end - vm_start) is the size mmapped region.

req_size should not exceed (vm_end - vm_start).


>>> Since GPU is being shared by multiple vGPUs, GPU driver might not remap
>>> whole BAR1 for only one vGPU device, so would prefer, say map one page
>>> at a time. GPU driver returns PAGE_SIZE. This is used by
>>> remap_pfn_range(). Now on next access to BAR1 other than that page, we
>>> will again get a fault().
>>> As the name says this call is to validate from GPU driver for the size
>>> and prot of map area. GPU driver can change size and prot for this map area.
> 
> If I understand correctly, you are trying to share a physical BAR among
> multiple vGPUs, by mapping a single pfn each time, when fault happens?
> 

Yes.

>>
>> Currently we don't require such interface for Intel vGPU. Need to think about
>> its rationale carefully (still not clear to me). Jike, do you have any thought on
>> this?
> 
> We need the mmap method of vgpu_device to be implemented, but I was
> expecting something else, like calling remap_pfn_range() directly from
> the mmap.
>

Calling remap_pfn_range directly from mmap means you would like to remap
pfn for whole BAR1 during mmap, right?

In that case, don't set validate_map_request() and access start of mmap
address, so that on first access it will do remap_pfn_range() for
(vm_end - vm_start).

Thanks,
Kirti


>>
>> Thanks
>> Kevin
>>
> 
> --
> Thanks,
> Jike
>
Jike Song May 9, 2016, 12:12 p.m. UTC | #13
On 05/07/2016 12:16 AM, Kirti Wankhede wrote:
> 
> 
> On 5/6/2016 5:44 PM, Jike Song wrote:
>> On 05/05/2016 05:06 PM, Tian, Kevin wrote:
>>>> From: Kirti Wankhede
>>>>
>>>>  >> + * @validate_map_request:	Validate remap pfn request
>>>>  >> + *				@vdev: vgpu device structure
>>>>  >> + *				@virtaddr: target user address to start at
>>>>  >> + *				@pfn: physical address of kernel memory, GPU
>>>>  >> + *				driver can change if required.
>>>>  >> + *				@size: size of map area, GPU driver can change
>>>>  >> + *				the size of map area if desired.
>>>>  >> + *				@prot: page protection flags for this mapping,
>>>>  >> + *				GPU driver can change, if required.
>>>>  >> + *				Returns integer: success (0) or error (< 0)
>>>>  >
>>>>  > Was not at all clear to me what this did until I got to patch 2, this
>>>>  > is actually providing the fault handling for mmap'ing a vGPU mmio BAR.
>>>>  > Needs a better name or better description.
>>>>  >
>>>>
>>>> If say VMM mmap whole BAR1 of GPU, say 128MB, so fault would occur when
>>>> BAR1 is tried to access then the size is calculated as:
>>>> req_size = vma->vm_end - virtaddr
>> Hi Kirti,
>>
>> virtaddr is the faulted one, vma->vm_end the vaddr of the mmap-ed 128MB BAR1?
>>
>> Would you elaborate why (vm_end - fault_addr) results the requested size? 
>>
>>
> 
> If first access is at start address of mmaped address, fault_addr is
> vma->vm_start. Then (vm_end - vm_start) is the size mmapped region.
> 
> req_size should not exceed (vm_end - vm_start).
> 

[Thanks for the kind explanation, I spent some time to dig & recall the details]


So this consists of two checks:

	1) vm_end >= vm_start
	2) fault_addr >= vm_start && fault_addr <= vm_end

>>>> Since GPU is being shared by multiple vGPUs, GPU driver might not remap
>>>> whole BAR1 for only one vGPU device, so would prefer, say map one page
>>>> at a time. GPU driver returns PAGE_SIZE. This is used by
>>>> remap_pfn_range(). Now on next access to BAR1 other than that page, we
>>>> will again get a fault().
>>>> As the name says this call is to validate from GPU driver for the size
>>>> and prot of map area. GPU driver can change size and prot for this map area.
>>
>> If I understand correctly, you are trying to share a physical BAR among
>> multiple vGPUs, by mapping a single pfn each time, when fault happens?
>>
> 
> Yes.
> 

Thanks.

For the vma with a vm_ops, and each time only one pfn to proceed, can
we replace remap_pfn_range with vm_insert_pfn? I had a quick check on
kernel repo, it seems that remap_pfn_range is only called from fops.mmap,
not from vma->vm_ops.fault.

>>>
>>> Currently we don't require such interface for Intel vGPU. Need to think about
>>> its rationale carefully (still not clear to me). Jike, do you have any thought on
>>> this?
>>
>> We need the mmap method of vgpu_device to be implemented, but I was
>> expecting something else, like calling remap_pfn_range() directly from
>> the mmap.
>>
> 
> Calling remap_pfn_range directly from mmap means you would like to remap
> pfn for whole BAR1 during mmap, right?
> 
> In that case, don't set validate_map_request() and access start of mmap
> address, so that on first access it will do remap_pfn_range() for
> (vm_end - vm_start).

No. I'd like QEMU to be aware that only a *portion* of the physical BAR1
is available for the vGPU, like:

	pGPU	: 1GB size BAR1
	vGPU	: 128MB size BAR1

QEMU has the information of the available size for a particular vGPU,
calling mmap() with that.

I'd say that your implementation is nice and flexible, but in order to
ensure whatever level a resource QoS, you have to account it from the
device-model (where validate_map_request is implemented), right?

How about making QEMU be aware that only a portion of MMIO is available?
Would appreciate hearing your opinion on this. Thanks!


> Thanks,
> Kirti
>

--
Thanks,
Jike
Tian, Kevin May 11, 2016, 6:37 a.m. UTC | #14
> From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> Sent: Thursday, May 05, 2016 8:57 PM
> 
> 
> On 5/5/2016 5:37 PM, Tian, Kevin wrote:
> >> From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> >> Sent: Thursday, May 05, 2016 6:45 PM
> >>
> >>
> >> On 5/5/2016 2:36 PM, Tian, Kevin wrote:
> >>>> From: Kirti Wankhede
> >>>> Sent: Wednesday, May 04, 2016 9:32 PM
> >>>>
> >>>> Thanks Alex.
> >>>>
> >>>>  >> +config VGPU_VFIO
> >>>>  >> +    tristate
> >>>>  >> +    depends on VGPU
> >>>>  >> +    default n
> >>>>  >> +
> >>>>  >
> >>>>  > This is a little bit convoluted, it seems like everything added in this
> >>>>  > patch is vfio agnostic, it doesn't necessarily care what the consumer
> >>>>  > is.  That makes me think we should only be adding CONFIG_VGPU here and
> >>>>  > it should not depend on CONFIG_VFIO or be enabling CONFIG_VGPU_VFIO.
> >>>>  > The middle config entry is also redundant to the first, just move the
> >>>>  > default line up to the first and remove the rest.
> >>>>
> >>>> CONFIG_VGPU doesn't directly depend on VFIO. CONFIG_VGPU_VFIO is
> >>>> directly dependent on VFIO. But devices created by VGPU core module need
> >>>> a driver to manage those devices. CONFIG_VGPU_VFIO is the driver which
> >>>> will manage vgpu devices. So I think CONFIG_VGPU_VFIO should be enabled
> >>>> by CONFIG_VGPU.
> >>>>
> >>>> This would look like:
> >>>> menuconfig VGPU
> >>>>      tristate "VGPU driver framework"
> >>>>      select VGPU_VFIO
> >>>>      default n
> >>>>      help
> >>>>          VGPU provides a framework to virtualize GPU without SR-IOV cap
> >>>>          See Documentation/vgpu.txt for more details.
> >>>>
> >>>>          If you don't know what do here, say N.
> >>>>
> >>>> config VGPU_VFIO
> >>>>      tristate
> >>>>      depends on VGPU
> >>>>      depends on VFIO
> >>>>      default n
> >>>>
> >>>
> >>> There could be multiple drivers operating VGPU. Why do we restrict
> >>> it to VFIO here?
> >>>
> >>
> >> VGPU_VFIO uses VFIO APIs, it depends on VFIO.
> >> I think since there is no other driver than VGPU_VFIO for VGPU devices,
> >> we should keep default selection of VGPU_VFIO on VGPU. May be in future
> >> if other driver is add ti operate vGPU devices, then default selection
> >> can be removed.
> >
> > What's your plan to support Xen here?
> >
> 
> No plans to support Xen.
> 

Intel will support both KVM and Xen based on this framework.

Also, such hard binding between components is better avoided if this framework
is designed for multi-drivers (that's why you introduce vgpu_register_driver).

Thanks
Kevin
Tian, Kevin May 12, 2016, 8:22 a.m. UTC | #15
Hi, Kirti/Neo, any response for below comment?

> From: Tian, Kevin
> Sent: Wednesday, May 04, 2016 10:59 AM
> 
> > From: Alex Williamson
> > Sent: Wednesday, May 04, 2016 6:44 AM
> > > +/**
> > > + * struct gpu_device_ops - Structure to be registered for each physical GPU to
> > > + * register the device to vgpu module.
> > > + *
> > > + * @owner:			The module owner.
> > > + * @dev_attr_groups:		Default attributes of the physical device.
> > > + * @vgpu_attr_groups:		Default attributes of the vGPU device.
> > > + * @vgpu_supported_config:	Called to get information about supported vgpu
> types.
> > > + *				@dev : pci device structure of physical GPU.
> > > + *				@config: should return string listing supported config
> > > + *				Returns integer: success (0) or error (< 0)
> > > + * @vgpu_create:		Called to allocate basic resouces in graphics
> 
> It's redundant to have vgpu prefixed to every op here. Same comment
> for later sysfs node.
> 
> > > + *				driver for a particular vgpu.
> > > + *				@dev: physical pci device structure on which vgpu
> > > + *				      should be created
> > > + *				@uuid: VM's uuid for which VM it is intended to
> > > + *				@instance: vgpu instance in that VM
> 
> I didn't quite get @instance here. Is it whatever vendor specific format
> to indicate a vgpu?
> 
> > > + *				@vgpu_params: extra parameters required by GPU driver.
> > > + *				Returns integer: success (0) or error (< 0)
> > > + * @vgpu_destroy:		Called to free resources in graphics driver for
> > > + *				a vgpu instance of that VM.
> > > + *				@dev: physical pci device structure to which
> > > + *				this vgpu points to.
> > > + *				@uuid: VM's uuid for which the vgpu belongs to.
> > > + *				@instance: vgpu instance in that VM
> > > + *				Returns integer: success (0) or error (< 0)
> > > + *				If VM is running and vgpu_destroy is called that
> > > + *				means the vGPU is being hotunpluged. Return error
> > > + *				if VM is running and graphics driver doesn't
> > > + *				support vgpu hotplug.
> > > + * @vgpu_start:			Called to do initiate vGPU initialization
> > > + *				process in graphics driver when VM boots before
> > > + *				qemu starts.
> > > + *				@uuid: VM's UUID which is booting.
> > > + *				Returns integer: success (0) or error (< 0)
> > > + * @vgpu_shutdown:		Called to teardown vGPU related resources for
> > > + *				the VM
> > > + *				@uuid: VM's UUID which is shutting down .
> > > + *				Returns integer: success (0) or error (< 0)
> 
> Can you give some specific example about difference between destroy
> and shutdown? Want to map it correctly into our side, e.g. whether we
> need implement both or just one of them.
> 
> Another optional op is 'stop', allowing physical device to stop scheduling
> vGPU including wait for in-flight DMA done. It would be useful to support
> VM live migration with vGPU assigned.
>
diff mbox

Patch

diff --git a/drivers/Kconfig b/drivers/Kconfig
index d2ac339..5fd9eae 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -122,6 +122,8 @@  source "drivers/uio/Kconfig"
 
 source "drivers/vfio/Kconfig"
 
+source "drivers/vgpu/Kconfig"
+
 source "drivers/vlynq/Kconfig"
 
 source "drivers/virt/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 8f5d076..36f1110 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -84,6 +84,7 @@  obj-$(CONFIG_FUSION)		+= message/
 obj-y				+= firewire/
 obj-$(CONFIG_UIO)		+= uio/
 obj-$(CONFIG_VFIO)		+= vfio/
+obj-$(CONFIG_VFIO)		+= vgpu/
 obj-y				+= cdrom/
 obj-y				+= auxdisplay/
 obj-$(CONFIG_PCCARD)		+= pcmcia/
diff --git a/drivers/vgpu/Kconfig b/drivers/vgpu/Kconfig
new file mode 100644
index 0000000..792eb48
--- /dev/null
+++ b/drivers/vgpu/Kconfig
@@ -0,0 +1,21 @@ 
+
+menuconfig VGPU
+    tristate "VGPU driver framework"
+    depends on VFIO
+    select VGPU_VFIO
+    help
+        VGPU provides a framework to virtualize GPU without SR-IOV cap
+        See Documentation/vgpu.txt for more details.
+
+        If you don't know what do here, say N.
+
+config VGPU
+    tristate
+    depends on VFIO
+    default n
+
+config VGPU_VFIO
+    tristate
+    depends on VGPU
+    default n
+
diff --git a/drivers/vgpu/Makefile b/drivers/vgpu/Makefile
new file mode 100644
index 0000000..f5be980
--- /dev/null
+++ b/drivers/vgpu/Makefile
@@ -0,0 +1,4 @@ 
+
+vgpu-y := vgpu-core.o vgpu-sysfs.o vgpu-driver.o
+
+obj-$(CONFIG_VGPU)			+= vgpu.o
diff --git a/drivers/vgpu/vgpu-core.c b/drivers/vgpu/vgpu-core.c
new file mode 100644
index 0000000..1a7d274
--- /dev/null
+++ b/drivers/vgpu/vgpu-core.c
@@ -0,0 +1,424 @@ 
+/*
+ * VGPU Core Driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/cdev.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/uuid.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/sysfs.h>
+#include <linux/ctype.h>
+#include <linux/vgpu.h>
+
+#include "vgpu_private.h"
+
+#define DRIVER_VERSION	"0.1"
+#define DRIVER_AUTHOR	"NVIDIA Corporation"
+#define DRIVER_DESC	"VGPU Core Driver"
+
+/*
+ * #defines
+ */
+
+#define VGPU_CLASS_NAME		"vgpu"
+
+/*
+ * Global Structures
+ */
+
+static struct vgpu {
+	struct list_head    vgpu_devices_list;
+	struct mutex        vgpu_devices_lock;
+	struct list_head    gpu_devices_list;
+	struct mutex        gpu_devices_lock;
+} vgpu;
+
+static struct class vgpu_class;
+
+/*
+ * Functions
+ */
+
+struct vgpu_device *get_vgpu_device_from_group(struct iommu_group *group)
+{
+	struct vgpu_device *vdev = NULL;
+
+	mutex_lock(&vgpu.vgpu_devices_lock);
+	list_for_each_entry(vdev, &vgpu.vgpu_devices_list, list) {
+		if (vdev->group) {
+			if (iommu_group_id(vdev->group) == iommu_group_id(group)) {
+				mutex_unlock(&vgpu.vgpu_devices_lock);
+				return vdev;
+			}
+		}
+	}
+	mutex_unlock(&vgpu.vgpu_devices_lock);
+	return NULL;
+}
+
+EXPORT_SYMBOL_GPL(get_vgpu_device_from_group);
+
+static int vgpu_add_attribute_group(struct device *dev,
+			            const struct attribute_group **groups)
+{
+        return sysfs_create_groups(&dev->kobj, groups);
+}
+
+static void vgpu_remove_attribute_group(struct device *dev,
+			                const struct attribute_group **groups)
+{
+        sysfs_remove_groups(&dev->kobj, groups);
+}
+
+int vgpu_register_device(struct pci_dev *dev, const struct gpu_device_ops *ops)
+{
+	int ret = 0;
+	struct gpu_device *gpu_dev, *tmp;
+
+	if (!dev)
+		return -EINVAL;
+
+        gpu_dev = kzalloc(sizeof(*gpu_dev), GFP_KERNEL);
+        if (!gpu_dev)
+                return -ENOMEM;
+
+	gpu_dev->dev = dev;
+        gpu_dev->ops = ops;
+
+        mutex_lock(&vgpu.gpu_devices_lock);
+
+        /* Check for duplicates */
+        list_for_each_entry(tmp, &vgpu.gpu_devices_list, gpu_next) {
+                if (tmp->dev == dev) {
+			ret = -EINVAL;
+			goto add_error;
+                }
+        }
+
+	ret = vgpu_create_pci_device_files(dev);
+	if (ret)
+		goto add_error;
+
+	ret = vgpu_add_attribute_group(&dev->dev, ops->dev_attr_groups);
+	if (ret)
+		goto add_group_error;
+
+        list_add(&gpu_dev->gpu_next, &vgpu.gpu_devices_list);
+
+	printk(KERN_INFO "VGPU: Registered dev 0x%x 0x%x, class 0x%x\n",
+			 dev->vendor, dev->device, dev->class);
+        mutex_unlock(&vgpu.gpu_devices_lock);
+
+        return 0;
+
+add_group_error:
+	vgpu_remove_pci_device_files(dev);
+add_error:
+	mutex_unlock(&vgpu.gpu_devices_lock);
+	kfree(gpu_dev);
+	return ret;
+
+}
+EXPORT_SYMBOL(vgpu_register_device);
+
+void vgpu_unregister_device(struct pci_dev *dev)
+{
+        struct gpu_device *gpu_dev;
+
+        mutex_lock(&vgpu.gpu_devices_lock);
+        list_for_each_entry(gpu_dev, &vgpu.gpu_devices_list, gpu_next) {
+		struct vgpu_device *vdev = NULL;
+
+                if (gpu_dev->dev != dev)
+			continue;
+
+		printk(KERN_INFO "VGPU: Unregistered dev 0x%x 0x%x, class 0x%x\n",
+				dev->vendor, dev->device, dev->class);
+
+		list_for_each_entry(vdev, &vgpu.vgpu_devices_list, list) {
+			if (vdev->gpu_dev != gpu_dev)
+				continue;
+			destroy_vgpu_device(vdev);
+		}
+		vgpu_remove_attribute_group(&dev->dev, gpu_dev->ops->dev_attr_groups);
+		vgpu_remove_pci_device_files(dev);
+		list_del(&gpu_dev->gpu_next);
+		mutex_unlock(&vgpu.gpu_devices_lock);
+		kfree(gpu_dev);
+		return;
+        }
+        mutex_unlock(&vgpu.gpu_devices_lock);
+}
+EXPORT_SYMBOL(vgpu_unregister_device);
+
+/*
+ * Helper Functions
+ */
+
+static struct vgpu_device *vgpu_device_alloc(uuid_le uuid, int instance, char *name)
+{
+	struct vgpu_device *vgpu_dev = NULL;
+
+	vgpu_dev = kzalloc(sizeof(*vgpu_dev), GFP_KERNEL);
+	if (!vgpu_dev)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&vgpu_dev->kref);
+	memcpy(&vgpu_dev->uuid, &uuid, sizeof(uuid_le));
+	vgpu_dev->vgpu_instance = instance;
+	strcpy(vgpu_dev->dev_name, name);
+
+	mutex_lock(&vgpu.vgpu_devices_lock);
+	list_add(&vgpu_dev->list, &vgpu.vgpu_devices_list);
+	mutex_unlock(&vgpu.vgpu_devices_lock);
+
+	return vgpu_dev;
+}
+
+static void vgpu_device_free(struct vgpu_device *vgpu_dev)
+{
+	if (vgpu_dev) {
+		mutex_lock(&vgpu.vgpu_devices_lock);
+		list_del(&vgpu_dev->list);
+		mutex_unlock(&vgpu.vgpu_devices_lock);
+		kfree(vgpu_dev);
+	}
+	return;
+}
+
+struct vgpu_device *vgpu_drv_get_vgpu_device(uuid_le uuid, int instance)
+{
+	struct vgpu_device *vdev = NULL;
+
+	mutex_lock(&vgpu.vgpu_devices_lock);
+	list_for_each_entry(vdev, &vgpu.vgpu_devices_list, list) {
+		if ((uuid_le_cmp(vdev->uuid, uuid) == 0) &&
+		    (vdev->vgpu_instance == instance)) {
+			mutex_unlock(&vgpu.vgpu_devices_lock);
+			return vdev;
+		}
+	}
+	mutex_unlock(&vgpu.vgpu_devices_lock);
+	return NULL;
+}
+
+static void vgpu_device_release(struct device *dev)
+{
+	struct vgpu_device *vgpu_dev = to_vgpu_device(dev);
+	vgpu_device_free(vgpu_dev);
+}
+
+int create_vgpu_device(struct pci_dev *pdev, uuid_le uuid, uint32_t instance, char *vgpu_params)
+{
+	char name[64];
+	int numChar = 0;
+	int retval = 0;
+	struct vgpu_device *vgpu_dev = NULL;
+	struct gpu_device *gpu_dev;
+
+	printk(KERN_INFO "VGPU: %s: device ", __FUNCTION__);
+
+	numChar = sprintf(name, "%pUb-%d", uuid.b, instance);
+	name[numChar] = '\0';
+
+	vgpu_dev = vgpu_device_alloc(uuid, instance, name);
+	if (IS_ERR(vgpu_dev)) {
+		return PTR_ERR(vgpu_dev);
+	}
+
+	vgpu_dev->dev.parent  = &pdev->dev;
+	vgpu_dev->dev.bus     = &vgpu_bus_type;
+	vgpu_dev->dev.release = vgpu_device_release;
+	dev_set_name(&vgpu_dev->dev, "%s", name);
+
+	retval = device_register(&vgpu_dev->dev);
+	if (retval)
+		goto create_failed1;
+
+	printk(KERN_INFO "UUID %pUb \n", vgpu_dev->uuid.b);
+
+	mutex_lock(&vgpu.gpu_devices_lock);
+	list_for_each_entry(gpu_dev, &vgpu.gpu_devices_list, gpu_next) {
+		if (gpu_dev->dev != pdev)
+			continue;
+
+		vgpu_dev->gpu_dev = gpu_dev;
+		if (gpu_dev->ops->vgpu_create) {
+			retval = gpu_dev->ops->vgpu_create(pdev, vgpu_dev->uuid,
+							   instance, vgpu_params);
+			if (retval) {
+				mutex_unlock(&vgpu.gpu_devices_lock);
+				goto create_failed2;
+			}
+		}
+		break;
+	}
+	if (!vgpu_dev->gpu_dev) {
+		retval = -EINVAL;
+		mutex_unlock(&vgpu.gpu_devices_lock);
+		goto create_failed2;
+	}
+
+	mutex_unlock(&vgpu.gpu_devices_lock);
+
+	retval = vgpu_add_attribute_group(&vgpu_dev->dev, gpu_dev->ops->vgpu_attr_groups);
+	if (retval)
+		goto create_attr_error;
+
+	return retval;
+
+create_attr_error:
+	if (gpu_dev->ops->vgpu_destroy) {
+		int ret = 0;
+		ret = gpu_dev->ops->vgpu_destroy(gpu_dev->dev,
+						 vgpu_dev->uuid,
+						 vgpu_dev->vgpu_instance);
+	}
+
+create_failed2:
+	device_unregister(&vgpu_dev->dev);
+
+create_failed1:
+	vgpu_device_free(vgpu_dev);
+
+	return retval;
+}
+
+void destroy_vgpu_device(struct vgpu_device *vgpu_dev)
+{
+	struct gpu_device *gpu_dev = vgpu_dev->gpu_dev;
+
+	printk(KERN_INFO "VGPU: destroying device %s ", vgpu_dev->dev_name);
+	if (gpu_dev->ops->vgpu_destroy) {
+		int retval = 0;
+		retval = gpu_dev->ops->vgpu_destroy(gpu_dev->dev,
+						    vgpu_dev->uuid,
+						    vgpu_dev->vgpu_instance);
+	/* if vendor driver doesn't return success that means vendor driver doesn't
+	 * support hot-unplug */
+		if (retval)
+			return;
+	}
+
+	vgpu_remove_attribute_group(&vgpu_dev->dev, gpu_dev->ops->vgpu_attr_groups);
+	device_unregister(&vgpu_dev->dev);
+}
+
+void get_vgpu_supported_types(struct device *dev, char *str)
+{
+	struct gpu_device *gpu_dev;
+
+	mutex_lock(&vgpu.gpu_devices_lock);
+	list_for_each_entry(gpu_dev, &vgpu.gpu_devices_list, gpu_next) {
+		if (&gpu_dev->dev->dev == dev) {
+			if (gpu_dev->ops->vgpu_supported_config)
+				gpu_dev->ops->vgpu_supported_config(gpu_dev->dev, str);
+			break;
+		}
+	}
+	mutex_unlock(&vgpu.gpu_devices_lock);
+}
+
+int vgpu_start_callback(struct vgpu_device *vgpu_dev)
+{
+	int ret = 0;
+	struct gpu_device *gpu_dev = vgpu_dev->gpu_dev;
+
+	mutex_lock(&vgpu.gpu_devices_lock);
+	if (gpu_dev->ops->vgpu_start)
+		ret = gpu_dev->ops->vgpu_start(vgpu_dev->uuid);
+	mutex_unlock(&vgpu.gpu_devices_lock);
+	return ret;
+}
+
+int vgpu_shutdown_callback(struct vgpu_device *vgpu_dev)
+{
+	int ret = 0;
+	struct gpu_device *gpu_dev = vgpu_dev->gpu_dev;
+
+	mutex_lock(&vgpu.gpu_devices_lock);
+	if (gpu_dev->ops->vgpu_shutdown)
+		ret = gpu_dev->ops->vgpu_shutdown(vgpu_dev->uuid);
+	mutex_unlock(&vgpu.gpu_devices_lock);
+	return ret;
+}
+
+char *vgpu_devnode(struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "vgpu/%s", dev_name(dev));
+}
+
+static void release_vgpubus_dev(struct device *dev)
+{
+	struct vgpu_device *vgpu_dev = to_vgpu_device(dev);
+	destroy_vgpu_device(vgpu_dev);
+}
+
+static struct class vgpu_class = {
+	.name		= VGPU_CLASS_NAME,
+	.owner		= THIS_MODULE,
+	.class_attrs	= vgpu_class_attrs,
+	.dev_groups	= vgpu_dev_groups,
+	.devnode	= vgpu_devnode,
+	.dev_release    = release_vgpubus_dev,
+};
+
+static int __init vgpu_init(void)
+{
+	int rc = 0;
+
+	memset(&vgpu, 0 , sizeof(vgpu));
+
+	mutex_init(&vgpu.vgpu_devices_lock);
+	INIT_LIST_HEAD(&vgpu.vgpu_devices_list);
+	mutex_init(&vgpu.gpu_devices_lock);
+	INIT_LIST_HEAD(&vgpu.gpu_devices_list);
+
+	rc = class_register(&vgpu_class);
+	if (rc < 0) {
+		printk(KERN_ERR "Error: failed to register vgpu class\n");
+		goto failed1;
+	}
+
+	rc = vgpu_bus_register();
+	if (rc < 0) {
+		printk(KERN_ERR "Error: failed to register vgpu bus\n");
+		class_unregister(&vgpu_class);
+	}
+
+    request_module_nowait("vgpu_vfio");
+
+failed1:
+	return rc;
+}
+
+static void __exit vgpu_exit(void)
+{
+	vgpu_bus_unregister();
+	class_unregister(&vgpu_class);
+}
+
+module_init(vgpu_init)
+module_exit(vgpu_exit)
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vgpu/vgpu-driver.c b/drivers/vgpu/vgpu-driver.c
new file mode 100644
index 0000000..c4c2e9f
--- /dev/null
+++ b/drivers/vgpu/vgpu-driver.c
@@ -0,0 +1,136 @@ 
+/*
+ * VGPU driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/sysfs.h>
+#include <linux/ctype.h>
+#include <linux/vgpu.h>
+
+#include "vgpu_private.h"
+
+static int vgpu_device_attach_iommu(struct vgpu_device *vgpu_dev)
+{
+        int retval = 0;
+        struct iommu_group *group = NULL;
+
+        group = iommu_group_alloc();
+        if (IS_ERR(group)) {
+                printk(KERN_ERR "VGPU: failed to allocate group!\n");
+                return PTR_ERR(group);
+        }
+
+        retval = iommu_group_add_device(group, &vgpu_dev->dev);
+        if (retval) {
+                printk(KERN_ERR "VGPU: failed to add dev to group!\n");
+                iommu_group_put(group);
+                return retval;
+        }
+
+        vgpu_dev->group = group;
+
+        printk(KERN_INFO "VGPU: group_id = %d \n", iommu_group_id(group));
+        return retval;
+}
+
+static void vgpu_device_detach_iommu(struct vgpu_device *vgpu_dev)
+{
+        iommu_group_put(vgpu_dev->dev.iommu_group);
+        iommu_group_remove_device(&vgpu_dev->dev);
+        printk(KERN_INFO "VGPU: detaching iommu \n");
+}
+
+static int vgpu_device_probe(struct device *dev)
+{
+	struct vgpu_driver *drv = to_vgpu_driver(dev->driver);
+	struct vgpu_device *vgpu_dev = to_vgpu_device(dev);
+	int status = 0;
+
+	status = vgpu_device_attach_iommu(vgpu_dev);
+	if (status) {
+		printk(KERN_ERR "Failed to attach IOMMU\n");
+		return status;
+	}
+
+	if (drv && drv->probe) {
+		status = drv->probe(dev);
+	}
+
+	return status;
+}
+
+static int vgpu_device_remove(struct device *dev)
+{
+	struct vgpu_driver *drv = to_vgpu_driver(dev->driver);
+	struct vgpu_device *vgpu_dev = to_vgpu_device(dev);
+	int status = 0;
+
+	if (drv && drv->remove) {
+		drv->remove(dev);
+	}
+
+	vgpu_device_detach_iommu(vgpu_dev);
+
+	return status;
+}
+
+struct bus_type vgpu_bus_type = {
+	.name		= "vgpu",
+	.probe		= vgpu_device_probe,
+	.remove		= vgpu_device_remove,
+};
+EXPORT_SYMBOL_GPL(vgpu_bus_type);
+
+/**
+ * vgpu_register_driver - register a new vGPU driver
+ * @drv: the driver to register
+ * @owner: owner module of driver ro register
+ *
+ * Returns a negative value on error, otherwise 0.
+ */
+int vgpu_register_driver(struct vgpu_driver *drv, struct module *owner)
+{
+	/* initialize common driver fields */
+	drv->driver.name = drv->name;
+	drv->driver.bus = &vgpu_bus_type;
+	drv->driver.owner = owner;
+
+	/* register with core */
+	return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL(vgpu_register_driver);
+
+/**
+ * vgpu_unregister_driver - unregister vGPU driver
+ * @drv: the driver to unregister
+ *
+ */
+void vgpu_unregister_driver(struct vgpu_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL(vgpu_unregister_driver);
+
+int vgpu_bus_register(void)
+{
+	return bus_register(&vgpu_bus_type);
+}
+
+void vgpu_bus_unregister(void)
+{
+	bus_unregister(&vgpu_bus_type);
+}
diff --git a/drivers/vgpu/vgpu-sysfs.c b/drivers/vgpu/vgpu-sysfs.c
new file mode 100644
index 0000000..b740f9a
--- /dev/null
+++ b/drivers/vgpu/vgpu-sysfs.c
@@ -0,0 +1,365 @@ 
+/*
+ * File attributes for vGPU devices
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/ctype.h>
+#include <linux/uuid.h>
+#include <linux/vfio.h>
+#include <linux/vgpu.h>
+
+#include "vgpu_private.h"
+
+/* Prototypes */
+
+static ssize_t vgpu_supported_types_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf);
+static DEVICE_ATTR_RO(vgpu_supported_types);
+
+static ssize_t vgpu_create_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count);
+static DEVICE_ATTR_WO(vgpu_create);
+
+static ssize_t vgpu_destroy_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count);
+static DEVICE_ATTR_WO(vgpu_destroy);
+
+
+/* Static functions */
+
+static bool is_uuid_sep(char sep)
+{
+	if (sep == '\n' || sep == '-' || sep == ':' || sep == '\0')
+		return true;
+	return false;
+}
+
+
+static int uuid_parse(const char *str, uuid_le *uuid)
+{
+	int i;
+
+	if (strlen(str) < 36)
+		return -1;
+
+	for (i = 0; i < 16; i++) {
+		if (!isxdigit(str[0]) || !isxdigit(str[1])) {
+			printk(KERN_ERR "%s err", __FUNCTION__);
+			return -EINVAL;
+		}
+
+		uuid->b[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]);
+		str += 2;
+		if (is_uuid_sep(*str))
+			str++;
+	}
+
+	return 0;
+}
+
+
+/* Functions */
+static ssize_t vgpu_supported_types_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	char *str;
+	ssize_t n;
+
+        str = kzalloc(sizeof(*str) * 512, GFP_KERNEL);
+        if (!str)
+                return -ENOMEM;
+
+	get_vgpu_supported_types(dev, str);
+
+	n = sprintf(buf,"%s\n", str);
+	kfree(str);
+
+	return n;
+}
+
+static ssize_t vgpu_create_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	char *str, *pstr;
+	char *uuid_str, *instance_str, *vgpu_params = NULL;
+	uuid_le uuid;
+	uint32_t instance;
+	struct pci_dev *pdev;
+	int ret = 0;
+
+	pstr = str = kstrndup(buf, count, GFP_KERNEL);
+
+	if (!str)
+		return -ENOMEM;
+
+	if ((uuid_str = strsep(&str, ":")) == NULL) {
+		printk(KERN_ERR "%s Empty UUID or string %s \n",
+				 __FUNCTION__, buf);
+		ret = -EINVAL;
+		goto create_error;
+	}
+
+	if (!str) {
+		printk(KERN_ERR "%s vgpu instance not specified %s \n",
+				 __FUNCTION__, buf);
+		ret = -EINVAL;
+		goto create_error;
+	}
+
+	if ((instance_str = strsep(&str, ":")) == NULL) {
+		printk(KERN_ERR "%s Empty instance or string %s \n",
+				 __FUNCTION__, buf);
+		ret = -EINVAL;
+		goto create_error;
+	}
+
+	instance = (unsigned int)simple_strtoul(instance_str, NULL, 0);
+
+	if (!str) {
+		printk(KERN_ERR "%s vgpu params not specified %s \n",
+				 __FUNCTION__, buf);
+		ret = -EINVAL;
+		goto create_error;
+	}
+
+	vgpu_params = kstrdup(str, GFP_KERNEL);
+
+	if (!vgpu_params) {
+		printk(KERN_ERR "%s vgpu params allocation failed \n",
+				 __FUNCTION__);
+		ret = -EINVAL;
+		goto create_error;
+	}
+
+	if (uuid_parse(uuid_str, &uuid) < 0) {
+		printk(KERN_ERR "%s UUID parse error  %s \n", __FUNCTION__, buf);
+		ret = -EINVAL;
+		goto create_error;
+	}
+
+	if (dev_is_pci(dev)) {
+		pdev = to_pci_dev(dev);
+
+		if (create_vgpu_device(pdev, uuid, instance, vgpu_params) < 0) {
+			printk(KERN_ERR "%s vgpu create error \n", __FUNCTION__);
+			ret = -EINVAL;
+			goto create_error;
+		}
+		ret = count;
+	}
+
+create_error:
+	if (vgpu_params)
+		kfree(vgpu_params);
+
+	if (pstr)
+		kfree(pstr);
+	return ret;
+}
+
+static ssize_t vgpu_destroy_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	char *uuid_str, *str;
+	uuid_le uuid;
+	unsigned int instance;
+	struct vgpu_device *vgpu_dev = NULL;
+
+	str = kstrndup(buf, count, GFP_KERNEL);
+
+	if (!str)
+		return -ENOMEM;
+
+	if ((uuid_str = strsep(&str, ":")) == NULL) {
+		printk(KERN_ERR "%s Empty UUID or string %s \n", __FUNCTION__, buf);
+		return -EINVAL;
+	}
+
+	if (str == NULL) {
+		printk(KERN_ERR "%s instance not specified %s \n", __FUNCTION__, buf);
+		return -EINVAL;
+	}
+
+	instance = (unsigned int)simple_strtoul(str, NULL, 0);
+
+	if (uuid_parse(uuid_str, &uuid) < 0) {
+		printk(KERN_ERR "%s UUID parse error  %s \n", __FUNCTION__, buf);
+		return -EINVAL;
+	}
+
+	printk(KERN_INFO "%s UUID %pUb - %d \n", __FUNCTION__, uuid.b, instance);
+
+	vgpu_dev = vgpu_drv_get_vgpu_device(uuid, instance);
+
+	if (vgpu_dev)
+		destroy_vgpu_device(vgpu_dev);
+
+	return count;
+}
+
+static ssize_t
+vgpu_uuid_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct vgpu_device *drv = to_vgpu_device(dev);
+
+	if (drv)
+		return sprintf(buf, "%pUb \n", drv->uuid.b);
+
+	return sprintf(buf, " \n");
+}
+
+static DEVICE_ATTR_RO(vgpu_uuid);
+
+static ssize_t
+vgpu_group_id_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct vgpu_device *drv = to_vgpu_device(dev);
+
+	if (drv && drv->group)
+		return sprintf(buf, "%d \n", iommu_group_id(drv->group));
+
+	return sprintf(buf, " \n");
+}
+
+static DEVICE_ATTR_RO(vgpu_group_id);
+
+
+static struct attribute *vgpu_dev_attrs[] = {
+	&dev_attr_vgpu_uuid.attr,
+	&dev_attr_vgpu_group_id.attr,
+	NULL,
+};
+
+static const struct attribute_group vgpu_dev_group = {
+	.attrs = vgpu_dev_attrs,
+};
+
+const struct attribute_group *vgpu_dev_groups[] = {
+	&vgpu_dev_group,
+	NULL,
+};
+
+
+ssize_t vgpu_start_store(struct class *class, struct class_attribute *attr,
+			 const char *buf, size_t count)
+{
+	char *uuid_str;
+	uuid_le uuid;
+	struct vgpu_device *vgpu_dev = NULL;
+	int ret;
+
+	uuid_str = kstrndup(buf, count, GFP_KERNEL);
+
+	if (!uuid_str)
+		return -ENOMEM;
+
+	if (uuid_parse(uuid_str, &uuid) < 0) {
+		printk(KERN_ERR "%s UUID parse error  %s \n", __FUNCTION__, buf);
+		return -EINVAL;
+	}
+
+	vgpu_dev = vgpu_drv_get_vgpu_device(uuid, 0);
+
+	if (vgpu_dev && dev_is_vgpu(&vgpu_dev->dev)) {
+		kobject_uevent(&vgpu_dev->dev.kobj, KOBJ_ONLINE);
+
+		ret = vgpu_start_callback(vgpu_dev);
+		if (ret < 0) {
+			printk(KERN_ERR "%s vgpu_start callback failed  %d \n",
+					 __FUNCTION__, ret);
+			return ret;
+		}
+	}
+
+	return count;
+}
+
+ssize_t vgpu_shutdown_store(struct class *class, struct class_attribute *attr,
+			    const char *buf, size_t count)
+{
+	char *uuid_str;
+	uuid_le uuid;
+	struct vgpu_device *vgpu_dev = NULL;
+	int ret;
+
+	uuid_str = kstrndup(buf, count, GFP_KERNEL);
+
+	if (!uuid_str)
+		return -ENOMEM;
+
+	if (uuid_parse(uuid_str, &uuid) < 0) {
+		printk(KERN_ERR "%s UUID parse error  %s \n", __FUNCTION__, buf);
+		return -EINVAL;
+	}
+	vgpu_dev = vgpu_drv_get_vgpu_device(uuid, 0);
+
+	if (vgpu_dev && dev_is_vgpu(&vgpu_dev->dev)) {
+		kobject_uevent(&vgpu_dev->dev.kobj, KOBJ_OFFLINE);
+
+		ret = vgpu_shutdown_callback(vgpu_dev);
+		if (ret < 0) {
+			printk(KERN_ERR "%s vgpu_shutdown callback failed  %d \n",
+					 __FUNCTION__, ret);
+			return ret;
+		}
+	}
+
+	return count;
+}
+
+struct class_attribute vgpu_class_attrs[] = {
+	__ATTR_WO(vgpu_start),
+	__ATTR_WO(vgpu_shutdown),
+	__ATTR_NULL
+};
+
+int vgpu_create_pci_device_files(struct pci_dev *dev)
+{
+	int retval;
+
+	retval = sysfs_create_file(&dev->dev.kobj,
+				   &dev_attr_vgpu_supported_types.attr);
+	if (retval) {
+		printk(KERN_ERR "VGPU-VFIO: failed to create vgpu_supported_types sysfs entry\n");
+		return retval;
+	}
+
+	retval = sysfs_create_file(&dev->dev.kobj, &dev_attr_vgpu_create.attr);
+	if (retval) {
+		printk(KERN_ERR "VGPU-VFIO: failed to create vgpu_create sysfs entry\n");
+		return retval;
+	}
+
+	retval = sysfs_create_file(&dev->dev.kobj, &dev_attr_vgpu_destroy.attr);
+	if (retval) {
+		printk(KERN_ERR "VGPU-VFIO: failed to create vgpu_destroy sysfs entry\n");
+		return retval;
+	}
+
+	return 0;
+}
+
+
+void vgpu_remove_pci_device_files(struct pci_dev *dev)
+{
+	sysfs_remove_file(&dev->dev.kobj, &dev_attr_vgpu_supported_types.attr);
+	sysfs_remove_file(&dev->dev.kobj, &dev_attr_vgpu_create.attr);
+	sysfs_remove_file(&dev->dev.kobj, &dev_attr_vgpu_destroy.attr);
+}
diff --git a/drivers/vgpu/vgpu_private.h b/drivers/vgpu/vgpu_private.h
new file mode 100644
index 0000000..35158ef
--- /dev/null
+++ b/drivers/vgpu/vgpu_private.h
@@ -0,0 +1,36 @@ 
+/*
+ * VGPU interal definition
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author:
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef VGPU_PRIVATE_H
+#define VGPU_PRIVATE_H
+
+struct vgpu_device *vgpu_drv_get_vgpu_device(uuid_le uuid, int instance);
+
+int  create_vgpu_device(struct pci_dev *pdev, uuid_le uuid, uint32_t instance,
+		       char *vgpu_params);
+void destroy_vgpu_device(struct vgpu_device *vgpu_dev);
+
+int  vgpu_bus_register(void);
+void vgpu_bus_unregister(void);
+
+/* Function prototypes for vgpu_sysfs */
+
+extern struct class_attribute vgpu_class_attrs[];
+extern const struct attribute_group *vgpu_dev_groups[];
+
+int  vgpu_create_pci_device_files(struct pci_dev *dev);
+void vgpu_remove_pci_device_files(struct pci_dev *dev);
+
+void get_vgpu_supported_types(struct device *dev, char *str);
+int  vgpu_start_callback(struct vgpu_device *vgpu_dev);
+int  vgpu_shutdown_callback(struct vgpu_device *vgpu_dev);
+
+#endif /* VGPU_PRIVATE_H */
diff --git a/include/linux/vgpu.h b/include/linux/vgpu.h
new file mode 100644
index 0000000..03a77cf
--- /dev/null
+++ b/include/linux/vgpu.h
@@ -0,0 +1,216 @@ 
+/*
+ * VGPU definition
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author:
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef VGPU_H
+#define VGPU_H
+
+// Common Data structures
+
+struct pci_bar_info {
+	uint64_t start;
+	uint64_t size;
+	uint32_t flags;
+};
+
+enum vgpu_emul_space_e {
+	vgpu_emul_space_config = 0, /*!< PCI configuration space */
+	vgpu_emul_space_io = 1,     /*!< I/O register space */
+	vgpu_emul_space_mmio = 2    /*!< Memory-mapped I/O space */
+};
+
+struct gpu_device;
+
+/*
+ * VGPU device
+ */
+struct vgpu_device {
+	struct kref		kref;
+	struct device		dev;
+	struct gpu_device	*gpu_dev;
+	struct iommu_group	*group;
+#define DEVICE_NAME_LEN		(64)
+	char			dev_name[DEVICE_NAME_LEN];
+	uuid_le			uuid;
+	uint32_t		vgpu_instance;
+	struct device_attribute	*dev_attr_vgpu_status;
+	int			vgpu_device_status;
+
+	void			*driver_data;
+
+	struct list_head	list;
+};
+
+
+/**
+ * struct gpu_device_ops - Structure to be registered for each physical GPU to
+ * register the device to vgpu module.
+ *
+ * @owner:			The module owner.
+ * @dev_attr_groups:		Default attributes of the physical device.
+ * @vgpu_attr_groups:		Default attributes of the vGPU device.
+ * @vgpu_supported_config:	Called to get information about supported vgpu types.
+ *				@dev : pci device structure of physical GPU.
+ *				@config: should return string listing supported config
+ *				Returns integer: success (0) or error (< 0)
+ * @vgpu_create:		Called to allocate basic resouces in graphics
+ *				driver for a particular vgpu.
+ *				@dev: physical pci device structure on which vgpu
+ *				      should be created
+ *				@uuid: VM's uuid for which VM it is intended to
+ *				@instance: vgpu instance in that VM
+ *				@vgpu_params: extra parameters required by GPU driver.
+ *				Returns integer: success (0) or error (< 0)
+ * @vgpu_destroy:		Called to free resources in graphics driver for
+ *				a vgpu instance of that VM.
+ *				@dev: physical pci device structure to which
+ *				this vgpu points to.
+ *				@uuid: VM's uuid for which the vgpu belongs to.
+ *				@instance: vgpu instance in that VM
+ *				Returns integer: success (0) or error (< 0)
+ *				If VM is running and vgpu_destroy is called that
+ *				means the vGPU is being hotunpluged. Return error
+ *				if VM is running and graphics driver doesn't
+ *				support vgpu hotplug.
+ * @vgpu_start:			Called to do initiate vGPU initialization
+ *				process in graphics driver when VM boots before
+ *				qemu starts.
+ *				@uuid: VM's UUID which is booting.
+ *				Returns integer: success (0) or error (< 0)
+ * @vgpu_shutdown:		Called to teardown vGPU related resources for
+ *				the VM
+ *				@uuid: VM's UUID which is shutting down .
+ *				Returns integer: success (0) or error (< 0)
+ * @read:			Read emulation callback
+ *				@vdev: vgpu device structure
+ *				@buf: read buffer
+ *				@count: number bytes to read
+ *				@address_space: specifies for which address space
+ *				the request is: pci_config_space, IO register
+ *				space or MMIO space.
+ *				@pos: offset from base address.
+ *				Retuns number on bytes read on success or error.
+ * @write:			Write emulation callback
+ *				@vdev: vgpu device structure
+ *				@buf: write buffer
+ *				@count: number bytes to be written
+ *				@address_space: specifies for which address space
+ *				the request is: pci_config_space, IO register
+ *				space or MMIO space.
+ *				@pos: offset from base address.
+ *				Retuns number on bytes written on success or error.
+ * @vgpu_set_irqs:		Called to send about interrupts configuration
+ *				information that qemu set.
+ *				@vdev: vgpu device structure
+ *				@flags, index, start, count and *data : same as
+ *				that of struct vfio_irq_set of
+ *				VFIO_DEVICE_SET_IRQS API.
+ * @vgpu_bar_info:		Called to get BAR size and flags of vGPU device.
+ *				@vdev: vgpu device structure
+ *				@bar_index: BAR index
+ *				@bar_info: output, returns size and flags of
+ *				requested BAR
+ *				Returns integer: success (0) or error (< 0)
+ * @validate_map_request:	Validate remap pfn request
+ *				@vdev: vgpu device structure
+ *				@virtaddr: target user address to start at
+ *				@pfn: physical address of kernel memory, GPU
+ *				driver can change if required.
+ *				@size: size of map area, GPU driver can change
+ *				the size of map area if desired.
+ *				@prot: page protection flags for this mapping,
+ *				GPU driver can change, if required.
+ *				Returns integer: success (0) or error (< 0)
+ *
+ * Physical GPU that support vGPU should be register with vgpu module with
+ * gpu_device_ops structure.
+ */
+
+struct gpu_device_ops {
+	struct module   *owner;
+	const struct attribute_group **dev_attr_groups;
+	const struct attribute_group **vgpu_attr_groups;
+
+	int	(*vgpu_supported_config)(struct pci_dev *dev, char *config);
+	int     (*vgpu_create)(struct pci_dev *dev, uuid_le uuid,
+			       uint32_t instance, char *vgpu_params);
+	int     (*vgpu_destroy)(struct pci_dev *dev, uuid_le uuid,
+			        uint32_t instance);
+
+	int     (*vgpu_start)(uuid_le uuid);
+	int     (*vgpu_shutdown)(uuid_le uuid);
+
+	ssize_t (*read) (struct vgpu_device *vdev, char *buf, size_t count,
+			 uint32_t address_space, loff_t pos);
+	ssize_t (*write)(struct vgpu_device *vdev, char *buf, size_t count,
+			 uint32_t address_space, loff_t pos);
+	int     (*vgpu_set_irqs)(struct vgpu_device *vdev, uint32_t flags,
+				 unsigned index, unsigned start, unsigned count,
+				 void *data);
+	int	(*vgpu_bar_info)(struct vgpu_device *vdev, int bar_index,
+				 struct pci_bar_info *bar_info);
+	int	(*validate_map_request)(struct vgpu_device *vdev,
+					unsigned long virtaddr,
+					unsigned long *pfn, unsigned long *size,
+					pgprot_t *prot);
+};
+
+/*
+ * Physical GPU
+ */
+struct gpu_device {
+	struct pci_dev                  *dev;
+	const struct gpu_device_ops     *ops;
+	struct list_head                gpu_next;
+};
+
+/**
+ * struct vgpu_driver - vGPU device driver
+ * @name: driver name
+ * @probe: called when new device created
+ * @remove: called when device removed
+ * @driver: device driver structure
+ *
+ **/
+struct vgpu_driver {
+	const char *name;
+	int  (*probe)  (struct device *dev);
+	void (*remove) (struct device *dev);
+	struct device_driver	driver;
+};
+
+static inline struct vgpu_driver *to_vgpu_driver(struct device_driver *drv)
+{
+	return drv ? container_of(drv, struct vgpu_driver, driver) : NULL;
+}
+
+static inline struct vgpu_device *to_vgpu_device(struct device *dev)
+{
+	return dev ? container_of(dev, struct vgpu_device, dev) : NULL;
+}
+
+extern struct bus_type vgpu_bus_type;
+
+#define dev_is_vgpu(d) ((d)->bus == &vgpu_bus_type)
+
+extern int  vgpu_register_device(struct pci_dev *dev,
+				 const struct gpu_device_ops *ops);
+extern void vgpu_unregister_device(struct pci_dev *dev);
+
+extern int  vgpu_register_driver(struct vgpu_driver *drv, struct module *owner);
+extern void vgpu_unregister_driver(struct vgpu_driver *drv);
+
+extern int vgpu_map_virtual_bar(uint64_t virt_bar_addr, uint64_t phys_bar_addr,
+				uint32_t len, uint32_t flags);
+extern int vgpu_dma_do_translate(dma_addr_t * gfn_buffer, uint32_t count);
+
+struct vgpu_device *get_vgpu_device_from_group(struct iommu_group *group);
+
+#endif /* VGPU_H */