diff mbox

[RFC,v2,3/3] VFIO: Type1 IOMMU mapping support for vGPU

Message ID 1456244666-25369-3-git-send-email-kwankhede@nvidia.com
State New
Headers show

Commit Message

Kirti Wankhede Feb. 23, 2016, 4:24 p.m. UTC
Aim of this module is to pin and unpin guest memory.
This module provides interface to GPU driver that can be used to map guest
physical memory into its kernel space driver.
Currently this module has duplicate code from vfio_iommu_type1.c
Working on refining functions to reuse existing code in vfio_iommu_type1.c and
with that will add API to unpin pages.
This is for the reference to review the overall design of vGPU.

Thanks,
Kirti.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
---
 drivers/vgpu/Makefile                |    1 +
 drivers/vgpu/vfio_iommu_type1_vgpu.c |  423 ++++++++++++++++++++++++++++++++++
 2 files changed, 424 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vgpu/vfio_iommu_type1_vgpu.c

Comments

Jike Song March 2, 2016, 8:38 a.m. UTC | #1
On 02/24/2016 12:24 AM, Kirti Wankhede wrote:
> Aim of this module is to pin and unpin guest memory.
> This module provides interface to GPU driver that can be used to map guest
> physical memory into its kernel space driver.
> Currently this module has duplicate code from vfio_iommu_type1.c
> Working on refining functions to reuse existing code in vfio_iommu_type1.c and
> with that will add API to unpin pages.
> This is for the reference to review the overall design of vGPU.
> 
> Thanks,
> Kirti.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> ---
>  drivers/vgpu/Makefile                |    1 +
>  drivers/vgpu/vfio_iommu_type1_vgpu.c |  423 ++++++++++++++++++++++++++++++++++
>  2 files changed, 424 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/vgpu/vfio_iommu_type1_vgpu.c
> 
> diff --git a/drivers/vgpu/Makefile b/drivers/vgpu/Makefile
> index a0a2655..8ace18d 100644
> --- a/drivers/vgpu/Makefile
> +++ b/drivers/vgpu/Makefile
> @@ -3,3 +3,4 @@ vgpu-y := vgpu-core.o vgpu-sysfs.o vgpu-driver.o
>  
>  obj-$(CONFIG_VGPU)			+= vgpu.o
>  obj-$(CONFIG_VGPU_VFIO)                 += vgpu_vfio.o
> +obj-$(CONFIG_VFIO_IOMMU_TYPE1_VGPU)     += vfio_iommu_type1_vgpu.o
> diff --git a/drivers/vgpu/vfio_iommu_type1_vgpu.c b/drivers/vgpu/vfio_iommu_type1_vgpu.c
> new file mode 100644
> index 0000000..0b36ae5
> --- /dev/null
> +++ b/drivers/vgpu/vfio_iommu_type1_vgpu.c
> @@ -0,0 +1,423 @@
> +/*
> + * VGPU : IOMMU DMA mapping support for VGPU
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/compat.h>
> +#include <linux/device.h>
> +#include <linux/kernel.h>
> +#include <linux/fs.h>
> +#include <linux/miscdevice.h>
> +#include <linux/sched.h>
> +#include <linux/wait.h>
> +#include <linux/uuid.h>
> +#include <linux/vfio.h>
> +#include <linux/iommu.h>
> +#include <linux/vgpu.h>
> +
> +#include "vgpu_private.h"
> +
> +#define DRIVER_VERSION	"0.1"
> +#define DRIVER_AUTHOR	"NVIDIA Corporation"
> +#define DRIVER_DESC     "VGPU Type1 IOMMU driver for VFIO"
> +
> +// VFIO structures
> +
> +struct vfio_iommu_vgpu {
> +	struct mutex lock;
> +	struct iommu_group *group;
> +	struct vgpu_device *vgpu_dev;
> +	struct rb_root dma_list;
> +	struct mm_struct * vm_mm;
> +};
> +
> +struct vgpu_vfio_dma {
> +	struct rb_node node;
> +	dma_addr_t iova;
> +	unsigned long vaddr;
> +	size_t size;
> +	int prot;
> +};
> +
> +/*
> + * VGPU VFIO FOPs definition
> + *
> + */
> +
> +/*
> + * Duplicated from vfio_link_dma, just quick hack ... should
> + * reuse code later
> + */
> +
> +static void vgpu_link_dma(struct vfio_iommu_vgpu *iommu,
> +			  struct vgpu_vfio_dma *new)
> +{
> +	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
> +	struct vgpu_vfio_dma *dma;
> +
> +	while (*link) {
> +		parent = *link;
> +		dma = rb_entry(parent, struct vgpu_vfio_dma, node);
> +
> +		if (new->iova + new->size <= dma->iova)
> +			link = &(*link)->rb_left;
> +		else
> +			link = &(*link)->rb_right;
> +	}
> +
> +	rb_link_node(&new->node, parent, link);
> +	rb_insert_color(&new->node, &iommu->dma_list);
> +}
> +
> +static struct vgpu_vfio_dma *vgpu_find_dma(struct vfio_iommu_vgpu *iommu,
> +					   dma_addr_t start, size_t size)
> +{
> +	struct rb_node *node = iommu->dma_list.rb_node;
> +
> +	while (node) {
> +		struct vgpu_vfio_dma *dma = rb_entry(node, struct vgpu_vfio_dma, node);
> +
> +		if (start + size <= dma->iova)
> +			node = node->rb_left;
> +		else if (start >= dma->iova + dma->size)
> +			node = node->rb_right;
> +		else
> +			return dma;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void vgpu_unlink_dma(struct vfio_iommu_vgpu *iommu, struct vgpu_vfio_dma *old)
> +{
> +	rb_erase(&old->node, &iommu->dma_list);
> +}
> +
> +static void vgpu_dump_dma(struct vfio_iommu_vgpu *iommu)
> +{
> +	struct vgpu_vfio_dma *c, *n;
> +	uint32_t i = 0;
> +
> +	rbtree_postorder_for_each_entry_safe(c, n, &iommu->dma_list, node)
> +		printk(KERN_INFO "%s: dma[%d] iova:0x%llx, vaddr:0x%lx, size:0x%lx\n",
> +		       __FUNCTION__, i++, c->iova, c->vaddr, c->size);
> +}
> +
> +static int vgpu_dma_do_track(struct vfio_iommu_vgpu * vgpu_iommu,
> +	struct vfio_iommu_type1_dma_map *map)
> +{
> +	dma_addr_t iova = map->iova;
> +	unsigned long vaddr = map->vaddr;
> +	int ret = 0, prot = 0;
> +	struct vgpu_vfio_dma *vgpu_dma;
> +
> +	mutex_lock(&vgpu_iommu->lock);
> +
> +	if (vgpu_find_dma(vgpu_iommu, map->iova, map->size)) {
> +		mutex_unlock(&vgpu_iommu->lock);
> +		return -EEXIST;
> +	}
> +
> +	vgpu_dma = kzalloc(sizeof(*vgpu_dma), GFP_KERNEL);
> +
> +	if (!vgpu_dma) {
> +		mutex_unlock(&vgpu_iommu->lock);
> +		return -ENOMEM;
> +	}
> +
> +	vgpu_dma->iova = iova;
> +	vgpu_dma->vaddr = vaddr;
> +	vgpu_dma->prot = prot;
> +	vgpu_dma->size = map->size;
> +
> +	vgpu_link_dma(vgpu_iommu, vgpu_dma);

Hi Kirti & Neo,

seems that no one actually setup mappings for IOMMU here?

> +
> +	mutex_unlock(&vgpu_iommu->lock);
> +	return ret;
> +}
> +
> +static int vgpu_dma_do_untrack(struct vfio_iommu_vgpu * vgpu_iommu,
> +	struct vfio_iommu_type1_dma_unmap *unmap)
> +{
> +	struct vgpu_vfio_dma *vgpu_dma;
> +	size_t unmapped = 0;
> +	int ret = 0;
> +
> +	mutex_lock(&vgpu_iommu->lock);
> +
> +	vgpu_dma = vgpu_find_dma(vgpu_iommu, unmap->iova, 0);
> +	if (vgpu_dma && vgpu_dma->iova != unmap->iova) {
> +		ret = -EINVAL;
> +		goto unlock;
> +	}
> +
> +	vgpu_dma = vgpu_find_dma(vgpu_iommu, unmap->iova + unmap->size - 1, 0);
> +	if (vgpu_dma && vgpu_dma->iova + vgpu_dma->size != unmap->iova + unmap->size) {
> +		ret = -EINVAL;
> +		goto unlock;
> +	}
> +
> +	while (( vgpu_dma = vgpu_find_dma(vgpu_iommu, unmap->iova, unmap->size))) {
> +		unmapped += vgpu_dma->size;
> +		vgpu_unlink_dma(vgpu_iommu, vgpu_dma);
> +	}
> +
> +unlock:
> +	mutex_unlock(&vgpu_iommu->lock);
> +	unmap->size = unmapped;
> +
> +	return ret;
> +}
> +
> +/* Ugly hack to quickly test single deivce ... */
> +
> +static struct vfio_iommu_vgpu *_local_iommu = NULL;
> +
> +int vgpu_dma_do_translate(dma_addr_t *gfn_buffer, uint32_t count)
> +{
> +	int i = 0, ret = 0, prot = 0;
> +	unsigned long remote_vaddr = 0, pfn = 0;
> +	struct vfio_iommu_vgpu *vgpu_iommu = _local_iommu;
> +	struct vgpu_vfio_dma *vgpu_dma;
> +	struct page *page[1];
> +	// unsigned long * addr = NULL;
> +	struct mm_struct *mm = vgpu_iommu->vm_mm;
> +
> +	prot = IOMMU_READ | IOMMU_WRITE;
> +
> +	printk(KERN_INFO "%s: >>>>\n", __FUNCTION__);
> +
> +	mutex_lock(&vgpu_iommu->lock);
> +
> +	vgpu_dump_dma(vgpu_iommu);
> +
> +	for (i = 0; i < count; i++) {
> +		dma_addr_t iova = gfn_buffer[i] << PAGE_SHIFT;
> +		vgpu_dma = vgpu_find_dma(vgpu_iommu, iova, 0 /*  size */);
> +
> +		if (!vgpu_dma) {
> +			printk(KERN_INFO "%s: fail locate iova[%d]:0x%llx\n", __FUNCTION__, i, iova);
> +			ret = -EINVAL;
> +			goto unlock;
> +		}
> +
> +		remote_vaddr = vgpu_dma->vaddr + iova - vgpu_dma->iova;
> +		printk(KERN_INFO "%s: find dma iova[%d]:0x%llx, vaddr:0x%lx, size:0x%lx, remote_vaddr:0x%lx\n",
> +			__FUNCTION__, i, vgpu_dma->iova,
> +			vgpu_dma->vaddr, vgpu_dma->size, remote_vaddr);
> +
> +		if (get_user_pages_unlocked(NULL, mm, remote_vaddr, 1, 1, 0, page) == 1) {
> +			pfn = page_to_pfn(page[0]);
> +			printk(KERN_INFO "%s: pfn[%d]:0x%lx\n", __FUNCTION__, i, pfn);
> +			// addr = vmap(page, 1, VM_MAP, PAGE_KERNEL);
> +		}
> +		else {
> +			printk(KERN_INFO "%s: fail to pin pfn[%d]\n", __FUNCTION__, i);
> +			ret = -ENOMEM;
> +			goto unlock;
> +		}
> +
> +		gfn_buffer[i] = pfn;
> +		// vunmap(addr);
> +
> +	}
> +
> +unlock:
> +	mutex_unlock(&vgpu_iommu->lock);
> +	printk(KERN_INFO "%s: <<<<\n", __FUNCTION__);
> +	return ret;
> +}
> +EXPORT_SYMBOL(vgpu_dma_do_translate);
> +
> +static void *vfio_iommu_vgpu_open(unsigned long arg)
> +{
> +	struct vfio_iommu_vgpu *iommu;
> +
> +	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
> +
> +	if (!iommu)
> +		return ERR_PTR(-ENOMEM);
> +
> +	mutex_init(&iommu->lock);
> +
> +	printk(KERN_INFO "%s", __FUNCTION__);
> +
> +	/* TODO: Keep track the v2 vs. v1, for now only assume
> +	 * we are v2 due to QEMU code */
> +	_local_iommu = iommu;
> +	return iommu;
> +}
> +
> +static void vfio_iommu_vgpu_release(void *iommu_data)
> +{
> +	struct vfio_iommu_vgpu *iommu = iommu_data;
> +	kfree(iommu);
> +	printk(KERN_INFO "%s", __FUNCTION__);
> +}
> +
> +static long vfio_iommu_vgpu_ioctl(void *iommu_data,
> +		unsigned int cmd, unsigned long arg)
> +{
> +	int ret = 0;
> +	unsigned long minsz;
> +	struct vfio_iommu_vgpu *vgpu_iommu = iommu_data;
> +
> +	switch (cmd) {
> +	case VFIO_CHECK_EXTENSION:
> +	{
> +		if ((arg == VFIO_TYPE1_IOMMU) || (arg == VFIO_TYPE1v2_IOMMU))
> +			return 1;
> +		else
> +			return 0;
> +	}
> +
> +	case VFIO_IOMMU_GET_INFO:
> +	{
> +		struct vfio_iommu_type1_info info;
> +		minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		info.flags = 0;
> +
> +		return copy_to_user((void __user *)arg, &info, minsz);
> +	}
> +	case VFIO_IOMMU_MAP_DMA:
> +	{
> +		// TODO
> +		struct vfio_iommu_type1_dma_map map;
> +		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
> +
> +		if (copy_from_user(&map, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (map.argsz < minsz)
> +			return -EINVAL;
> +
> +		printk(KERN_INFO "VGPU-IOMMU:MAP_DMA flags:%d, vaddr:0x%llx, iova:0x%llx, size:0x%llx\n",
> +			map.flags, map.vaddr, map.iova, map.size);
> +
> +		/*
> +		 * TODO: Tracking code is mostly duplicated from TYPE1 IOMMU, ideally,
> +		 * this should be merged into one single file and reuse data
> +		 * structure
> +		 *
> +		 */
> +		ret = vgpu_dma_do_track(vgpu_iommu, &map);
> +		break;
> +	}
> +	case VFIO_IOMMU_UNMAP_DMA:
> +	{
> +		// TODO
> +		struct vfio_iommu_type1_dma_unmap unmap;
> +
> +		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
> +
> +		if (copy_from_user(&unmap, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (unmap.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = vgpu_dma_do_untrack(vgpu_iommu, &unmap);
> +		break;
> +	}
> +	default:
> +	{
> +		printk(KERN_INFO "%s cmd default ", __FUNCTION__);
> +		ret = -ENOTTY;
> +		break;
> +	}
> +	}
> +
> +	return ret;
> +}
> +
> +
> +static int vfio_iommu_vgpu_attach_group(void *iommu_data,
> +		                        struct iommu_group *iommu_group)
> +{
> +	struct vfio_iommu_vgpu *iommu = iommu_data;
> +	struct vgpu_device *vgpu_dev = NULL;
> +
> +	printk(KERN_INFO "%s", __FUNCTION__);
> +
> +	vgpu_dev = get_vgpu_device_from_group(iommu_group);
> +	if (vgpu_dev) {
> +		iommu->vgpu_dev = vgpu_dev;
> +		iommu->group = iommu_group;
> +
> +		/* IOMMU shares the same life cylce as VM MM */
> +		iommu->vm_mm = current->mm;
> +
> +		return 0;
> +	}
> +	iommu->group = iommu_group;
> +	return 1;
> +}
> +
> +static void vfio_iommu_vgpu_detach_group(void *iommu_data,
> +		struct iommu_group *iommu_group)
> +{
> +	struct vfio_iommu_vgpu *iommu = iommu_data;
> +
> +	printk(KERN_INFO "%s", __FUNCTION__);
> +	iommu->vm_mm = NULL;
> +	iommu->group = NULL;
> +
> +	return;
> +}
> +
> +
> +static const struct vfio_iommu_driver_ops vfio_iommu_vgpu_driver_ops = {
> +	.name           = "vgpu_vfio",
> +	.owner          = THIS_MODULE,
> +	.open           = vfio_iommu_vgpu_open,
> +	.release        = vfio_iommu_vgpu_release,
> +	.ioctl          = vfio_iommu_vgpu_ioctl,
> +	.attach_group   = vfio_iommu_vgpu_attach_group,
> +	.detach_group   = vfio_iommu_vgpu_detach_group,
> +};
> +
> +
> +int vgpu_vfio_iommu_init(void)
> +{
> +	int rc = vfio_register_iommu_driver(&vfio_iommu_vgpu_driver_ops);
> +
> +	printk(KERN_INFO "%s\n", __FUNCTION__);
> +	if (rc < 0) {
> +		printk(KERN_ERR "Error: failed to register vfio iommu, err:%d\n", rc);
> +	}
> +
> +	return rc;
> +}
> +
> +void vgpu_vfio_iommu_exit(void)
> +{
> +	// unregister vgpu_vfio driver
> +	vfio_unregister_iommu_driver(&vfio_iommu_vgpu_driver_ops);
> +	printk(KERN_INFO "%s\n", __FUNCTION__);
> +}
> +
> +
> +module_init(vgpu_vfio_iommu_init);
> +module_exit(vgpu_vfio_iommu_exit);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> +
> 

--
Thanks,
Jike
Neo Jia March 4, 2016, 7 a.m. UTC | #2
On Wed, Mar 02, 2016 at 04:38:34PM +0800, Jike Song wrote:
> On 02/24/2016 12:24 AM, Kirti Wankhede wrote:
> > +	vgpu_dma->size = map->size;
> > +
> > +	vgpu_link_dma(vgpu_iommu, vgpu_dma);
> 
> Hi Kirti & Neo,
> 
> seems that no one actually setup mappings for IOMMU here?
> 

Hi Jike,

Yes.

The actual mapping should be done by the host kernel driver after calling the
translation/pinning API vgpu_dma_do_translate.

Thanks,
Neo

> > 
> 
> --
> Thanks,
> Jike
>
Jike Song March 7, 2016, 6:07 a.m. UTC | #3
Hi Neo,

On Fri, Mar 4, 2016 at 3:00 PM, Neo Jia <cjia@nvidia.com> wrote:
> On Wed, Mar 02, 2016 at 04:38:34PM +0800, Jike Song wrote:
>> On 02/24/2016 12:24 AM, Kirti Wankhede wrote:
>> > +   vgpu_dma->size = map->size;
>> > +
>> > +   vgpu_link_dma(vgpu_iommu, vgpu_dma);
>>
>> Hi Kirti & Neo,
>>
>> seems that no one actually setup mappings for IOMMU here?
>>
>
> Hi Jike,
>
> Yes.
>
> The actual mapping should be done by the host kernel driver after calling the
> translation/pinning API vgpu_dma_do_translate.

Thanks for the reply. I mis-deleted the mail in my intel account, so
reply with private mail account, sorry for that.


In vgpu_dma_do_translate():

for (i = 0; i < count; i++) {
   {snip}
   dma_addr_t iova = gfn_buffer[i] << PAGE_SHIFT;
   vgpu_dma = vgpu_find_dma(vgpu_iommu, iova, 0 /*  size */);

    remote_vaddr = vgpu_dma->vaddr + iova - vgpu_dma->iova;
    if (get_user_pages_unlocked(NULL, mm, remote_vaddr, 1, 1, 0, page) == 1) {
        pfn = page_to_pfn(page[0]);
    }
    gfn_buffer[i] = pfn;
}

If I understand correctly, the purpose of above code, is given an
array of gfns, try to pin & return associated pfns. There is still no
IOMMU mappings here.  Is it supposed to be the caller who should set
up IOMMU by DMA api such as dma_map_page(), after calling
vgpu_dma_do_translate()?
Neo Jia March 8, 2016, 12:31 a.m. UTC | #4
On Mon, Mar 07, 2016 at 02:07:15PM +0800, Jike Song wrote:
> Hi Neo,
> 
> On Fri, Mar 4, 2016 at 3:00 PM, Neo Jia <cjia@nvidia.com> wrote:
> > On Wed, Mar 02, 2016 at 04:38:34PM +0800, Jike Song wrote:
> >> On 02/24/2016 12:24 AM, Kirti Wankhede wrote:
> >> > +   vgpu_dma->size = map->size;
> >> > +
> >> > +   vgpu_link_dma(vgpu_iommu, vgpu_dma);
> >>
> >> Hi Kirti & Neo,
> >>
> >> seems that no one actually setup mappings for IOMMU here?
> >>
> >
> > Hi Jike,
> >
> > Yes.
> >
> > The actual mapping should be done by the host kernel driver after calling the
> > translation/pinning API vgpu_dma_do_translate.
> 
> Thanks for the reply. I mis-deleted the mail in my intel account, so
> reply with private mail account, sorry for that.
> 
> 
> In vgpu_dma_do_translate():
> 
> for (i = 0; i < count; i++) {
>    {snip}
>    dma_addr_t iova = gfn_buffer[i] << PAGE_SHIFT;
>    vgpu_dma = vgpu_find_dma(vgpu_iommu, iova, 0 /*  size */);
> 
>     remote_vaddr = vgpu_dma->vaddr + iova - vgpu_dma->iova;
>     if (get_user_pages_unlocked(NULL, mm, remote_vaddr, 1, 1, 0, page) == 1) {
>         pfn = page_to_pfn(page[0]);
>     }
>     gfn_buffer[i] = pfn;
> }
> 
> If I understand correctly, the purpose of above code, is given an
> array of gfns, try to pin & return associated pfns. There is still no
> IOMMU mappings here.  

Yes.

> Is it supposed to be the caller who should set
> up IOMMU by DMA api such as dma_map_page(), after calling
> vgpu_dma_do_translate()?
> 

Don't think you need to call dma_map_page here. Once you have the pfn available
to your GPU kernel driver, you can just go ahead to setup the mapping as you
normally do such as calling pci_map_sg and its friends.

Thanks,
Neo

> 
> -- 
> Thanks,
> Jike
Jike Song March 10, 2016, 3:10 a.m. UTC | #5
On 03/08/2016 08:31 AM, Neo Jia wrote:
> On Mon, Mar 07, 2016 at 02:07:15PM +0800, Jike Song wrote:
>> Hi Neo,
>>
>> On Fri, Mar 4, 2016 at 3:00 PM, Neo Jia <cjia@nvidia.com> wrote:
>>> On Wed, Mar 02, 2016 at 04:38:34PM +0800, Jike Song wrote:
>>>> On 02/24/2016 12:24 AM, Kirti Wankhede wrote:
>>>>> +   vgpu_dma->size = map->size;
>>>>> +
>>>>> +   vgpu_link_dma(vgpu_iommu, vgpu_dma);
>>>>
>>>> Hi Kirti & Neo,
>>>>
>>>> seems that no one actually setup mappings for IOMMU here?
>>>>
>>>
>>> Hi Jike,
>>>
>>> Yes.
>>>
>>> The actual mapping should be done by the host kernel driver after calling the
>>> translation/pinning API vgpu_dma_do_translate.
>>
>> Thanks for the reply. I mis-deleted the mail in my intel account, so
>> reply with private mail account, sorry for that.
>>
>>
>> In vgpu_dma_do_translate():
>>
>> for (i = 0; i < count; i++) {
>>    {snip}
>>    dma_addr_t iova = gfn_buffer[i] << PAGE_SHIFT;
>>    vgpu_dma = vgpu_find_dma(vgpu_iommu, iova, 0 /*  size */);
>>
>>     remote_vaddr = vgpu_dma->vaddr + iova - vgpu_dma->iova;
>>     if (get_user_pages_unlocked(NULL, mm, remote_vaddr, 1, 1, 0, page) == 1) {
>>         pfn = page_to_pfn(page[0]);
>>     }
>>     gfn_buffer[i] = pfn;
>> }
>>
>> If I understand correctly, the purpose of above code, is given an
>> array of gfns, try to pin & return associated pfns. There is still no
>> IOMMU mappings here.  
> 
> Yes.
> 

Thanks for the conformation.

>> Is it supposed to be the caller who should set
>> up IOMMU by DMA api such as dma_map_page(), after calling
>> vgpu_dma_do_translate()?
>>
> 
> Don't think you need to call dma_map_page here. Once you have the pfn available
> to your GPU kernel driver, you can just go ahead to setup the mapping as you
> normally do such as calling pci_map_sg and its friends.
> 

Technically it's definitely OK to call DMA API from the caller rather than here,
however personally I think it is a bit counter-intuitive: IOMMU page tables
should be constructed within the VFIO IOMMU driver.


> Thanks,
> Neo

--
Thanks,
Jike
Neo Jia March 11, 2016, 4:19 a.m. UTC | #6
On Thu, Mar 10, 2016 at 11:10:10AM +0800, Jike Song wrote:
> 
> >> Is it supposed to be the caller who should set
> >> up IOMMU by DMA api such as dma_map_page(), after calling
> >> vgpu_dma_do_translate()?
> >>
> > 
> > Don't think you need to call dma_map_page here. Once you have the pfn available
> > to your GPU kernel driver, you can just go ahead to setup the mapping as you
> > normally do such as calling pci_map_sg and its friends.
> > 
> 
> Technically it's definitely OK to call DMA API from the caller rather than here,
> however personally I think it is a bit counter-intuitive: IOMMU page tables
> should be constructed within the VFIO IOMMU driver.
> 

Hi Jike,

For vGPU, what we have is just a virtual device and a fake IOMMU group, therefore 
the actual interaction with the real GPU should be managed by the GPU vendor driver.

With the default TYPE1 IOMMU, it works with the vfio-pci as it owns the device.

Thanks,
Neo

> --
> Thanks,
> Jike
>
Tian, Kevin March 11, 2016, 4:46 a.m. UTC | #7
> From: Neo Jia [mailto:cjia@nvidia.com]
> Sent: Friday, March 11, 2016 12:20 PM
> 
> On Thu, Mar 10, 2016 at 11:10:10AM +0800, Jike Song wrote:
> >
> > >> Is it supposed to be the caller who should set
> > >> up IOMMU by DMA api such as dma_map_page(), after calling
> > >> vgpu_dma_do_translate()?
> > >>
> > >
> > > Don't think you need to call dma_map_page here. Once you have the pfn available
> > > to your GPU kernel driver, you can just go ahead to setup the mapping as you
> > > normally do such as calling pci_map_sg and its friends.
> > >
> >
> > Technically it's definitely OK to call DMA API from the caller rather than here,
> > however personally I think it is a bit counter-intuitive: IOMMU page tables
> > should be constructed within the VFIO IOMMU driver.
> >
> 
> Hi Jike,
> 
> For vGPU, what we have is just a virtual device and a fake IOMMU group, therefore
> the actual interaction with the real GPU should be managed by the GPU vendor driver.
> 

Hi, Neo,

Seems we have a different thought on this. Regardless of whether it's a virtual/physical 
device, imo, VFIO should manage IOMMU configuration. The only difference is:

- for physical device, VFIO directly invokes IOMMU API to set IOMMU entry (GPA->HPA);
- for virtual device, VFIO invokes kernel DMA APIs which indirectly lead to IOMMU entry 
set if CONFIG_IOMMU is enabled in kernel (GPA->IOVA);

This would provide an unified way to manage the translation in VFIO, and then vendor
specific driver only needs to query and use returned IOVA corresponding to a GPA. 

Doing so has another benefit, to make underlying vGPU driver VMM agnostic. For KVM,
yes we can use pci_map_sg. However for Xen it's different (today Dom0 doesn't see
IOMMU. In the future there'll be a PVIOMMU implementation) so different code path is 
required. It's better to abstract such specific knowledge out of vGPU driver, which just
uses whatever dma_addr returned by other agent (VFIO here, or another Xen specific
agent) in a centralized way.

Alex, what's your opinion on this?

Thanks
Kevin
Neo Jia March 11, 2016, 6:10 a.m. UTC | #8
On Fri, Mar 11, 2016 at 04:46:23AM +0000, Tian, Kevin wrote:
> > From: Neo Jia [mailto:cjia@nvidia.com]
> > Sent: Friday, March 11, 2016 12:20 PM
> > 
> > On Thu, Mar 10, 2016 at 11:10:10AM +0800, Jike Song wrote:
> > >
> > > >> Is it supposed to be the caller who should set
> > > >> up IOMMU by DMA api such as dma_map_page(), after calling
> > > >> vgpu_dma_do_translate()?
> > > >>
> > > >
> > > > Don't think you need to call dma_map_page here. Once you have the pfn available
> > > > to your GPU kernel driver, you can just go ahead to setup the mapping as you
> > > > normally do such as calling pci_map_sg and its friends.
> > > >
> > >
> > > Technically it's definitely OK to call DMA API from the caller rather than here,
> > > however personally I think it is a bit counter-intuitive: IOMMU page tables
> > > should be constructed within the VFIO IOMMU driver.
> > >
> > 
> > Hi Jike,
> > 
> > For vGPU, what we have is just a virtual device and a fake IOMMU group, therefore
> > the actual interaction with the real GPU should be managed by the GPU vendor driver.
> > 
> 
> Hi, Neo,
> 
> Seems we have a different thought on this. Regardless of whether it's a virtual/physical 
> device, imo, VFIO should manage IOMMU configuration. The only difference is:
> 
> - for physical device, VFIO directly invokes IOMMU API to set IOMMU entry (GPA->HPA);
> - for virtual device, VFIO invokes kernel DMA APIs which indirectly lead to IOMMU entry 
> set if CONFIG_IOMMU is enabled in kernel (GPA->IOVA);

How does it make any sense for us to do a dma_map_page for a physical device that we don't 
have any direct interaction with?

> 
> This would provide an unified way to manage the translation in VFIO, and then vendor
> specific driver only needs to query and use returned IOVA corresponding to a GPA. 
> 
> Doing so has another benefit, to make underlying vGPU driver VMM agnostic. For KVM,
> yes we can use pci_map_sg. However for Xen it's different (today Dom0 doesn't see
> IOMMU. In the future there'll be a PVIOMMU implementation) so different code path is 
> required. It's better to abstract such specific knowledge out of vGPU driver, which just
> uses whatever dma_addr returned by other agent (VFIO here, or another Xen specific
> agent) in a centralized way.
> 
> Alex, what's your opinion on this?
> 
> Thanks
> Kevin
Tian, Kevin March 11, 2016, 8:06 a.m. UTC | #9
> From: Neo Jia
> Sent: Friday, March 11, 2016 2:11 PM
> > > Hi Jike,
> > >
> > > For vGPU, what we have is just a virtual device and a fake IOMMU group, therefore
> > > the actual interaction with the real GPU should be managed by the GPU vendor driver.
> > >
> >
> > Hi, Neo,
> >
> > Seems we have a different thought on this. Regardless of whether it's a virtual/physical
> > device, imo, VFIO should manage IOMMU configuration. The only difference is:
> >
> > - for physical device, VFIO directly invokes IOMMU API to set IOMMU entry (GPA->HPA);
> > - for virtual device, VFIO invokes kernel DMA APIs which indirectly lead to IOMMU entry
> > set if CONFIG_IOMMU is enabled in kernel (GPA->IOVA);
> 
> How does it make any sense for us to do a dma_map_page for a physical device that we
> don't
> have any direct interaction with?
> 

That is also a valid point. It really depends on how we look at this issue.

From VFIO p.o.v, it needs to enforce DMA isolation for managed devices. In 
that manner it doesn't matter whether it's a physical or virtual one. However 
if looking at specific linux DMA interface, you are right that it is built around 
the physical device instance, which is not managed by VFIO in this case. 

On the other hand, your proposal leaves DMA mapping to vendor specific 
driver which actually manages physical device. However this way VFIO relies
on other agent to enforce DMA isolation of vGPUs. Might not be a real 
problem (more a conceptual one)...

So let me do more thinking here (half-way convinced by you) :-)

Thanks
Kevin
Alex Williamson March 11, 2016, 4:13 p.m. UTC | #10
On Fri, 11 Mar 2016 04:46:23 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Neo Jia [mailto:cjia@nvidia.com]
> > Sent: Friday, March 11, 2016 12:20 PM
> > 
> > On Thu, Mar 10, 2016 at 11:10:10AM +0800, Jike Song wrote:  
> > >  
> > > >> Is it supposed to be the caller who should set
> > > >> up IOMMU by DMA api such as dma_map_page(), after calling
> > > >> vgpu_dma_do_translate()?
> > > >>  
> > > >
> > > > Don't think you need to call dma_map_page here. Once you have the pfn available
> > > > to your GPU kernel driver, you can just go ahead to setup the mapping as you
> > > > normally do such as calling pci_map_sg and its friends.
> > > >  
> > >
> > > Technically it's definitely OK to call DMA API from the caller rather than here,
> > > however personally I think it is a bit counter-intuitive: IOMMU page tables
> > > should be constructed within the VFIO IOMMU driver.
> > >  
> > 
> > Hi Jike,
> > 
> > For vGPU, what we have is just a virtual device and a fake IOMMU group, therefore
> > the actual interaction with the real GPU should be managed by the GPU vendor driver.
> >   
> 
> Hi, Neo,
> 
> Seems we have a different thought on this. Regardless of whether it's a virtual/physical 
> device, imo, VFIO should manage IOMMU configuration. The only difference is:
> 
> - for physical device, VFIO directly invokes IOMMU API to set IOMMU entry (GPA->HPA);
> - for virtual device, VFIO invokes kernel DMA APIs which indirectly lead to IOMMU entry 
> set if CONFIG_IOMMU is enabled in kernel (GPA->IOVA);
> 
> This would provide an unified way to manage the translation in VFIO, and then vendor
> specific driver only needs to query and use returned IOVA corresponding to a GPA. 
> 
> Doing so has another benefit, to make underlying vGPU driver VMM agnostic. For KVM,
> yes we can use pci_map_sg. However for Xen it's different (today Dom0 doesn't see
> IOMMU. In the future there'll be a PVIOMMU implementation) so different code path is 
> required. It's better to abstract such specific knowledge out of vGPU driver, which just
> uses whatever dma_addr returned by other agent (VFIO here, or another Xen specific
> agent) in a centralized way.
> 
> Alex, what's your opinion on this?

The sticky point is how vfio, which is only handling the vGPU, has a
reference to the physical GPU on which to call DMA API operations.  If
that reference is provided by the vendor vGPU driver, for example
vgpu_dma_do_translate_for_pci(gpa, pci_dev), I don't see any reason to
be opposed to such an API.  I would not condone vfio deriving or owning
a reference to the physical device on its own though, that's in the
realm of the vendor vGPU driver.  It does seem a bit cleaner and should
reduce duplicate code if the vfio vGPU iommu interface could handle the
iommu mapping for the vendor vgpu driver when necessary.  Thanks,

Alex
Neo Jia March 11, 2016, 4:55 p.m. UTC | #11
On Fri, Mar 11, 2016 at 09:13:15AM -0700, Alex Williamson wrote:
> On Fri, 11 Mar 2016 04:46:23 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > Sent: Friday, March 11, 2016 12:20 PM
> > > 
> > > On Thu, Mar 10, 2016 at 11:10:10AM +0800, Jike Song wrote:  
> > > >  
> > > > >> Is it supposed to be the caller who should set
> > > > >> up IOMMU by DMA api such as dma_map_page(), after calling
> > > > >> vgpu_dma_do_translate()?
> > > > >>  
> > > > >
> > > > > Don't think you need to call dma_map_page here. Once you have the pfn available
> > > > > to your GPU kernel driver, you can just go ahead to setup the mapping as you
> > > > > normally do such as calling pci_map_sg and its friends.
> > > > >  
> > > >
> > > > Technically it's definitely OK to call DMA API from the caller rather than here,
> > > > however personally I think it is a bit counter-intuitive: IOMMU page tables
> > > > should be constructed within the VFIO IOMMU driver.
> > > >  
> > > 
> > > Hi Jike,
> > > 
> > > For vGPU, what we have is just a virtual device and a fake IOMMU group, therefore
> > > the actual interaction with the real GPU should be managed by the GPU vendor driver.
> > >   
> > 
> > Hi, Neo,
> > 
> > Seems we have a different thought on this. Regardless of whether it's a virtual/physical 
> > device, imo, VFIO should manage IOMMU configuration. The only difference is:
> > 
> > - for physical device, VFIO directly invokes IOMMU API to set IOMMU entry (GPA->HPA);
> > - for virtual device, VFIO invokes kernel DMA APIs which indirectly lead to IOMMU entry 
> > set if CONFIG_IOMMU is enabled in kernel (GPA->IOVA);
> > 
> > This would provide an unified way to manage the translation in VFIO, and then vendor
> > specific driver only needs to query and use returned IOVA corresponding to a GPA. 
> > 
> > Doing so has another benefit, to make underlying vGPU driver VMM agnostic. For KVM,
> > yes we can use pci_map_sg. However for Xen it's different (today Dom0 doesn't see
> > IOMMU. In the future there'll be a PVIOMMU implementation) so different code path is 
> > required. It's better to abstract such specific knowledge out of vGPU driver, which just
> > uses whatever dma_addr returned by other agent (VFIO here, or another Xen specific
> > agent) in a centralized way.
> > 
> > Alex, what's your opinion on this?
> 
> The sticky point is how vfio, which is only handling the vGPU, has a
> reference to the physical GPU on which to call DMA API operations.  If
> that reference is provided by the vendor vGPU driver, for example
> vgpu_dma_do_translate_for_pci(gpa, pci_dev), I don't see any reason to
> be opposed to such an API.  I would not condone vfio deriving or owning
> a reference to the physical device on its own though, that's in the
> realm of the vendor vGPU driver.  It does seem a bit cleaner and should
> reduce duplicate code if the vfio vGPU iommu interface could handle the
> iommu mapping for the vendor vgpu driver when necessary.  Thanks,

Hi Alex,

Since we don't want to allow vfio iommu to derive or own a reference to the
physical device, I think it is still better not providing such pci_dev to the 
vfio iommu type1 driver.

Also, I need to point out that if the vfio iommu is going to set up iommu page
table for the real underlying physical device, giving the fact of single RID we
are all having here, the iommu mapping code has to return the new "IOVA" that is
mapped to the HPA, which the GPU vendro driver will have to put on its DMA
engine. This is very different than the current VFIO IOMMU mapping logic.

And we still have to provide another interface to translate the GPA to
HPA for CPU mapping.

In the current RFC, we only need to have a single interface to provide the most
basic information to the GPU vendor driver and without taking the risk of
leaking a ref to VFIO IOMMU.

Thanks,
Neo

> 
> Alex
Alex Williamson March 11, 2016, 5:56 p.m. UTC | #12
On Fri, 11 Mar 2016 08:55:44 -0800
Neo Jia <cjia@nvidia.com> wrote:

> On Fri, Mar 11, 2016 at 09:13:15AM -0700, Alex Williamson wrote:
> > On Fri, 11 Mar 2016 04:46:23 +0000
> > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> >   
> > > > From: Neo Jia [mailto:cjia@nvidia.com]
> > > > Sent: Friday, March 11, 2016 12:20 PM
> > > > 
> > > > On Thu, Mar 10, 2016 at 11:10:10AM +0800, Jike Song wrote:    
> > > > >    
> > > > > >> Is it supposed to be the caller who should set
> > > > > >> up IOMMU by DMA api such as dma_map_page(), after calling
> > > > > >> vgpu_dma_do_translate()?
> > > > > >>    
> > > > > >
> > > > > > Don't think you need to call dma_map_page here. Once you have the pfn available
> > > > > > to your GPU kernel driver, you can just go ahead to setup the mapping as you
> > > > > > normally do such as calling pci_map_sg and its friends.
> > > > > >    
> > > > >
> > > > > Technically it's definitely OK to call DMA API from the caller rather than here,
> > > > > however personally I think it is a bit counter-intuitive: IOMMU page tables
> > > > > should be constructed within the VFIO IOMMU driver.
> > > > >    
> > > > 
> > > > Hi Jike,
> > > > 
> > > > For vGPU, what we have is just a virtual device and a fake IOMMU group, therefore
> > > > the actual interaction with the real GPU should be managed by the GPU vendor driver.
> > > >     
> > > 
> > > Hi, Neo,
> > > 
> > > Seems we have a different thought on this. Regardless of whether it's a virtual/physical 
> > > device, imo, VFIO should manage IOMMU configuration. The only difference is:
> > > 
> > > - for physical device, VFIO directly invokes IOMMU API to set IOMMU entry (GPA->HPA);
> > > - for virtual device, VFIO invokes kernel DMA APIs which indirectly lead to IOMMU entry 
> > > set if CONFIG_IOMMU is enabled in kernel (GPA->IOVA);
> > > 
> > > This would provide an unified way to manage the translation in VFIO, and then vendor
> > > specific driver only needs to query and use returned IOVA corresponding to a GPA. 
> > > 
> > > Doing so has another benefit, to make underlying vGPU driver VMM agnostic. For KVM,
> > > yes we can use pci_map_sg. However for Xen it's different (today Dom0 doesn't see
> > > IOMMU. In the future there'll be a PVIOMMU implementation) so different code path is 
> > > required. It's better to abstract such specific knowledge out of vGPU driver, which just
> > > uses whatever dma_addr returned by other agent (VFIO here, or another Xen specific
> > > agent) in a centralized way.
> > > 
> > > Alex, what's your opinion on this?  
> > 
> > The sticky point is how vfio, which is only handling the vGPU, has a
> > reference to the physical GPU on which to call DMA API operations.  If
> > that reference is provided by the vendor vGPU driver, for example
> > vgpu_dma_do_translate_for_pci(gpa, pci_dev), I don't see any reason to
> > be opposed to such an API.  I would not condone vfio deriving or owning
> > a reference to the physical device on its own though, that's in the
> > realm of the vendor vGPU driver.  It does seem a bit cleaner and should
> > reduce duplicate code if the vfio vGPU iommu interface could handle the
> > iommu mapping for the vendor vgpu driver when necessary.  Thanks,  
> 
> Hi Alex,
> 
> Since we don't want to allow vfio iommu to derive or own a reference to the
> physical device, I think it is still better not providing such pci_dev to the 
> vfio iommu type1 driver.
> 
> Also, I need to point out that if the vfio iommu is going to set up iommu page
> table for the real underlying physical device, giving the fact of single RID we
> are all having here, the iommu mapping code has to return the new "IOVA" that is
> mapped to the HPA, which the GPU vendro driver will have to put on its DMA
> engine. This is very different than the current VFIO IOMMU mapping logic.
> 
> And we still have to provide another interface to translate the GPA to
> HPA for CPU mapping.
> 
> In the current RFC, we only need to have a single interface to provide the most
> basic information to the GPU vendor driver and without taking the risk of
> leaking a ref to VFIO IOMMU.

I don't see this as some fundamental difference of opinion, it's really
just whether vfio provides a "pin this GFN and return the HPA" function
or whether that function could be extended to include "... and also map
it through the DMA API for the provided device and return the host
IOVA".  It might even still be a single function to vfio for CPU vs
device mapping where the device and IOVA return pointer are NULL when
only pinning is required for CPU access (though maybe there are better
ways to provide CPU access than pinning).  A wrapper could even give the
appearance that those are two separate functions.

So long as vfio isn't owning or deriving the device for the DMA API
calls and we don't introduce some complication in page accounting, this
really just seems like a question of whether moving the DMA API
handling into vfio is common between the vendor vGPU drivers and are we
reducing the overall amount and complexity of code by giving the vendor
drivers the opportunity to do both operations with one interface.
If as Kevin suggest it also provides some additional abstractions
for Xen vs KVM, even better.  Thanks,

Alex
Neo Jia March 11, 2016, 6:18 p.m. UTC | #13
On Fri, Mar 11, 2016 at 10:56:24AM -0700, Alex Williamson wrote:
> On Fri, 11 Mar 2016 08:55:44 -0800
> Neo Jia <cjia@nvidia.com> wrote:
> 
> > > > Alex, what's your opinion on this?  
> > > 
> > > The sticky point is how vfio, which is only handling the vGPU, has a
> > > reference to the physical GPU on which to call DMA API operations.  If
> > > that reference is provided by the vendor vGPU driver, for example
> > > vgpu_dma_do_translate_for_pci(gpa, pci_dev), I don't see any reason to
> > > be opposed to such an API.  I would not condone vfio deriving or owning
> > > a reference to the physical device on its own though, that's in the
> > > realm of the vendor vGPU driver.  It does seem a bit cleaner and should
> > > reduce duplicate code if the vfio vGPU iommu interface could handle the
> > > iommu mapping for the vendor vgpu driver when necessary.  Thanks,  
> > 
> > Hi Alex,
> > 
> > Since we don't want to allow vfio iommu to derive or own a reference to the
> > physical device, I think it is still better not providing such pci_dev to the 
> > vfio iommu type1 driver.
> > 
> > Also, I need to point out that if the vfio iommu is going to set up iommu page
> > table for the real underlying physical device, giving the fact of single RID we
> > are all having here, the iommu mapping code has to return the new "IOVA" that is
> > mapped to the HPA, which the GPU vendro driver will have to put on its DMA
> > engine. This is very different than the current VFIO IOMMU mapping logic.
> > 
> > And we still have to provide another interface to translate the GPA to
> > HPA for CPU mapping.
> > 
> > In the current RFC, we only need to have a single interface to provide the most
> > basic information to the GPU vendor driver and without taking the risk of
> > leaking a ref to VFIO IOMMU.
> 
> I don't see this as some fundamental difference of opinion, it's really
> just whether vfio provides a "pin this GFN and return the HPA" function
> or whether that function could be extended to include "... and also map
> it through the DMA API for the provided device and return the host
> IOVA".  It might even still be a single function to vfio for CPU vs
> device mapping where the device and IOVA return pointer are NULL when
> only pinning is required for CPU access (though maybe there are better
> ways to provide CPU access than pinning).  A wrapper could even give the
> appearance that those are two separate functions.
> 
> So long as vfio isn't owning or deriving the device for the DMA API
> calls and we don't introduce some complication in page accounting, this
> really just seems like a question of whether moving the DMA API
> handling into vfio is common between the vendor vGPU drivers and are we
> reducing the overall amount and complexity of code by giving the vendor
> drivers the opportunity to do both operations with one interface.

Hi Alex,

OK, I will look into of adding such facilitation and probably include it in a
bit later rev of VGPU IOMMU if we don't run any surprise or the issues you
mentioned above.

Thanks,
Neo

> If as Kevin suggest it also provides some additional abstractions
> for Xen vs KVM, even better.  Thanks,
> 
> Alex
diff mbox

Patch

diff --git a/drivers/vgpu/Makefile b/drivers/vgpu/Makefile
index a0a2655..8ace18d 100644
--- a/drivers/vgpu/Makefile
+++ b/drivers/vgpu/Makefile
@@ -3,3 +3,4 @@  vgpu-y := vgpu-core.o vgpu-sysfs.o vgpu-driver.o
 
 obj-$(CONFIG_VGPU)			+= vgpu.o
 obj-$(CONFIG_VGPU_VFIO)                 += vgpu_vfio.o
+obj-$(CONFIG_VFIO_IOMMU_TYPE1_VGPU)     += vfio_iommu_type1_vgpu.o
diff --git a/drivers/vgpu/vfio_iommu_type1_vgpu.c b/drivers/vgpu/vfio_iommu_type1_vgpu.c
new file mode 100644
index 0000000..0b36ae5
--- /dev/null
+++ b/drivers/vgpu/vfio_iommu_type1_vgpu.c
@@ -0,0 +1,423 @@ 
+/*
+ * VGPU : IOMMU DMA mapping support for VGPU
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/compat.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/uuid.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/vgpu.h>
+
+#include "vgpu_private.h"
+
+#define DRIVER_VERSION	"0.1"
+#define DRIVER_AUTHOR	"NVIDIA Corporation"
+#define DRIVER_DESC     "VGPU Type1 IOMMU driver for VFIO"
+
+// VFIO structures
+
+struct vfio_iommu_vgpu {
+	struct mutex lock;
+	struct iommu_group *group;
+	struct vgpu_device *vgpu_dev;
+	struct rb_root dma_list;
+	struct mm_struct * vm_mm;
+};
+
+struct vgpu_vfio_dma {
+	struct rb_node node;
+	dma_addr_t iova;
+	unsigned long vaddr;
+	size_t size;
+	int prot;
+};
+
+/*
+ * VGPU VFIO FOPs definition
+ *
+ */
+
+/*
+ * Duplicated from vfio_link_dma, just quick hack ... should
+ * reuse code later
+ */
+
+static void vgpu_link_dma(struct vfio_iommu_vgpu *iommu,
+			  struct vgpu_vfio_dma *new)
+{
+	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
+	struct vgpu_vfio_dma *dma;
+
+	while (*link) {
+		parent = *link;
+		dma = rb_entry(parent, struct vgpu_vfio_dma, node);
+
+		if (new->iova + new->size <= dma->iova)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &iommu->dma_list);
+}
+
+static struct vgpu_vfio_dma *vgpu_find_dma(struct vfio_iommu_vgpu *iommu,
+					   dma_addr_t start, size_t size)
+{
+	struct rb_node *node = iommu->dma_list.rb_node;
+
+	while (node) {
+		struct vgpu_vfio_dma *dma = rb_entry(node, struct vgpu_vfio_dma, node);
+
+		if (start + size <= dma->iova)
+			node = node->rb_left;
+		else if (start >= dma->iova + dma->size)
+			node = node->rb_right;
+		else
+			return dma;
+	}
+
+	return NULL;
+}
+
+static void vgpu_unlink_dma(struct vfio_iommu_vgpu *iommu, struct vgpu_vfio_dma *old)
+{
+	rb_erase(&old->node, &iommu->dma_list);
+}
+
+static void vgpu_dump_dma(struct vfio_iommu_vgpu *iommu)
+{
+	struct vgpu_vfio_dma *c, *n;
+	uint32_t i = 0;
+
+	rbtree_postorder_for_each_entry_safe(c, n, &iommu->dma_list, node)
+		printk(KERN_INFO "%s: dma[%d] iova:0x%llx, vaddr:0x%lx, size:0x%lx\n",
+		       __FUNCTION__, i++, c->iova, c->vaddr, c->size);
+}
+
+static int vgpu_dma_do_track(struct vfio_iommu_vgpu * vgpu_iommu,
+	struct vfio_iommu_type1_dma_map *map)
+{
+	dma_addr_t iova = map->iova;
+	unsigned long vaddr = map->vaddr;
+	int ret = 0, prot = 0;
+	struct vgpu_vfio_dma *vgpu_dma;
+
+	mutex_lock(&vgpu_iommu->lock);
+
+	if (vgpu_find_dma(vgpu_iommu, map->iova, map->size)) {
+		mutex_unlock(&vgpu_iommu->lock);
+		return -EEXIST;
+	}
+
+	vgpu_dma = kzalloc(sizeof(*vgpu_dma), GFP_KERNEL);
+
+	if (!vgpu_dma) {
+		mutex_unlock(&vgpu_iommu->lock);
+		return -ENOMEM;
+	}
+
+	vgpu_dma->iova = iova;
+	vgpu_dma->vaddr = vaddr;
+	vgpu_dma->prot = prot;
+	vgpu_dma->size = map->size;
+
+	vgpu_link_dma(vgpu_iommu, vgpu_dma);
+
+	mutex_unlock(&vgpu_iommu->lock);
+	return ret;
+}
+
+static int vgpu_dma_do_untrack(struct vfio_iommu_vgpu * vgpu_iommu,
+	struct vfio_iommu_type1_dma_unmap *unmap)
+{
+	struct vgpu_vfio_dma *vgpu_dma;
+	size_t unmapped = 0;
+	int ret = 0;
+
+	mutex_lock(&vgpu_iommu->lock);
+
+	vgpu_dma = vgpu_find_dma(vgpu_iommu, unmap->iova, 0);
+	if (vgpu_dma && vgpu_dma->iova != unmap->iova) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	vgpu_dma = vgpu_find_dma(vgpu_iommu, unmap->iova + unmap->size - 1, 0);
+	if (vgpu_dma && vgpu_dma->iova + vgpu_dma->size != unmap->iova + unmap->size) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	while (( vgpu_dma = vgpu_find_dma(vgpu_iommu, unmap->iova, unmap->size))) {
+		unmapped += vgpu_dma->size;
+		vgpu_unlink_dma(vgpu_iommu, vgpu_dma);
+	}
+
+unlock:
+	mutex_unlock(&vgpu_iommu->lock);
+	unmap->size = unmapped;
+
+	return ret;
+}
+
+/* Ugly hack to quickly test single deivce ... */
+
+static struct vfio_iommu_vgpu *_local_iommu = NULL;
+
+int vgpu_dma_do_translate(dma_addr_t *gfn_buffer, uint32_t count)
+{
+	int i = 0, ret = 0, prot = 0;
+	unsigned long remote_vaddr = 0, pfn = 0;
+	struct vfio_iommu_vgpu *vgpu_iommu = _local_iommu;
+	struct vgpu_vfio_dma *vgpu_dma;
+	struct page *page[1];
+	// unsigned long * addr = NULL;
+	struct mm_struct *mm = vgpu_iommu->vm_mm;
+
+	prot = IOMMU_READ | IOMMU_WRITE;
+
+	printk(KERN_INFO "%s: >>>>\n", __FUNCTION__);
+
+	mutex_lock(&vgpu_iommu->lock);
+
+	vgpu_dump_dma(vgpu_iommu);
+
+	for (i = 0; i < count; i++) {
+		dma_addr_t iova = gfn_buffer[i] << PAGE_SHIFT;
+		vgpu_dma = vgpu_find_dma(vgpu_iommu, iova, 0 /*  size */);
+
+		if (!vgpu_dma) {
+			printk(KERN_INFO "%s: fail locate iova[%d]:0x%llx\n", __FUNCTION__, i, iova);
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		remote_vaddr = vgpu_dma->vaddr + iova - vgpu_dma->iova;
+		printk(KERN_INFO "%s: find dma iova[%d]:0x%llx, vaddr:0x%lx, size:0x%lx, remote_vaddr:0x%lx\n",
+			__FUNCTION__, i, vgpu_dma->iova,
+			vgpu_dma->vaddr, vgpu_dma->size, remote_vaddr);
+
+		if (get_user_pages_unlocked(NULL, mm, remote_vaddr, 1, 1, 0, page) == 1) {
+			pfn = page_to_pfn(page[0]);
+			printk(KERN_INFO "%s: pfn[%d]:0x%lx\n", __FUNCTION__, i, pfn);
+			// addr = vmap(page, 1, VM_MAP, PAGE_KERNEL);
+		}
+		else {
+			printk(KERN_INFO "%s: fail to pin pfn[%d]\n", __FUNCTION__, i);
+			ret = -ENOMEM;
+			goto unlock;
+		}
+
+		gfn_buffer[i] = pfn;
+		// vunmap(addr);
+
+	}
+
+unlock:
+	mutex_unlock(&vgpu_iommu->lock);
+	printk(KERN_INFO "%s: <<<<\n", __FUNCTION__);
+	return ret;
+}
+EXPORT_SYMBOL(vgpu_dma_do_translate);
+
+static void *vfio_iommu_vgpu_open(unsigned long arg)
+{
+	struct vfio_iommu_vgpu *iommu;
+
+	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
+
+	if (!iommu)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&iommu->lock);
+
+	printk(KERN_INFO "%s", __FUNCTION__);
+
+	/* TODO: Keep track the v2 vs. v1, for now only assume
+	 * we are v2 due to QEMU code */
+	_local_iommu = iommu;
+	return iommu;
+}
+
+static void vfio_iommu_vgpu_release(void *iommu_data)
+{
+	struct vfio_iommu_vgpu *iommu = iommu_data;
+	kfree(iommu);
+	printk(KERN_INFO "%s", __FUNCTION__);
+}
+
+static long vfio_iommu_vgpu_ioctl(void *iommu_data,
+		unsigned int cmd, unsigned long arg)
+{
+	int ret = 0;
+	unsigned long minsz;
+	struct vfio_iommu_vgpu *vgpu_iommu = iommu_data;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION:
+	{
+		if ((arg == VFIO_TYPE1_IOMMU) || (arg == VFIO_TYPE1v2_IOMMU))
+			return 1;
+		else
+			return 0;
+	}
+
+	case VFIO_IOMMU_GET_INFO:
+	{
+		struct vfio_iommu_type1_info info;
+		minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.flags = 0;
+
+		return copy_to_user((void __user *)arg, &info, minsz);
+	}
+	case VFIO_IOMMU_MAP_DMA:
+	{
+		// TODO
+		struct vfio_iommu_type1_dma_map map;
+		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
+
+		if (copy_from_user(&map, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (map.argsz < minsz)
+			return -EINVAL;
+
+		printk(KERN_INFO "VGPU-IOMMU:MAP_DMA flags:%d, vaddr:0x%llx, iova:0x%llx, size:0x%llx\n",
+			map.flags, map.vaddr, map.iova, map.size);
+
+		/*
+		 * TODO: Tracking code is mostly duplicated from TYPE1 IOMMU, ideally,
+		 * this should be merged into one single file and reuse data
+		 * structure
+		 *
+		 */
+		ret = vgpu_dma_do_track(vgpu_iommu, &map);
+		break;
+	}
+	case VFIO_IOMMU_UNMAP_DMA:
+	{
+		// TODO
+		struct vfio_iommu_type1_dma_unmap unmap;
+
+		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
+
+		if (copy_from_user(&unmap, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (unmap.argsz < minsz)
+			return -EINVAL;
+
+		ret = vgpu_dma_do_untrack(vgpu_iommu, &unmap);
+		break;
+	}
+	default:
+	{
+		printk(KERN_INFO "%s cmd default ", __FUNCTION__);
+		ret = -ENOTTY;
+		break;
+	}
+	}
+
+	return ret;
+}
+
+
+static int vfio_iommu_vgpu_attach_group(void *iommu_data,
+		                        struct iommu_group *iommu_group)
+{
+	struct vfio_iommu_vgpu *iommu = iommu_data;
+	struct vgpu_device *vgpu_dev = NULL;
+
+	printk(KERN_INFO "%s", __FUNCTION__);
+
+	vgpu_dev = get_vgpu_device_from_group(iommu_group);
+	if (vgpu_dev) {
+		iommu->vgpu_dev = vgpu_dev;
+		iommu->group = iommu_group;
+
+		/* IOMMU shares the same life cylce as VM MM */
+		iommu->vm_mm = current->mm;
+
+		return 0;
+	}
+	iommu->group = iommu_group;
+	return 1;
+}
+
+static void vfio_iommu_vgpu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct vfio_iommu_vgpu *iommu = iommu_data;
+
+	printk(KERN_INFO "%s", __FUNCTION__);
+	iommu->vm_mm = NULL;
+	iommu->group = NULL;
+
+	return;
+}
+
+
+static const struct vfio_iommu_driver_ops vfio_iommu_vgpu_driver_ops = {
+	.name           = "vgpu_vfio",
+	.owner          = THIS_MODULE,
+	.open           = vfio_iommu_vgpu_open,
+	.release        = vfio_iommu_vgpu_release,
+	.ioctl          = vfio_iommu_vgpu_ioctl,
+	.attach_group   = vfio_iommu_vgpu_attach_group,
+	.detach_group   = vfio_iommu_vgpu_detach_group,
+};
+
+
+int vgpu_vfio_iommu_init(void)
+{
+	int rc = vfio_register_iommu_driver(&vfio_iommu_vgpu_driver_ops);
+
+	printk(KERN_INFO "%s\n", __FUNCTION__);
+	if (rc < 0) {
+		printk(KERN_ERR "Error: failed to register vfio iommu, err:%d\n", rc);
+	}
+
+	return rc;
+}
+
+void vgpu_vfio_iommu_exit(void)
+{
+	// unregister vgpu_vfio driver
+	vfio_unregister_iommu_driver(&vfio_iommu_vgpu_driver_ops);
+	printk(KERN_INFO "%s\n", __FUNCTION__);
+}
+
+
+module_init(vgpu_vfio_iommu_init);
+module_exit(vgpu_vfio_iommu_exit);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+