diff mbox

[v7,3/4] vfio iommu: Add support for mediated devices

Message ID 1472097235-6332-4-git-send-email-kwankhede@nvidia.com
State New
Headers show

Commit Message

Kirti Wankhede Aug. 25, 2016, 3:53 a.m. UTC
VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
Mediated device only uses IOMMU APIs, the underlying hardware can be
managed by an IOMMU domain.

Aim of this change is:
- To use most of the code of TYPE1 IOMMU driver for mediated devices
- To support direct assigned device and mediated device in single module

Added two new callback functions to struct vfio_iommu_driver_ops. Backend
IOMMU module that supports pining and unpinning pages for mdev devices
should provide these functions.
Added APIs for pining and unpining pages to VFIO module. These calls back
into backend iommu module to actually pin and unpin pages.

This change adds pin and unpin support for mediated device to TYPE1 IOMMU
backend module. More details:
- When iommu_group of mediated devices is attached, task structure is
  cached which is used later to pin pages and page accounting.
- It keeps track of pinned pages for mediated domain. This data is used to
  verify unpinning request and to unpin remaining pages while detaching, if
  there are any.
- Used existing mechanism for page accounting. If iommu capable domain
  exist in the container then all pages are already pinned and accounted.
  Accouting for mdev device is only done if there is no iommu capable
  domain in the container.

Tested by assigning below combinations of devices to a single VM:
- GPU pass through only
- vGPU device only
- One GPU pass through and one vGPU device
- two GPU pass through

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
Reviewed-on: http://git-master/r/1175707
Reviewed-by: Automatic_Commit_Validation_User
---
 drivers/vfio/vfio.c             | 117 ++++++++++
 drivers/vfio/vfio_iommu_type1.c | 498 ++++++++++++++++++++++++++++++++++++----
 include/linux/vfio.h            |  13 +-
 3 files changed, 580 insertions(+), 48 deletions(-)

Comments

Dong Jia Shi Aug. 25, 2016, 7:29 a.m. UTC | #1
On Thu, 25 Aug 2016 09:23:54 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> @@ -769,6 +1090,33 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_free;
> 
> +	if (IS_ENABLED(CONFIF_VFIO_MDEV) && !iommu_present(bus) &&
s/CONFIF_VFIO_MDEV/CONFIG_VFIO_MDEV/

> +	    (bus == &mdev_bus_type)) {
> +		if (iommu->local_domain) {
> +			list_add(&group->next,
> +				 &iommu->local_domain->group_list);
> +			kfree(domain);
> +			mutex_unlock(&iommu->lock);
> +			return 0;
> +		}
> +


--------
Dong Jia
Kirti Wankhede Aug. 26, 2016, 1:50 p.m. UTC | #2
Oh, that's the last minute change after running checkpatch.pl :(
Thanks for catching that. I'll correct that.

Thanks,
Kirti

On 8/25/2016 12:59 PM, Dong Jia wrote:
> On Thu, 25 Aug 2016 09:23:54 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> @@ -769,6 +1090,33 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  	if (ret)
>>  		goto out_free;
>>
>> +	if (IS_ENABLED(CONFIF_VFIO_MDEV) && !iommu_present(bus) &&
> s/CONFIF_VFIO_MDEV/CONFIG_VFIO_MDEV/
> 
>> +	    (bus == &mdev_bus_type)) {
>> +		if (iommu->local_domain) {
>> +			list_add(&group->next,
>> +				 &iommu->local_domain->group_list);
>> +			kfree(domain);
>> +			mutex_unlock(&iommu->lock);
>> +			return 0;
>> +		}
>> +
> 
> 
> --------
> Dong Jia
>
Jike Song Sept. 29, 2016, 2:17 a.m. UTC | #3
+Guangrong

On 08/25/2016 11:53 AM, Kirti Wankhede wrote:
> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
> 
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
> 
> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
> IOMMU module that supports pining and unpinning pages for mdev devices
> should provide these functions.
> Added APIs for pining and unpining pages to VFIO module. These calls back
> into backend iommu module to actually pin and unpin pages.
> 
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - When iommu_group of mediated devices is attached, task structure is
>   cached which is used later to pin pages and page accounting.
> - It keeps track of pinned pages for mediated domain. This data is used to
>   verify unpinning request and to unpin remaining pages while detaching, if
>   there are any.
> - Used existing mechanism for page accounting. If iommu capable domain
>   exist in the container then all pages are already pinned and accounted.
>   Accouting for mdev device is only done if there is no iommu capable
>   domain in the container.
> 
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - two GPU pass through
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> Reviewed-on: http://git-master/r/1175707
> Reviewed-by: Automatic_Commit_Validation_User
> ---
>  drivers/vfio/vfio.c             | 117 ++++++++++
>  drivers/vfio/vfio_iommu_type1.c | 498 ++++++++++++++++++++++++++++++++++++----
>  include/linux/vfio.h            |  13 +-
>  3 files changed, 580 insertions(+), 48 deletions(-)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 6fd6fa5469de..e3e342861e04 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1782,6 +1782,123 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
>  }
>  EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
>  
> +static struct vfio_group *vfio_group_from_dev(struct device *dev)
> +{
> +	struct vfio_device *device;
> +	struct vfio_group *group;
> +	int ret;
> +
> +	device = vfio_device_get_from_dev(dev);
> +	if (!device)
> +		return ERR_PTR(-EINVAL);
> +
> +	group = device->group;
> +	if (!atomic_inc_not_zero(&group->container_users)) {
> +		ret = -EINVAL;
> +		goto err_ret;
> +	}
> +
> +	if (group->noiommu) {
> +		atomic_dec(&group->container_users);
> +		ret = -EPERM;
> +		goto err_ret;
> +	}
> +
> +	if (!group->container->iommu_driver ||
> +	    !vfio_group_viable(group)) {
> +		atomic_dec(&group->container_users);
> +		ret = -EINVAL;
> +		goto err_ret;
> +	}
> +
> +	vfio_device_put(device);
> +	return group;
> +
> +err_ret:
> +	vfio_device_put(device);
> +	return ERR_PTR(ret);
> +}
> +
> +/*
> + * Pin a set of guest PFNs and return their associated host PFNs for local
> + * domain only.
> + * @dev [in] : device
> + * @user_pfn [in]: array of user/guest PFNs
> + * @npage [in]: count of array elements
> + * @prot [in] : protection flags
> + * @phys_pfn[out] : array of host PFNs
> + */
> +long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> +		    long npage, int prot, unsigned long *phys_pfn)
> +{
> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	ssize_t ret = -EINVAL;
> +
> +	if (!dev || !user_pfn || !phys_pfn)
> +		return -EINVAL;
> +
> +	group = vfio_group_from_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);
> +
> +	container = group->container;
> +	if (IS_ERR(container))
> +		return PTR_ERR(container);
> +
> +	down_read(&container->group_lock);
> +
> +	driver = container->iommu_driver;
> +	if (likely(driver && driver->ops->pin_pages))
> +		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
> +					     npage, prot, phys_pfn);
> +
> +	up_read(&container->group_lock);
> +	vfio_group_try_dissolve_container(group);
> +
> +	return ret;
> +
> +}
> +EXPORT_SYMBOL(vfio_pin_pages);
> +
> +/*
> + * Unpin set of host PFNs for local domain only.
> + * @dev [in] : device
> + * @pfn [in] : array of host PFNs to be unpinned.
> + * @npage [in] :count of elements in array, that is number of pages.
> + */
> +long vfio_unpin_pages(struct device *dev, unsigned long *pfn, long npage)
> +{
> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	ssize_t ret = -EINVAL;
> +
> +	if (!dev || !pfn)
> +		return -EINVAL;
> +
> +	group = vfio_group_from_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);
> +
> +	container = group->container;
> +	if (IS_ERR(container))
> +		return PTR_ERR(container);
> +
> +	down_read(&container->group_lock);
> +
> +	driver = container->iommu_driver;
> +	if (likely(driver && driver->ops->unpin_pages))
> +		ret = driver->ops->unpin_pages(container->iommu_data, pfn,
> +					       npage);
> +
> +	up_read(&container->group_lock);
> +	vfio_group_try_dissolve_container(group);
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfio_unpin_pages);
> +
>  /**
>   * Module/class support
>   */
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 2ba19424e4a1..d52d75fd0f04 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -55,18 +55,26 @@ MODULE_PARM_DESC(disable_hugepages,
>  
>  struct vfio_iommu {
>  	struct list_head	domain_list;
> +	struct vfio_domain	*local_domain;
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
>  	bool			v2;
>  	bool			nesting;
>  };
>  
> +struct local_addr_space {
> +	struct task_struct	*task;
> +	struct rb_root		pfn_list;	/* pinned Host pfn list */
> +	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
> +};
> +
>  struct vfio_domain {
>  	struct iommu_domain	*domain;
>  	struct list_head	next;
>  	struct list_head	group_list;
>  	int			prot;		/* IOMMU_CACHE */
>  	bool			fgsp;		/* Fine-grained super pages */
> +	struct local_addr_space	*local_addr_space;
>  };
>  
>  struct vfio_dma {
> @@ -83,6 +91,22 @@ struct vfio_group {
>  };
>  
>  /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_pfn {
> +	struct rb_node		node;
> +	unsigned long		vaddr;		/* virtual addr */
> +	dma_addr_t		iova;		/* IOVA */
> +	unsigned long		pfn;		/* Host pfn */
> +	size_t			prot;
> +	atomic_t		ref_count;
> +};
> +
> +
> +#define IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)	\
> +			 (list_empty(&iommu->domain_list) ? false : true)
> +
> +/*
>   * This code handles mapping and unmapping of user data buffers
>   * into DMA'ble space using the IOMMU
>   */
> @@ -130,6 +154,84 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +/*
> + * Helper Functions for host pfn list
> + */
> +
> +static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
> +				      unsigned long pfn)
> +{
> +	struct rb_node *node;
> +	struct vfio_pfn *vpfn, *ret = NULL;
> +
> +	node = domain->local_addr_space->pfn_list.rb_node;
> +
> +	while (node) {
> +		vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> +		if (pfn < vpfn->pfn)
> +			node = node->rb_left;
> +		else if (pfn > vpfn->pfn)
> +			node = node->rb_right;
> +		else {
> +			ret = vpfn;
> +			break;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
> +{
> +	struct rb_node **link, *parent = NULL;
> +	struct vfio_pfn *vpfn;
> +
> +	link = &domain->local_addr_space->pfn_list.rb_node;
> +	while (*link) {
> +		parent = *link;
> +		vpfn = rb_entry(parent, struct vfio_pfn, node);
> +
> +		if (new->pfn < vpfn->pfn)
> +			link = &(*link)->rb_left;
> +		else
> +			link = &(*link)->rb_right;
> +	}
> +
> +	rb_link_node(&new->node, parent, link);
> +	rb_insert_color(&new->node, &domain->local_addr_space->pfn_list);
> +}
> +
> +static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
> +{
> +	rb_erase(&old->node, &domain->local_addr_space->pfn_list);
> +}
> +
> +static int vfio_add_to_pfn_list(struct vfio_domain *domain, unsigned long vaddr,
> +				dma_addr_t iova, unsigned long pfn, size_t prot)
> +{
> +	struct vfio_pfn *vpfn;
> +
> +	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
> +	if (!vpfn)
> +		return -ENOMEM;
> +
> +	vpfn->vaddr = vaddr;
> +	vpfn->iova = iova;
> +	vpfn->pfn = pfn;
> +	vpfn->prot = prot;
> +	atomic_set(&vpfn->ref_count, 1);
> +	vfio_link_pfn(domain, vpfn);
> +	return 0;
> +}
> +
> +static void vfio_remove_from_pfn_list(struct vfio_domain *domain,
> +				      struct vfio_pfn *vpfn)
> +{
> +	vfio_unlink_pfn(domain, vpfn);
> +	kfree(vpfn);
> +}
> +
>  struct vwork {
>  	struct mm_struct	*mm;
>  	long			npage;
> @@ -150,17 +252,17 @@ static void vfio_lock_acct_bg(struct work_struct *work)
>  	kfree(vwork);
>  }
>  
> -static void vfio_lock_acct(long npage)
> +static void vfio_lock_acct(struct task_struct *task, long npage)
>  {
>  	struct vwork *vwork;
>  	struct mm_struct *mm;
>  
> -	if (!current->mm || !npage)
> +	if (!task->mm || !npage)
>  		return; /* process exited or nothing to do */
>  
> -	if (down_write_trylock(&current->mm->mmap_sem)) {
> -		current->mm->locked_vm += npage;
> -		up_write(&current->mm->mmap_sem);
> +	if (down_write_trylock(&task->mm->mmap_sem)) {
> +		task->mm->locked_vm += npage;
> +		up_write(&task->mm->mmap_sem);
>  		return;
>  	}
>  
> @@ -172,7 +274,7 @@ static void vfio_lock_acct(long npage)
>  	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
>  	if (!vwork)
>  		return;
> -	mm = get_task_mm(current);
> +	mm = get_task_mm(task);
>  	if (!mm) {
>  		kfree(vwork);
>  		return;
> @@ -228,20 +330,31 @@ static int put_pfn(unsigned long pfn, int prot)
>  	return 0;
>  }
>  
> -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
> +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> +			 int prot, unsigned long *pfn)
>  {
>  	struct page *page[1];
>  	struct vm_area_struct *vma;
> +	struct mm_struct *local_mm = (mm ? mm : current->mm);
>  	int ret = -EFAULT;
>  
> -	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
> +	if (mm) {
> +		down_read(&local_mm->mmap_sem);
> +		ret = get_user_pages_remote(NULL, local_mm, vaddr, 1,
> +					!!(prot & IOMMU_WRITE), 0, page, NULL);
> +		up_read(&local_mm->mmap_sem);
> +	} else
> +		ret = get_user_pages_fast(vaddr, 1,
> +					  !!(prot & IOMMU_WRITE), page);
> +
> +	if (ret == 1) {
>  		*pfn = page_to_pfn(page[0]);
>  		return 0;
>  	}
>  
> -	down_read(&current->mm->mmap_sem);
> +	down_read(&local_mm->mmap_sem);
>  
> -	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> +	vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
>  
>  	if (vma && vma->vm_flags & VM_PFNMAP) {
>  		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> @@ -249,7 +362,7 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>  			ret = 0;
>  	}
>  
> -	up_read(&current->mm->mmap_sem);
> +	up_read(&local_mm->mmap_sem);
>  
>  	return ret;
>  }
> @@ -259,8 +372,8 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>   * first page and all consecutive pages with the same locking.
>   */
> -static long vfio_pin_pages(unsigned long vaddr, long npage,
> -			   int prot, unsigned long *pfn_base)
> +static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
> +				    int prot, unsigned long *pfn_base)
>  {
>  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  	bool lock_cap = capable(CAP_IPC_LOCK);
> @@ -270,7 +383,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	if (!current->mm)
>  		return -ENODEV;
>  
> -	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
> +	ret = vaddr_get_pfn(NULL, vaddr, prot, pfn_base);
>  	if (ret)
>  		return ret;
>  
> @@ -285,7 +398,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  
>  	if (unlikely(disable_hugepages)) {
>  		if (!rsvd)
> -			vfio_lock_acct(1);
> +			vfio_lock_acct(current, 1);
>  		return 1;
>  	}
>  
> @@ -293,7 +406,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
>  		unsigned long pfn = 0;
>  
> -		ret = vaddr_get_pfn(vaddr, prot, &pfn);
> +		ret = vaddr_get_pfn(NULL, vaddr, prot, &pfn);
>  		if (ret)
>  			break;
>  
> @@ -313,13 +426,13 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	}
>  
>  	if (!rsvd)
> -		vfio_lock_acct(i);
> +		vfio_lock_acct(current, i);
>  
>  	return i;
>  }
>  
> -static long vfio_unpin_pages(unsigned long pfn, long npage,
> -			     int prot, bool do_accounting)
> +static long __vfio_unpin_pages_remote(unsigned long pfn, long npage, int prot,
> +				      bool do_accounting)
>  {
>  	unsigned long unlocked = 0;
>  	long i;
> @@ -328,7 +441,188 @@ static long vfio_unpin_pages(unsigned long pfn, long npage,
>  		unlocked += put_pfn(pfn++, prot);
>  
>  	if (do_accounting)
> -		vfio_lock_acct(-unlocked);
> +		vfio_lock_acct(current, -unlocked);
> +	return unlocked;
> +}
> +
> +static long __vfio_pin_pages_local(struct vfio_domain *domain,
> +				   unsigned long vaddr, int prot,
> +				   unsigned long *pfn_base,
> +				   bool do_accounting)
> +{
> +	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	bool lock_cap = capable(CAP_IPC_LOCK);
> +	long ret;
> +	bool rsvd;
> +	struct task_struct *task = domain->local_addr_space->task;
> +
> +	if (!task->mm)
> +		return -ENODEV;
> +
> +	ret = vaddr_get_pfn(task->mm, vaddr, prot, pfn_base);
> +	if (ret)
> +		return ret;
> +
> +	rsvd = is_invalid_reserved_pfn(*pfn_base);
> +
> +	if (!rsvd && !lock_cap && task->mm->locked_vm + 1 > limit) {
> +		put_pfn(*pfn_base, prot);
> +		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> +			limit << PAGE_SHIFT);
> +		return -ENOMEM;
> +	}
> +
> +	if (!rsvd && do_accounting)
> +		vfio_lock_acct(task, 1);
> +
> +	return 1;
> +}
> +
> +static void __vfio_unpin_pages_local(struct vfio_domain *domain,
> +				     unsigned long pfn, int prot,
> +				     bool do_accounting)
> +{
> +	put_pfn(pfn, prot);
> +
> +	if (do_accounting)
> +		vfio_lock_acct(domain->local_addr_space->task, -1);
> +}
> +
> +static int vfio_unpin_pfn(struct vfio_domain *domain,
> +			  struct vfio_pfn *vpfn, bool do_accounting)
> +{
> +	__vfio_unpin_pages_local(domain, vpfn->pfn, vpfn->prot,
> +				 do_accounting);
> +
> +	if (atomic_dec_and_test(&vpfn->ref_count))
> +		vfio_remove_from_pfn_list(domain, vpfn);
> +
> +	return 1;
> +}
> +
> +static long vfio_iommu_type1_pin_pages(void *iommu_data,
> +				       unsigned long *user_pfn,
> +				       long npage, int prot,
> +				       unsigned long *phys_pfn)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain;
> +	int i, j, ret;
> +	long retpage;
> +	unsigned long remote_vaddr;
> +	unsigned long *pfn = phys_pfn;
> +	struct vfio_dma *dma;
> +	bool do_accounting = false;
> +
> +	if (!iommu || !user_pfn || !phys_pfn)
> +		return -EINVAL;
> +
> +	mutex_lock(&iommu->lock);
> +
> +	if (!iommu->local_domain) {
> +		ret = -EINVAL;
> +		goto pin_done;
> +	}
> +
> +	domain = iommu->local_domain;
> +
> +	/*
> +	 * If iommu capable domain exist in the container then all pages are
> +	 * already pinned and accounted. Accouting should be done if there is no
> +	 * iommu capable domain in the container.
> +	 */
> +	do_accounting = !IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu);
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
> +		dma_addr_t iova;
> +
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +
> +		dma = vfio_find_dma(iommu, iova, 0);
> +		if (!dma) {
> +			ret = -EINVAL;
> +			goto pin_unwind;
> +		}
> +
> +		remote_vaddr = dma->vaddr + iova - dma->iova;
> +
> +		retpage = __vfio_pin_pages_local(domain, remote_vaddr, prot,
> +						 &pfn[i], do_accounting);

Hi Kirti,

Here you call __vfio_pin_pages_local() > vaddr_get_pfn() > GUP regardless
whether the vaddr already pinned or not. That probably means, if the caller 
calls vfio_pin_pages() with a GPA for multiple times, you get memory leaks.

GUP always increases the page refcnt.

FWIW, I would like to have the pfn_list_lock implemented with key == iova,
so you can always try to find the PFN for a given iova, and pin it only if
not found.

--
Thanks,
Jike


> +		if (retpage <= 0) {
> +			WARN_ON(!retpage);
> +			ret = (int)retpage;
> +			goto pin_unwind;
> +		}
> +
> +		mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +		/* search if pfn exist */
> +		p = vfio_find_pfn(domain, pfn[i]);
> +		if (p) {
> +			atomic_inc(&p->ref_count);
> +			mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +			continue;
> +		}
> +
> +		ret = vfio_add_to_pfn_list(domain, remote_vaddr, iova,
> +					   pfn[i], prot);
> +		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> +		if (ret) {
> +			__vfio_unpin_pages_local(domain, pfn[i], prot,
> +						 do_accounting);
> +			goto pin_unwind;
> +		}
> +	}
> +
> +	ret = i;
> +	goto pin_done;
> +
> +pin_unwind:
> +	pfn[i] = 0;
> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +	for (j = 0; j < i; j++) {
> +		struct vfio_pfn *p;
> +
> +		p = vfio_find_pfn(domain, pfn[j]);
> +		if (p)
> +			vfio_unpin_pfn(domain, p, do_accounting);
> +
> +		pfn[j] = 0;
> +	}
> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> +pin_done:
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
> +static long vfio_iommu_type1_unpin_pages(void *iommu_data, unsigned long *pfn,
> +					 long npage)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain = NULL;
> +	long unlocked = 0;
> +	int i;
> +
> +	if (!iommu || !pfn)
> +		return -EINVAL;
> +
> +	domain = iommu->local_domain;
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
> +
> +		mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +		/* verify if pfn exist in pfn_list */
> +		p = vfio_find_pfn(domain, pfn[i]);
> +		if (p)
> +			unlocked += vfio_unpin_pfn(domain, p, true);
> +
> +		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +	}
>  
>  	return unlocked;
>  }
> @@ -341,6 +635,9 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  
>  	if (!dma->size)
>  		return;
> +
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +		return;
>  	/*
>  	 * We use the IOMMU to track the physical addresses, otherwise we'd
>  	 * need a much more complicated tracking system.  Unfortunately that
> @@ -382,15 +679,15 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  		if (WARN_ON(!unmapped))
>  			break;
>  
> -		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
> -					     unmapped >> PAGE_SHIFT,
> -					     dma->prot, false);
> +		unlocked += __vfio_unpin_pages_remote(phys >> PAGE_SHIFT,
> +						      unmapped >> PAGE_SHIFT,
> +						      dma->prot, false);
>  		iova += unmapped;
>  
>  		cond_resched();
>  	}
>  
> -	vfio_lock_acct(-unlocked);
> +	vfio_lock_acct(current, -unlocked);
>  }
>  
>  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> @@ -611,10 +908,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	/* Insert zero-sized and grow as we map chunks of it */
>  	vfio_link_dma(iommu, dma);
>  
> +	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)) {
> +		dma->size = size;
> +		goto map_done;
> +	}
> +
>  	while (size) {
>  		/* Pin a contiguous chunk of memory */
> -		npage = vfio_pin_pages(vaddr + dma->size,
> -				       size >> PAGE_SHIFT, prot, &pfn);
> +		npage = __vfio_pin_pages_remote(vaddr + dma->size,
> +						size >> PAGE_SHIFT, prot, &pfn);
>  		if (npage <= 0) {
>  			WARN_ON(!npage);
>  			ret = (int)npage;
> @@ -624,7 +927,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  		/* Map it! */
>  		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
>  		if (ret) {
> -			vfio_unpin_pages(pfn, npage, prot, true);
> +			__vfio_unpin_pages_remote(pfn, npage, prot, true);
>  			break;
>  		}
>  
> @@ -635,6 +938,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	if (ret)
>  		vfio_remove_dma(iommu, dma);
>  
> +map_done:
>  	mutex_unlock(&iommu->lock);
>  	return ret;
>  }
> @@ -734,11 +1038,24 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
>  	__free_pages(pages, order);
>  }
>  
> +static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
> +				   struct iommu_group *iommu_group)
> +{
> +	struct vfio_group *g;
> +
> +	list_for_each_entry(g, &domain->group_list, next) {
> +		if (g->iommu_group == iommu_group)
> +			return g;
> +	}
> +
> +	return NULL;
> +}
> +
>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>  					 struct iommu_group *iommu_group)
>  {
>  	struct vfio_iommu *iommu = iommu_data;
> -	struct vfio_group *group, *g;
> +	struct vfio_group *group;
>  	struct vfio_domain *domain, *d;
>  	struct bus_type *bus = NULL;
>  	int ret;
> @@ -746,10 +1063,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	mutex_lock(&iommu->lock);
>  
>  	list_for_each_entry(d, &iommu->domain_list, next) {
> -		list_for_each_entry(g, &d->group_list, next) {
> -			if (g->iommu_group != iommu_group)
> -				continue;
> +		if (find_iommu_group(d, iommu_group)) {
> +			mutex_unlock(&iommu->lock);
> +			return -EINVAL;
> +		}
> +	}
>  
> +	if (iommu->local_domain) {
> +		if (find_iommu_group(iommu->local_domain, iommu_group)) {
>  			mutex_unlock(&iommu->lock);
>  			return -EINVAL;
>  		}
> @@ -769,6 +1090,33 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_free;
>  
> +	if (IS_ENABLED(CONFIF_VFIO_MDEV) && !iommu_present(bus) &&
> +	    (bus == &mdev_bus_type)) {
> +		if (iommu->local_domain) {
> +			list_add(&group->next,
> +				 &iommu->local_domain->group_list);
> +			kfree(domain);
> +			mutex_unlock(&iommu->lock);
> +			return 0;
> +		}
> +
> +		domain->local_addr_space = kzalloc(sizeof(*domain->local_addr_space),
> +						   GFP_KERNEL);
> +		if (!domain->local_addr_space) {
> +			ret = -ENOMEM;
> +			goto out_free;
> +		}
> +
> +		domain->local_addr_space->task = current;
> +		INIT_LIST_HEAD(&domain->group_list);
> +		list_add(&group->next, &domain->group_list);
> +		domain->local_addr_space->pfn_list = RB_ROOT;
> +		mutex_init(&domain->local_addr_space->pfn_list_lock);
> +		iommu->local_domain = domain;
> +		mutex_unlock(&iommu->lock);
> +		return 0;
> +	}
> +
>  	domain->domain = iommu_domain_alloc(bus);
>  	if (!domain->domain) {
>  		ret = -EIO;
> @@ -859,6 +1207,18 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
>  		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
>  }
>  
> +static void vfio_local_unpin_all(struct vfio_domain *domain)
> +{
> +	struct rb_node *node;
> +
> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +	while ((node = rb_first(&domain->local_addr_space->pfn_list))) {
> +		vfio_unpin_pfn(domain,
> +				rb_entry(node, struct vfio_pfn, node), false);
> +	}
> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +}
> +
>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>  					  struct iommu_group *iommu_group)
>  {
> @@ -868,31 +1228,52 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  
>  	mutex_lock(&iommu->lock);
>  
> -	list_for_each_entry(domain, &iommu->domain_list, next) {
> -		list_for_each_entry(group, &domain->group_list, next) {
> -			if (group->iommu_group != iommu_group)
> -				continue;
> +	if (iommu->local_domain) {
> +		domain = iommu->local_domain;
> +		group = find_iommu_group(domain, iommu_group);
> +		if (group) {
> +			list_del(&group->next);
> +			kfree(group);
>  
> +			if (list_empty(&domain->group_list)) {
> +				vfio_local_unpin_all(domain);
> +				if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +					vfio_iommu_unmap_unpin_all(iommu);
> +				kfree(domain);
> +				iommu->local_domain = NULL;
> +			}
> +			goto detach_group_done;
> +		}
> +	}
> +
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +		goto detach_group_done;
> +
> +	list_for_each_entry(domain, &iommu->domain_list, next) {
> +		group = find_iommu_group(domain, iommu_group);
> +		if (group) {
>  			iommu_detach_group(domain->domain, iommu_group);
>  			list_del(&group->next);
>  			kfree(group);
>  			/*
>  			 * Group ownership provides privilege, if the group
>  			 * list is empty, the domain goes away.  If it's the
> -			 * last domain, then all the mappings go away too.
> +			 * last domain with iommu and local domain doesn't
> +			 * exist, the all the mappings go away too.
>  			 */
>  			if (list_empty(&domain->group_list)) {
> -				if (list_is_singular(&iommu->domain_list))
> +				if (list_is_singular(&iommu->domain_list) &&
> +				   (!iommu->local_domain))
>  					vfio_iommu_unmap_unpin_all(iommu);
>  				iommu_domain_free(domain->domain);
>  				list_del(&domain->next);
>  				kfree(domain);
>  			}
> -			goto done;
> +			break;
>  		}
>  	}
>  
> -done:
> +detach_group_done:
>  	mutex_unlock(&iommu->lock);
>  }
>  
> @@ -924,27 +1305,48 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  	return iommu;
>  }
>  
> +static void vfio_release_domain(struct vfio_domain *domain)
> +{
> +	struct vfio_group *group, *group_tmp;
> +
> +	list_for_each_entry_safe(group, group_tmp,
> +				 &domain->group_list, next) {
> +		if (!domain->local_addr_space)
> +			iommu_detach_group(domain->domain, group->iommu_group);
> +		list_del(&group->next);
> +		kfree(group);
> +	}
> +
> +	if (domain->local_addr_space)
> +		vfio_local_unpin_all(domain);
> +	else
> +		iommu_domain_free(domain->domain);
> +}
> +
>  static void vfio_iommu_type1_release(void *iommu_data)
>  {
>  	struct vfio_iommu *iommu = iommu_data;
>  	struct vfio_domain *domain, *domain_tmp;
> -	struct vfio_group *group, *group_tmp;
> +
> +	if (iommu->local_domain) {
> +		vfio_release_domain(iommu->local_domain);
> +		kfree(iommu->local_domain);
> +		iommu->local_domain = NULL;
> +	}
>  
>  	vfio_iommu_unmap_unpin_all(iommu);
>  
> +	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +		goto release_exit;
> +
>  	list_for_each_entry_safe(domain, domain_tmp,
>  				 &iommu->domain_list, next) {
> -		list_for_each_entry_safe(group, group_tmp,
> -					 &domain->group_list, next) {
> -			iommu_detach_group(domain->domain, group->iommu_group);
> -			list_del(&group->next);
> -			kfree(group);
> -		}
> -		iommu_domain_free(domain->domain);
> +		vfio_release_domain(domain);
>  		list_del(&domain->next);
>  		kfree(domain);
>  	}
>  
> +release_exit:
>  	kfree(iommu);
>  }
>  
> @@ -1048,6 +1450,8 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
>  	.ioctl		= vfio_iommu_type1_ioctl,
>  	.attach_group	= vfio_iommu_type1_attach_group,
>  	.detach_group	= vfio_iommu_type1_detach_group,
> +	.pin_pages	= vfio_iommu_type1_pin_pages,
> +	.unpin_pages	= vfio_iommu_type1_unpin_pages,
>  };
>  
>  static int __init vfio_iommu_type1_init(void)
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0ecae0b1cd34..0bd25ba6223d 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -17,6 +17,7 @@
>  #include <linux/workqueue.h>
>  #include <linux/poll.h>
>  #include <uapi/linux/vfio.h>
> +#include <linux/mdev.h>
>  
>  /**
>   * struct vfio_device_ops - VFIO bus driver device callbacks
> @@ -75,7 +76,11 @@ struct vfio_iommu_driver_ops {
>  					struct iommu_group *group);
>  	void		(*detach_group)(void *iommu_data,
>  					struct iommu_group *group);
> -
> +	long		(*pin_pages)(void *iommu_data, unsigned long *user_pfn,
> +				     long npage, int prot,
> +				     unsigned long *phys_pfn);
> +	long		(*unpin_pages)(void *iommu_data, unsigned long *pfn,
> +				       long npage);
>  };
>  
>  extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
> @@ -127,6 +132,12 @@ static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
>  }
>  #endif /* CONFIG_EEH */
>  
> +extern long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> +			   long npage, int prot, unsigned long *phys_pfn);
> +
> +extern long vfio_unpin_pages(struct device *dev, unsigned long *pfn,
> +			     long npage);
> +
>  /*
>   * IRQfd - generic
>   */
>
Kirti Wankhede Sept. 29, 2016, 3:06 p.m. UTC | #4
On 9/29/2016 7:47 AM, Jike Song wrote:
> +Guangrong
> 
> On 08/25/2016 11:53 AM, Kirti Wankhede wrote:

...

>> +static long vfio_iommu_type1_pin_pages(void *iommu_data,
>> +				       unsigned long *user_pfn,
>> +				       long npage, int prot,
>> +				       unsigned long *phys_pfn)
>> +{
>> +	struct vfio_iommu *iommu = iommu_data;
>> +	struct vfio_domain *domain;
>> +	int i, j, ret;
>> +	long retpage;
>> +	unsigned long remote_vaddr;
>> +	unsigned long *pfn = phys_pfn;
>> +	struct vfio_dma *dma;
>> +	bool do_accounting = false;
>> +
>> +	if (!iommu || !user_pfn || !phys_pfn)
>> +		return -EINVAL;
>> +
>> +	mutex_lock(&iommu->lock);
>> +
>> +	if (!iommu->local_domain) {
>> +		ret = -EINVAL;
>> +		goto pin_done;
>> +	}
>> +
>> +	domain = iommu->local_domain;
>> +
>> +	/*
>> +	 * If iommu capable domain exist in the container then all pages are
>> +	 * already pinned and accounted. Accouting should be done if there is no
>> +	 * iommu capable domain in the container.
>> +	 */
>> +	do_accounting = !IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu);
>> +
>> +	for (i = 0; i < npage; i++) {
>> +		struct vfio_pfn *p;
>> +		dma_addr_t iova;
>> +
>> +		iova = user_pfn[i] << PAGE_SHIFT;
>> +
>> +		dma = vfio_find_dma(iommu, iova, 0);
>> +		if (!dma) {
>> +			ret = -EINVAL;
>> +			goto pin_unwind;
>> +		}
>> +
>> +		remote_vaddr = dma->vaddr + iova - dma->iova;
>> +
>> +		retpage = __vfio_pin_pages_local(domain, remote_vaddr, prot,
>> +						 &pfn[i], do_accounting);
> 
> Hi Kirti,
> 
> Here you call __vfio_pin_pages_local() > vaddr_get_pfn() > GUP regardless
> whether the vaddr already pinned or not. That probably means, if the caller 
> calls vfio_pin_pages() with a GPA for multiple times, you get memory leaks.
> 
> GUP always increases the page refcnt.
> 
> FWIW, I would like to have the pfn_list_lock implemented with key == iova,
> so you can always try to find the PFN for a given iova, and pin it only if
> not found.
> 

I didn't get how there would be a memory leak.

Right, GUP increases refcnt, so if vfio_pin_pages() is called for
multiple types for same GPA, refcnt would be incremented. In
vfio_iommu_type1_pin_pages() pinned pages list is maintained with
ref_count. If pfn is already in list, ref_count is incremented and same
is used while unpining pages.

Kirti
Jike Song Sept. 30, 2016, 2:58 a.m. UTC | #5
On 09/29/2016 11:06 PM, Kirti Wankhede wrote:
> 
> 
> On 9/29/2016 7:47 AM, Jike Song wrote:
>> +Guangrong
>>
>> On 08/25/2016 11:53 AM, Kirti Wankhede wrote:
> 
> ...
> 
>>> +static long vfio_iommu_type1_pin_pages(void *iommu_data,
>>> +				       unsigned long *user_pfn,
>>> +				       long npage, int prot,
>>> +				       unsigned long *phys_pfn)
>>> +{
>>> +	struct vfio_iommu *iommu = iommu_data;
>>> +	struct vfio_domain *domain;
>>> +	int i, j, ret;
>>> +	long retpage;
>>> +	unsigned long remote_vaddr;
>>> +	unsigned long *pfn = phys_pfn;
>>> +	struct vfio_dma *dma;
>>> +	bool do_accounting = false;
>>> +
>>> +	if (!iommu || !user_pfn || !phys_pfn)
>>> +		return -EINVAL;
>>> +
>>> +	mutex_lock(&iommu->lock);
>>> +
>>> +	if (!iommu->local_domain) {
>>> +		ret = -EINVAL;
>>> +		goto pin_done;
>>> +	}
>>> +
>>> +	domain = iommu->local_domain;
>>> +
>>> +	/*
>>> +	 * If iommu capable domain exist in the container then all pages are
>>> +	 * already pinned and accounted. Accouting should be done if there is no
>>> +	 * iommu capable domain in the container.
>>> +	 */
>>> +	do_accounting = !IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu);
>>> +
>>> +	for (i = 0; i < npage; i++) {
>>> +		struct vfio_pfn *p;
>>> +		dma_addr_t iova;
>>> +
>>> +		iova = user_pfn[i] << PAGE_SHIFT;
>>> +
>>> +		dma = vfio_find_dma(iommu, iova, 0);
>>> +		if (!dma) {
>>> +			ret = -EINVAL;
>>> +			goto pin_unwind;
>>> +		}
>>> +
>>> +		remote_vaddr = dma->vaddr + iova - dma->iova;
>>> +
>>> +		retpage = __vfio_pin_pages_local(domain, remote_vaddr, prot,
>>> +						 &pfn[i], do_accounting);
>>
>> Hi Kirti,
>>
>> Here you call __vfio_pin_pages_local() > vaddr_get_pfn() > GUP regardless
>> whether the vaddr already pinned or not. That probably means, if the caller 
>> calls vfio_pin_pages() with a GPA for multiple times, you get memory leaks.
>>
>> GUP always increases the page refcnt.
>>
>> FWIW, I would like to have the pfn_list_lock implemented with key == iova,
>> so you can always try to find the PFN for a given iova, and pin it only if
>> not found.
>>
> 
> I didn't get how there would be a memory leak.
> 
> Right, GUP increases refcnt, so if vfio_pin_pages() is called for
> multiple types for same GPA, refcnt would be incremented. In
> vfio_iommu_type1_pin_pages() pinned pages list is maintained with
> ref_count. If pfn is already in list, ref_count is incremented and same
> is used while unpining pages.
> 

Let's have a close look at vfio_unpin_pfn:

	static int vfio_unpin_pfn(struct vfio_domain *domain,
				  struct vfio_pfn *vpfn, bool do_accounting)
	{
		__vfio_unpin_pages_for_mdev(domain, vpfn->pfn, vpfn->prot,
					    do_accounting);

		if (atomic_dec_and_test(&vpfn->ref_count))
			vfio_remove_from_pfn_list(domain, vpfn);

		return 1;
	}

Here you didn't call __vfio_unpin_pages_for_mdev -- thereby put_page -- for
vpfn->ref_count times. If page->_refcount increased by GUP for (N) times, here
you only set it back to (N-1).

--
Thanks,
Jike
Jike Song Sept. 30, 2016, 3:10 a.m. UTC | #6
On 09/30/2016 10:58 AM, Jike Song wrote:
> On 09/29/2016 11:06 PM, Kirti Wankhede wrote:
>>
>>
>> On 9/29/2016 7:47 AM, Jike Song wrote:
>>> +Guangrong
>>>
>>> On 08/25/2016 11:53 AM, Kirti Wankhede wrote:
>>
>> ...
>>
>>>> +static long vfio_iommu_type1_pin_pages(void *iommu_data,
>>>> +				       unsigned long *user_pfn,
>>>> +				       long npage, int prot,
>>>> +				       unsigned long *phys_pfn)
>>>> +{
>>>> +	struct vfio_iommu *iommu = iommu_data;
>>>> +	struct vfio_domain *domain;
>>>> +	int i, j, ret;
>>>> +	long retpage;
>>>> +	unsigned long remote_vaddr;
>>>> +	unsigned long *pfn = phys_pfn;
>>>> +	struct vfio_dma *dma;
>>>> +	bool do_accounting = false;
>>>> +
>>>> +	if (!iommu || !user_pfn || !phys_pfn)
>>>> +		return -EINVAL;
>>>> +
>>>> +	mutex_lock(&iommu->lock);
>>>> +
>>>> +	if (!iommu->local_domain) {
>>>> +		ret = -EINVAL;
>>>> +		goto pin_done;
>>>> +	}
>>>> +
>>>> +	domain = iommu->local_domain;
>>>> +
>>>> +	/*
>>>> +	 * If iommu capable domain exist in the container then all pages are
>>>> +	 * already pinned and accounted. Accouting should be done if there is no
>>>> +	 * iommu capable domain in the container.
>>>> +	 */
>>>> +	do_accounting = !IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu);
>>>> +
>>>> +	for (i = 0; i < npage; i++) {
>>>> +		struct vfio_pfn *p;
>>>> +		dma_addr_t iova;
>>>> +
>>>> +		iova = user_pfn[i] << PAGE_SHIFT;
>>>> +
>>>> +		dma = vfio_find_dma(iommu, iova, 0);
>>>> +		if (!dma) {
>>>> +			ret = -EINVAL;
>>>> +			goto pin_unwind;
>>>> +		}
>>>> +
>>>> +		remote_vaddr = dma->vaddr + iova - dma->iova;
>>>> +
>>>> +		retpage = __vfio_pin_pages_local(domain, remote_vaddr, prot,
>>>> +						 &pfn[i], do_accounting);
>>>
>>> Hi Kirti,
>>>
>>> Here you call __vfio_pin_pages_local() > vaddr_get_pfn() > GUP regardless
>>> whether the vaddr already pinned or not. That probably means, if the caller 
>>> calls vfio_pin_pages() with a GPA for multiple times, you get memory leaks.
>>>
>>> GUP always increases the page refcnt.
>>>
>>> FWIW, I would like to have the pfn_list_lock implemented with key == iova,
>>> so you can always try to find the PFN for a given iova, and pin it only if
>>> not found.
>>>
>>
>> I didn't get how there would be a memory leak.
>>
>> Right, GUP increases refcnt, so if vfio_pin_pages() is called for
>> multiple types for same GPA, refcnt would be incremented. In
>> vfio_iommu_type1_pin_pages() pinned pages list is maintained with
>> ref_count. If pfn is already in list, ref_count is incremented and same
>> is used while unpining pages.
>>
> 
> Let's have a close look at vfio_unpin_pfn:
> 
> 	static int vfio_unpin_pfn(struct vfio_domain *domain,
> 				  struct vfio_pfn *vpfn, bool do_accounting)
> 	{
> 		__vfio_unpin_pages_for_mdev(domain, vpfn->pfn, vpfn->prot,
> 					    do_accounting);
> 
> 		if (atomic_dec_and_test(&vpfn->ref_count))
> 			vfio_remove_from_pfn_list(domain, vpfn);
> 
> 		return 1;
> 	}
> 
> Here you didn't call __vfio_unpin_pages_for_mdev -- thereby put_page -- for
> vpfn->ref_count times. If page->_refcount increased by GUP for (N) times, here
> you only set it back to (N-1).
> 

What's more, since all pinned {iova, pfni} already saved, it's better to
consult it before calling GUP, which will get_page() unconditionally.

--
Thanks,
Jike
Kirti Wankhede Sept. 30, 2016, 11:44 a.m. UTC | #7
On 9/30/2016 8:40 AM, Jike Song wrote:
> On 09/30/2016 10:58 AM, Jike Song wrote:
>> On 09/29/2016 11:06 PM, Kirti Wankhede wrote:
>>>
>>>
>>> On 9/29/2016 7:47 AM, Jike Song wrote:
>>>> +Guangrong
>>>>
>>>> On 08/25/2016 11:53 AM, Kirti Wankhede wrote:
>>>
>>> ...
>>>
>>>>> +static long vfio_iommu_type1_pin_pages(void *iommu_data,
>>>>> +				       unsigned long *user_pfn,
>>>>> +				       long npage, int prot,
>>>>> +				       unsigned long *phys_pfn)
>>>>> +{
>>>>> +	struct vfio_iommu *iommu = iommu_data;
>>>>> +	struct vfio_domain *domain;
>>>>> +	int i, j, ret;
>>>>> +	long retpage;
>>>>> +	unsigned long remote_vaddr;
>>>>> +	unsigned long *pfn = phys_pfn;
>>>>> +	struct vfio_dma *dma;
>>>>> +	bool do_accounting = false;
>>>>> +
>>>>> +	if (!iommu || !user_pfn || !phys_pfn)
>>>>> +		return -EINVAL;
>>>>> +
>>>>> +	mutex_lock(&iommu->lock);
>>>>> +
>>>>> +	if (!iommu->local_domain) {
>>>>> +		ret = -EINVAL;
>>>>> +		goto pin_done;
>>>>> +	}
>>>>> +
>>>>> +	domain = iommu->local_domain;
>>>>> +
>>>>> +	/*
>>>>> +	 * If iommu capable domain exist in the container then all pages are
>>>>> +	 * already pinned and accounted. Accouting should be done if there is no
>>>>> +	 * iommu capable domain in the container.
>>>>> +	 */
>>>>> +	do_accounting = !IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu);
>>>>> +
>>>>> +	for (i = 0; i < npage; i++) {
>>>>> +		struct vfio_pfn *p;
>>>>> +		dma_addr_t iova;
>>>>> +
>>>>> +		iova = user_pfn[i] << PAGE_SHIFT;
>>>>> +
>>>>> +		dma = vfio_find_dma(iommu, iova, 0);
>>>>> +		if (!dma) {
>>>>> +			ret = -EINVAL;
>>>>> +			goto pin_unwind;
>>>>> +		}
>>>>> +
>>>>> +		remote_vaddr = dma->vaddr + iova - dma->iova;
>>>>> +
>>>>> +		retpage = __vfio_pin_pages_local(domain, remote_vaddr, prot,
>>>>> +						 &pfn[i], do_accounting);
>>>>
>>>> Hi Kirti,
>>>>
>>>> Here you call __vfio_pin_pages_local() > vaddr_get_pfn() > GUP regardless
>>>> whether the vaddr already pinned or not. That probably means, if the caller 
>>>> calls vfio_pin_pages() with a GPA for multiple times, you get memory leaks.
>>>>
>>>> GUP always increases the page refcnt.
>>>>
>>>> FWIW, I would like to have the pfn_list_lock implemented with key == iova,
>>>> so you can always try to find the PFN for a given iova, and pin it only if
>>>> not found.
>>>>
>>>
>>> I didn't get how there would be a memory leak.
>>>
>>> Right, GUP increases refcnt, so if vfio_pin_pages() is called for
>>> multiple types for same GPA, refcnt would be incremented. In
>>> vfio_iommu_type1_pin_pages() pinned pages list is maintained with
>>> ref_count. If pfn is already in list, ref_count is incremented and same
>>> is used while unpining pages.
>>>
>>
>> Let's have a close look at vfio_unpin_pfn:
>>
>> 	static int vfio_unpin_pfn(struct vfio_domain *domain,
>> 				  struct vfio_pfn *vpfn, bool do_accounting)
>> 	{
>> 		__vfio_unpin_pages_for_mdev(domain, vpfn->pfn, vpfn->prot,
>> 					    do_accounting);
>>
>> 		if (atomic_dec_and_test(&vpfn->ref_count))
>> 			vfio_remove_from_pfn_list(domain, vpfn);
>>
>> 		return 1;
>> 	}
>>
>> Here you didn't call __vfio_unpin_pages_for_mdev -- thereby put_page -- for
>> vpfn->ref_count times. If page->_refcount increased by GUP for (N) times, here
>> you only set it back to (N-1).
>>

User of vfio_pin_pages() should call vfio_unpin_pages() also,  so here
we unpin it once. If vfio_pin_pages() is called twice for same page, we
should get vfio_unpin_pages() twice for same page.

If users of these APIs don't follow this, then
vfio_release_domain() -> vfio_local_unpin_all() takes care of unpin,
decrement ref_count and delete node on (ref_count == 0) for all
remaining pfn.

> 
> What's more, since all pinned {iova, pfni} already saved, it's better to
> consult it before calling GUP, which will get_page() unconditionally.

pfn is required to unpin page, so we have pfn as key for rbtree.
vfio_pin_pages() is called with user_pfn or iova, which can't be used to
search in rbtree with iova in optimized way. Raw way would be to goto
each node of rbtree and check iova which would hamper the performance in
if this is called in performance critical path.
So here optimized way is to first pin it, get pfn and check if already
exist in rbtree. If it exist increment ref_count else add it to the rbtree.

Thanks,
Kirti
Jike Song Oct. 8, 2016, 7:09 a.m. UTC | #8
On 09/30/2016 07:44 PM, Kirti Wankhede wrote:
> On 9/30/2016 8:40 AM, Jike Song wrote:
>> On 09/30/2016 10:58 AM, Jike Song wrote:
>>> On 09/29/2016 11:06 PM, Kirti Wankhede wrote:
>>>>
>>>>
>>>> On 9/29/2016 7:47 AM, Jike Song wrote:
>>>>> +Guangrong
>>>>>
>>>>> On 08/25/2016 11:53 AM, Kirti Wankhede wrote:
>>>>
>>>> ...
>>>>
>>>>>> +static long vfio_iommu_type1_pin_pages(void *iommu_data,
>>>>>> +				       unsigned long *user_pfn,
>>>>>> +				       long npage, int prot,
>>>>>> +				       unsigned long *phys_pfn)
>>>>>> +{
>>>>>> +	struct vfio_iommu *iommu = iommu_data;
>>>>>> +	struct vfio_domain *domain;
>>>>>> +	int i, j, ret;
>>>>>> +	long retpage;
>>>>>> +	unsigned long remote_vaddr;
>>>>>> +	unsigned long *pfn = phys_pfn;
>>>>>> +	struct vfio_dma *dma;
>>>>>> +	bool do_accounting = false;
>>>>>> +
>>>>>> +	if (!iommu || !user_pfn || !phys_pfn)
>>>>>> +		return -EINVAL;
>>>>>> +
>>>>>> +	mutex_lock(&iommu->lock);
>>>>>> +
>>>>>> +	if (!iommu->local_domain) {
>>>>>> +		ret = -EINVAL;
>>>>>> +		goto pin_done;
>>>>>> +	}
>>>>>> +
>>>>>> +	domain = iommu->local_domain;
>>>>>> +
>>>>>> +	/*
>>>>>> +	 * If iommu capable domain exist in the container then all pages are
>>>>>> +	 * already pinned and accounted. Accouting should be done if there is no
>>>>>> +	 * iommu capable domain in the container.
>>>>>> +	 */
>>>>>> +	do_accounting = !IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu);
>>>>>> +
>>>>>> +	for (i = 0; i < npage; i++) {
>>>>>> +		struct vfio_pfn *p;
>>>>>> +		dma_addr_t iova;
>>>>>> +
>>>>>> +		iova = user_pfn[i] << PAGE_SHIFT;
>>>>>> +
>>>>>> +		dma = vfio_find_dma(iommu, iova, 0);
>>>>>> +		if (!dma) {
>>>>>> +			ret = -EINVAL;
>>>>>> +			goto pin_unwind;
>>>>>> +		}
>>>>>> +
>>>>>> +		remote_vaddr = dma->vaddr + iova - dma->iova;
>>>>>> +
>>>>>> +		retpage = __vfio_pin_pages_local(domain, remote_vaddr, prot,
>>>>>> +						 &pfn[i], do_accounting);
>>>>>
>>>>> Hi Kirti,
>>>>>
>>>>> Here you call __vfio_pin_pages_local() > vaddr_get_pfn() > GUP regardless
>>>>> whether the vaddr already pinned or not. That probably means, if the caller 
>>>>> calls vfio_pin_pages() with a GPA for multiple times, you get memory leaks.
>>>>>
>>>>> GUP always increases the page refcnt.
>>>>>
>>>>> FWIW, I would like to have the pfn_list_lock implemented with key == iova,
>>>>> so you can always try to find the PFN for a given iova, and pin it only if
>>>>> not found.
>>>>>
>>>>
>>>> I didn't get how there would be a memory leak.
>>>>
>>>> Right, GUP increases refcnt, so if vfio_pin_pages() is called for
>>>> multiple types for same GPA, refcnt would be incremented. In
>>>> vfio_iommu_type1_pin_pages() pinned pages list is maintained with
>>>> ref_count. If pfn is already in list, ref_count is incremented and same
>>>> is used while unpining pages.
>>>>
>>>
>>> Let's have a close look at vfio_unpin_pfn:
>>>
>>> 	static int vfio_unpin_pfn(struct vfio_domain *domain,
>>> 				  struct vfio_pfn *vpfn, bool do_accounting)
>>> 	{
>>> 		__vfio_unpin_pages_for_mdev(domain, vpfn->pfn, vpfn->prot,
>>> 					    do_accounting);
>>>
>>> 		if (atomic_dec_and_test(&vpfn->ref_count))
>>> 			vfio_remove_from_pfn_list(domain, vpfn);
>>>
>>> 		return 1;
>>> 	}
>>>
>>> Here you didn't call __vfio_unpin_pages_for_mdev -- thereby put_page -- for
>>> vpfn->ref_count times. If page->_refcount increased by GUP for (N) times, here
>>> you only set it back to (N-1).
>>>
> 
> User of vfio_pin_pages() should call vfio_unpin_pages() also,  so here
> we unpin it once. If vfio_pin_pages() is called twice for same page, we
> should get vfio_unpin_pages() twice for same page.
>

If this is the deliberate design, why do you need a 'ref_count'? You can
simply drop the 'ref_count' and blame the caller for pinning/unpinning
different times.

> If users of these APIs don't follow this, then
> vfio_release_domain() -> vfio_local_unpin_all() takes care of unpin,
> decrement ref_count and delete node on (ref_count == 0) for all
> remaining pfn.
>

Here you did pay attention to the "caller doesn't follow this" situation.
However, dealing with 'ref_count' in vfio-iommu is not enough: memory
leaked.

>>
>> What's more, since all pinned {iova, pfni} already saved, it's better to
>> consult it before calling GUP, which will get_page() unconditionally.
>
> pfn is required to unpin page, so we have pfn as key for rbtree.
> vfio_pin_pages() is called with user_pfn or iova, which can't be used to
> search in rbtree with iova in optimized way. Raw way would be to goto
> each node of rbtree and check iova which would hamper the performance in
> if this is called in performance critical path.
> So here optimized way is to first pin it, get pfn and check if already
> exist in rbtree. If it exist increment ref_count else add it to the rbtree.
> 

Of course pfn is required to unpin page, I 100% agree. But that doesn't 
change the argue: using iova instead for key, you can still store pfn
along with it.

By the way, calling GUP unconditionally hurts more than searching rbtree.


--
Thanks,
Jike
diff mbox

Patch

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 6fd6fa5469de..e3e342861e04 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1782,6 +1782,123 @@  void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
 }
 EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
 
+static struct vfio_group *vfio_group_from_dev(struct device *dev)
+{
+	struct vfio_device *device;
+	struct vfio_group *group;
+	int ret;
+
+	device = vfio_device_get_from_dev(dev);
+	if (!device)
+		return ERR_PTR(-EINVAL);
+
+	group = device->group;
+	if (!atomic_inc_not_zero(&group->container_users)) {
+		ret = -EINVAL;
+		goto err_ret;
+	}
+
+	if (group->noiommu) {
+		atomic_dec(&group->container_users);
+		ret = -EPERM;
+		goto err_ret;
+	}
+
+	if (!group->container->iommu_driver ||
+	    !vfio_group_viable(group)) {
+		atomic_dec(&group->container_users);
+		ret = -EINVAL;
+		goto err_ret;
+	}
+
+	vfio_device_put(device);
+	return group;
+
+err_ret:
+	vfio_device_put(device);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Pin a set of guest PFNs and return their associated host PFNs for local
+ * domain only.
+ * @dev [in] : device
+ * @user_pfn [in]: array of user/guest PFNs
+ * @npage [in]: count of array elements
+ * @prot [in] : protection flags
+ * @phys_pfn[out] : array of host PFNs
+ */
+long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
+		    long npage, int prot, unsigned long *phys_pfn)
+{
+	struct vfio_container *container;
+	struct vfio_group *group;
+	struct vfio_iommu_driver *driver;
+	ssize_t ret = -EINVAL;
+
+	if (!dev || !user_pfn || !phys_pfn)
+		return -EINVAL;
+
+	group = vfio_group_from_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	container = group->container;
+	if (IS_ERR(container))
+		return PTR_ERR(container);
+
+	down_read(&container->group_lock);
+
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->pin_pages))
+		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
+					     npage, prot, phys_pfn);
+
+	up_read(&container->group_lock);
+	vfio_group_try_dissolve_container(group);
+
+	return ret;
+
+}
+EXPORT_SYMBOL(vfio_pin_pages);
+
+/*
+ * Unpin set of host PFNs for local domain only.
+ * @dev [in] : device
+ * @pfn [in] : array of host PFNs to be unpinned.
+ * @npage [in] :count of elements in array, that is number of pages.
+ */
+long vfio_unpin_pages(struct device *dev, unsigned long *pfn, long npage)
+{
+	struct vfio_container *container;
+	struct vfio_group *group;
+	struct vfio_iommu_driver *driver;
+	ssize_t ret = -EINVAL;
+
+	if (!dev || !pfn)
+		return -EINVAL;
+
+	group = vfio_group_from_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	container = group->container;
+	if (IS_ERR(container))
+		return PTR_ERR(container);
+
+	down_read(&container->group_lock);
+
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->unpin_pages))
+		ret = driver->ops->unpin_pages(container->iommu_data, pfn,
+					       npage);
+
+	up_read(&container->group_lock);
+	vfio_group_try_dissolve_container(group);
+	return ret;
+}
+EXPORT_SYMBOL(vfio_unpin_pages);
+
 /**
  * Module/class support
  */
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 2ba19424e4a1..d52d75fd0f04 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -55,18 +55,26 @@  MODULE_PARM_DESC(disable_hugepages,
 
 struct vfio_iommu {
 	struct list_head	domain_list;
+	struct vfio_domain	*local_domain;
 	struct mutex		lock;
 	struct rb_root		dma_list;
 	bool			v2;
 	bool			nesting;
 };
 
+struct local_addr_space {
+	struct task_struct	*task;
+	struct rb_root		pfn_list;	/* pinned Host pfn list */
+	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
+};
+
 struct vfio_domain {
 	struct iommu_domain	*domain;
 	struct list_head	next;
 	struct list_head	group_list;
 	int			prot;		/* IOMMU_CACHE */
 	bool			fgsp;		/* Fine-grained super pages */
+	struct local_addr_space	*local_addr_space;
 };
 
 struct vfio_dma {
@@ -83,6 +91,22 @@  struct vfio_group {
 };
 
 /*
+ * Guest RAM pinning working set or DMA target
+ */
+struct vfio_pfn {
+	struct rb_node		node;
+	unsigned long		vaddr;		/* virtual addr */
+	dma_addr_t		iova;		/* IOVA */
+	unsigned long		pfn;		/* Host pfn */
+	size_t			prot;
+	atomic_t		ref_count;
+};
+
+
+#define IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)	\
+			 (list_empty(&iommu->domain_list) ? false : true)
+
+/*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
  */
@@ -130,6 +154,84 @@  static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 	rb_erase(&old->node, &iommu->dma_list);
 }
 
+/*
+ * Helper Functions for host pfn list
+ */
+
+static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
+				      unsigned long pfn)
+{
+	struct rb_node *node;
+	struct vfio_pfn *vpfn, *ret = NULL;
+
+	node = domain->local_addr_space->pfn_list.rb_node;
+
+	while (node) {
+		vpfn = rb_entry(node, struct vfio_pfn, node);
+
+		if (pfn < vpfn->pfn)
+			node = node->rb_left;
+		else if (pfn > vpfn->pfn)
+			node = node->rb_right;
+		else {
+			ret = vpfn;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
+{
+	struct rb_node **link, *parent = NULL;
+	struct vfio_pfn *vpfn;
+
+	link = &domain->local_addr_space->pfn_list.rb_node;
+	while (*link) {
+		parent = *link;
+		vpfn = rb_entry(parent, struct vfio_pfn, node);
+
+		if (new->pfn < vpfn->pfn)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &domain->local_addr_space->pfn_list);
+}
+
+static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
+{
+	rb_erase(&old->node, &domain->local_addr_space->pfn_list);
+}
+
+static int vfio_add_to_pfn_list(struct vfio_domain *domain, unsigned long vaddr,
+				dma_addr_t iova, unsigned long pfn, size_t prot)
+{
+	struct vfio_pfn *vpfn;
+
+	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
+	if (!vpfn)
+		return -ENOMEM;
+
+	vpfn->vaddr = vaddr;
+	vpfn->iova = iova;
+	vpfn->pfn = pfn;
+	vpfn->prot = prot;
+	atomic_set(&vpfn->ref_count, 1);
+	vfio_link_pfn(domain, vpfn);
+	return 0;
+}
+
+static void vfio_remove_from_pfn_list(struct vfio_domain *domain,
+				      struct vfio_pfn *vpfn)
+{
+	vfio_unlink_pfn(domain, vpfn);
+	kfree(vpfn);
+}
+
 struct vwork {
 	struct mm_struct	*mm;
 	long			npage;
@@ -150,17 +252,17 @@  static void vfio_lock_acct_bg(struct work_struct *work)
 	kfree(vwork);
 }
 
-static void vfio_lock_acct(long npage)
+static void vfio_lock_acct(struct task_struct *task, long npage)
 {
 	struct vwork *vwork;
 	struct mm_struct *mm;
 
-	if (!current->mm || !npage)
+	if (!task->mm || !npage)
 		return; /* process exited or nothing to do */
 
-	if (down_write_trylock(&current->mm->mmap_sem)) {
-		current->mm->locked_vm += npage;
-		up_write(&current->mm->mmap_sem);
+	if (down_write_trylock(&task->mm->mmap_sem)) {
+		task->mm->locked_vm += npage;
+		up_write(&task->mm->mmap_sem);
 		return;
 	}
 
@@ -172,7 +274,7 @@  static void vfio_lock_acct(long npage)
 	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
 	if (!vwork)
 		return;
-	mm = get_task_mm(current);
+	mm = get_task_mm(task);
 	if (!mm) {
 		kfree(vwork);
 		return;
@@ -228,20 +330,31 @@  static int put_pfn(unsigned long pfn, int prot)
 	return 0;
 }
 
-static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
+static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
+			 int prot, unsigned long *pfn)
 {
 	struct page *page[1];
 	struct vm_area_struct *vma;
+	struct mm_struct *local_mm = (mm ? mm : current->mm);
 	int ret = -EFAULT;
 
-	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
+	if (mm) {
+		down_read(&local_mm->mmap_sem);
+		ret = get_user_pages_remote(NULL, local_mm, vaddr, 1,
+					!!(prot & IOMMU_WRITE), 0, page, NULL);
+		up_read(&local_mm->mmap_sem);
+	} else
+		ret = get_user_pages_fast(vaddr, 1,
+					  !!(prot & IOMMU_WRITE), page);
+
+	if (ret == 1) {
 		*pfn = page_to_pfn(page[0]);
 		return 0;
 	}
 
-	down_read(&current->mm->mmap_sem);
+	down_read(&local_mm->mmap_sem);
 
-	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
+	vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
 
 	if (vma && vma->vm_flags & VM_PFNMAP) {
 		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -249,7 +362,7 @@  static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 			ret = 0;
 	}
 
-	up_read(&current->mm->mmap_sem);
+	up_read(&local_mm->mmap_sem);
 
 	return ret;
 }
@@ -259,8 +372,8 @@  static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
  * the iommu can only map chunks of consecutive pfns anyway, so get the
  * first page and all consecutive pages with the same locking.
  */
-static long vfio_pin_pages(unsigned long vaddr, long npage,
-			   int prot, unsigned long *pfn_base)
+static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
+				    int prot, unsigned long *pfn_base)
 {
 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	bool lock_cap = capable(CAP_IPC_LOCK);
@@ -270,7 +383,7 @@  static long vfio_pin_pages(unsigned long vaddr, long npage,
 	if (!current->mm)
 		return -ENODEV;
 
-	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
+	ret = vaddr_get_pfn(NULL, vaddr, prot, pfn_base);
 	if (ret)
 		return ret;
 
@@ -285,7 +398,7 @@  static long vfio_pin_pages(unsigned long vaddr, long npage,
 
 	if (unlikely(disable_hugepages)) {
 		if (!rsvd)
-			vfio_lock_acct(1);
+			vfio_lock_acct(current, 1);
 		return 1;
 	}
 
@@ -293,7 +406,7 @@  static long vfio_pin_pages(unsigned long vaddr, long npage,
 	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 		unsigned long pfn = 0;
 
-		ret = vaddr_get_pfn(vaddr, prot, &pfn);
+		ret = vaddr_get_pfn(NULL, vaddr, prot, &pfn);
 		if (ret)
 			break;
 
@@ -313,13 +426,13 @@  static long vfio_pin_pages(unsigned long vaddr, long npage,
 	}
 
 	if (!rsvd)
-		vfio_lock_acct(i);
+		vfio_lock_acct(current, i);
 
 	return i;
 }
 
-static long vfio_unpin_pages(unsigned long pfn, long npage,
-			     int prot, bool do_accounting)
+static long __vfio_unpin_pages_remote(unsigned long pfn, long npage, int prot,
+				      bool do_accounting)
 {
 	unsigned long unlocked = 0;
 	long i;
@@ -328,7 +441,188 @@  static long vfio_unpin_pages(unsigned long pfn, long npage,
 		unlocked += put_pfn(pfn++, prot);
 
 	if (do_accounting)
-		vfio_lock_acct(-unlocked);
+		vfio_lock_acct(current, -unlocked);
+	return unlocked;
+}
+
+static long __vfio_pin_pages_local(struct vfio_domain *domain,
+				   unsigned long vaddr, int prot,
+				   unsigned long *pfn_base,
+				   bool do_accounting)
+{
+	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	bool lock_cap = capable(CAP_IPC_LOCK);
+	long ret;
+	bool rsvd;
+	struct task_struct *task = domain->local_addr_space->task;
+
+	if (!task->mm)
+		return -ENODEV;
+
+	ret = vaddr_get_pfn(task->mm, vaddr, prot, pfn_base);
+	if (ret)
+		return ret;
+
+	rsvd = is_invalid_reserved_pfn(*pfn_base);
+
+	if (!rsvd && !lock_cap && task->mm->locked_vm + 1 > limit) {
+		put_pfn(*pfn_base, prot);
+		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
+			limit << PAGE_SHIFT);
+		return -ENOMEM;
+	}
+
+	if (!rsvd && do_accounting)
+		vfio_lock_acct(task, 1);
+
+	return 1;
+}
+
+static void __vfio_unpin_pages_local(struct vfio_domain *domain,
+				     unsigned long pfn, int prot,
+				     bool do_accounting)
+{
+	put_pfn(pfn, prot);
+
+	if (do_accounting)
+		vfio_lock_acct(domain->local_addr_space->task, -1);
+}
+
+static int vfio_unpin_pfn(struct vfio_domain *domain,
+			  struct vfio_pfn *vpfn, bool do_accounting)
+{
+	__vfio_unpin_pages_local(domain, vpfn->pfn, vpfn->prot,
+				 do_accounting);
+
+	if (atomic_dec_and_test(&vpfn->ref_count))
+		vfio_remove_from_pfn_list(domain, vpfn);
+
+	return 1;
+}
+
+static long vfio_iommu_type1_pin_pages(void *iommu_data,
+				       unsigned long *user_pfn,
+				       long npage, int prot,
+				       unsigned long *phys_pfn)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *domain;
+	int i, j, ret;
+	long retpage;
+	unsigned long remote_vaddr;
+	unsigned long *pfn = phys_pfn;
+	struct vfio_dma *dma;
+	bool do_accounting = false;
+
+	if (!iommu || !user_pfn || !phys_pfn)
+		return -EINVAL;
+
+	mutex_lock(&iommu->lock);
+
+	if (!iommu->local_domain) {
+		ret = -EINVAL;
+		goto pin_done;
+	}
+
+	domain = iommu->local_domain;
+
+	/*
+	 * If iommu capable domain exist in the container then all pages are
+	 * already pinned and accounted. Accouting should be done if there is no
+	 * iommu capable domain in the container.
+	 */
+	do_accounting = !IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu);
+
+	for (i = 0; i < npage; i++) {
+		struct vfio_pfn *p;
+		dma_addr_t iova;
+
+		iova = user_pfn[i] << PAGE_SHIFT;
+
+		dma = vfio_find_dma(iommu, iova, 0);
+		if (!dma) {
+			ret = -EINVAL;
+			goto pin_unwind;
+		}
+
+		remote_vaddr = dma->vaddr + iova - dma->iova;
+
+		retpage = __vfio_pin_pages_local(domain, remote_vaddr, prot,
+						 &pfn[i], do_accounting);
+		if (retpage <= 0) {
+			WARN_ON(!retpage);
+			ret = (int)retpage;
+			goto pin_unwind;
+		}
+
+		mutex_lock(&domain->local_addr_space->pfn_list_lock);
+
+		/* search if pfn exist */
+		p = vfio_find_pfn(domain, pfn[i]);
+		if (p) {
+			atomic_inc(&p->ref_count);
+			mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+			continue;
+		}
+
+		ret = vfio_add_to_pfn_list(domain, remote_vaddr, iova,
+					   pfn[i], prot);
+		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+
+		if (ret) {
+			__vfio_unpin_pages_local(domain, pfn[i], prot,
+						 do_accounting);
+			goto pin_unwind;
+		}
+	}
+
+	ret = i;
+	goto pin_done;
+
+pin_unwind:
+	pfn[i] = 0;
+	mutex_lock(&domain->local_addr_space->pfn_list_lock);
+	for (j = 0; j < i; j++) {
+		struct vfio_pfn *p;
+
+		p = vfio_find_pfn(domain, pfn[j]);
+		if (p)
+			vfio_unpin_pfn(domain, p, do_accounting);
+
+		pfn[j] = 0;
+	}
+	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+
+pin_done:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static long vfio_iommu_type1_unpin_pages(void *iommu_data, unsigned long *pfn,
+					 long npage)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *domain = NULL;
+	long unlocked = 0;
+	int i;
+
+	if (!iommu || !pfn)
+		return -EINVAL;
+
+	domain = iommu->local_domain;
+
+	for (i = 0; i < npage; i++) {
+		struct vfio_pfn *p;
+
+		mutex_lock(&domain->local_addr_space->pfn_list_lock);
+
+		/* verify if pfn exist in pfn_list */
+		p = vfio_find_pfn(domain, pfn[i]);
+		if (p)
+			unlocked += vfio_unpin_pfn(domain, p, true);
+
+		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+	}
 
 	return unlocked;
 }
@@ -341,6 +635,9 @@  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 
 	if (!dma->size)
 		return;
+
+	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
+		return;
 	/*
 	 * We use the IOMMU to track the physical addresses, otherwise we'd
 	 * need a much more complicated tracking system.  Unfortunately that
@@ -382,15 +679,15 @@  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 		if (WARN_ON(!unmapped))
 			break;
 
-		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
-					     unmapped >> PAGE_SHIFT,
-					     dma->prot, false);
+		unlocked += __vfio_unpin_pages_remote(phys >> PAGE_SHIFT,
+						      unmapped >> PAGE_SHIFT,
+						      dma->prot, false);
 		iova += unmapped;
 
 		cond_resched();
 	}
 
-	vfio_lock_acct(-unlocked);
+	vfio_lock_acct(current, -unlocked);
 }
 
 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
@@ -611,10 +908,16 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	/* Insert zero-sized and grow as we map chunks of it */
 	vfio_link_dma(iommu, dma);
 
+	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
+	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)) {
+		dma->size = size;
+		goto map_done;
+	}
+
 	while (size) {
 		/* Pin a contiguous chunk of memory */
-		npage = vfio_pin_pages(vaddr + dma->size,
-				       size >> PAGE_SHIFT, prot, &pfn);
+		npage = __vfio_pin_pages_remote(vaddr + dma->size,
+						size >> PAGE_SHIFT, prot, &pfn);
 		if (npage <= 0) {
 			WARN_ON(!npage);
 			ret = (int)npage;
@@ -624,7 +927,7 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 		/* Map it! */
 		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
 		if (ret) {
-			vfio_unpin_pages(pfn, npage, prot, true);
+			__vfio_unpin_pages_remote(pfn, npage, prot, true);
 			break;
 		}
 
@@ -635,6 +938,7 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	if (ret)
 		vfio_remove_dma(iommu, dma);
 
+map_done:
 	mutex_unlock(&iommu->lock);
 	return ret;
 }
@@ -734,11 +1038,24 @@  static void vfio_test_domain_fgsp(struct vfio_domain *domain)
 	__free_pages(pages, order);
 }
 
+static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
+				   struct iommu_group *iommu_group)
+{
+	struct vfio_group *g;
+
+	list_for_each_entry(g, &domain->group_list, next) {
+		if (g->iommu_group == iommu_group)
+			return g;
+	}
+
+	return NULL;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
 	struct vfio_iommu *iommu = iommu_data;
-	struct vfio_group *group, *g;
+	struct vfio_group *group;
 	struct vfio_domain *domain, *d;
 	struct bus_type *bus = NULL;
 	int ret;
@@ -746,10 +1063,14 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	mutex_lock(&iommu->lock);
 
 	list_for_each_entry(d, &iommu->domain_list, next) {
-		list_for_each_entry(g, &d->group_list, next) {
-			if (g->iommu_group != iommu_group)
-				continue;
+		if (find_iommu_group(d, iommu_group)) {
+			mutex_unlock(&iommu->lock);
+			return -EINVAL;
+		}
+	}
 
+	if (iommu->local_domain) {
+		if (find_iommu_group(iommu->local_domain, iommu_group)) {
 			mutex_unlock(&iommu->lock);
 			return -EINVAL;
 		}
@@ -769,6 +1090,33 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_free;
 
+	if (IS_ENABLED(CONFIF_VFIO_MDEV) && !iommu_present(bus) &&
+	    (bus == &mdev_bus_type)) {
+		if (iommu->local_domain) {
+			list_add(&group->next,
+				 &iommu->local_domain->group_list);
+			kfree(domain);
+			mutex_unlock(&iommu->lock);
+			return 0;
+		}
+
+		domain->local_addr_space = kzalloc(sizeof(*domain->local_addr_space),
+						   GFP_KERNEL);
+		if (!domain->local_addr_space) {
+			ret = -ENOMEM;
+			goto out_free;
+		}
+
+		domain->local_addr_space->task = current;
+		INIT_LIST_HEAD(&domain->group_list);
+		list_add(&group->next, &domain->group_list);
+		domain->local_addr_space->pfn_list = RB_ROOT;
+		mutex_init(&domain->local_addr_space->pfn_list_lock);
+		iommu->local_domain = domain;
+		mutex_unlock(&iommu->lock);
+		return 0;
+	}
+
 	domain->domain = iommu_domain_alloc(bus);
 	if (!domain->domain) {
 		ret = -EIO;
@@ -859,6 +1207,18 @@  static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
 		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
 }
 
+static void vfio_local_unpin_all(struct vfio_domain *domain)
+{
+	struct rb_node *node;
+
+	mutex_lock(&domain->local_addr_space->pfn_list_lock);
+	while ((node = rb_first(&domain->local_addr_space->pfn_list))) {
+		vfio_unpin_pfn(domain,
+				rb_entry(node, struct vfio_pfn, node), false);
+	}
+	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
@@ -868,31 +1228,52 @@  static void vfio_iommu_type1_detach_group(void *iommu_data,
 
 	mutex_lock(&iommu->lock);
 
-	list_for_each_entry(domain, &iommu->domain_list, next) {
-		list_for_each_entry(group, &domain->group_list, next) {
-			if (group->iommu_group != iommu_group)
-				continue;
+	if (iommu->local_domain) {
+		domain = iommu->local_domain;
+		group = find_iommu_group(domain, iommu_group);
+		if (group) {
+			list_del(&group->next);
+			kfree(group);
 
+			if (list_empty(&domain->group_list)) {
+				vfio_local_unpin_all(domain);
+				if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
+					vfio_iommu_unmap_unpin_all(iommu);
+				kfree(domain);
+				iommu->local_domain = NULL;
+			}
+			goto detach_group_done;
+		}
+	}
+
+	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
+		goto detach_group_done;
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		group = find_iommu_group(domain, iommu_group);
+		if (group) {
 			iommu_detach_group(domain->domain, iommu_group);
 			list_del(&group->next);
 			kfree(group);
 			/*
 			 * Group ownership provides privilege, if the group
 			 * list is empty, the domain goes away.  If it's the
-			 * last domain, then all the mappings go away too.
+			 * last domain with iommu and local domain doesn't
+			 * exist, the all the mappings go away too.
 			 */
 			if (list_empty(&domain->group_list)) {
-				if (list_is_singular(&iommu->domain_list))
+				if (list_is_singular(&iommu->domain_list) &&
+				   (!iommu->local_domain))
 					vfio_iommu_unmap_unpin_all(iommu);
 				iommu_domain_free(domain->domain);
 				list_del(&domain->next);
 				kfree(domain);
 			}
-			goto done;
+			break;
 		}
 	}
 
-done:
+detach_group_done:
 	mutex_unlock(&iommu->lock);
 }
 
@@ -924,27 +1305,48 @@  static void *vfio_iommu_type1_open(unsigned long arg)
 	return iommu;
 }
 
+static void vfio_release_domain(struct vfio_domain *domain)
+{
+	struct vfio_group *group, *group_tmp;
+
+	list_for_each_entry_safe(group, group_tmp,
+				 &domain->group_list, next) {
+		if (!domain->local_addr_space)
+			iommu_detach_group(domain->domain, group->iommu_group);
+		list_del(&group->next);
+		kfree(group);
+	}
+
+	if (domain->local_addr_space)
+		vfio_local_unpin_all(domain);
+	else
+		iommu_domain_free(domain->domain);
+}
+
 static void vfio_iommu_type1_release(void *iommu_data)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_domain *domain, *domain_tmp;
-	struct vfio_group *group, *group_tmp;
+
+	if (iommu->local_domain) {
+		vfio_release_domain(iommu->local_domain);
+		kfree(iommu->local_domain);
+		iommu->local_domain = NULL;
+	}
 
 	vfio_iommu_unmap_unpin_all(iommu);
 
+	if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
+		goto release_exit;
+
 	list_for_each_entry_safe(domain, domain_tmp,
 				 &iommu->domain_list, next) {
-		list_for_each_entry_safe(group, group_tmp,
-					 &domain->group_list, next) {
-			iommu_detach_group(domain->domain, group->iommu_group);
-			list_del(&group->next);
-			kfree(group);
-		}
-		iommu_domain_free(domain->domain);
+		vfio_release_domain(domain);
 		list_del(&domain->next);
 		kfree(domain);
 	}
 
+release_exit:
 	kfree(iommu);
 }
 
@@ -1048,6 +1450,8 @@  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.ioctl		= vfio_iommu_type1_ioctl,
 	.attach_group	= vfio_iommu_type1_attach_group,
 	.detach_group	= vfio_iommu_type1_detach_group,
+	.pin_pages	= vfio_iommu_type1_pin_pages,
+	.unpin_pages	= vfio_iommu_type1_unpin_pages,
 };
 
 static int __init vfio_iommu_type1_init(void)
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0ecae0b1cd34..0bd25ba6223d 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -17,6 +17,7 @@ 
 #include <linux/workqueue.h>
 #include <linux/poll.h>
 #include <uapi/linux/vfio.h>
+#include <linux/mdev.h>
 
 /**
  * struct vfio_device_ops - VFIO bus driver device callbacks
@@ -75,7 +76,11 @@  struct vfio_iommu_driver_ops {
 					struct iommu_group *group);
 	void		(*detach_group)(void *iommu_data,
 					struct iommu_group *group);
-
+	long		(*pin_pages)(void *iommu_data, unsigned long *user_pfn,
+				     long npage, int prot,
+				     unsigned long *phys_pfn);
+	long		(*unpin_pages)(void *iommu_data, unsigned long *pfn,
+				       long npage);
 };
 
 extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
@@ -127,6 +132,12 @@  static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
 }
 #endif /* CONFIG_EEH */
 
+extern long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
+			   long npage, int prot, unsigned long *phys_pfn);
+
+extern long vfio_unpin_pages(struct device *dev, unsigned long *pfn,
+			     long npage);
+
 /*
  * IRQfd - generic
  */