diff mbox

[v9,04/12] vfio iommu: Add support for mediated devices

Message ID 1476739332-4911-5-git-send-email-kwankhede@nvidia.com
State New
Headers show

Commit Message

Kirti Wankhede Oct. 17, 2016, 9:22 p.m. UTC
VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
Mediated device only uses IOMMU APIs, the underlying hardware can be
managed by an IOMMU domain.

Aim of this change is:
- To use most of the code of TYPE1 IOMMU driver for mediated devices
- To support direct assigned device and mediated device in single module

Added two new callback functions to struct vfio_iommu_driver_ops. Backend
IOMMU module that supports pining and unpinning pages for mdev devices
should provide these functions.
Added APIs for pining and unpining pages to VFIO module. These calls back
into backend iommu module to actually pin and unpin pages.

This change adds pin and unpin support for mediated device to TYPE1 IOMMU
backend module. More details:
- When iommu_group of mediated devices is attached, task structure is
  cached which is used later to pin pages and page accounting.
- It keeps track of pinned pages for mediated domain. This data is used to
  verify unpinning request and to unpin remaining pages while detaching, if
  there are any.
- Used existing mechanism for page accounting. If iommu capable domain
  exist in the container then all pages are already pinned and accounted.
  Accouting for mdev device is only done if there is no iommu capable
  domain in the container.
- Page accouting is updated on hot plug and unplug mdev device and pass
  through device.

Tested by assigning below combinations of devices to a single VM:
- GPU pass through only
- vGPU device only
- One GPU pass through and one vGPU device
- Linux VM hot plug and unplug vGPU device while GPU pass through device
  exist
- Linux VM hot plug and unplug GPU pass through device while vGPU device
  exist

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
---
 drivers/vfio/vfio.c             |  98 ++++++
 drivers/vfio/vfio_iommu_type1.c | 692 ++++++++++++++++++++++++++++++++++------
 include/linux/vfio.h            |  13 +-
 3 files changed, 707 insertions(+), 96 deletions(-)

Comments

Alex Williamson Oct. 19, 2016, 9:02 p.m. UTC | #1
On Tue, 18 Oct 2016 02:52:04 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
> 
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
> 
> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
> IOMMU module that supports pining and unpinning pages for mdev devices
> should provide these functions.
> Added APIs for pining and unpining pages to VFIO module. These calls back
> into backend iommu module to actually pin and unpin pages.
> 
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - When iommu_group of mediated devices is attached, task structure is
>   cached which is used later to pin pages and page accounting.
> - It keeps track of pinned pages for mediated domain. This data is used to
>   verify unpinning request and to unpin remaining pages while detaching, if
>   there are any.
> - Used existing mechanism for page accounting. If iommu capable domain
>   exist in the container then all pages are already pinned and accounted.
>   Accouting for mdev device is only done if there is no iommu capable
>   domain in the container.
> - Page accouting is updated on hot plug and unplug mdev device and pass
>   through device.
> 
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - Linux VM hot plug and unplug vGPU device while GPU pass through device
>   exist
> - Linux VM hot plug and unplug GPU pass through device while vGPU device
>   exist

Were you able to do these with the locked memory limit of the user set
to the minimum required for existing GPU assignment?

> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> ---
>  drivers/vfio/vfio.c             |  98 ++++++
>  drivers/vfio/vfio_iommu_type1.c | 692 ++++++++++++++++++++++++++++++++++------
>  include/linux/vfio.h            |  13 +-
>  3 files changed, 707 insertions(+), 96 deletions(-)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 2e83bdf007fe..a5a210005b65 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1799,6 +1799,104 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
>  }
>  EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
>  
> +
> +/*
> + * Pin a set of guest PFNs and return their associated host PFNs for local
> + * domain only.
> + * @dev [in] : device
> + * @user_pfn [in]: array of user/guest PFNs
> + * @npage [in]: count of array elements
> + * @prot [in] : protection flags
> + * @phys_pfn[out] : array of host PFNs
> + */
> +long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> +		    long npage, int prot, unsigned long *phys_pfn)
> +{
> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	ssize_t ret = -EINVAL;

Unused initialization.

> +
> +	if (!dev || !user_pfn || !phys_pfn)
> +		return -EINVAL;
> +
> +	group = vfio_group_get_from_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);
> +
> +	ret = vfio_group_add_container_user(group);
> +	if (ret)
> +		goto err_pin_pages;
> +
> +	container = group->container;
> +	if (IS_ERR(container)) {

I don't see that we ever use an ERR_PTR to set group->container, it
should either be NULL or valid and the fact that we added ourselves to
container_users should mean that it's valid.  The paranoia test here
would be if container is NULL, but IS_ERR() doesn't check NULL.  If we
need that paranoia test, maybe we should just:

if (WARN_ON(!container)) {

I'm not fully convinced it's needed though.

> +		ret = PTR_ERR(container);
> +		goto err_pin_pages;
> +	}
> +
> +	down_read(&container->group_lock);
> +
> +	driver = container->iommu_driver;
> +	if (likely(driver && driver->ops->pin_pages))
> +		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
> +					     npage, prot, phys_pfn);

The caller is going to need to provide some means for us to callback to
invalidate pinned pages.

ret has already been used, so it's zero at this point.  I expect the
original intention was to let the initialization above fall through
here so that the caller gets an errno if the driver doesn't support
pin_pages.  Returning zero without actually doing anything seems like
an unexpected return value.

> +
> +	up_read(&container->group_lock);
> +	vfio_group_try_dissolve_container(group);
> +
> +err_pin_pages:
> +	vfio_group_put(group);
> +	return ret;
> +
> +}
> +EXPORT_SYMBOL(vfio_pin_pages);
> +
> +/*
> + * Unpin set of host PFNs for local domain only.
> + * @dev [in] : device
> + * @pfn [in] : array of host PFNs to be unpinned.
> + * @npage [in] :count of elements in array, that is number of pages.
> + */
> +long vfio_unpin_pages(struct device *dev, unsigned long *pfn, long npage)
> +{
> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	ssize_t ret = -EINVAL;

Same unused initialization.

> +
> +	if (!dev || !pfn)
> +		return -EINVAL;
> +
> +	group = vfio_group_get_from_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);
> +
> +	ret = vfio_group_add_container_user(group);
> +	if (ret)
> +		goto err_unpin_pages;
> +
> +	container = group->container;
> +	if (IS_ERR(container)) {

Same container not as above.

> +		ret = PTR_ERR(container);
> +		goto err_unpin_pages;
> +	}
> +
> +	down_read(&container->group_lock);
> +
> +	driver = container->iommu_driver;
> +	if (likely(driver && driver->ops->unpin_pages))
> +		ret = driver->ops->unpin_pages(container->iommu_data, pfn,
> +					       npage);

Same fall through, zero return value as above.

> +
> +	up_read(&container->group_lock);
> +	vfio_group_try_dissolve_container(group);
> +
> +err_unpin_pages:
> +	vfio_group_put(group);
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfio_unpin_pages);
> +
>  /**
>   * Module/class support
>   */
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 2ba19424e4a1..5d67058a611d 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -55,16 +55,24 @@ MODULE_PARM_DESC(disable_hugepages,
>  
>  struct vfio_iommu {
>  	struct list_head	domain_list;
> +	struct vfio_domain	*local_domain;
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
>  	bool			v2;
>  	bool			nesting;
>  };
>  
> +struct local_addr_space {
> +	struct task_struct	*task;
> +	struct rb_root		pfn_list;	/* pinned Host pfn list */
> +	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
> +};
> +
>  struct vfio_domain {
>  	struct iommu_domain	*domain;
>  	struct list_head	next;
>  	struct list_head	group_list;
> +	struct local_addr_space	*local_addr_space;
>  	int			prot;		/* IOMMU_CACHE */
>  	bool			fgsp;		/* Fine-grained super pages */
>  };
> @@ -75,6 +83,7 @@ struct vfio_dma {
>  	unsigned long		vaddr;		/* Process virtual addr */
>  	size_t			size;		/* Map size (bytes) */
>  	int			prot;		/* IOMMU_READ/WRITE */
> +	bool			iommu_mapped;
>  };
>  
>  struct vfio_group {
> @@ -83,6 +92,21 @@ struct vfio_group {
>  };
>  
>  /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_pfn {
> +	struct rb_node		node;
> +	unsigned long		vaddr;		/* virtual addr */
> +	dma_addr_t		iova;		/* IOVA */
> +	unsigned long		pfn;		/* Host pfn */
> +	int			prot;
> +	atomic_t		ref_count;
> +};

Somehow we're going to need to fit an invalidation callback here too.
How would we handle a case where there are multiple mdev devices, from
different vendor drivers, that all have the same pfn pinned?  I'm
already concerned about the per pfn overhead we're introducing here so
clearly we cannot store an invalidation callback per pinned page, per
vendor driver.  Perhaps invalidations should be done using a notifier
chain per vfio_iommu, the vendor drivers are required to register on
that chain (fail pinning with empty notifier list) user unmapping
will be broadcast to the notifier chain, the vendor driver will be
responsible for deciding if each unmap is relevant to them (potentially
it's for a pinning from another driver).

I expect we also need to enforce that vendors perform a synchronous
unmap such that after returning from the notifier list call, the
vfio_pfn should no longer exist.  If it does we might need to BUG_ON.
Also be careful to pay attention to the locking of the notifier vs
unpin callbacks to avoid deadlocks.

> +
> +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> +					(!list_empty(&iommu->domain_list))
> +
> +/*
>   * This code handles mapping and unmapping of user data buffers
>   * into DMA'ble space using the IOMMU
>   */
> @@ -130,6 +154,101 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +/*
> + * Helper Functions for host pfn list
> + */
> +
> +static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
> +				      unsigned long pfn)
> +{
> +	struct rb_node *node;
> +	struct vfio_pfn *vpfn;
> +
> +	node = domain->local_addr_space->pfn_list.rb_node;
> +
> +	while (node) {
> +		vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> +		if (pfn < vpfn->pfn)
> +			node = node->rb_left;
> +		else if (pfn > vpfn->pfn)
> +			node = node->rb_right;
> +		else
> +			return vpfn;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
> +{
> +	struct rb_node **link, *parent = NULL;
> +	struct vfio_pfn *vpfn;
> +
> +	link = &domain->local_addr_space->pfn_list.rb_node;
> +	while (*link) {
> +		parent = *link;
> +		vpfn = rb_entry(parent, struct vfio_pfn, node);
> +
> +		if (new->pfn < vpfn->pfn)
> +			link = &(*link)->rb_left;
> +		else
> +			link = &(*link)->rb_right;
> +	}
> +
> +	rb_link_node(&new->node, parent, link);
> +	rb_insert_color(&new->node, &domain->local_addr_space->pfn_list);
> +}
> +
> +static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
> +{
> +	rb_erase(&old->node, &domain->local_addr_space->pfn_list);
> +}
> +
> +static int vfio_add_to_pfn_list(struct vfio_domain *domain, unsigned long vaddr,
> +				dma_addr_t iova, unsigned long pfn, int prot)
> +{
> +	struct vfio_pfn *vpfn;
> +
> +	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
> +	if (!vpfn)
> +		return -ENOMEM;
> +
> +	vpfn->vaddr = vaddr;
> +	vpfn->iova = iova;
> +	vpfn->pfn = pfn;
> +	vpfn->prot = prot;
> +	atomic_set(&vpfn->ref_count, 1);
> +	vfio_link_pfn(domain, vpfn);
> +	return 0;
> +}
> +
> +static void vfio_remove_from_pfn_list(struct vfio_domain *domain,
> +				      struct vfio_pfn *vpfn)
> +{
> +	vfio_unlink_pfn(domain, vpfn);
> +	kfree(vpfn);
> +}
> +
> +static int vfio_pfn_account(struct vfio_iommu *iommu, unsigned long pfn)
> +{
> +	struct vfio_pfn *p;
> +	struct vfio_domain *domain = iommu->local_domain;
> +	int ret = 1;
> +
> +	if (!domain)
> +		return 1;
> +
> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +	p = vfio_find_pfn(domain, pfn);
> +	if (p)
> +		ret = 0;
> +
> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +	return ret;
> +}

So if the vfio_pfn for a given pfn exists, return 0, else return 1.
But do we know that the vfio_pfn exists at the point where we actually
do that accounting?

> +
>  struct vwork {
>  	struct mm_struct	*mm;
>  	long			npage;
> @@ -150,17 +269,17 @@ static void vfio_lock_acct_bg(struct work_struct *work)
>  	kfree(vwork);
>  }
>  
> -static void vfio_lock_acct(long npage)
> +static void vfio_lock_acct(struct task_struct *task, long npage)
>  {
>  	struct vwork *vwork;
>  	struct mm_struct *mm;
>  
> -	if (!current->mm || !npage)
> +	if (!task->mm || !npage)
>  		return; /* process exited or nothing to do */
>  
> -	if (down_write_trylock(&current->mm->mmap_sem)) {
> -		current->mm->locked_vm += npage;
> -		up_write(&current->mm->mmap_sem);
> +	if (down_write_trylock(&task->mm->mmap_sem)) {
> +		task->mm->locked_vm += npage;
> +		up_write(&task->mm->mmap_sem);
>  		return;
>  	}
>  
> @@ -172,7 +291,7 @@ static void vfio_lock_acct(long npage)
>  	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
>  	if (!vwork)
>  		return;
> -	mm = get_task_mm(current);
> +	mm = get_task_mm(task);
>  	if (!mm) {
>  		kfree(vwork);
>  		return;
> @@ -228,20 +347,31 @@ static int put_pfn(unsigned long pfn, int prot)
>  	return 0;
>  }

This coversion of vfio_lock_acct() to pass a task_struct and updating
existing callers to pass current would be a great separate, easily
review-able patch.

>  
> -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
> +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> +			 int prot, unsigned long *pfn)
>  {
>  	struct page *page[1];
>  	struct vm_area_struct *vma;
> +	struct mm_struct *local_mm = (mm ? mm : current->mm);
>  	int ret = -EFAULT;
>  
> -	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
> +	if (mm) {
> +		down_read(&local_mm->mmap_sem);
> +		ret = get_user_pages_remote(NULL, local_mm, vaddr, 1,
> +					!!(prot & IOMMU_WRITE), 0, page, NULL);
> +		up_read(&local_mm->mmap_sem);
> +	} else
> +		ret = get_user_pages_fast(vaddr, 1,
> +					  !!(prot & IOMMU_WRITE), page);
> +
> +	if (ret == 1) {
>  		*pfn = page_to_pfn(page[0]);
>  		return 0;
>  	}
>  
> -	down_read(&current->mm->mmap_sem);
> +	down_read(&local_mm->mmap_sem);
>  
> -	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> +	vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
>  
>  	if (vma && vma->vm_flags & VM_PFNMAP) {
>  		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> @@ -249,7 +379,7 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>  			ret = 0;
>  	}
>  
> -	up_read(&current->mm->mmap_sem);
> +	up_read(&local_mm->mmap_sem);
>  
>  	return ret;
>  }

This would also be a great separate patch.  Have you considered
renaming the mm_struct function arg to "remote_mm" and making the local
variable simply "mm"?  It seems like it would tie nicely with the
remote_mm path using get_user_pages_remote() while passing NULL for
remote_mm uses current->mm and the existing path (and avoid the general
oddness of passing local_mm to a "remote" function).

> @@ -259,33 +389,37 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>   * first page and all consecutive pages with the same locking.
>   */
> -static long vfio_pin_pages(unsigned long vaddr, long npage,
> -			   int prot, unsigned long *pfn_base)
> +static long __vfio_pin_pages_remote(struct vfio_iommu *iommu,
> +				    unsigned long vaddr, long npage,
> +				    int prot, unsigned long *pfn_base)
>  {
>  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  	bool lock_cap = capable(CAP_IPC_LOCK);
> -	long ret, i;
> +	long ret, i, lock_acct = 0;
>  	bool rsvd;
>  
>  	if (!current->mm)
>  		return -ENODEV;
>  
> -	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
> +	ret = vaddr_get_pfn(NULL, vaddr, prot, pfn_base);
>  	if (ret)
>  		return ret;
>  
> +	lock_acct = vfio_pfn_account(iommu, *pfn_base);
> +
>  	rsvd = is_invalid_reserved_pfn(*pfn_base);
>  
> -	if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
> +	if (!rsvd && !lock_cap && current->mm->locked_vm + lock_acct > limit) {
>  		put_pfn(*pfn_base, prot);
>  		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
>  			limit << PAGE_SHIFT);
>  		return -ENOMEM;
>  	}
>  
> +

Extra whitespace

>  	if (unlikely(disable_hugepages)) {
>  		if (!rsvd)
> -			vfio_lock_acct(1);
> +			vfio_lock_acct(current, lock_acct);
>  		return 1;
>  	}
>  
> @@ -293,7 +427,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
>  		unsigned long pfn = 0;
>  
> -		ret = vaddr_get_pfn(vaddr, prot, &pfn);
> +		ret = vaddr_get_pfn(NULL, vaddr, prot, &pfn);
>  		if (ret)
>  			break;
>  
> @@ -303,8 +437,10 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  			break;
>  		}
>  
> +		lock_acct += vfio_pfn_account(iommu, pfn);
> +

I take it that this is the new technique for keeping the accounting
accurate, we only increment the locked accounting by the amount not
already pinned in a vfio_pfn.

>  		if (!rsvd && !lock_cap &&
> -		    current->mm->locked_vm + i + 1 > limit) {
> +		    current->mm->locked_vm + lock_acct > limit) {
>  			put_pfn(pfn, prot);
>  			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>  				__func__, limit << PAGE_SHIFT);
> @@ -313,23 +449,216 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	}
>  
>  	if (!rsvd)
> -		vfio_lock_acct(i);
> +		vfio_lock_acct(current, lock_acct);
>  
>  	return i;
>  }
>  
> -static long vfio_unpin_pages(unsigned long pfn, long npage,
> -			     int prot, bool do_accounting)
> +static long __vfio_unpin_pages_remote(struct vfio_iommu *iommu,
> +				      unsigned long pfn, long npage, int prot,
> +				      bool do_accounting)

Have you noticed that it's kind of confusing that
__vfio_{un}pin_pages_remote() uses current, which does a
get_user_pages_fast() while "local" uses a provided task_struct and
uses get_user_pages_*remote*()?  And also what was effectively local
(ie. we're pinning for our own use here) is now "remote" and pinning
for a remote, vendor driver consumer, is now "local".  It's not very
intuitive.

>  {
> -	unsigned long unlocked = 0;
> +	unsigned long unlocked = 0, unlock_acct = 0;
>  	long i;
>  
> -	for (i = 0; i < npage; i++)
> +	for (i = 0; i < npage; i++) {
> +		if (do_accounting)
> +			unlock_acct += vfio_pfn_account(iommu, pfn);
> +
>  		unlocked += put_pfn(pfn++, prot);
> +	}
>  
>  	if (do_accounting)
> -		vfio_lock_acct(-unlocked);
> +		vfio_lock_acct(current, -unlock_acct);
> +
> +	return unlocked;
> +}
> +
> +static long __vfio_pin_page_local(struct vfio_domain *domain,
> +				  unsigned long vaddr, int prot,
> +				  unsigned long *pfn_base,
> +				  bool do_accounting)
> +{
> +	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	bool lock_cap = capable(CAP_IPC_LOCK);
> +	long ret;
> +	bool rsvd;
> +	struct task_struct *task = domain->local_addr_space->task;
> +
> +	if (!task->mm)
> +		return -ENODEV;
> +
> +	ret = vaddr_get_pfn(task->mm, vaddr, prot, pfn_base);
> +	if (ret)
> +		return ret;
> +
> +	rsvd = is_invalid_reserved_pfn(*pfn_base);
> +
> +	if (!rsvd && !lock_cap && task->mm->locked_vm + 1 > limit) {
> +		put_pfn(*pfn_base, prot);
> +		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> +			limit << PAGE_SHIFT);
> +		return -ENOMEM;
> +	}
> +
> +	if (!rsvd && do_accounting)
> +		vfio_lock_acct(task, 1);
> +
> +	return 1;
> +}
> +
> +static void __vfio_unpin_page_local(struct vfio_domain *domain,
> +				    unsigned long pfn, int prot,
> +				    bool do_accounting)
> +{
> +	put_pfn(pfn, prot);
> +
> +	if (do_accounting)
> +		vfio_lock_acct(domain->local_addr_space->task, -1);
> +}
> +
> +static int vfio_unpin_pfn(struct vfio_domain *domain,
> +			  struct vfio_pfn *vpfn, bool do_accounting)
> +{
> +	__vfio_unpin_page_local(domain, vpfn->pfn, vpfn->prot,
> +				do_accounting);
> +
> +	if (atomic_dec_and_test(&vpfn->ref_count))
> +		vfio_remove_from_pfn_list(domain, vpfn);
> +
> +	return 1;
> +}
> +
> +static long vfio_iommu_type1_pin_pages(void *iommu_data,
> +				       unsigned long *user_pfn,
> +				       long npage, int prot,
> +				       unsigned long *phys_pfn)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain;
> +	int i, j, ret;
> +	long retpage;
> +	unsigned long remote_vaddr;
> +	unsigned long *pfn = phys_pfn;
> +	struct vfio_dma *dma;
> +	bool do_accounting;
> +
> +	if (!iommu || !user_pfn || !phys_pfn)
> +		return -EINVAL;
> +
> +	mutex_lock(&iommu->lock);
> +
> +	if (!iommu->local_domain) {
> +		ret = -EINVAL;
> +		goto pin_done;
> +	}
> +
> +	domain = iommu->local_domain;
> +
> +	/*
> +	 * If iommu capable domain exist in the container then all pages are
> +	 * already pinned and accounted. Accouting should be done if there is no
> +	 * iommu capable domain in the container.
> +	 */
> +	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
> +		dma_addr_t iova;
> +
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +
> +		dma = vfio_find_dma(iommu, iova, 0);
> +		if (!dma) {
> +			ret = -EINVAL;
> +			goto pin_unwind;
> +		}
> +
> +		remote_vaddr = dma->vaddr + iova - dma->iova;
> +
> +		retpage = __vfio_pin_page_local(domain, remote_vaddr, prot,
> +						&pfn[i], do_accounting);
> +		if (retpage <= 0) {
> +			WARN_ON(!retpage);
> +			ret = (int)retpage;
> +			goto pin_unwind;
> +		}
> +
> +		mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +		/* search if pfn exist */
> +		p = vfio_find_pfn(domain, pfn[i]);
> +		if (p) {
> +			atomic_inc(&p->ref_count);
> +			mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +			continue;
> +		}
> +
> +		ret = vfio_add_to_pfn_list(domain, remote_vaddr, iova,
> +					   pfn[i], prot);
> +		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> +		if (ret) {
> +			__vfio_unpin_page_local(domain, pfn[i], prot,
> +						do_accounting);
> +			goto pin_unwind;
> +		}
> +	}
> +
> +	ret = i;
> +	goto pin_done;
> +
> +pin_unwind:
> +	pfn[i] = 0;
> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +	for (j = 0; j < i; j++) {
> +		struct vfio_pfn *p;
> +
> +		p = vfio_find_pfn(domain, pfn[j]);
> +		if (p)
> +			vfio_unpin_pfn(domain, p, do_accounting);
> +
> +		pfn[j] = 0;
> +	}
> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> +pin_done:
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
> +static long vfio_iommu_type1_unpin_pages(void *iommu_data, unsigned long *pfn,
> +					 long npage)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain = NULL;
> +	bool do_accounting;
> +	long unlocked = 0;
> +	int i;
> +
> +	if (!iommu || !pfn)
> +		return -EINVAL;
> +
> +	mutex_lock(&iommu->lock);
> +
> +	domain = iommu->local_domain;
> +
> +	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
> +
> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
>  
> +		/* verify if pfn exist in pfn_list */
> +		p = vfio_find_pfn(domain, pfn[i]);
> +		if (p)
> +			unlocked += vfio_unpin_pfn(domain, p, do_accounting);
> +
> +	}
> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> +	mutex_unlock(&iommu->lock);
>  	return unlocked;
>  }
>  
> @@ -341,6 +670,10 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  
>  	if (!dma->size)
>  		return;
> +
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> +		return;
> +
>  	/*
>  	 * We use the IOMMU to track the physical addresses, otherwise we'd
>  	 * need a much more complicated tracking system.  Unfortunately that
> @@ -382,15 +715,16 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  		if (WARN_ON(!unmapped))
>  			break;
>  
> -		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
> -					     unmapped >> PAGE_SHIFT,
> -					     dma->prot, false);
> +		unlocked += __vfio_unpin_pages_remote(iommu, phys >> PAGE_SHIFT,
> +						      unmapped >> PAGE_SHIFT,
> +						      dma->prot, false);
>  		iova += unmapped;
>  
>  		cond_resched();
>  	}
>  
> -	vfio_lock_acct(-unlocked);
> +	dma->iommu_mapped = false;
> +	vfio_lock_acct(current, -unlocked);
>  }
>  
>  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> @@ -558,17 +892,57 @@ unwind:
>  	return ret;
>  }
>  
> +static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> +			    size_t map_size)
> +{
> +	dma_addr_t iova = dma->iova;
> +	unsigned long vaddr = dma->vaddr;
> +	size_t size = map_size;
> +	long npage;
> +	unsigned long pfn;
> +	int ret = 0;
> +
> +	while (size) {
> +		/* Pin a contiguous chunk of memory */
> +		npage = __vfio_pin_pages_remote(iommu, vaddr + dma->size,
> +						size >> PAGE_SHIFT, dma->prot,
> +						&pfn);
> +		if (npage <= 0) {
> +			WARN_ON(!npage);
> +			ret = (int)npage;
> +			break;
> +		}
> +
> +		/* Map it! */
> +		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
> +				     dma->prot);
> +		if (ret) {
> +			__vfio_unpin_pages_remote(iommu, pfn, npage, dma->prot,
> +						  true);
> +			break;
> +		}
> +
> +		size -= npage << PAGE_SHIFT;
> +		dma->size += npage << PAGE_SHIFT;
> +	}
> +
> +	dma->iommu_mapped = true;
> +
> +	if (ret)
> +		vfio_remove_dma(iommu, dma);
> +
> +	return ret;
> +}
> +
>  static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  			   struct vfio_iommu_type1_dma_map *map)
>  {
>  	dma_addr_t iova = map->iova;
>  	unsigned long vaddr = map->vaddr;
>  	size_t size = map->size;
> -	long npage;
>  	int ret = 0, prot = 0;
>  	uint64_t mask;
>  	struct vfio_dma *dma;
> -	unsigned long pfn;
>  
>  	/* Verify that none of our __u64 fields overflow */
>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> @@ -611,29 +985,11 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	/* Insert zero-sized and grow as we map chunks of it */
>  	vfio_link_dma(iommu, dma);
>  
> -	while (size) {
> -		/* Pin a contiguous chunk of memory */
> -		npage = vfio_pin_pages(vaddr + dma->size,
> -				       size >> PAGE_SHIFT, prot, &pfn);
> -		if (npage <= 0) {
> -			WARN_ON(!npage);
> -			ret = (int)npage;
> -			break;
> -		}
> -
> -		/* Map it! */
> -		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
> -		if (ret) {
> -			vfio_unpin_pages(pfn, npage, prot, true);
> -			break;
> -		}
> -
> -		size -= npage << PAGE_SHIFT;
> -		dma->size += npage << PAGE_SHIFT;
> -	}
> -
> -	if (ret)
> -		vfio_remove_dma(iommu, dma);
> +	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> +		dma->size = size;
> +	else
> +		ret = vfio_pin_map_dma(iommu, dma, size);
>  
>  	mutex_unlock(&iommu->lock);
>  	return ret;
> @@ -662,10 +1018,6 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
>  	n = rb_first(&iommu->dma_list);
>  
> -	/* If there's not a domain, there better not be any mappings */
> -	if (WARN_ON(n && !d))
> -		return -EINVAL;
> -
>  	for (; n; n = rb_next(n)) {
>  		struct vfio_dma *dma;
>  		dma_addr_t iova;
> @@ -674,20 +1026,43 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  		iova = dma->iova;
>  
>  		while (iova < dma->iova + dma->size) {
> -			phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
> +			phys_addr_t phys;
>  			size_t size;
>  
> -			if (WARN_ON(!phys)) {
> -				iova += PAGE_SIZE;
> -				continue;
> -			}
> +			if (dma->iommu_mapped) {
> +				phys = iommu_iova_to_phys(d->domain, iova);
> +
> +				if (WARN_ON(!phys)) {
> +					iova += PAGE_SIZE;
> +					continue;
> +				}
>  
> -			size = PAGE_SIZE;
> +				size = PAGE_SIZE;
>  
> -			while (iova + size < dma->iova + dma->size &&
> -			       phys + size == iommu_iova_to_phys(d->domain,
> +				while (iova + size < dma->iova + dma->size &&
> +				    phys + size == iommu_iova_to_phys(d->domain,
>  								 iova + size))
> -				size += PAGE_SIZE;
> +					size += PAGE_SIZE;
> +			} else {
> +				unsigned long pfn;
> +				unsigned long vaddr = dma->vaddr +
> +						     (iova - dma->iova);
> +				size_t n = dma->iova + dma->size - iova;
> +				long npage;
> +
> +				npage = __vfio_pin_pages_remote(iommu, vaddr,
> +								n >> PAGE_SHIFT,
> +								dma->prot,
> +								&pfn);
> +				if (npage <= 0) {
> +					WARN_ON(!npage);
> +					ret = (int)npage;
> +					return ret;
> +				}
> +
> +				phys = pfn << PAGE_SHIFT;
> +				size = npage << PAGE_SHIFT;
> +			}
>  
>  			ret = iommu_map(domain->domain, iova, phys,
>  					size, dma->prot | domain->prot);
> @@ -696,6 +1071,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  
>  			iova += size;
>  		}
> +
> +		dma->iommu_mapped = true;
>  	}
>  
>  	return 0;
> @@ -734,11 +1111,24 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
>  	__free_pages(pages, order);
>  }
>  
> +static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
> +				   struct iommu_group *iommu_group)
> +{
> +	struct vfio_group *g;
> +
> +	list_for_each_entry(g, &domain->group_list, next) {
> +		if (g->iommu_group == iommu_group)
> +			return g;
> +	}
> +
> +	return NULL;
> +}
> +
>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>  					 struct iommu_group *iommu_group)
>  {
>  	struct vfio_iommu *iommu = iommu_data;
> -	struct vfio_group *group, *g;
> +	struct vfio_group *group;
>  	struct vfio_domain *domain, *d;
>  	struct bus_type *bus = NULL;
>  	int ret;
> @@ -746,10 +1136,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	mutex_lock(&iommu->lock);
>  
>  	list_for_each_entry(d, &iommu->domain_list, next) {
> -		list_for_each_entry(g, &d->group_list, next) {
> -			if (g->iommu_group != iommu_group)
> -				continue;
> +		if (find_iommu_group(d, iommu_group)) {
> +			mutex_unlock(&iommu->lock);
> +			return -EINVAL;
> +		}
> +	}

The find_iommu_group() conversion would also be an easy separate patch.

>  
> +	if (iommu->local_domain) {
> +		if (find_iommu_group(iommu->local_domain, iommu_group)) {
>  			mutex_unlock(&iommu->lock);
>  			return -EINVAL;
>  		}
> @@ -769,6 +1163,30 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_free;
>  
> +	if (IS_ENABLED(CONFIG_VFIO_MDEV) && !iommu_present(bus) &&
> +	    (bus == &mdev_bus_type)) {
> +		if (!iommu->local_domain) {
> +			domain->local_addr_space =
> +				kzalloc(sizeof(*domain->local_addr_space),
> +						GFP_KERNEL);
> +			if (!domain->local_addr_space) {
> +				ret = -ENOMEM;
> +				goto out_free;
> +			}
> +
> +			domain->local_addr_space->task = current;
> +			INIT_LIST_HEAD(&domain->group_list);
> +			domain->local_addr_space->pfn_list = RB_ROOT;
> +			mutex_init(&domain->local_addr_space->pfn_list_lock);
> +			iommu->local_domain = domain;
> +		} else
> +			kfree(domain);
> +
> +		list_add(&group->next, &domain->group_list);

I think you mean s/domain/iommu->local_domain/ here, we just freed
domain in the else path.

> +		mutex_unlock(&iommu->lock);
> +		return 0;
> +	}
> +
>  	domain->domain = iommu_domain_alloc(bus);
>  	if (!domain->domain) {
>  		ret = -EIO;
> @@ -859,6 +1277,41 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
>  		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
>  }
>  
> +static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
> +{
> +	struct vfio_domain *domain = iommu->local_domain;
> +	struct vfio_dma *dma, *tdma;
> +	struct rb_node *n;
> +	long locked = 0;
> +
> +	rbtree_postorder_for_each_entry_safe(dma, tdma, &iommu->dma_list,
> +					     node) {
> +		vfio_unmap_unpin(iommu, dma);
> +	}
> +
> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +	n = rb_first(&domain->local_addr_space->pfn_list);
> +
> +	for (; n; n = rb_next(n))
> +		locked++;
> +
> +	vfio_lock_acct(domain->local_addr_space->task, locked);
> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +}

Couldn't a properly timed mlock by the user allow them to lock more
memory than they're allowed here?  For instance imagine the vendor
driver has pinned the entire VM memory and the user has exactly the
locked memory limit for that VM.  During the gap here between unpinning
the entire vfio_dma list and re-accounting for the pfn_list, the user
can mlock up to their limit again an now they've doubled the locked
memory they're allowed.

> +
> +static void vfio_local_unpin_all(struct vfio_domain *domain)
> +{
> +	struct rb_node *node;
> +
> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +	while ((node = rb_first(&domain->local_addr_space->pfn_list)))
> +		vfio_unpin_pfn(domain,
> +				rb_entry(node, struct vfio_pfn, node), false);
> +
> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +}
> +
>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>  					  struct iommu_group *iommu_group)
>  {
> @@ -868,31 +1321,57 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  
>  	mutex_lock(&iommu->lock);
>  
> -	list_for_each_entry(domain, &iommu->domain_list, next) {
> -		list_for_each_entry(group, &domain->group_list, next) {
> -			if (group->iommu_group != iommu_group)
> -				continue;
> -
> -			iommu_detach_group(domain->domain, iommu_group);
> +	if (iommu->local_domain) {
> +		domain = iommu->local_domain;
> +		group = find_iommu_group(domain, iommu_group);
> +		if (group) {
>  			list_del(&group->next);
>  			kfree(group);
> -			/*
> -			 * Group ownership provides privilege, if the group
> -			 * list is empty, the domain goes away.  If it's the
> -			 * last domain, then all the mappings go away too.
> -			 */
> +
>  			if (list_empty(&domain->group_list)) {
> -				if (list_is_singular(&iommu->domain_list))
> +				vfio_local_unpin_all(domain);
> +				if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
>  					vfio_iommu_unmap_unpin_all(iommu);
> -				iommu_domain_free(domain->domain);
> -				list_del(&domain->next);
>  				kfree(domain);
> +				iommu->local_domain = NULL;
> +			}


I can't quite wrap my head around this, if we have mdev groups attached
and this iommu group matches an mdev group, remove from list and free
the group.  If there are now no more groups in the mdev group list,
then for each vfio_pfn, unpin the pfn, /without/ doing accounting
udpates and remove the vfio_pfn, but only if the ref_count is now
zero.  We free the domain, so if the ref_count was non-zero we've now
just leaked memory.  I think that means that if a vendor driver pins a
given page twice, that leak occurs.  Furthermore, if there is not an
iommu capable domain in the container, we remove all the vfio_dma
entries as well, ok.  Maybe the only issue is those leaked vfio_pfns.

> +			goto detach_group_done;
> +		}
> +	}
> +
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> +		goto detach_group_done;
> +
> +	list_for_each_entry(domain, &iommu->domain_list, next) {
> +		group = find_iommu_group(domain, iommu_group);
> +		if (!group)
> +			continue;
> +
> +		iommu_detach_group(domain->domain, iommu_group);
> +		list_del(&group->next);
> +		kfree(group);
> +		/*
> +		 * Group ownership provides privilege, if the group list is
> +		 * empty, the domain goes away. If it's the last domain with
> +		 * iommu and local domain doesn't exist, then all the mappings
> +		 * go away too. If it's the last domain with iommu and local
> +		 * domain exist, update accounting
> +		 */
> +		if (list_empty(&domain->group_list)) {
> +			if (list_is_singular(&iommu->domain_list)) {
> +				if (!iommu->local_domain)
> +					vfio_iommu_unmap_unpin_all(iommu);
> +				else
> +					vfio_iommu_unmap_unpin_reaccount(iommu);
>  			}
> -			goto done;
> +			iommu_domain_free(domain->domain);
> +			list_del(&domain->next);
> +			kfree(domain);
>  		}
> +		break;
>  	}
>  
> -done:
> +detach_group_done:
>  	mutex_unlock(&iommu->lock);
>  }
>  
> @@ -924,27 +1403,48 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  	return iommu;
>  }
>  
> +static void vfio_release_domain(struct vfio_domain *domain)
> +{
> +	struct vfio_group *group, *group_tmp;
> +
> +	list_for_each_entry_safe(group, group_tmp,
> +				 &domain->group_list, next) {
> +		if (!domain->local_addr_space)
> +			iommu_detach_group(domain->domain, group->iommu_group);
> +		list_del(&group->next);
> +		kfree(group);
> +	}
> +
> +	if (domain->local_addr_space)
> +		vfio_local_unpin_all(domain);
> +	else
> +		iommu_domain_free(domain->domain);
> +}
> +
>  static void vfio_iommu_type1_release(void *iommu_data)
>  {
>  	struct vfio_iommu *iommu = iommu_data;
>  	struct vfio_domain *domain, *domain_tmp;
> -	struct vfio_group *group, *group_tmp;
> +
> +	if (iommu->local_domain) {
> +		vfio_release_domain(iommu->local_domain);
> +		kfree(iommu->local_domain);
> +		iommu->local_domain = NULL;
> +	}
>  
>  	vfio_iommu_unmap_unpin_all(iommu);
>  
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> +		goto release_exit;

This is a bit redundant, the below for_each should just have no entries
and we skip to there anyway.  Thanks,

Alex

> +
>  	list_for_each_entry_safe(domain, domain_tmp,
>  				 &iommu->domain_list, next) {
> -		list_for_each_entry_safe(group, group_tmp,
> -					 &domain->group_list, next) {
> -			iommu_detach_group(domain->domain, group->iommu_group);
> -			list_del(&group->next);
> -			kfree(group);
> -		}
> -		iommu_domain_free(domain->domain);
> +		vfio_release_domain(domain);
>  		list_del(&domain->next);
>  		kfree(domain);
>  	}
>  
> +release_exit:
>  	kfree(iommu);
>  }
>  
> @@ -1048,6 +1548,8 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
>  	.ioctl		= vfio_iommu_type1_ioctl,
>  	.attach_group	= vfio_iommu_type1_attach_group,
>  	.detach_group	= vfio_iommu_type1_detach_group,
> +	.pin_pages	= vfio_iommu_type1_pin_pages,
> +	.unpin_pages	= vfio_iommu_type1_unpin_pages,
>  };
>  
>  static int __init vfio_iommu_type1_init(void)
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0ecae0b1cd34..0bd25ba6223d 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -17,6 +17,7 @@
>  #include <linux/workqueue.h>
>  #include <linux/poll.h>
>  #include <uapi/linux/vfio.h>
> +#include <linux/mdev.h>
>  
>  /**
>   * struct vfio_device_ops - VFIO bus driver device callbacks
> @@ -75,7 +76,11 @@ struct vfio_iommu_driver_ops {
>  					struct iommu_group *group);
>  	void		(*detach_group)(void *iommu_data,
>  					struct iommu_group *group);
> -
> +	long		(*pin_pages)(void *iommu_data, unsigned long *user_pfn,
> +				     long npage, int prot,
> +				     unsigned long *phys_pfn);
> +	long		(*unpin_pages)(void *iommu_data, unsigned long *pfn,
> +				       long npage);
>  };
>  
>  extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
> @@ -127,6 +132,12 @@ static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
>  }
>  #endif /* CONFIG_EEH */
>  
> +extern long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> +			   long npage, int prot, unsigned long *phys_pfn);
> +
> +extern long vfio_unpin_pages(struct device *dev, unsigned long *pfn,
> +			     long npage);
> +
>  /*
>   * IRQfd - generic
>   */
Kirti Wankhede Oct. 20, 2016, 8:17 p.m. UTC | #2
Alex,

Addressing your comments other than invalidation part.

On 10/20/2016 2:32 AM, Alex Williamson wrote:
> On Tue, 18 Oct 2016 02:52:04 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
...
>> Tested by assigning below combinations of devices to a single VM:
>> - GPU pass through only
>> - vGPU device only
>> - One GPU pass through and one vGPU device
>> - Linux VM hot plug and unplug vGPU device while GPU pass through device
>>   exist
>> - Linux VM hot plug and unplug GPU pass through device while vGPU device
>>   exist
> 
> Were you able to do these with the locked memory limit of the user set
> to the minimum required for existing GPU assignment?
> 

No, is there a way to set memory limit through livbirt so that it would
set memory limit to system memory assigned to VM?

>>
...
>> +	container = group->container;
>> +	if (IS_ERR(container)) {
> 
> I don't see that we ever use an ERR_PTR to set group->container, it
> should either be NULL or valid and the fact that we added ourselves to
> container_users should mean that it's valid.  The paranoia test here
> would be if container is NULL, but IS_ERR() doesn't check NULL.  If we
> need that paranoia test, maybe we should just:
> 
> if (WARN_ON(!container)) {
> 
> I'm not fully convinced it's needed though.
> 

Ok removing this check.

>> +		ret = PTR_ERR(container);
>> +		goto err_pin_pages;
>> +	}
>> +
>> +	down_read(&container->group_lock);
>> +
>> +	driver = container->iommu_driver;
>> +	if (likely(driver && driver->ops->pin_pages))
>> +		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
>> +					     npage, prot, phys_pfn);
> 
> The caller is going to need to provide some means for us to callback to
> invalidate pinned pages.
> 
> ret has already been used, so it's zero at this point.  I expect the
> original intention was to let the initialization above fall through
> here so that the caller gets an errno if the driver doesn't support
> pin_pages.  Returning zero without actually doing anything seems like
> an unexpected return value.
> 

yes, changing it to:

driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
        ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
                                     npage, prot, phys_pfn);
else
        ret = -EINVAL;




>> +static int vfio_pfn_account(struct vfio_iommu *iommu, unsigned long pfn)
>> +{
>> +	struct vfio_pfn *p;
>> +	struct vfio_domain *domain = iommu->local_domain;
>> +	int ret = 1;
>> +
>> +	if (!domain)
>> +		return 1;
>> +
>> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
>> +
>> +	p = vfio_find_pfn(domain, pfn);
>> +	if (p)
>> +		ret = 0;
>> +
>> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
>> +	return ret;
>> +}
> 
> So if the vfio_pfn for a given pfn exists, return 0, else return 1.
> But do we know that the vfio_pfn exists at the point where we actually
> do that accounting?
>

Only below functions call vfio_pfn_account()
__vfio_pin_pages_remote() -> vfio_pfn_account()
__vfio_unpin_pages_remote() -> vfio_pfn_account()

Consider the case when mdev device is already assigned to VM, run some
app in VM that pins some pages, then hotplug pass through device.
Then __vfio_pin_pages_remote() is called when iommu capable domain is
attached to container to pin all pages from vfio_iommu_replay(). So if
at this time vfio_pfn exist means that the page is pinned through
local_domain when iommu capable domain was not present, so accounting
was already done for that pages. Hence returned 0 here which mean don't
add this page in accounting.


>> +
>>  struct vwork {
>>  	struct mm_struct	*mm;
>>  	long			npage;
>> @@ -150,17 +269,17 @@ static void vfio_lock_acct_bg(struct work_struct *work)
>>  	kfree(vwork);
>>  }
>>  
>> -static void vfio_lock_acct(long npage)
>> +static void vfio_lock_acct(struct task_struct *task, long npage)
>>  {
>>  	struct vwork *vwork;
>>  	struct mm_struct *mm;
>>  
>> -	if (!current->mm || !npage)
>> +	if (!task->mm || !npage)
>>  		return; /* process exited or nothing to do */
>>  
>> -	if (down_write_trylock(&current->mm->mmap_sem)) {
>> -		current->mm->locked_vm += npage;
>> -		up_write(&current->mm->mmap_sem);
>> +	if (down_write_trylock(&task->mm->mmap_sem)) {
>> +		task->mm->locked_vm += npage;
>> +		up_write(&task->mm->mmap_sem);
>>  		return;
>>  	}
>>  
>> @@ -172,7 +291,7 @@ static void vfio_lock_acct(long npage)
>>  	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
>>  	if (!vwork)
>>  		return;
>> -	mm = get_task_mm(current);
>> +	mm = get_task_mm(task);
>>  	if (!mm) {
>>  		kfree(vwork);
>>  		return;
>> @@ -228,20 +347,31 @@ static int put_pfn(unsigned long pfn, int prot)
>>  	return 0;
>>  }
> 
> This coversion of vfio_lock_acct() to pass a task_struct and updating
> existing callers to pass current would be a great separate, easily
> review-able patch.
>

Ok. I'll split this in separate commit.


>>  
>> -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>> +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>> +			 int prot, unsigned long *pfn)
>>  {
>>  	struct page *page[1];
>>  	struct vm_area_struct *vma;
>> +	struct mm_struct *local_mm = (mm ? mm : current->mm);
>>  	int ret = -EFAULT;
>>  
>> -	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
>> +	if (mm) {
>> +		down_read(&local_mm->mmap_sem);
>> +		ret = get_user_pages_remote(NULL, local_mm, vaddr, 1,
>> +					!!(prot & IOMMU_WRITE), 0, page, NULL);
>> +		up_read(&local_mm->mmap_sem);
>> +	} else
>> +		ret = get_user_pages_fast(vaddr, 1,
>> +					  !!(prot & IOMMU_WRITE), page);
>> +
>> +	if (ret == 1) {
>>  		*pfn = page_to_pfn(page[0]);
>>  		return 0;
>>  	}
>>  
>> -	down_read(&current->mm->mmap_sem);
>> +	down_read(&local_mm->mmap_sem);
>>  
>> -	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
>> +	vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
>>  
>>  	if (vma && vma->vm_flags & VM_PFNMAP) {
>>  		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
>> @@ -249,7 +379,7 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>>  			ret = 0;
>>  	}
>>  
>> -	up_read(&current->mm->mmap_sem);
>> +	up_read(&local_mm->mmap_sem);
>>  
>>  	return ret;
>>  }
> 
> This would also be a great separate patch.

Ok.

>  Have you considered
> renaming the mm_struct function arg to "remote_mm" and making the local
> variable simply "mm"?  It seems like it would tie nicely with the
> remote_mm path using get_user_pages_remote() while passing NULL for
> remote_mm uses current->mm and the existing path (and avoid the general
> oddness of passing local_mm to a "remote" function).
> 

Yes, your suggestion looks good. Updating.


>> @@ -259,33 +389,37 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
>>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>>   * first page and all consecutive pages with the same locking.
>>   */
>> -static long vfio_pin_pages(unsigned long vaddr, long npage,
>> -			   int prot, unsigned long *pfn_base)
>> +static long __vfio_pin_pages_remote(struct vfio_iommu *iommu,
>> +				    unsigned long vaddr, long npage,
>> +				    int prot, unsigned long *pfn_base)

...


>> @@ -303,8 +437,10 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>>  			break;
>>  		}
>>  
>> +		lock_acct += vfio_pfn_account(iommu, pfn);
>> +
> 
> I take it that this is the new technique for keeping the accounting
> accurate, we only increment the locked accounting by the amount not
> already pinned in a vfio_pfn.
>

That's correct.


>>  		if (!rsvd && !lock_cap &&
>> -		    current->mm->locked_vm + i + 1 > limit) {
>> +		    current->mm->locked_vm + lock_acct > limit) {
>>  			put_pfn(pfn, prot);
>>  			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>>  				__func__, limit << PAGE_SHIFT);
>> @@ -313,23 +449,216 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>>  	}
>>  
>>  	if (!rsvd)
>> -		vfio_lock_acct(i);
>> +		vfio_lock_acct(current, lock_acct);
>>  
>>  	return i;
>>  }
>>  
>> -static long vfio_unpin_pages(unsigned long pfn, long npage,
>> -			     int prot, bool do_accounting)
>> +static long __vfio_unpin_pages_remote(struct vfio_iommu *iommu,
>> +				      unsigned long pfn, long npage, int prot,
>> +				      bool do_accounting)
> 
> Have you noticed that it's kind of confusing that
> __vfio_{un}pin_pages_remote() uses current, which does a
> get_user_pages_fast() while "local" uses a provided task_struct and
> uses get_user_pages_*remote*()?  And also what was effectively local
> (ie. we're pinning for our own use here) is now "remote" and pinning
> for a remote, vendor driver consumer, is now "local".  It's not very
> intuitive.
> 

'local' in local_domain was suggested to describe the domain for local
page tracking. Earlier suggestions to have 'mdev' or 'noimmu' in this
name were discarded. May be we should revisit what the name should be.
Any suggestion?

For local_domain, to pin pages, flow is:

for local_domain
    |- vfio_pin_pages()
        |- vfio_iommu_type1_pin_pages()
            |- __vfio_pin_page_local()
                |-  vaddr_get_pfn(task->mm)
                    |- get_user_pages_remote()

__vfio_pin_page_local() --> get_user_pages_remote()



>>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  					 struct iommu_group *iommu_group)
>>  {
>>  	struct vfio_iommu *iommu = iommu_data;
>> -	struct vfio_group *group, *g;
>> +	struct vfio_group *group;
>>  	struct vfio_domain *domain, *d;
>>  	struct bus_type *bus = NULL;
>>  	int ret;
>> @@ -746,10 +1136,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  	mutex_lock(&iommu->lock);
>>  
>>  	list_for_each_entry(d, &iommu->domain_list, next) {
>> -		list_for_each_entry(g, &d->group_list, next) {
>> -			if (g->iommu_group != iommu_group)
>> -				continue;
>> +		if (find_iommu_group(d, iommu_group)) {
>> +			mutex_unlock(&iommu->lock);
>> +			return -EINVAL;
>> +		}
>> +	}
> 
> The find_iommu_group() conversion would also be an easy separate patch.
> 

Ok.

>>  
>> +	if (iommu->local_domain) {
>> +		if (find_iommu_group(iommu->local_domain, iommu_group)) {
>>  			mutex_unlock(&iommu->lock);
>>  			return -EINVAL;
>>  		}
>> @@ -769,6 +1163,30 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  	if (ret)
>>  		goto out_free;
>>  
>> +	if (IS_ENABLED(CONFIG_VFIO_MDEV) && !iommu_present(bus) &&
>> +	    (bus == &mdev_bus_type)) {
>> +		if (!iommu->local_domain) {
>> +			domain->local_addr_space =
>> +				kzalloc(sizeof(*domain->local_addr_space),
>> +						GFP_KERNEL);
>> +			if (!domain->local_addr_space) {
>> +				ret = -ENOMEM;
>> +				goto out_free;
>> +			}
>> +
>> +			domain->local_addr_space->task = current;
>> +			INIT_LIST_HEAD(&domain->group_list);
>> +			domain->local_addr_space->pfn_list = RB_ROOT;
>> +			mutex_init(&domain->local_addr_space->pfn_list_lock);
>> +			iommu->local_domain = domain;
>> +		} else
>> +			kfree(domain);
>> +
>> +		list_add(&group->next, &domain->group_list);
> 
> I think you mean s/domain/iommu->local_domain/ here, we just freed
> domain in the else path.
> 

Yes, corrected.

>> +		mutex_unlock(&iommu->lock);
>> +		return 0;
>> +	}
>> +
>>  	domain->domain = iommu_domain_alloc(bus);
>>  	if (!domain->domain) {
>>  		ret = -EIO;
>> @@ -859,6 +1277,41 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
>>  		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
>>  }
>>  
>> +static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
>> +{
>> +	struct vfio_domain *domain = iommu->local_domain;
>> +	struct vfio_dma *dma, *tdma;
>> +	struct rb_node *n;
>> +	long locked = 0;
>> +
>> +	rbtree_postorder_for_each_entry_safe(dma, tdma, &iommu->dma_list,
>> +					     node) {
>> +		vfio_unmap_unpin(iommu, dma);
>> +	}
>> +
>> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
>> +
>> +	n = rb_first(&domain->local_addr_space->pfn_list);
>> +
>> +	for (; n; n = rb_next(n))
>> +		locked++;
>> +
>> +	vfio_lock_acct(domain->local_addr_space->task, locked);
>> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
>> +}
> 
> Couldn't a properly timed mlock by the user allow them to lock more
> memory than they're allowed here?  For instance imagine the vendor
> driver has pinned the entire VM memory and the user has exactly the
> locked memory limit for that VM.  During the gap here between unpinning
> the entire vfio_dma list and re-accounting for the pfn_list, the user
> can mlock up to their limit again an now they've doubled the locked
> memory they're allowed.
> 

As per original code, vfio_unmap_unpin() calls
__vfio_unpin_pages_remote(.., false) with do_accounting set to false,
why is that so?

Here if accounting is set to true then we don't have to do re-accounting
here.

>> +
>> +static void vfio_local_unpin_all(struct vfio_domain *domain)
>> +{
>> +	struct rb_node *node;
>> +
>> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
>> +	while ((node = rb_first(&domain->local_addr_space->pfn_list)))
>> +		vfio_unpin_pfn(domain,
>> +				rb_entry(node, struct vfio_pfn, node), false);
>> +
>> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
>> +}
>> +
>>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>>  					  struct iommu_group *iommu_group)
>>  {
>> @@ -868,31 +1321,57 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>>  
>>  	mutex_lock(&iommu->lock);
>>  
>> -	list_for_each_entry(domain, &iommu->domain_list, next) {
>> -		list_for_each_entry(group, &domain->group_list, next) {
>> -			if (group->iommu_group != iommu_group)
>> -				continue;
>> -
>> -			iommu_detach_group(domain->domain, iommu_group);
>> +	if (iommu->local_domain) {
>> +		domain = iommu->local_domain;
>> +		group = find_iommu_group(domain, iommu_group);
>> +		if (group) {
>>  			list_del(&group->next);
>>  			kfree(group);
>> -			/*
>> -			 * Group ownership provides privilege, if the group
>> -			 * list is empty, the domain goes away.  If it's the
>> -			 * last domain, then all the mappings go away too.
>> -			 */
>> +
>>  			if (list_empty(&domain->group_list)) {
>> -				if (list_is_singular(&iommu->domain_list))
>> +				vfio_local_unpin_all(domain);
>> +				if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
>>  					vfio_iommu_unmap_unpin_all(iommu);
>> -				iommu_domain_free(domain->domain);
>> -				list_del(&domain->next);
>>  				kfree(domain);
>> +				iommu->local_domain = NULL;
>> +			}
> 
> 
> I can't quite wrap my head around this, if we have mdev groups attached
> and this iommu group matches an mdev group, remove from list and free
> the group.  If there are now no more groups in the mdev group list,
> then for each vfio_pfn, unpin the pfn, /without/ doing accounting
> udpates 

corrected the code to do accounting here.

> and remove the vfio_pfn, but only if the ref_count is now
> zero.

Yes, If you see the loop vfio_local_unpin_all(), it iterates till the
node in rb tree exist

>> +	while ((node = rb_first(&domain->local_addr_space->pfn_list)))
>> +		vfio_unpin_pfn(domain,
>> +				rb_entry(node, struct vfio_pfn, node), false);
>> +


and vfio_unpin_pfn() only remove the node from rb tree if ref count is
zero.

static int vfio_unpin_pfn(struct vfio_domain *domain,
                          struct vfio_pfn *vpfn, bool do_accounting)
{
        __vfio_unpin_page_local(domain, vpfn->pfn, vpfn->prot,
                                do_accounting);

        if (atomic_dec_and_test(&vpfn->ref_count))
                vfio_remove_from_pfn_list(domain, vpfn);

        return 1;
}

so for example for a vfio_pfn ref_count is 2, first iteration would be:
 - call __vfio_unpin_page_local()
 - atomic_dec(ref_count), so now ref_count is 1, but node is not removed
from rb tree.

In next iteration:
 - call __vfio_unpin_page_local()
 - atomic_dec(ref_count), so now ref_count is 0, remove node from rb tree.


>  We free the domain, so if the ref_count was non-zero we've now
> just leaked memory.  I think that means that if a vendor driver pins a
> given page twice, that leak occurs.  Furthermore, if there is not an
> iommu capable domain in the container, we remove all the vfio_dma
> entries as well, ok.  Maybe the only issue is those leaked vfio_pfns.
> 

So if vendor driver pins a page twice, vfio_unpin_pfn() would get called
twice and only when ref count is zero that node is removed from rb tree.
So there is no memory leak.

Kirti
Jike Song Oct. 21, 2016, 7:49 a.m. UTC | #3
On 10/18/2016 05:22 AM, Kirti Wankhede wrote:
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 2ba19424e4a1..5d67058a611d 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
[snip]
>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>  					 struct iommu_group *iommu_group)
>  {
>  	struct vfio_iommu *iommu = iommu_data;
> -	struct vfio_group *group, *g;
> +	struct vfio_group *group;
>  	struct vfio_domain *domain, *d;
>  	struct bus_type *bus = NULL;
>  	int ret;
> @@ -746,10 +1136,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	mutex_lock(&iommu->lock);
>  
>  	list_for_each_entry(d, &iommu->domain_list, next) {
> -		list_for_each_entry(g, &d->group_list, next) {
> -			if (g->iommu_group != iommu_group)
> -				continue;
> +		if (find_iommu_group(d, iommu_group)) {
> +			mutex_unlock(&iommu->lock);
> +			return -EINVAL;
> +		}
> +	}
>  
> +	if (iommu->local_domain) {
> +		if (find_iommu_group(iommu->local_domain, iommu_group)) {
>  			mutex_unlock(&iommu->lock);
>  			return -EINVAL;
>  		}
> @@ -769,6 +1163,30 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_free;
>  
> +	if (IS_ENABLED(CONFIG_VFIO_MDEV) && !iommu_present(bus) &&
> +	    (bus == &mdev_bus_type)) {

Hi Kirti,

By refering mdev_bus_type directly you are making vfio_iommu_type1.ko depends
on mdev.ko, but in Kconfig doesn't guarantee the dependency. For example,
if CONFIG_VFIO_IOMMU_TYPE1=y and CONFIG_VFIO_MDEV=m, the building will fail.


--
Thanks,
Jike
Alex Williamson Oct. 21, 2016, 2:36 p.m. UTC | #4
On Fri, 21 Oct 2016 15:49:07 +0800
Jike Song <jike.song@intel.com> wrote:

> On 10/18/2016 05:22 AM, Kirti Wankhede wrote:
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index 2ba19424e4a1..5d67058a611d 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c  
> [snip]
> >  static int vfio_iommu_type1_attach_group(void *iommu_data,
> >  					 struct iommu_group *iommu_group)
> >  {
> >  	struct vfio_iommu *iommu = iommu_data;
> > -	struct vfio_group *group, *g;
> > +	struct vfio_group *group;
> >  	struct vfio_domain *domain, *d;
> >  	struct bus_type *bus = NULL;
> >  	int ret;
> > @@ -746,10 +1136,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> >  	mutex_lock(&iommu->lock);
> >  
> >  	list_for_each_entry(d, &iommu->domain_list, next) {
> > -		list_for_each_entry(g, &d->group_list, next) {
> > -			if (g->iommu_group != iommu_group)
> > -				continue;
> > +		if (find_iommu_group(d, iommu_group)) {
> > +			mutex_unlock(&iommu->lock);
> > +			return -EINVAL;
> > +		}
> > +	}
> >  
> > +	if (iommu->local_domain) {
> > +		if (find_iommu_group(iommu->local_domain, iommu_group)) {
> >  			mutex_unlock(&iommu->lock);
> >  			return -EINVAL;
> >  		}
> > @@ -769,6 +1163,30 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> >  	if (ret)
> >  		goto out_free;
> >  
> > +	if (IS_ENABLED(CONFIG_VFIO_MDEV) && !iommu_present(bus) &&
> > +	    (bus == &mdev_bus_type)) {  
> 
> Hi Kirti,
> 
> By refering mdev_bus_type directly you are making vfio_iommu_type1.ko depends
> on mdev.ko, but in Kconfig doesn't guarantee the dependency. For example,
> if CONFIG_VFIO_IOMMU_TYPE1=y and CONFIG_VFIO_MDEV=m, the building will fail.

Good point, Jike.  I don't think we want to make existing vfio modules
dependent on mdev modules.  I wonder if we can lookup the mdev_bus_type
symbol w/o triggering the module load.  Thanks,

Alex
Alex Williamson Oct. 24, 2016, 2:32 a.m. UTC | #5
On Fri, 21 Oct 2016 01:47:25 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> Alex,
> 
> Addressing your comments other than invalidation part.
> 
> On 10/20/2016 2:32 AM, Alex Williamson wrote:
> > On Tue, 18 Oct 2016 02:52:04 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> ...
> >> Tested by assigning below combinations of devices to a single VM:
> >> - GPU pass through only
> >> - vGPU device only
> >> - One GPU pass through and one vGPU device
> >> - Linux VM hot plug and unplug vGPU device while GPU pass through device
> >>   exist
> >> - Linux VM hot plug and unplug GPU pass through device while vGPU device
> >>   exist  
> > 
> > Were you able to do these with the locked memory limit of the user set
> > to the minimum required for existing GPU assignment?
> >   
> 
> No, is there a way to set memory limit through livbirt so that it would
> set memory limit to system memory assigned to VM?

Not that I know of, but I also don't know how you're making use of an
mdev device through libvirt yet since they don't have support for the
vfio-pci sysfsdev option.  I would recommend testing with QEMU manually.

> ...
> >> +	container = group->container;
> >> +	if (IS_ERR(container)) {  
> > 
> > I don't see that we ever use an ERR_PTR to set group->container, it
> > should either be NULL or valid and the fact that we added ourselves to
> > container_users should mean that it's valid.  The paranoia test here
> > would be if container is NULL, but IS_ERR() doesn't check NULL.  If we
> > need that paranoia test, maybe we should just:
> > 
> > if (WARN_ON(!container)) {
> > 
> > I'm not fully convinced it's needed though.
> >   
> 
> Ok removing this check.
> 
> >> +		ret = PTR_ERR(container);
> >> +		goto err_pin_pages;
> >> +	}
> >> +
> >> +	down_read(&container->group_lock);
> >> +
> >> +	driver = container->iommu_driver;
> >> +	if (likely(driver && driver->ops->pin_pages))
> >> +		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
> >> +					     npage, prot, phys_pfn);  
> > 
> > The caller is going to need to provide some means for us to callback to
> > invalidate pinned pages.
> > 
> > ret has already been used, so it's zero at this point.  I expect the
> > original intention was to let the initialization above fall through
> > here so that the caller gets an errno if the driver doesn't support
> > pin_pages.  Returning zero without actually doing anything seems like
> > an unexpected return value.
> >   
> 
> yes, changing it to:
> 
> driver = container->iommu_driver;
> if (likely(driver && driver->ops->pin_pages))
>         ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
>                                      npage, prot, phys_pfn);
> else
>         ret = -EINVAL;
> 
> 
> 
> 
> >> +static int vfio_pfn_account(struct vfio_iommu *iommu, unsigned long pfn)
> >> +{
> >> +	struct vfio_pfn *p;
> >> +	struct vfio_domain *domain = iommu->local_domain;
> >> +	int ret = 1;
> >> +
> >> +	if (!domain)
> >> +		return 1;
> >> +
> >> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> >> +
> >> +	p = vfio_find_pfn(domain, pfn);
> >> +	if (p)
> >> +		ret = 0;
> >> +
> >> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> >> +	return ret;
> >> +}  
> > 
> > So if the vfio_pfn for a given pfn exists, return 0, else return 1.
> > But do we know that the vfio_pfn exists at the point where we actually
> > do that accounting?
> >  
> 
> Only below functions call vfio_pfn_account()
> __vfio_pin_pages_remote() -> vfio_pfn_account()
> __vfio_unpin_pages_remote() -> vfio_pfn_account()
> 
> Consider the case when mdev device is already assigned to VM, run some
> app in VM that pins some pages, then hotplug pass through device.
> Then __vfio_pin_pages_remote() is called when iommu capable domain is
> attached to container to pin all pages from vfio_iommu_replay(). So if
> at this time vfio_pfn exist means that the page is pinned through
> local_domain when iommu capable domain was not present, so accounting
> was already done for that pages. Hence returned 0 here which mean don't
> add this page in accounting.

Right, I see that's the intention, I can't pick any holes in the
concept, but I'll continue to try to look for bugs.

> >> +
> >>  struct vwork {
> >>  	struct mm_struct	*mm;
> >>  	long			npage;
> >> @@ -150,17 +269,17 @@ static void vfio_lock_acct_bg(struct work_struct *work)
> >>  	kfree(vwork);
> >>  }
> >>  
> >> -static void vfio_lock_acct(long npage)
> >> +static void vfio_lock_acct(struct task_struct *task, long npage)
> >>  {
> >>  	struct vwork *vwork;
> >>  	struct mm_struct *mm;
> >>  
> >> -	if (!current->mm || !npage)
> >> +	if (!task->mm || !npage)
> >>  		return; /* process exited or nothing to do */
> >>  
> >> -	if (down_write_trylock(&current->mm->mmap_sem)) {
> >> -		current->mm->locked_vm += npage;
> >> -		up_write(&current->mm->mmap_sem);
> >> +	if (down_write_trylock(&task->mm->mmap_sem)) {
> >> +		task->mm->locked_vm += npage;
> >> +		up_write(&task->mm->mmap_sem);
> >>  		return;
> >>  	}
> >>  
> >> @@ -172,7 +291,7 @@ static void vfio_lock_acct(long npage)
> >>  	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> >>  	if (!vwork)
> >>  		return;
> >> -	mm = get_task_mm(current);
> >> +	mm = get_task_mm(task);
> >>  	if (!mm) {
> >>  		kfree(vwork);
> >>  		return;
> >> @@ -228,20 +347,31 @@ static int put_pfn(unsigned long pfn, int prot)
> >>  	return 0;
> >>  }  
> > 
> > This coversion of vfio_lock_acct() to pass a task_struct and updating
> > existing callers to pass current would be a great separate, easily
> > review-able patch.
> >  
> 
> Ok. I'll split this in separate commit.
> 
> 
> >>  
> >> -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
> >> +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> >> +			 int prot, unsigned long *pfn)
> >>  {
> >>  	struct page *page[1];
> >>  	struct vm_area_struct *vma;
> >> +	struct mm_struct *local_mm = (mm ? mm : current->mm);
> >>  	int ret = -EFAULT;
> >>  
> >> -	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
> >> +	if (mm) {
> >> +		down_read(&local_mm->mmap_sem);
> >> +		ret = get_user_pages_remote(NULL, local_mm, vaddr, 1,
> >> +					!!(prot & IOMMU_WRITE), 0, page, NULL);
> >> +		up_read(&local_mm->mmap_sem);
> >> +	} else
> >> +		ret = get_user_pages_fast(vaddr, 1,
> >> +					  !!(prot & IOMMU_WRITE), page);
> >> +
> >> +	if (ret == 1) {
> >>  		*pfn = page_to_pfn(page[0]);
> >>  		return 0;
> >>  	}
> >>  
> >> -	down_read(&current->mm->mmap_sem);
> >> +	down_read(&local_mm->mmap_sem);
> >>  
> >> -	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> >> +	vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
> >>  
> >>  	if (vma && vma->vm_flags & VM_PFNMAP) {
> >>  		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> >> @@ -249,7 +379,7 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
> >>  			ret = 0;
> >>  	}
> >>  
> >> -	up_read(&current->mm->mmap_sem);
> >> +	up_read(&local_mm->mmap_sem);
> >>  
> >>  	return ret;
> >>  }  
> > 
> > This would also be a great separate patch.  
> 
> Ok.
> 
> >  Have you considered
> > renaming the mm_struct function arg to "remote_mm" and making the local
> > variable simply "mm"?  It seems like it would tie nicely with the
> > remote_mm path using get_user_pages_remote() while passing NULL for
> > remote_mm uses current->mm and the existing path (and avoid the general
> > oddness of passing local_mm to a "remote" function).
> >   
> 
> Yes, your suggestion looks good. Updating.
> 
> 
> >> @@ -259,33 +389,37 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
> >>   * the iommu can only map chunks of consecutive pfns anyway, so get the
> >>   * first page and all consecutive pages with the same locking.
> >>   */
> >> -static long vfio_pin_pages(unsigned long vaddr, long npage,
> >> -			   int prot, unsigned long *pfn_base)
> >> +static long __vfio_pin_pages_remote(struct vfio_iommu *iommu,
> >> +				    unsigned long vaddr, long npage,
> >> +				    int prot, unsigned long *pfn_base)  
> 
> ...
> 
> 
> >> @@ -303,8 +437,10 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
> >>  			break;
> >>  		}
> >>  
> >> +		lock_acct += vfio_pfn_account(iommu, pfn);
> >> +  
> > 
> > I take it that this is the new technique for keeping the accounting
> > accurate, we only increment the locked accounting by the amount not
> > already pinned in a vfio_pfn.
> >  
> 
> That's correct.
> 
> 
> >>  		if (!rsvd && !lock_cap &&
> >> -		    current->mm->locked_vm + i + 1 > limit) {
> >> +		    current->mm->locked_vm + lock_acct > limit) {
> >>  			put_pfn(pfn, prot);
> >>  			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> >>  				__func__, limit << PAGE_SHIFT);
> >> @@ -313,23 +449,216 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
> >>  	}
> >>  
> >>  	if (!rsvd)
> >> -		vfio_lock_acct(i);
> >> +		vfio_lock_acct(current, lock_acct);
> >>  
> >>  	return i;
> >>  }
> >>  
> >> -static long vfio_unpin_pages(unsigned long pfn, long npage,
> >> -			     int prot, bool do_accounting)
> >> +static long __vfio_unpin_pages_remote(struct vfio_iommu *iommu,
> >> +				      unsigned long pfn, long npage, int prot,
> >> +				      bool do_accounting)  
> > 
> > Have you noticed that it's kind of confusing that
> > __vfio_{un}pin_pages_remote() uses current, which does a
> > get_user_pages_fast() while "local" uses a provided task_struct and
> > uses get_user_pages_*remote*()?  And also what was effectively local
> > (ie. we're pinning for our own use here) is now "remote" and pinning
> > for a remote, vendor driver consumer, is now "local".  It's not very
> > intuitive.
> >   
> 
> 'local' in local_domain was suggested to describe the domain for local
> page tracking. Earlier suggestions to have 'mdev' or 'noimmu' in this
> name were discarded. May be we should revisit what the name should be.
> Any suggestion?
> 
> For local_domain, to pin pages, flow is:
> 
> for local_domain
>     |- vfio_pin_pages()
>         |- vfio_iommu_type1_pin_pages()
>             |- __vfio_pin_page_local()
>                 |-  vaddr_get_pfn(task->mm)
>                     |- get_user_pages_remote()
> 
> __vfio_pin_page_local() --> get_user_pages_remote()


In vfio.c we have the concept of an external user, perhaps that could
be continued here.  An mdev driver would be an external, or remote
pinning.

> >>  static int vfio_iommu_type1_attach_group(void *iommu_data,
> >>  					 struct iommu_group *iommu_group)
> >>  {
> >>  	struct vfio_iommu *iommu = iommu_data;
> >> -	struct vfio_group *group, *g;
> >> +	struct vfio_group *group;
> >>  	struct vfio_domain *domain, *d;
> >>  	struct bus_type *bus = NULL;
> >>  	int ret;
> >> @@ -746,10 +1136,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> >>  	mutex_lock(&iommu->lock);
> >>  
> >>  	list_for_each_entry(d, &iommu->domain_list, next) {
> >> -		list_for_each_entry(g, &d->group_list, next) {
> >> -			if (g->iommu_group != iommu_group)
> >> -				continue;
> >> +		if (find_iommu_group(d, iommu_group)) {
> >> +			mutex_unlock(&iommu->lock);
> >> +			return -EINVAL;
> >> +		}
> >> +	}  
> > 
> > The find_iommu_group() conversion would also be an easy separate patch.
> >   
> 
> Ok.
> 
> >>  
> >> +	if (iommu->local_domain) {
> >> +		if (find_iommu_group(iommu->local_domain, iommu_group)) {
> >>  			mutex_unlock(&iommu->lock);
> >>  			return -EINVAL;
> >>  		}
> >> @@ -769,6 +1163,30 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> >>  	if (ret)
> >>  		goto out_free;
> >>  
> >> +	if (IS_ENABLED(CONFIG_VFIO_MDEV) && !iommu_present(bus) &&
> >> +	    (bus == &mdev_bus_type)) {
> >> +		if (!iommu->local_domain) {
> >> +			domain->local_addr_space =
> >> +				kzalloc(sizeof(*domain->local_addr_space),
> >> +						GFP_KERNEL);
> >> +			if (!domain->local_addr_space) {
> >> +				ret = -ENOMEM;
> >> +				goto out_free;
> >> +			}
> >> +
> >> +			domain->local_addr_space->task = current;
> >> +			INIT_LIST_HEAD(&domain->group_list);
> >> +			domain->local_addr_space->pfn_list = RB_ROOT;
> >> +			mutex_init(&domain->local_addr_space->pfn_list_lock);
> >> +			iommu->local_domain = domain;
> >> +		} else
> >> +			kfree(domain);
> >> +
> >> +		list_add(&group->next, &domain->group_list);  
> > 
> > I think you mean s/domain/iommu->local_domain/ here, we just freed
> > domain in the else path.
> >   
> 
> Yes, corrected.
> 
> >> +		mutex_unlock(&iommu->lock);
> >> +		return 0;
> >> +	}
> >> +
> >>  	domain->domain = iommu_domain_alloc(bus);
> >>  	if (!domain->domain) {
> >>  		ret = -EIO;
> >> @@ -859,6 +1277,41 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
> >>  		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
> >>  }
> >>  
> >> +static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
> >> +{
> >> +	struct vfio_domain *domain = iommu->local_domain;
> >> +	struct vfio_dma *dma, *tdma;
> >> +	struct rb_node *n;
> >> +	long locked = 0;
> >> +
> >> +	rbtree_postorder_for_each_entry_safe(dma, tdma, &iommu->dma_list,
> >> +					     node) {
> >> +		vfio_unmap_unpin(iommu, dma);
> >> +	}
> >> +
> >> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> >> +
> >> +	n = rb_first(&domain->local_addr_space->pfn_list);
> >> +
> >> +	for (; n; n = rb_next(n))
> >> +		locked++;
> >> +
> >> +	vfio_lock_acct(domain->local_addr_space->task, locked);
> >> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> >> +}  
> > 
> > Couldn't a properly timed mlock by the user allow them to lock more
> > memory than they're allowed here?  For instance imagine the vendor
> > driver has pinned the entire VM memory and the user has exactly the
> > locked memory limit for that VM.  During the gap here between unpinning
> > the entire vfio_dma list and re-accounting for the pfn_list, the user
> > can mlock up to their limit again an now they've doubled the locked
> > memory they're allowed.
> >   
> 
> As per original code, vfio_unmap_unpin() calls
> __vfio_unpin_pages_remote(.., false) with do_accounting set to false,
> why is that so?

Because vfio_dma tracks the user granularity of calling MAP_DMA, not
the granularity with which the iommu mapping was actually done.  There
might be multiple non-contiguous chunks to make that mapping and we
don't know how the iommu chose to map a given chunk to support large
page sizes.  If we chose to do accounting on the iommu_unmap()
granularity, we might account for every 4k page separately.  We choose
not to do accounting there so that we can batch the accounting into one
update per range.

> Here if accounting is set to true then we don't have to do re-accounting
> here.

If vfio_unmap_unpin() did not do accounting, you could update
accounting once with the difference between what was pinned and what
remains pinned via the mdev and avoid the gap caused by de-accounting
everything and then re-accounting only for the mdev pinnings.

> >> +
> >> +static void vfio_local_unpin_all(struct vfio_domain *domain)
> >> +{
> >> +	struct rb_node *node;
> >> +
> >> +	mutex_lock(&domain->local_addr_space->pfn_list_lock);
> >> +	while ((node = rb_first(&domain->local_addr_space->pfn_list)))
> >> +		vfio_unpin_pfn(domain,
> >> +				rb_entry(node, struct vfio_pfn, node), false);
> >> +
> >> +	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> >> +}
> >> +
> >>  static void vfio_iommu_type1_detach_group(void *iommu_data,
> >>  					  struct iommu_group *iommu_group)
> >>  {
> >> @@ -868,31 +1321,57 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
> >>  
> >>  	mutex_lock(&iommu->lock);
> >>  
> >> -	list_for_each_entry(domain, &iommu->domain_list, next) {
> >> -		list_for_each_entry(group, &domain->group_list, next) {
> >> -			if (group->iommu_group != iommu_group)
> >> -				continue;
> >> -
> >> -			iommu_detach_group(domain->domain, iommu_group);
> >> +	if (iommu->local_domain) {
> >> +		domain = iommu->local_domain;
> >> +		group = find_iommu_group(domain, iommu_group);
> >> +		if (group) {
> >>  			list_del(&group->next);
> >>  			kfree(group);
> >> -			/*
> >> -			 * Group ownership provides privilege, if the group
> >> -			 * list is empty, the domain goes away.  If it's the
> >> -			 * last domain, then all the mappings go away too.
> >> -			 */
> >> +
> >>  			if (list_empty(&domain->group_list)) {
> >> -				if (list_is_singular(&iommu->domain_list))
> >> +				vfio_local_unpin_all(domain);
> >> +				if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> >>  					vfio_iommu_unmap_unpin_all(iommu);
> >> -				iommu_domain_free(domain->domain);
> >> -				list_del(&domain->next);
> >>  				kfree(domain);
> >> +				iommu->local_domain = NULL;
> >> +			}  
> > 
> > 
> > I can't quite wrap my head around this, if we have mdev groups attached
> > and this iommu group matches an mdev group, remove from list and free
> > the group.  If there are now no more groups in the mdev group list,
> > then for each vfio_pfn, unpin the pfn, /without/ doing accounting
> > udpates   
> 
> corrected the code to do accounting here.
> 
> > and remove the vfio_pfn, but only if the ref_count is now
> > zero.  
> 
> Yes, If you see the loop vfio_local_unpin_all(), it iterates till the
> node in rb tree exist
> 
> >> +	while ((node = rb_first(&domain->local_addr_space->pfn_list)))
> >> +		vfio_unpin_pfn(domain,
> >> +				rb_entry(node, struct vfio_pfn, node), false);
> >> +  
> 
> 
> and vfio_unpin_pfn() only remove the node from rb tree if ref count is
> zero.
> 
> static int vfio_unpin_pfn(struct vfio_domain *domain,
>                           struct vfio_pfn *vpfn, bool do_accounting)
> {
>         __vfio_unpin_page_local(domain, vpfn->pfn, vpfn->prot,
>                                 do_accounting);
> 
>         if (atomic_dec_and_test(&vpfn->ref_count))
>                 vfio_remove_from_pfn_list(domain, vpfn);
> 
>         return 1;
> }
> 
> so for example for a vfio_pfn ref_count is 2, first iteration would be:
>  - call __vfio_unpin_page_local()
>  - atomic_dec(ref_count), so now ref_count is 1, but node is not removed
> from rb tree.
> 
> In next iteration:
>  - call __vfio_unpin_page_local()
>  - atomic_dec(ref_count), so now ref_count is 0, remove node from rb tree.

Ok, I missed that, thanks.

> >  We free the domain, so if the ref_count was non-zero we've now
> > just leaked memory.  I think that means that if a vendor driver pins a
> > given page twice, that leak occurs.  Furthermore, if there is not an
> > iommu capable domain in the container, we remove all the vfio_dma
> > entries as well, ok.  Maybe the only issue is those leaked vfio_pfns.
> >   
> 
> So if vendor driver pins a page twice, vfio_unpin_pfn() would get called
> twice and only when ref count is zero that node is removed from rb tree.
> So there is no memory leak.

Ok
Kirti Wankhede Oct. 24, 2016, 10:35 a.m. UTC | #6
On 10/21/2016 8:06 PM, Alex Williamson wrote:
> On Fri, 21 Oct 2016 15:49:07 +0800
> Jike Song <jike.song@intel.com> wrote:
> 
>> On 10/18/2016 05:22 AM, Kirti Wankhede wrote:
>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>> index 2ba19424e4a1..5d67058a611d 100644
>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>> +++ b/drivers/vfio/vfio_iommu_type1.c  
>> [snip]
>>>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>>>  					 struct iommu_group *iommu_group)
>>>  {
>>>  	struct vfio_iommu *iommu = iommu_data;
>>> -	struct vfio_group *group, *g;
>>> +	struct vfio_group *group;
>>>  	struct vfio_domain *domain, *d;
>>>  	struct bus_type *bus = NULL;
>>>  	int ret;
>>> @@ -746,10 +1136,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>>  	mutex_lock(&iommu->lock);
>>>  
>>>  	list_for_each_entry(d, &iommu->domain_list, next) {
>>> -		list_for_each_entry(g, &d->group_list, next) {
>>> -			if (g->iommu_group != iommu_group)
>>> -				continue;
>>> +		if (find_iommu_group(d, iommu_group)) {
>>> +			mutex_unlock(&iommu->lock);
>>> +			return -EINVAL;
>>> +		}
>>> +	}
>>>  
>>> +	if (iommu->local_domain) {
>>> +		if (find_iommu_group(iommu->local_domain, iommu_group)) {
>>>  			mutex_unlock(&iommu->lock);
>>>  			return -EINVAL;
>>>  		}
>>> @@ -769,6 +1163,30 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>>  	if (ret)
>>>  		goto out_free;
>>>  
>>> +	if (IS_ENABLED(CONFIG_VFIO_MDEV) && !iommu_present(bus) &&
>>> +	    (bus == &mdev_bus_type)) {  
>>
>> Hi Kirti,
>>
>> By refering mdev_bus_type directly you are making vfio_iommu_type1.ko depends
>> on mdev.ko, but in Kconfig doesn't guarantee the dependency. For example,
>> if CONFIG_VFIO_IOMMU_TYPE1=y and CONFIG_VFIO_MDEV=m, the building will fail.
> 
> Good point, Jike.  I don't think we want to make existing vfio modules
> dependent on mdev modules.  I wonder if we can lookup the mdev_bus_type
> symbol w/o triggering the module load.  Thanks,
> 

Ok. Modifying the check as below works in above case:

        mdev_bus = symbol_get(mdev_bus_type);

        if (mdev_bus && (bus == mdev_bus) && !iommu_present(bus) ) {
                symbol_put(mdev_bus_type);
                ...
        }

Kirti
Tian, Kevin Oct. 26, 2016, 7:19 a.m. UTC | #7
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Monday, October 24, 2016 10:32 AM
> 
> > >> -static long vfio_unpin_pages(unsigned long pfn, long npage,
> > >> -			     int prot, bool do_accounting)
> > >> +static long __vfio_unpin_pages_remote(struct vfio_iommu *iommu,
> > >> +				      unsigned long pfn, long npage, int prot,
> > >> +				      bool do_accounting)
> > >
> > > Have you noticed that it's kind of confusing that
> > > __vfio_{un}pin_pages_remote() uses current, which does a
> > > get_user_pages_fast() while "local" uses a provided task_struct and
> > > uses get_user_pages_*remote*()?  And also what was effectively local
> > > (ie. we're pinning for our own use here) is now "remote" and pinning
> > > for a remote, vendor driver consumer, is now "local".  It's not very
> > > intuitive.
> > >

I questioned this confusing naming in v8 too...

> >
> > 'local' in local_domain was suggested to describe the domain for local
> > page tracking. Earlier suggestions to have 'mdev' or 'noimmu' in this
> > name were discarded. May be we should revisit what the name should be.
> > Any suggestion?
> >
> > For local_domain, to pin pages, flow is:
> >
> > for local_domain
> >     |- vfio_pin_pages()
> >         |- vfio_iommu_type1_pin_pages()
> >             |- __vfio_pin_page_local()
> >                 |-  vaddr_get_pfn(task->mm)
> >                     |- get_user_pages_remote()
> >
> > __vfio_pin_page_local() --> get_user_pages_remote()
> 
> 
> In vfio.c we have the concept of an external user, perhaps that could
> be continued here.  An mdev driver would be an external, or remote
> pinning.
> 

I prefer to use remote here. It's aligned with underlying mm operations

Thanks
Kevin
Tian, Kevin Oct. 26, 2016, 7:53 a.m. UTC | #8
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Thursday, October 20, 2016 5:03 AM
> > @@ -83,6 +92,21 @@ struct vfio_group {
> >  };
> >
> >  /*
> > + * Guest RAM pinning working set or DMA target
> > + */
> > +struct vfio_pfn {
> > +	struct rb_node		node;
> > +	unsigned long		vaddr;		/* virtual addr */
> > +	dma_addr_t		iova;		/* IOVA */
> > +	unsigned long		pfn;		/* Host pfn */
> > +	int			prot;
> > +	atomic_t		ref_count;
> > +};
> 
> Somehow we're going to need to fit an invalidation callback here too.
> How would we handle a case where there are multiple mdev devices, from
> different vendor drivers, that all have the same pfn pinned?  I'm
> already concerned about the per pfn overhead we're introducing here so
> clearly we cannot store an invalidation callback per pinned page, per
> vendor driver.  Perhaps invalidations should be done using a notifier
> chain per vfio_iommu, the vendor drivers are required to register on
> that chain (fail pinning with empty notifier list) user unmapping
> will be broadcast to the notifier chain, the vendor driver will be
> responsible for deciding if each unmap is relevant to them (potentially
> it's for a pinning from another driver).
> 
> I expect we also need to enforce that vendors perform a synchronous
> unmap such that after returning from the notifier list call, the
> vfio_pfn should no longer exist.  If it does we might need to BUG_ON.
> Also be careful to pay attention to the locking of the notifier vs
> unpin callbacks to avoid deadlocks.
> 

What about just requesting vendor driver to provide a callback in parent 
device ops? 

Curious in which scenario the user application (say Qemu here) may 
unmap memory pages which are still pinned by vendor driver... Is it 
purely about a corner case which we want to handle elegantly? 

If yes, possibly a simpler way is to force destroying mdev instead of 
asking vendor driver to take care of each invalidation request under
such situation. Since anyway the mdev device won't be in an usable
state anymore... (sorry if I missed the key problem here.)

Thanks
Kevin
Tian, Kevin Oct. 26, 2016, 7:54 a.m. UTC | #9
> From: Tian, Kevin
> Sent: Wednesday, October 26, 2016 3:54 PM
> 
> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Thursday, October 20, 2016 5:03 AM
> > > @@ -83,6 +92,21 @@ struct vfio_group {
> > >  };
> > >
> > >  /*
> > > + * Guest RAM pinning working set or DMA target
> > > + */
> > > +struct vfio_pfn {
> > > +	struct rb_node		node;
> > > +	unsigned long		vaddr;		/* virtual addr */
> > > +	dma_addr_t		iova;		/* IOVA */
> > > +	unsigned long		pfn;		/* Host pfn */
> > > +	int			prot;
> > > +	atomic_t		ref_count;
> > > +};
> >
> > Somehow we're going to need to fit an invalidation callback here too.
> > How would we handle a case where there are multiple mdev devices, from
> > different vendor drivers, that all have the same pfn pinned?  I'm
> > already concerned about the per pfn overhead we're introducing here so
> > clearly we cannot store an invalidation callback per pinned page, per
> > vendor driver.  Perhaps invalidations should be done using a notifier
> > chain per vfio_iommu, the vendor drivers are required to register on
> > that chain (fail pinning with empty notifier list) user unmapping
> > will be broadcast to the notifier chain, the vendor driver will be
> > responsible for deciding if each unmap is relevant to them (potentially
> > it's for a pinning from another driver).
> >
> > I expect we also need to enforce that vendors perform a synchronous
> > unmap such that after returning from the notifier list call, the
> > vfio_pfn should no longer exist.  If it does we might need to BUG_ON.
> > Also be careful to pay attention to the locking of the notifier vs
> > unpin callbacks to avoid deadlocks.
> >
> 
> What about just requesting vendor driver to provide a callback in parent
> device ops?
> 
> Curious in which scenario the user application (say Qemu here) may
> unmap memory pages which are still pinned by vendor driver... Is it
> purely about a corner case which we want to handle elegantly?
> 
> If yes, possibly a simpler way is to force destroying mdev instead of
> asking vendor driver to take care of each invalidation request under
> such situation. Since anyway the mdev device won't be in an usable
> state anymore... (sorry if I missed the key problem here.)
> 

or calling reset callback of parent device driver, if we don't want to
break libvirt's expectation by blindly removing mdev device...
Kirti Wankhede Oct. 26, 2016, 3:06 p.m. UTC | #10
On 10/26/2016 12:49 PM, Tian, Kevin wrote:
>> From: Alex Williamson [mailto:alex.williamson@redhat.com]
>> Sent: Monday, October 24, 2016 10:32 AM
>>
>>>>> -static long vfio_unpin_pages(unsigned long pfn, long npage,
>>>>> -			     int prot, bool do_accounting)
>>>>> +static long __vfio_unpin_pages_remote(struct vfio_iommu *iommu,
>>>>> +				      unsigned long pfn, long npage, int prot,
>>>>> +				      bool do_accounting)
>>>>
>>>> Have you noticed that it's kind of confusing that
>>>> __vfio_{un}pin_pages_remote() uses current, which does a
>>>> get_user_pages_fast() while "local" uses a provided task_struct and
>>>> uses get_user_pages_*remote*()?  And also what was effectively local
>>>> (ie. we're pinning for our own use here) is now "remote" and pinning
>>>> for a remote, vendor driver consumer, is now "local".  It's not very
>>>> intuitive.
>>>>
> 
> I questioned this confusing naming in v8 too...
> 

I do tried to address your concerns on v8.

>>>
>>> 'local' in local_domain was suggested to describe the domain for local
>>> page tracking. Earlier suggestions to have 'mdev' or 'noimmu' in this
>>> name were discarded. May be we should revisit what the name should be.
>>> Any suggestion?
>>>
>>> For local_domain, to pin pages, flow is:
>>>
>>> for local_domain
>>>     |- vfio_pin_pages()
>>>         |- vfio_iommu_type1_pin_pages()
>>>             |- __vfio_pin_page_local()
>>>                 |-  vaddr_get_pfn(task->mm)
>>>                     |- get_user_pages_remote()
>>>
>>> __vfio_pin_page_local() --> get_user_pages_remote()
>>
>>
>> In vfio.c we have the concept of an external user, perhaps that could
>> be continued here.  An mdev driver would be an external, or remote
>> pinning.
>>
> 
> I prefer to use remote here. It's aligned with underlying mm operations
> 

Using 'remote' in this case is also confusing since it is already used
in this file. I liked Alex's suggestion to use external and I'll have
those changed in next version of patch set.

Kirti
Alex Williamson Oct. 26, 2016, 3:16 p.m. UTC | #11
On Wed, 26 Oct 2016 07:53:43 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Thursday, October 20, 2016 5:03 AM  
> > > @@ -83,6 +92,21 @@ struct vfio_group {
> > >  };
> > >
> > >  /*
> > > + * Guest RAM pinning working set or DMA target
> > > + */
> > > +struct vfio_pfn {
> > > +	struct rb_node		node;
> > > +	unsigned long		vaddr;		/* virtual addr */
> > > +	dma_addr_t		iova;		/* IOVA */
> > > +	unsigned long		pfn;		/* Host pfn */
> > > +	int			prot;
> > > +	atomic_t		ref_count;
> > > +};  
> > 
> > Somehow we're going to need to fit an invalidation callback here too.
> > How would we handle a case where there are multiple mdev devices, from
> > different vendor drivers, that all have the same pfn pinned?  I'm
> > already concerned about the per pfn overhead we're introducing here so
> > clearly we cannot store an invalidation callback per pinned page, per
> > vendor driver.  Perhaps invalidations should be done using a notifier
> > chain per vfio_iommu, the vendor drivers are required to register on
> > that chain (fail pinning with empty notifier list) user unmapping
> > will be broadcast to the notifier chain, the vendor driver will be
> > responsible for deciding if each unmap is relevant to them (potentially
> > it's for a pinning from another driver).
> > 
> > I expect we also need to enforce that vendors perform a synchronous
> > unmap such that after returning from the notifier list call, the
> > vfio_pfn should no longer exist.  If it does we might need to BUG_ON.
> > Also be careful to pay attention to the locking of the notifier vs
> > unpin callbacks to avoid deadlocks.
> >   
> 
> What about just requesting vendor driver to provide a callback in parent 
> device ops?

How does the iommu driver get to the mdev vendor driver callback?  We
can also have pages pinned by multiple vendor drivers, I don't think
we want the additional overhead of a per page list of invalidation
callbacks.
 
> Curious in which scenario the user application (say Qemu here) may 
> unmap memory pages which are still pinned by vendor driver... Is it 
> purely about a corner case which we want to handle elegantly? 

The vfio type1 iommu API provides a MAP and UNMAP interface.  The unmap
call is expected to work regardless of how it might inhibit the device
from working.  This is currently true of iommu protected devices today,
a user can unmap pages which might be DMA targets for the device and
the iommu prevents further access to those pages, possibly at the
expense of device operation.  We cannot support an interface where a
user can unmap a set of pages and map in new pages to replace them when
the vendor driver might be caching stale mappings.

In normal VM operation perhaps this is a corner case, but the API is
not defined only for the normal and expected behavior of a VM.
 
> If yes, possibly a simpler way is to force destroying mdev instead of 
> asking vendor driver to take care of each invalidation request under
> such situation. Since anyway the mdev device won't be in an usable
> state anymore... (sorry if I missed the key problem here.)

That's a pretty harsh response for an operation which is completely
valid from an API perspective.  What if the VM does an unmap of all
memory around reset?  We cannot guarantee that the guest driver will
have a chance to do cleanup, the guest may have crashed or a
system_reset invoked.  Would you have the mdev destroyed in this case?
How could QEMU, which has no device specific driver to know that vendor
pinnings are present, recover from this?  Thanks,

Alex
Alex Williamson Oct. 26, 2016, 3:19 p.m. UTC | #12
On Wed, 26 Oct 2016 07:54:56 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Tian, Kevin
> > Sent: Wednesday, October 26, 2016 3:54 PM
> >   
> > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > Sent: Thursday, October 20, 2016 5:03 AM  
> > > > @@ -83,6 +92,21 @@ struct vfio_group {
> > > >  };
> > > >
> > > >  /*
> > > > + * Guest RAM pinning working set or DMA target
> > > > + */
> > > > +struct vfio_pfn {
> > > > +	struct rb_node		node;
> > > > +	unsigned long		vaddr;		/* virtual addr */
> > > > +	dma_addr_t		iova;		/* IOVA */
> > > > +	unsigned long		pfn;		/* Host pfn */
> > > > +	int			prot;
> > > > +	atomic_t		ref_count;
> > > > +};  
> > >
> > > Somehow we're going to need to fit an invalidation callback here too.
> > > How would we handle a case where there are multiple mdev devices, from
> > > different vendor drivers, that all have the same pfn pinned?  I'm
> > > already concerned about the per pfn overhead we're introducing here so
> > > clearly we cannot store an invalidation callback per pinned page, per
> > > vendor driver.  Perhaps invalidations should be done using a notifier
> > > chain per vfio_iommu, the vendor drivers are required to register on
> > > that chain (fail pinning with empty notifier list) user unmapping
> > > will be broadcast to the notifier chain, the vendor driver will be
> > > responsible for deciding if each unmap is relevant to them (potentially
> > > it's for a pinning from another driver).
> > >
> > > I expect we also need to enforce that vendors perform a synchronous
> > > unmap such that after returning from the notifier list call, the
> > > vfio_pfn should no longer exist.  If it does we might need to BUG_ON.
> > > Also be careful to pay attention to the locking of the notifier vs
> > > unpin callbacks to avoid deadlocks.
> > >  
> > 
> > What about just requesting vendor driver to provide a callback in parent
> > device ops?
> > 
> > Curious in which scenario the user application (say Qemu here) may
> > unmap memory pages which are still pinned by vendor driver... Is it
> > purely about a corner case which we want to handle elegantly?
> > 
> > If yes, possibly a simpler way is to force destroying mdev instead of
> > asking vendor driver to take care of each invalidation request under
> > such situation. Since anyway the mdev device won't be in an usable
> > state anymore... (sorry if I missed the key problem here.)
> >   
> 
> or calling reset callback of parent device driver, if we don't want to
> break libvirt's expectation by blindly removing mdev device...

I think we're going off into the weeds here.  mdev devices need to
honor the existing API, therefore an unmap should result in preventing
the device from further access to the unmapped pages, nothing more,
nothing less.  Thanks,

Alex
Alexey Kardashevskiy Oct. 27, 2016, 7:20 a.m. UTC | #13
On 18/10/16 08:22, Kirti Wankhede wrote:
> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
> 
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
> 
> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
> IOMMU module that supports pining and unpinning pages for mdev devices
> should provide these functions.
> Added APIs for pining and unpining pages to VFIO module. These calls back
> into backend iommu module to actually pin and unpin pages.
> 
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - When iommu_group of mediated devices is attached, task structure is
>   cached which is used later to pin pages and page accounting.


For SPAPR TCE IOMMU driver, I ended up caching mm_struct with
atomic_inc(&container->mm->mm_count) (patches are on the way) instead of
using @current or task as the process might be gone while VFIO container is
still alive and @mm might be needed to do proper cleanup; this might not be
an issue with this patchset now but still you seem to only use @mm from
task_struct.



> - It keeps track of pinned pages for mediated domain. This data is used to
>   verify unpinning request and to unpin remaining pages while detaching, if
>   there are any.
> - Used existing mechanism for page accounting. If iommu capable domain
>   exist in the container then all pages are already pinned and accounted.
>   Accouting for mdev device is only done if there is no iommu capable
>   domain in the container.
> - Page accouting is updated on hot plug and unplug mdev device and pass
>   through device.
> 
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - Linux VM hot plug and unplug vGPU device while GPU pass through device
>   exist
> - Linux VM hot plug and unplug GPU pass through device while vGPU device
>   exist
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
Kirti Wankhede Oct. 27, 2016, 12:31 p.m. UTC | #14
On 10/27/2016 12:50 PM, Alexey Kardashevskiy wrote:
> On 18/10/16 08:22, Kirti Wankhede wrote:
>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>> managed by an IOMMU domain.
>>
>> Aim of this change is:
>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>> - To support direct assigned device and mediated device in single module
>>
>> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
>> IOMMU module that supports pining and unpinning pages for mdev devices
>> should provide these functions.
>> Added APIs for pining and unpining pages to VFIO module. These calls back
>> into backend iommu module to actually pin and unpin pages.
>>
>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>> backend module. More details:
>> - When iommu_group of mediated devices is attached, task structure is
>>   cached which is used later to pin pages and page accounting.
> 
> 
> For SPAPR TCE IOMMU driver, I ended up caching mm_struct with
> atomic_inc(&container->mm->mm_count) (patches are on the way) instead of
> using @current or task as the process might be gone while VFIO container is
> still alive and @mm might be needed to do proper cleanup; this might not be
> an issue with this patchset now but still you seem to only use @mm from
> task_struct.
> 

Consider the example of QEMU process which creates VFIO container, QEMU
in its teardown path would release the container. How could container be
alive when process is gone?

Kirti

> 
> 
>> - It keeps track of pinned pages for mediated domain. This data is used to
>>   verify unpinning request and to unpin remaining pages while detaching, if
>>   there are any.
>> - Used existing mechanism for page accounting. If iommu capable domain
>>   exist in the container then all pages are already pinned and accounted.
>>   Accouting for mdev device is only done if there is no iommu capable
>>   domain in the container.
>> - Page accouting is updated on hot plug and unplug mdev device and pass
>>   through device.
>>
>> Tested by assigning below combinations of devices to a single VM:
>> - GPU pass through only
>> - vGPU device only
>> - One GPU pass through and one vGPU device
>> - Linux VM hot plug and unplug vGPU device while GPU pass through device
>>   exist
>> - Linux VM hot plug and unplug GPU pass through device while vGPU device
>>   exist
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Signed-off-by: Neo Jia <cjia@nvidia.com>
>> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> 
>
Alex Williamson Oct. 27, 2016, 2:30 p.m. UTC | #15
On Thu, 27 Oct 2016 18:01:51 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 10/27/2016 12:50 PM, Alexey Kardashevskiy wrote:
> > On 18/10/16 08:22, Kirti Wankhede wrote:  
> >> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> >> Mediated device only uses IOMMU APIs, the underlying hardware can be
> >> managed by an IOMMU domain.
> >>
> >> Aim of this change is:
> >> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> >> - To support direct assigned device and mediated device in single module
> >>
> >> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
> >> IOMMU module that supports pining and unpinning pages for mdev devices
> >> should provide these functions.
> >> Added APIs for pining and unpining pages to VFIO module. These calls back
> >> into backend iommu module to actually pin and unpin pages.
> >>
> >> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> >> backend module. More details:
> >> - When iommu_group of mediated devices is attached, task structure is
> >>   cached which is used later to pin pages and page accounting.  
> > 
> > 
> > For SPAPR TCE IOMMU driver, I ended up caching mm_struct with
> > atomic_inc(&container->mm->mm_count) (patches are on the way) instead of
> > using @current or task as the process might be gone while VFIO container is
> > still alive and @mm might be needed to do proper cleanup; this might not be
> > an issue with this patchset now but still you seem to only use @mm from
> > task_struct.
> >   
> 
> Consider the example of QEMU process which creates VFIO container, QEMU
> in its teardown path would release the container. How could container be
> alive when process is gone?

If QEMU is sent a SIGKILL, does the process still exist?  We must be
able to perform cleanup regardless of the state, or existence, of the
task that created it.  Thanks,

Alex
Kirti Wankhede Oct. 27, 2016, 3:59 p.m. UTC | #16
On 10/27/2016 8:00 PM, Alex Williamson wrote:
> On Thu, 27 Oct 2016 18:01:51 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 10/27/2016 12:50 PM, Alexey Kardashevskiy wrote:
>>> On 18/10/16 08:22, Kirti Wankhede wrote:  
>>>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>>>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>>>> managed by an IOMMU domain.
>>>>
>>>> Aim of this change is:
>>>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>>>> - To support direct assigned device and mediated device in single module
>>>>
>>>> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
>>>> IOMMU module that supports pining and unpinning pages for mdev devices
>>>> should provide these functions.
>>>> Added APIs for pining and unpining pages to VFIO module. These calls back
>>>> into backend iommu module to actually pin and unpin pages.
>>>>
>>>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>>>> backend module. More details:
>>>> - When iommu_group of mediated devices is attached, task structure is
>>>>   cached which is used later to pin pages and page accounting.  
>>>
>>>
>>> For SPAPR TCE IOMMU driver, I ended up caching mm_struct with
>>> atomic_inc(&container->mm->mm_count) (patches are on the way) instead of
>>> using @current or task as the process might be gone while VFIO container is
>>> still alive and @mm might be needed to do proper cleanup; this might not be
>>> an issue with this patchset now but still you seem to only use @mm from
>>> task_struct.
>>>   
>>
>> Consider the example of QEMU process which creates VFIO container, QEMU
>> in its teardown path would release the container. How could container be
>> alive when process is gone?
> 
> If QEMU is sent a SIGKILL, does the process still exist?  We must be
> able to perform cleanup regardless of the state, or existence, of the
> task that created it.  Thanks,
> 

The kernel closes all open file descriptors when any process is
terminated, so .release() from struct vfio_iommu_driver_ops gets called
on SIGKILL or SIGTERM and release() function do all cleanup.

Kirti
Alexey Kardashevskiy Oct. 28, 2016, 2:18 a.m. UTC | #17
On 27/10/16 23:31, Kirti Wankhede wrote:
> 
> 
> On 10/27/2016 12:50 PM, Alexey Kardashevskiy wrote:
>> On 18/10/16 08:22, Kirti Wankhede wrote:
>>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>>> managed by an IOMMU domain.
>>>
>>> Aim of this change is:
>>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>>> - To support direct assigned device and mediated device in single module
>>>
>>> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
>>> IOMMU module that supports pining and unpinning pages for mdev devices
>>> should provide these functions.
>>> Added APIs for pining and unpining pages to VFIO module. These calls back
>>> into backend iommu module to actually pin and unpin pages.
>>>
>>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>>> backend module. More details:
>>> - When iommu_group of mediated devices is attached, task structure is
>>>   cached which is used later to pin pages and page accounting.
>>
>>
>> For SPAPR TCE IOMMU driver, I ended up caching mm_struct with
>> atomic_inc(&container->mm->mm_count) (patches are on the way) instead of
>> using @current or task as the process might be gone while VFIO container is
>> still alive and @mm might be needed to do proper cleanup; this might not be
>> an issue with this patchset now but still you seem to only use @mm from
>> task_struct.
>>
> 
> Consider the example of QEMU process which creates VFIO container, QEMU
> in its teardown path would release the container. How could container be
> alive when process is gone?

do_exit() in kernel/exit.c calls exit_mm() (which sets NULL to tsk->mm)
first, and then releases open files by calling  exit_files(). So
container's release() does not have current->mm.



> 
> Kirti
> 
>>
>>
>>> - It keeps track of pinned pages for mediated domain. This data is used to
>>>   verify unpinning request and to unpin remaining pages while detaching, if
>>>   there are any.
>>> - Used existing mechanism for page accounting. If iommu capable domain
>>>   exist in the container then all pages are already pinned and accounted.
>>>   Accouting for mdev device is only done if there is no iommu capable
>>>   domain in the container.
>>> - Page accouting is updated on hot plug and unplug mdev device and pass
>>>   through device.
>>>
>>> Tested by assigning below combinations of devices to a single VM:
>>> - GPU pass through only
>>> - vGPU device only
>>> - One GPU pass through and one vGPU device
>>> - Linux VM hot plug and unplug vGPU device while GPU pass through device
>>>   exist
>>> - Linux VM hot plug and unplug GPU pass through device while vGPU device
>>>   exist
>>>
>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>> Signed-off-by: Neo Jia <cjia@nvidia.com>
>>> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
>>
>>
Kirti Wankhede Nov. 1, 2016, 2:01 p.m. UTC | #18
On 10/28/2016 7:48 AM, Alexey Kardashevskiy wrote:
> On 27/10/16 23:31, Kirti Wankhede wrote:
>>
>>
>> On 10/27/2016 12:50 PM, Alexey Kardashevskiy wrote:
>>> On 18/10/16 08:22, Kirti Wankhede wrote:
>>>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>>>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>>>> managed by an IOMMU domain.
>>>>
>>>> Aim of this change is:
>>>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>>>> - To support direct assigned device and mediated device in single module
>>>>
>>>> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
>>>> IOMMU module that supports pining and unpinning pages for mdev devices
>>>> should provide these functions.
>>>> Added APIs for pining and unpining pages to VFIO module. These calls back
>>>> into backend iommu module to actually pin and unpin pages.
>>>>
>>>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>>>> backend module. More details:
>>>> - When iommu_group of mediated devices is attached, task structure is
>>>>   cached which is used later to pin pages and page accounting.
>>>
>>>
>>> For SPAPR TCE IOMMU driver, I ended up caching mm_struct with
>>> atomic_inc(&container->mm->mm_count) (patches are on the way) instead of
>>> using @current or task as the process might be gone while VFIO container is
>>> still alive and @mm might be needed to do proper cleanup; this might not be
>>> an issue with this patchset now but still you seem to only use @mm from
>>> task_struct.
>>>
>>
>> Consider the example of QEMU process which creates VFIO container, QEMU
>> in its teardown path would release the container. How could container be
>> alive when process is gone?
> 
> do_exit() in kernel/exit.c calls exit_mm() (which sets NULL to tsk->mm)
> first, and then releases open files by calling  exit_files(). So
> container's release() does not have current->mm.
> 

Incrementing usage count (get_task_struct()) while saving task structure
and decementing it (put_task_struct()) from release() should  work here.
Updating the patch.

Thanks,
Kirti
Alexey Kardashevskiy Nov. 2, 2016, 1:24 a.m. UTC | #19
On 02/11/16 01:01, Kirti Wankhede wrote:
> 
> 
> On 10/28/2016 7:48 AM, Alexey Kardashevskiy wrote:
>> On 27/10/16 23:31, Kirti Wankhede wrote:
>>>
>>>
>>> On 10/27/2016 12:50 PM, Alexey Kardashevskiy wrote:
>>>> On 18/10/16 08:22, Kirti Wankhede wrote:
>>>>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>>>>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>>>>> managed by an IOMMU domain.
>>>>>
>>>>> Aim of this change is:
>>>>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>>>>> - To support direct assigned device and mediated device in single module
>>>>>
>>>>> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
>>>>> IOMMU module that supports pining and unpinning pages for mdev devices
>>>>> should provide these functions.
>>>>> Added APIs for pining and unpining pages to VFIO module. These calls back
>>>>> into backend iommu module to actually pin and unpin pages.
>>>>>
>>>>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>>>>> backend module. More details:
>>>>> - When iommu_group of mediated devices is attached, task structure is
>>>>>   cached which is used later to pin pages and page accounting.
>>>>
>>>>
>>>> For SPAPR TCE IOMMU driver, I ended up caching mm_struct with
>>>> atomic_inc(&container->mm->mm_count) (patches are on the way) instead of
>>>> using @current or task as the process might be gone while VFIO container is
>>>> still alive and @mm might be needed to do proper cleanup; this might not be
>>>> an issue with this patchset now but still you seem to only use @mm from
>>>> task_struct.
>>>>
>>>
>>> Consider the example of QEMU process which creates VFIO container, QEMU
>>> in its teardown path would release the container. How could container be
>>> alive when process is gone?
>>
>> do_exit() in kernel/exit.c calls exit_mm() (which sets NULL to tsk->mm)
>> first, and then releases open files by calling  exit_files(). So
>> container's release() does not have current->mm.
>>
> 
> Incrementing usage count (get_task_struct()) while saving task structure
> and decementing it (put_task_struct()) from release() should  work here.
> Updating the patch.

I cannot see how the task->usage counter prevents do_exit() from performing
the exit, can you?
Kirti Wankhede Nov. 2, 2016, 3:29 a.m. UTC | #20
On 11/2/2016 6:54 AM, Alexey Kardashevskiy wrote:
> On 02/11/16 01:01, Kirti Wankhede wrote:
>>
>>
>> On 10/28/2016 7:48 AM, Alexey Kardashevskiy wrote:
>>> On 27/10/16 23:31, Kirti Wankhede wrote:
>>>>
>>>>
>>>> On 10/27/2016 12:50 PM, Alexey Kardashevskiy wrote:
>>>>> On 18/10/16 08:22, Kirti Wankhede wrote:
>>>>>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>>>>>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>>>>>> managed by an IOMMU domain.
>>>>>>
>>>>>> Aim of this change is:
>>>>>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>>>>>> - To support direct assigned device and mediated device in single module
>>>>>>
>>>>>> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
>>>>>> IOMMU module that supports pining and unpinning pages for mdev devices
>>>>>> should provide these functions.
>>>>>> Added APIs for pining and unpining pages to VFIO module. These calls back
>>>>>> into backend iommu module to actually pin and unpin pages.
>>>>>>
>>>>>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>>>>>> backend module. More details:
>>>>>> - When iommu_group of mediated devices is attached, task structure is
>>>>>>   cached which is used later to pin pages and page accounting.
>>>>>
>>>>>
>>>>> For SPAPR TCE IOMMU driver, I ended up caching mm_struct with
>>>>> atomic_inc(&container->mm->mm_count) (patches are on the way) instead of
>>>>> using @current or task as the process might be gone while VFIO container is
>>>>> still alive and @mm might be needed to do proper cleanup; this might not be
>>>>> an issue with this patchset now but still you seem to only use @mm from
>>>>> task_struct.
>>>>>
>>>>
>>>> Consider the example of QEMU process which creates VFIO container, QEMU
>>>> in its teardown path would release the container. How could container be
>>>> alive when process is gone?
>>>
>>> do_exit() in kernel/exit.c calls exit_mm() (which sets NULL to tsk->mm)
>>> first, and then releases open files by calling  exit_files(). So
>>> container's release() does not have current->mm.
>>>
>>
>> Incrementing usage count (get_task_struct()) while saving task structure
>> and decementing it (put_task_struct()) from release() should  work here.
>> Updating the patch.
> 
> I cannot see how the task->usage counter prevents do_exit() from performing
> the exit, can you?
> 

It will not prevent exit from do_exit(), but that will make sure that we
don't have stale pointer of task structure. Then we can check whether
the task is alive and get mm pointer in teardown path as below:

{
        struct task_struct *task = domain->external_addr_space->task;
        struct mm_struct *mm = NULL;

        put_pfn(pfn, prot);

        if (pid_alive(task))
                mm = get_task_mm(task);

        if (mm) {
                if (do_accounting)
                        vfio_lock_acct(task, -1);

                mmput(mm);
        }
}

Thanks,
Kirti
Alexey Kardashevskiy Nov. 2, 2016, 4:09 a.m. UTC | #21
On 02/11/16 14:29, Kirti Wankhede wrote:
> 
> 
> On 11/2/2016 6:54 AM, Alexey Kardashevskiy wrote:
>> On 02/11/16 01:01, Kirti Wankhede wrote:
>>>
>>>
>>> On 10/28/2016 7:48 AM, Alexey Kardashevskiy wrote:
>>>> On 27/10/16 23:31, Kirti Wankhede wrote:
>>>>>
>>>>>
>>>>> On 10/27/2016 12:50 PM, Alexey Kardashevskiy wrote:
>>>>>> On 18/10/16 08:22, Kirti Wankhede wrote:
>>>>>>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>>>>>>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>>>>>>> managed by an IOMMU domain.
>>>>>>>
>>>>>>> Aim of this change is:
>>>>>>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>>>>>>> - To support direct assigned device and mediated device in single module
>>>>>>>
>>>>>>> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
>>>>>>> IOMMU module that supports pining and unpinning pages for mdev devices
>>>>>>> should provide these functions.
>>>>>>> Added APIs for pining and unpining pages to VFIO module. These calls back
>>>>>>> into backend iommu module to actually pin and unpin pages.
>>>>>>>
>>>>>>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>>>>>>> backend module. More details:
>>>>>>> - When iommu_group of mediated devices is attached, task structure is
>>>>>>>   cached which is used later to pin pages and page accounting.
>>>>>>
>>>>>>
>>>>>> For SPAPR TCE IOMMU driver, I ended up caching mm_struct with
>>>>>> atomic_inc(&container->mm->mm_count) (patches are on the way) instead of
>>>>>> using @current or task as the process might be gone while VFIO container is
>>>>>> still alive and @mm might be needed to do proper cleanup; this might not be
>>>>>> an issue with this patchset now but still you seem to only use @mm from
>>>>>> task_struct.
>>>>>>
>>>>>
>>>>> Consider the example of QEMU process which creates VFIO container, QEMU
>>>>> in its teardown path would release the container. How could container be
>>>>> alive when process is gone?
>>>>
>>>> do_exit() in kernel/exit.c calls exit_mm() (which sets NULL to tsk->mm)
>>>> first, and then releases open files by calling  exit_files(). So
>>>> container's release() does not have current->mm.
>>>>
>>>
>>> Incrementing usage count (get_task_struct()) while saving task structure
>>> and decementing it (put_task_struct()) from release() should  work here.
>>> Updating the patch.
>>
>> I cannot see how the task->usage counter prevents do_exit() from performing
>> the exit, can you?
>>
> 
> It will not prevent exit from do_exit(), but that will make sure that we
> don't have stale pointer of task structure. Then we can check whether
> the task is alive and get mm pointer in teardown path as below:


Or you could just reference and use @mm as KVM and others do. Or there is
anything else you need from @current than just @mm?


> 
> {
>         struct task_struct *task = domain->external_addr_space->task;
>         struct mm_struct *mm = NULL;
> 
>         put_pfn(pfn, prot);
> 
>         if (pid_alive(task))
>                 mm = get_task_mm(task);
> 
>         if (mm) {
>                 if (do_accounting)
>                         vfio_lock_acct(task, -1);
> 
>                 mmput(mm);
>         }
> }
Jike Song Nov. 2, 2016, 12:21 p.m. UTC | #22
On 11/02/2016 12:09 PM, Alexey Kardashevskiy wrote:
> On 02/11/16 14:29, Kirti Wankhede wrote:
>>
>>
>> On 11/2/2016 6:54 AM, Alexey Kardashevskiy wrote:
>>> On 02/11/16 01:01, Kirti Wankhede wrote:
>>>>
>>>>
>>>> On 10/28/2016 7:48 AM, Alexey Kardashevskiy wrote:
>>>>> On 27/10/16 23:31, Kirti Wankhede wrote:
>>>>>>
>>>>>>
>>>>>> On 10/27/2016 12:50 PM, Alexey Kardashevskiy wrote:
>>>>>>> On 18/10/16 08:22, Kirti Wankhede wrote:
>>>>>>>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>>>>>>>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>>>>>>>> managed by an IOMMU domain.
>>>>>>>>
>>>>>>>> Aim of this change is:
>>>>>>>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>>>>>>>> - To support direct assigned device and mediated device in single module
>>>>>>>>
>>>>>>>> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
>>>>>>>> IOMMU module that supports pining and unpinning pages for mdev devices
>>>>>>>> should provide these functions.
>>>>>>>> Added APIs for pining and unpining pages to VFIO module. These calls back
>>>>>>>> into backend iommu module to actually pin and unpin pages.
>>>>>>>>
>>>>>>>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>>>>>>>> backend module. More details:
>>>>>>>> - When iommu_group of mediated devices is attached, task structure is
>>>>>>>>   cached which is used later to pin pages and page accounting.
>>>>>>>
>>>>>>>
>>>>>>> For SPAPR TCE IOMMU driver, I ended up caching mm_struct with
>>>>>>> atomic_inc(&container->mm->mm_count) (patches are on the way) instead of
>>>>>>> using @current or task as the process might be gone while VFIO container is
>>>>>>> still alive and @mm might be needed to do proper cleanup; this might not be
>>>>>>> an issue with this patchset now but still you seem to only use @mm from
>>>>>>> task_struct.
>>>>>>>
>>>>>>
>>>>>> Consider the example of QEMU process which creates VFIO container, QEMU
>>>>>> in its teardown path would release the container. How could container be
>>>>>> alive when process is gone?
>>>>>
>>>>> do_exit() in kernel/exit.c calls exit_mm() (which sets NULL to tsk->mm)
>>>>> first, and then releases open files by calling  exit_files(). So
>>>>> container's release() does not have current->mm.
>>>>>
>>>>
>>>> Incrementing usage count (get_task_struct()) while saving task structure
>>>> and decementing it (put_task_struct()) from release() should  work here.
>>>> Updating the patch.
>>>
>>> I cannot see how the task->usage counter prevents do_exit() from performing
>>> the exit, can you?
>>>
>>
>> It will not prevent exit from do_exit(), but that will make sure that we
>> don't have stale pointer of task structure. Then we can check whether
>> the task is alive and get mm pointer in teardown path as below:
> 
> 
> Or you could just reference and use @mm as KVM and others do. Or there is
> anything else you need from @current than just @mm?
> 

I agree. If @mm is the only thing needed, there is really no reason to
refer to the @task :-)

--
Thanks,
Jike
Kirti Wankhede Nov. 2, 2016, 12:41 p.m. UTC | #23
On 11/2/2016 5:51 PM, Jike Song wrote:
> On 11/02/2016 12:09 PM, Alexey Kardashevskiy wrote:
>> On 02/11/16 14:29, Kirti Wankhede wrote:
>>>
>>>
>>> On 11/2/2016 6:54 AM, Alexey Kardashevskiy wrote:
>>>> On 02/11/16 01:01, Kirti Wankhede wrote:
>>>>>
>>>>>
>>>>> On 10/28/2016 7:48 AM, Alexey Kardashevskiy wrote:
>>>>>> On 27/10/16 23:31, Kirti Wankhede wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 10/27/2016 12:50 PM, Alexey Kardashevskiy wrote:
>>>>>>>> On 18/10/16 08:22, Kirti Wankhede wrote:
>>>>>>>>> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
>>>>>>>>> Mediated device only uses IOMMU APIs, the underlying hardware can be
>>>>>>>>> managed by an IOMMU domain.
>>>>>>>>>
>>>>>>>>> Aim of this change is:
>>>>>>>>> - To use most of the code of TYPE1 IOMMU driver for mediated devices
>>>>>>>>> - To support direct assigned device and mediated device in single module
>>>>>>>>>
>>>>>>>>> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
>>>>>>>>> IOMMU module that supports pining and unpinning pages for mdev devices
>>>>>>>>> should provide these functions.
>>>>>>>>> Added APIs for pining and unpining pages to VFIO module. These calls back
>>>>>>>>> into backend iommu module to actually pin and unpin pages.
>>>>>>>>>
>>>>>>>>> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
>>>>>>>>> backend module. More details:
>>>>>>>>> - When iommu_group of mediated devices is attached, task structure is
>>>>>>>>>   cached which is used later to pin pages and page accounting.
>>>>>>>>
>>>>>>>>
>>>>>>>> For SPAPR TCE IOMMU driver, I ended up caching mm_struct with
>>>>>>>> atomic_inc(&container->mm->mm_count) (patches are on the way) instead of
>>>>>>>> using @current or task as the process might be gone while VFIO container is
>>>>>>>> still alive and @mm might be needed to do proper cleanup; this might not be
>>>>>>>> an issue with this patchset now but still you seem to only use @mm from
>>>>>>>> task_struct.
>>>>>>>>
>>>>>>>
>>>>>>> Consider the example of QEMU process which creates VFIO container, QEMU
>>>>>>> in its teardown path would release the container. How could container be
>>>>>>> alive when process is gone?
>>>>>>
>>>>>> do_exit() in kernel/exit.c calls exit_mm() (which sets NULL to tsk->mm)
>>>>>> first, and then releases open files by calling  exit_files(). So
>>>>>> container's release() does not have current->mm.
>>>>>>
>>>>>
>>>>> Incrementing usage count (get_task_struct()) while saving task structure
>>>>> and decementing it (put_task_struct()) from release() should  work here.
>>>>> Updating the patch.
>>>>
>>>> I cannot see how the task->usage counter prevents do_exit() from performing
>>>> the exit, can you?
>>>>
>>>
>>> It will not prevent exit from do_exit(), but that will make sure that we
>>> don't have stale pointer of task structure. Then we can check whether
>>> the task is alive and get mm pointer in teardown path as below:
>>
>>
>> Or you could just reference and use @mm as KVM and others do. Or there is
>> anything else you need from @current than just @mm?
>>
> 
> I agree. If @mm is the only thing needed, there is really no reason to
> refer to the @task :-)
> 

In vfio_lock_acct(), that is for page accounting, if mm->mmap_sem is
already held then page accounting is deferred, where task structure is
used to get mm and work is deferred only if mm exist:
	mm = get_task_mm(task);

That is where this module need task structure.

Thanks,
Kirti
Jike Song Nov. 2, 2016, 1 p.m. UTC | #24
On 11/02/2016 08:41 PM, Kirti Wankhede wrote:
> On 11/2/2016 5:51 PM, Jike Song wrote:
>> On 11/02/2016 12:09 PM, Alexey Kardashevskiy wrote:
>>> Or you could just reference and use @mm as KVM and others do. Or there is
>>> anything else you need from @current than just @mm?
>>>
>>
>> I agree. If @mm is the only thing needed, there is really no reason to
>> refer to the @task :-)
>>
> 
> In vfio_lock_acct(), that is for page accounting, if mm->mmap_sem is
> already held then page accounting is deferred, where task structure is
> used to get mm and work is deferred only if mm exist:
> 	mm = get_task_mm(task);
> 
> That is where this module need task structure.

Kirti,

By calling get_task_mm you hold a ref on @mm and save it in iommu,
whenever you want to do something like vfio_lock_acct(), use that mm
(as you said, if mmap_sem not accessible then defer it to a work, but
still @mm is the whole information), and put it after the usage.

I still can't see any reason that the @task have to be saved. It's
always the @mm all the time. Did I miss anything?

--
Thanks,
Jike
Kirti Wankhede Nov. 2, 2016, 1:18 p.m. UTC | #25
On 11/2/2016 6:30 PM, Jike Song wrote:
> On 11/02/2016 08:41 PM, Kirti Wankhede wrote:
>> On 11/2/2016 5:51 PM, Jike Song wrote:
>>> On 11/02/2016 12:09 PM, Alexey Kardashevskiy wrote:
>>>> Or you could just reference and use @mm as KVM and others do. Or there is
>>>> anything else you need from @current than just @mm?
>>>>
>>>
>>> I agree. If @mm is the only thing needed, there is really no reason to
>>> refer to the @task :-)
>>>
>>
>> In vfio_lock_acct(), that is for page accounting, if mm->mmap_sem is
>> already held then page accounting is deferred, where task structure is
>> used to get mm and work is deferred only if mm exist:
>> 	mm = get_task_mm(task);
>>
>> That is where this module need task structure.
> 
> Kirti,
> 
> By calling get_task_mm you hold a ref on @mm and save it in iommu,
> whenever you want to do something like vfio_lock_acct(), use that mm
> (as you said, if mmap_sem not accessible then defer it to a work, but
> still @mm is the whole information), and put it after the usage.
> 
> I still can't see any reason that the @task have to be saved. It's
> always the @mm all the time. Did I miss anything?
> 

If the process is terminated by SIGKILL, as Alexey mentioned in this
mail thread earlier exit_mm() is called first and then all files are
closed. From exit_mm(), task->mm is set to NULL. So from teardown path,
we should call get_task_mm(task) to get current status intsead of using
stale pointer.

Thanks,
Kirti.
Jike Song Nov. 2, 2016, 1:35 p.m. UTC | #26
On 11/02/2016 09:18 PM, Kirti Wankhede wrote:
> On 11/2/2016 6:30 PM, Jike Song wrote:
>> On 11/02/2016 08:41 PM, Kirti Wankhede wrote:
>>> On 11/2/2016 5:51 PM, Jike Song wrote:
>>>> On 11/02/2016 12:09 PM, Alexey Kardashevskiy wrote:
>>>>> Or you could just reference and use @mm as KVM and others do. Or there is
>>>>> anything else you need from @current than just @mm?
>>>>>
>>>>
>>>> I agree. If @mm is the only thing needed, there is really no reason to
>>>> refer to the @task :-)
>>>>
>>>
>>> In vfio_lock_acct(), that is for page accounting, if mm->mmap_sem is
>>> already held then page accounting is deferred, where task structure is
>>> used to get mm and work is deferred only if mm exist:
>>> 	mm = get_task_mm(task);
>>>
>>> That is where this module need task structure.
>>
>> Kirti,
>>
>> By calling get_task_mm you hold a ref on @mm and save it in iommu,
>> whenever you want to do something like vfio_lock_acct(), use that mm
>> (as you said, if mmap_sem not accessible then defer it to a work, but
>> still @mm is the whole information), and put it after the usage.
>>
>> I still can't see any reason that the @task have to be saved. It's
>> always the @mm all the time. Did I miss anything?
>>
> 
> If the process is terminated by SIGKILL, as Alexey mentioned in this
> mail thread earlier exit_mm() is called first and then all files are
> closed. From exit_mm(), task->mm is set to NULL. So from teardown path,
> we should call get_task_mm(task) to get current status intsead of using
> stale pointer.

You have got the ref on a task->mm and stored it somewhere, then after
that at some time the task->mm was set to NULL -- what's exactly the
problem here? It's perfectly okay per my understanding ...

--
Thanks,
Jike
Alexey Kardashevskiy Nov. 3, 2016, 4:29 a.m. UTC | #27
On 03/11/16 00:18, Kirti Wankhede wrote:
> 
> 
> On 11/2/2016 6:30 PM, Jike Song wrote:
>> On 11/02/2016 08:41 PM, Kirti Wankhede wrote:
>>> On 11/2/2016 5:51 PM, Jike Song wrote:
>>>> On 11/02/2016 12:09 PM, Alexey Kardashevskiy wrote:
>>>>> Or you could just reference and use @mm as KVM and others do. Or there is
>>>>> anything else you need from @current than just @mm?
>>>>>
>>>>
>>>> I agree. If @mm is the only thing needed, there is really no reason to
>>>> refer to the @task :-)
>>>>
>>>
>>> In vfio_lock_acct(), that is for page accounting, if mm->mmap_sem is
>>> already held then page accounting is deferred, where task structure is
>>> used to get mm and work is deferred only if mm exist:
>>> 	mm = get_task_mm(task);

get_task_mm() increments mm_users which is basically a number of userspaces
holding the reference to mm. As this case it is not a userspace, mm_count
needs to be incremented imho.


>>>
>>> That is where this module need task structure.
>>
>> Kirti,
>>
>> By calling get_task_mm you hold a ref on @mm and save it in iommu,
>> whenever you want to do something like vfio_lock_acct(), use that mm
>> (as you said, if mmap_sem not accessible then defer it to a work, but
>> still @mm is the whole information), and put it after the usage.
>>
>> I still can't see any reason that the @task have to be saved. It's
>> always the @mm all the time. Did I miss anything?
>>
> 
> If the process is terminated by SIGKILL, as Alexey mentioned in this
> mail thread earlier exit_mm() is called first and then all files are
> closed. From exit_mm(), task->mm is set to NULL. So from teardown path,
> we should call get_task_mm(task)

... which will return NULL, no?

> to get current status intsead of using
> stale pointer.

If you increment either mm_users or mm_count at the exact place where you
want to cache task pointer, why would mm pointer become stale until you do
mmdrop() or mmput()?
diff mbox

Patch

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 2e83bdf007fe..a5a210005b65 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1799,6 +1799,104 @@  void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
 }
 EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
 
+
+/*
+ * Pin a set of guest PFNs and return their associated host PFNs for local
+ * domain only.
+ * @dev [in] : device
+ * @user_pfn [in]: array of user/guest PFNs
+ * @npage [in]: count of array elements
+ * @prot [in] : protection flags
+ * @phys_pfn[out] : array of host PFNs
+ */
+long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
+		    long npage, int prot, unsigned long *phys_pfn)
+{
+	struct vfio_container *container;
+	struct vfio_group *group;
+	struct vfio_iommu_driver *driver;
+	ssize_t ret = -EINVAL;
+
+	if (!dev || !user_pfn || !phys_pfn)
+		return -EINVAL;
+
+	group = vfio_group_get_from_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	ret = vfio_group_add_container_user(group);
+	if (ret)
+		goto err_pin_pages;
+
+	container = group->container;
+	if (IS_ERR(container)) {
+		ret = PTR_ERR(container);
+		goto err_pin_pages;
+	}
+
+	down_read(&container->group_lock);
+
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->pin_pages))
+		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
+					     npage, prot, phys_pfn);
+
+	up_read(&container->group_lock);
+	vfio_group_try_dissolve_container(group);
+
+err_pin_pages:
+	vfio_group_put(group);
+	return ret;
+
+}
+EXPORT_SYMBOL(vfio_pin_pages);
+
+/*
+ * Unpin set of host PFNs for local domain only.
+ * @dev [in] : device
+ * @pfn [in] : array of host PFNs to be unpinned.
+ * @npage [in] :count of elements in array, that is number of pages.
+ */
+long vfio_unpin_pages(struct device *dev, unsigned long *pfn, long npage)
+{
+	struct vfio_container *container;
+	struct vfio_group *group;
+	struct vfio_iommu_driver *driver;
+	ssize_t ret = -EINVAL;
+
+	if (!dev || !pfn)
+		return -EINVAL;
+
+	group = vfio_group_get_from_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	ret = vfio_group_add_container_user(group);
+	if (ret)
+		goto err_unpin_pages;
+
+	container = group->container;
+	if (IS_ERR(container)) {
+		ret = PTR_ERR(container);
+		goto err_unpin_pages;
+	}
+
+	down_read(&container->group_lock);
+
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->unpin_pages))
+		ret = driver->ops->unpin_pages(container->iommu_data, pfn,
+					       npage);
+
+	up_read(&container->group_lock);
+	vfio_group_try_dissolve_container(group);
+
+err_unpin_pages:
+	vfio_group_put(group);
+	return ret;
+}
+EXPORT_SYMBOL(vfio_unpin_pages);
+
 /**
  * Module/class support
  */
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 2ba19424e4a1..5d67058a611d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -55,16 +55,24 @@  MODULE_PARM_DESC(disable_hugepages,
 
 struct vfio_iommu {
 	struct list_head	domain_list;
+	struct vfio_domain	*local_domain;
 	struct mutex		lock;
 	struct rb_root		dma_list;
 	bool			v2;
 	bool			nesting;
 };
 
+struct local_addr_space {
+	struct task_struct	*task;
+	struct rb_root		pfn_list;	/* pinned Host pfn list */
+	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
+};
+
 struct vfio_domain {
 	struct iommu_domain	*domain;
 	struct list_head	next;
 	struct list_head	group_list;
+	struct local_addr_space	*local_addr_space;
 	int			prot;		/* IOMMU_CACHE */
 	bool			fgsp;		/* Fine-grained super pages */
 };
@@ -75,6 +83,7 @@  struct vfio_dma {
 	unsigned long		vaddr;		/* Process virtual addr */
 	size_t			size;		/* Map size (bytes) */
 	int			prot;		/* IOMMU_READ/WRITE */
+	bool			iommu_mapped;
 };
 
 struct vfio_group {
@@ -83,6 +92,21 @@  struct vfio_group {
 };
 
 /*
+ * Guest RAM pinning working set or DMA target
+ */
+struct vfio_pfn {
+	struct rb_node		node;
+	unsigned long		vaddr;		/* virtual addr */
+	dma_addr_t		iova;		/* IOVA */
+	unsigned long		pfn;		/* Host pfn */
+	int			prot;
+	atomic_t		ref_count;
+};
+
+#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
+					(!list_empty(&iommu->domain_list))
+
+/*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
  */
@@ -130,6 +154,101 @@  static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 	rb_erase(&old->node, &iommu->dma_list);
 }
 
+/*
+ * Helper Functions for host pfn list
+ */
+
+static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
+				      unsigned long pfn)
+{
+	struct rb_node *node;
+	struct vfio_pfn *vpfn;
+
+	node = domain->local_addr_space->pfn_list.rb_node;
+
+	while (node) {
+		vpfn = rb_entry(node, struct vfio_pfn, node);
+
+		if (pfn < vpfn->pfn)
+			node = node->rb_left;
+		else if (pfn > vpfn->pfn)
+			node = node->rb_right;
+		else
+			return vpfn;
+	}
+
+	return NULL;
+}
+
+static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
+{
+	struct rb_node **link, *parent = NULL;
+	struct vfio_pfn *vpfn;
+
+	link = &domain->local_addr_space->pfn_list.rb_node;
+	while (*link) {
+		parent = *link;
+		vpfn = rb_entry(parent, struct vfio_pfn, node);
+
+		if (new->pfn < vpfn->pfn)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &domain->local_addr_space->pfn_list);
+}
+
+static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
+{
+	rb_erase(&old->node, &domain->local_addr_space->pfn_list);
+}
+
+static int vfio_add_to_pfn_list(struct vfio_domain *domain, unsigned long vaddr,
+				dma_addr_t iova, unsigned long pfn, int prot)
+{
+	struct vfio_pfn *vpfn;
+
+	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
+	if (!vpfn)
+		return -ENOMEM;
+
+	vpfn->vaddr = vaddr;
+	vpfn->iova = iova;
+	vpfn->pfn = pfn;
+	vpfn->prot = prot;
+	atomic_set(&vpfn->ref_count, 1);
+	vfio_link_pfn(domain, vpfn);
+	return 0;
+}
+
+static void vfio_remove_from_pfn_list(struct vfio_domain *domain,
+				      struct vfio_pfn *vpfn)
+{
+	vfio_unlink_pfn(domain, vpfn);
+	kfree(vpfn);
+}
+
+static int vfio_pfn_account(struct vfio_iommu *iommu, unsigned long pfn)
+{
+	struct vfio_pfn *p;
+	struct vfio_domain *domain = iommu->local_domain;
+	int ret = 1;
+
+	if (!domain)
+		return 1;
+
+	mutex_lock(&domain->local_addr_space->pfn_list_lock);
+
+	p = vfio_find_pfn(domain, pfn);
+	if (p)
+		ret = 0;
+
+	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+	return ret;
+}
+
 struct vwork {
 	struct mm_struct	*mm;
 	long			npage;
@@ -150,17 +269,17 @@  static void vfio_lock_acct_bg(struct work_struct *work)
 	kfree(vwork);
 }
 
-static void vfio_lock_acct(long npage)
+static void vfio_lock_acct(struct task_struct *task, long npage)
 {
 	struct vwork *vwork;
 	struct mm_struct *mm;
 
-	if (!current->mm || !npage)
+	if (!task->mm || !npage)
 		return; /* process exited or nothing to do */
 
-	if (down_write_trylock(&current->mm->mmap_sem)) {
-		current->mm->locked_vm += npage;
-		up_write(&current->mm->mmap_sem);
+	if (down_write_trylock(&task->mm->mmap_sem)) {
+		task->mm->locked_vm += npage;
+		up_write(&task->mm->mmap_sem);
 		return;
 	}
 
@@ -172,7 +291,7 @@  static void vfio_lock_acct(long npage)
 	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
 	if (!vwork)
 		return;
-	mm = get_task_mm(current);
+	mm = get_task_mm(task);
 	if (!mm) {
 		kfree(vwork);
 		return;
@@ -228,20 +347,31 @@  static int put_pfn(unsigned long pfn, int prot)
 	return 0;
 }
 
-static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
+static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
+			 int prot, unsigned long *pfn)
 {
 	struct page *page[1];
 	struct vm_area_struct *vma;
+	struct mm_struct *local_mm = (mm ? mm : current->mm);
 	int ret = -EFAULT;
 
-	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
+	if (mm) {
+		down_read(&local_mm->mmap_sem);
+		ret = get_user_pages_remote(NULL, local_mm, vaddr, 1,
+					!!(prot & IOMMU_WRITE), 0, page, NULL);
+		up_read(&local_mm->mmap_sem);
+	} else
+		ret = get_user_pages_fast(vaddr, 1,
+					  !!(prot & IOMMU_WRITE), page);
+
+	if (ret == 1) {
 		*pfn = page_to_pfn(page[0]);
 		return 0;
 	}
 
-	down_read(&current->mm->mmap_sem);
+	down_read(&local_mm->mmap_sem);
 
-	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
+	vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
 
 	if (vma && vma->vm_flags & VM_PFNMAP) {
 		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -249,7 +379,7 @@  static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 			ret = 0;
 	}
 
-	up_read(&current->mm->mmap_sem);
+	up_read(&local_mm->mmap_sem);
 
 	return ret;
 }
@@ -259,33 +389,37 @@  static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
  * the iommu can only map chunks of consecutive pfns anyway, so get the
  * first page and all consecutive pages with the same locking.
  */
-static long vfio_pin_pages(unsigned long vaddr, long npage,
-			   int prot, unsigned long *pfn_base)
+static long __vfio_pin_pages_remote(struct vfio_iommu *iommu,
+				    unsigned long vaddr, long npage,
+				    int prot, unsigned long *pfn_base)
 {
 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	bool lock_cap = capable(CAP_IPC_LOCK);
-	long ret, i;
+	long ret, i, lock_acct = 0;
 	bool rsvd;
 
 	if (!current->mm)
 		return -ENODEV;
 
-	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
+	ret = vaddr_get_pfn(NULL, vaddr, prot, pfn_base);
 	if (ret)
 		return ret;
 
+	lock_acct = vfio_pfn_account(iommu, *pfn_base);
+
 	rsvd = is_invalid_reserved_pfn(*pfn_base);
 
-	if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
+	if (!rsvd && !lock_cap && current->mm->locked_vm + lock_acct > limit) {
 		put_pfn(*pfn_base, prot);
 		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
 			limit << PAGE_SHIFT);
 		return -ENOMEM;
 	}
 
+
 	if (unlikely(disable_hugepages)) {
 		if (!rsvd)
-			vfio_lock_acct(1);
+			vfio_lock_acct(current, lock_acct);
 		return 1;
 	}
 
@@ -293,7 +427,7 @@  static long vfio_pin_pages(unsigned long vaddr, long npage,
 	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 		unsigned long pfn = 0;
 
-		ret = vaddr_get_pfn(vaddr, prot, &pfn);
+		ret = vaddr_get_pfn(NULL, vaddr, prot, &pfn);
 		if (ret)
 			break;
 
@@ -303,8 +437,10 @@  static long vfio_pin_pages(unsigned long vaddr, long npage,
 			break;
 		}
 
+		lock_acct += vfio_pfn_account(iommu, pfn);
+
 		if (!rsvd && !lock_cap &&
-		    current->mm->locked_vm + i + 1 > limit) {
+		    current->mm->locked_vm + lock_acct > limit) {
 			put_pfn(pfn, prot);
 			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 				__func__, limit << PAGE_SHIFT);
@@ -313,23 +449,216 @@  static long vfio_pin_pages(unsigned long vaddr, long npage,
 	}
 
 	if (!rsvd)
-		vfio_lock_acct(i);
+		vfio_lock_acct(current, lock_acct);
 
 	return i;
 }
 
-static long vfio_unpin_pages(unsigned long pfn, long npage,
-			     int prot, bool do_accounting)
+static long __vfio_unpin_pages_remote(struct vfio_iommu *iommu,
+				      unsigned long pfn, long npage, int prot,
+				      bool do_accounting)
 {
-	unsigned long unlocked = 0;
+	unsigned long unlocked = 0, unlock_acct = 0;
 	long i;
 
-	for (i = 0; i < npage; i++)
+	for (i = 0; i < npage; i++) {
+		if (do_accounting)
+			unlock_acct += vfio_pfn_account(iommu, pfn);
+
 		unlocked += put_pfn(pfn++, prot);
+	}
 
 	if (do_accounting)
-		vfio_lock_acct(-unlocked);
+		vfio_lock_acct(current, -unlock_acct);
+
+	return unlocked;
+}
+
+static long __vfio_pin_page_local(struct vfio_domain *domain,
+				  unsigned long vaddr, int prot,
+				  unsigned long *pfn_base,
+				  bool do_accounting)
+{
+	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	bool lock_cap = capable(CAP_IPC_LOCK);
+	long ret;
+	bool rsvd;
+	struct task_struct *task = domain->local_addr_space->task;
+
+	if (!task->mm)
+		return -ENODEV;
+
+	ret = vaddr_get_pfn(task->mm, vaddr, prot, pfn_base);
+	if (ret)
+		return ret;
+
+	rsvd = is_invalid_reserved_pfn(*pfn_base);
+
+	if (!rsvd && !lock_cap && task->mm->locked_vm + 1 > limit) {
+		put_pfn(*pfn_base, prot);
+		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
+			limit << PAGE_SHIFT);
+		return -ENOMEM;
+	}
+
+	if (!rsvd && do_accounting)
+		vfio_lock_acct(task, 1);
+
+	return 1;
+}
+
+static void __vfio_unpin_page_local(struct vfio_domain *domain,
+				    unsigned long pfn, int prot,
+				    bool do_accounting)
+{
+	put_pfn(pfn, prot);
+
+	if (do_accounting)
+		vfio_lock_acct(domain->local_addr_space->task, -1);
+}
+
+static int vfio_unpin_pfn(struct vfio_domain *domain,
+			  struct vfio_pfn *vpfn, bool do_accounting)
+{
+	__vfio_unpin_page_local(domain, vpfn->pfn, vpfn->prot,
+				do_accounting);
+
+	if (atomic_dec_and_test(&vpfn->ref_count))
+		vfio_remove_from_pfn_list(domain, vpfn);
+
+	return 1;
+}
+
+static long vfio_iommu_type1_pin_pages(void *iommu_data,
+				       unsigned long *user_pfn,
+				       long npage, int prot,
+				       unsigned long *phys_pfn)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *domain;
+	int i, j, ret;
+	long retpage;
+	unsigned long remote_vaddr;
+	unsigned long *pfn = phys_pfn;
+	struct vfio_dma *dma;
+	bool do_accounting;
+
+	if (!iommu || !user_pfn || !phys_pfn)
+		return -EINVAL;
+
+	mutex_lock(&iommu->lock);
+
+	if (!iommu->local_domain) {
+		ret = -EINVAL;
+		goto pin_done;
+	}
+
+	domain = iommu->local_domain;
+
+	/*
+	 * If iommu capable domain exist in the container then all pages are
+	 * already pinned and accounted. Accouting should be done if there is no
+	 * iommu capable domain in the container.
+	 */
+	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
+
+	for (i = 0; i < npage; i++) {
+		struct vfio_pfn *p;
+		dma_addr_t iova;
+
+		iova = user_pfn[i] << PAGE_SHIFT;
+
+		dma = vfio_find_dma(iommu, iova, 0);
+		if (!dma) {
+			ret = -EINVAL;
+			goto pin_unwind;
+		}
+
+		remote_vaddr = dma->vaddr + iova - dma->iova;
+
+		retpage = __vfio_pin_page_local(domain, remote_vaddr, prot,
+						&pfn[i], do_accounting);
+		if (retpage <= 0) {
+			WARN_ON(!retpage);
+			ret = (int)retpage;
+			goto pin_unwind;
+		}
+
+		mutex_lock(&domain->local_addr_space->pfn_list_lock);
+
+		/* search if pfn exist */
+		p = vfio_find_pfn(domain, pfn[i]);
+		if (p) {
+			atomic_inc(&p->ref_count);
+			mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+			continue;
+		}
+
+		ret = vfio_add_to_pfn_list(domain, remote_vaddr, iova,
+					   pfn[i], prot);
+		mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+
+		if (ret) {
+			__vfio_unpin_page_local(domain, pfn[i], prot,
+						do_accounting);
+			goto pin_unwind;
+		}
+	}
+
+	ret = i;
+	goto pin_done;
+
+pin_unwind:
+	pfn[i] = 0;
+	mutex_lock(&domain->local_addr_space->pfn_list_lock);
+	for (j = 0; j < i; j++) {
+		struct vfio_pfn *p;
+
+		p = vfio_find_pfn(domain, pfn[j]);
+		if (p)
+			vfio_unpin_pfn(domain, p, do_accounting);
+
+		pfn[j] = 0;
+	}
+	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+
+pin_done:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static long vfio_iommu_type1_unpin_pages(void *iommu_data, unsigned long *pfn,
+					 long npage)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *domain = NULL;
+	bool do_accounting;
+	long unlocked = 0;
+	int i;
+
+	if (!iommu || !pfn)
+		return -EINVAL;
+
+	mutex_lock(&iommu->lock);
+
+	domain = iommu->local_domain;
+
+	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
+
+	mutex_lock(&domain->local_addr_space->pfn_list_lock);
+
+	for (i = 0; i < npage; i++) {
+		struct vfio_pfn *p;
 
+		/* verify if pfn exist in pfn_list */
+		p = vfio_find_pfn(domain, pfn[i]);
+		if (p)
+			unlocked += vfio_unpin_pfn(domain, p, do_accounting);
+
+	}
+	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+
+	mutex_unlock(&iommu->lock);
 	return unlocked;
 }
 
@@ -341,6 +670,10 @@  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 
 	if (!dma->size)
 		return;
+
+	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+		return;
+
 	/*
 	 * We use the IOMMU to track the physical addresses, otherwise we'd
 	 * need a much more complicated tracking system.  Unfortunately that
@@ -382,15 +715,16 @@  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 		if (WARN_ON(!unmapped))
 			break;
 
-		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
-					     unmapped >> PAGE_SHIFT,
-					     dma->prot, false);
+		unlocked += __vfio_unpin_pages_remote(iommu, phys >> PAGE_SHIFT,
+						      unmapped >> PAGE_SHIFT,
+						      dma->prot, false);
 		iova += unmapped;
 
 		cond_resched();
 	}
 
-	vfio_lock_acct(-unlocked);
+	dma->iommu_mapped = false;
+	vfio_lock_acct(current, -unlocked);
 }
 
 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
@@ -558,17 +892,57 @@  unwind:
 	return ret;
 }
 
+static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			    size_t map_size)
+{
+	dma_addr_t iova = dma->iova;
+	unsigned long vaddr = dma->vaddr;
+	size_t size = map_size;
+	long npage;
+	unsigned long pfn;
+	int ret = 0;
+
+	while (size) {
+		/* Pin a contiguous chunk of memory */
+		npage = __vfio_pin_pages_remote(iommu, vaddr + dma->size,
+						size >> PAGE_SHIFT, dma->prot,
+						&pfn);
+		if (npage <= 0) {
+			WARN_ON(!npage);
+			ret = (int)npage;
+			break;
+		}
+
+		/* Map it! */
+		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
+				     dma->prot);
+		if (ret) {
+			__vfio_unpin_pages_remote(iommu, pfn, npage, dma->prot,
+						  true);
+			break;
+		}
+
+		size -= npage << PAGE_SHIFT;
+		dma->size += npage << PAGE_SHIFT;
+	}
+
+	dma->iommu_mapped = true;
+
+	if (ret)
+		vfio_remove_dma(iommu, dma);
+
+	return ret;
+}
+
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			   struct vfio_iommu_type1_dma_map *map)
 {
 	dma_addr_t iova = map->iova;
 	unsigned long vaddr = map->vaddr;
 	size_t size = map->size;
-	long npage;
 	int ret = 0, prot = 0;
 	uint64_t mask;
 	struct vfio_dma *dma;
-	unsigned long pfn;
 
 	/* Verify that none of our __u64 fields overflow */
 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
@@ -611,29 +985,11 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	/* Insert zero-sized and grow as we map chunks of it */
 	vfio_link_dma(iommu, dma);
 
-	while (size) {
-		/* Pin a contiguous chunk of memory */
-		npage = vfio_pin_pages(vaddr + dma->size,
-				       size >> PAGE_SHIFT, prot, &pfn);
-		if (npage <= 0) {
-			WARN_ON(!npage);
-			ret = (int)npage;
-			break;
-		}
-
-		/* Map it! */
-		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
-		if (ret) {
-			vfio_unpin_pages(pfn, npage, prot, true);
-			break;
-		}
-
-		size -= npage << PAGE_SHIFT;
-		dma->size += npage << PAGE_SHIFT;
-	}
-
-	if (ret)
-		vfio_remove_dma(iommu, dma);
+	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
+	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+		dma->size = size;
+	else
+		ret = vfio_pin_map_dma(iommu, dma, size);
 
 	mutex_unlock(&iommu->lock);
 	return ret;
@@ -662,10 +1018,6 @@  static int vfio_iommu_replay(struct vfio_iommu *iommu,
 	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
 	n = rb_first(&iommu->dma_list);
 
-	/* If there's not a domain, there better not be any mappings */
-	if (WARN_ON(n && !d))
-		return -EINVAL;
-
 	for (; n; n = rb_next(n)) {
 		struct vfio_dma *dma;
 		dma_addr_t iova;
@@ -674,20 +1026,43 @@  static int vfio_iommu_replay(struct vfio_iommu *iommu,
 		iova = dma->iova;
 
 		while (iova < dma->iova + dma->size) {
-			phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
+			phys_addr_t phys;
 			size_t size;
 
-			if (WARN_ON(!phys)) {
-				iova += PAGE_SIZE;
-				continue;
-			}
+			if (dma->iommu_mapped) {
+				phys = iommu_iova_to_phys(d->domain, iova);
+
+				if (WARN_ON(!phys)) {
+					iova += PAGE_SIZE;
+					continue;
+				}
 
-			size = PAGE_SIZE;
+				size = PAGE_SIZE;
 
-			while (iova + size < dma->iova + dma->size &&
-			       phys + size == iommu_iova_to_phys(d->domain,
+				while (iova + size < dma->iova + dma->size &&
+				    phys + size == iommu_iova_to_phys(d->domain,
 								 iova + size))
-				size += PAGE_SIZE;
+					size += PAGE_SIZE;
+			} else {
+				unsigned long pfn;
+				unsigned long vaddr = dma->vaddr +
+						     (iova - dma->iova);
+				size_t n = dma->iova + dma->size - iova;
+				long npage;
+
+				npage = __vfio_pin_pages_remote(iommu, vaddr,
+								n >> PAGE_SHIFT,
+								dma->prot,
+								&pfn);
+				if (npage <= 0) {
+					WARN_ON(!npage);
+					ret = (int)npage;
+					return ret;
+				}
+
+				phys = pfn << PAGE_SHIFT;
+				size = npage << PAGE_SHIFT;
+			}
 
 			ret = iommu_map(domain->domain, iova, phys,
 					size, dma->prot | domain->prot);
@@ -696,6 +1071,8 @@  static int vfio_iommu_replay(struct vfio_iommu *iommu,
 
 			iova += size;
 		}
+
+		dma->iommu_mapped = true;
 	}
 
 	return 0;
@@ -734,11 +1111,24 @@  static void vfio_test_domain_fgsp(struct vfio_domain *domain)
 	__free_pages(pages, order);
 }
 
+static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
+				   struct iommu_group *iommu_group)
+{
+	struct vfio_group *g;
+
+	list_for_each_entry(g, &domain->group_list, next) {
+		if (g->iommu_group == iommu_group)
+			return g;
+	}
+
+	return NULL;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
 	struct vfio_iommu *iommu = iommu_data;
-	struct vfio_group *group, *g;
+	struct vfio_group *group;
 	struct vfio_domain *domain, *d;
 	struct bus_type *bus = NULL;
 	int ret;
@@ -746,10 +1136,14 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	mutex_lock(&iommu->lock);
 
 	list_for_each_entry(d, &iommu->domain_list, next) {
-		list_for_each_entry(g, &d->group_list, next) {
-			if (g->iommu_group != iommu_group)
-				continue;
+		if (find_iommu_group(d, iommu_group)) {
+			mutex_unlock(&iommu->lock);
+			return -EINVAL;
+		}
+	}
 
+	if (iommu->local_domain) {
+		if (find_iommu_group(iommu->local_domain, iommu_group)) {
 			mutex_unlock(&iommu->lock);
 			return -EINVAL;
 		}
@@ -769,6 +1163,30 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_free;
 
+	if (IS_ENABLED(CONFIG_VFIO_MDEV) && !iommu_present(bus) &&
+	    (bus == &mdev_bus_type)) {
+		if (!iommu->local_domain) {
+			domain->local_addr_space =
+				kzalloc(sizeof(*domain->local_addr_space),
+						GFP_KERNEL);
+			if (!domain->local_addr_space) {
+				ret = -ENOMEM;
+				goto out_free;
+			}
+
+			domain->local_addr_space->task = current;
+			INIT_LIST_HEAD(&domain->group_list);
+			domain->local_addr_space->pfn_list = RB_ROOT;
+			mutex_init(&domain->local_addr_space->pfn_list_lock);
+			iommu->local_domain = domain;
+		} else
+			kfree(domain);
+
+		list_add(&group->next, &domain->group_list);
+		mutex_unlock(&iommu->lock);
+		return 0;
+	}
+
 	domain->domain = iommu_domain_alloc(bus);
 	if (!domain->domain) {
 		ret = -EIO;
@@ -859,6 +1277,41 @@  static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
 		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
 }
 
+static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
+{
+	struct vfio_domain *domain = iommu->local_domain;
+	struct vfio_dma *dma, *tdma;
+	struct rb_node *n;
+	long locked = 0;
+
+	rbtree_postorder_for_each_entry_safe(dma, tdma, &iommu->dma_list,
+					     node) {
+		vfio_unmap_unpin(iommu, dma);
+	}
+
+	mutex_lock(&domain->local_addr_space->pfn_list_lock);
+
+	n = rb_first(&domain->local_addr_space->pfn_list);
+
+	for (; n; n = rb_next(n))
+		locked++;
+
+	vfio_lock_acct(domain->local_addr_space->task, locked);
+	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+}
+
+static void vfio_local_unpin_all(struct vfio_domain *domain)
+{
+	struct rb_node *node;
+
+	mutex_lock(&domain->local_addr_space->pfn_list_lock);
+	while ((node = rb_first(&domain->local_addr_space->pfn_list)))
+		vfio_unpin_pfn(domain,
+				rb_entry(node, struct vfio_pfn, node), false);
+
+	mutex_unlock(&domain->local_addr_space->pfn_list_lock);
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
@@ -868,31 +1321,57 @@  static void vfio_iommu_type1_detach_group(void *iommu_data,
 
 	mutex_lock(&iommu->lock);
 
-	list_for_each_entry(domain, &iommu->domain_list, next) {
-		list_for_each_entry(group, &domain->group_list, next) {
-			if (group->iommu_group != iommu_group)
-				continue;
-
-			iommu_detach_group(domain->domain, iommu_group);
+	if (iommu->local_domain) {
+		domain = iommu->local_domain;
+		group = find_iommu_group(domain, iommu_group);
+		if (group) {
 			list_del(&group->next);
 			kfree(group);
-			/*
-			 * Group ownership provides privilege, if the group
-			 * list is empty, the domain goes away.  If it's the
-			 * last domain, then all the mappings go away too.
-			 */
+
 			if (list_empty(&domain->group_list)) {
-				if (list_is_singular(&iommu->domain_list))
+				vfio_local_unpin_all(domain);
+				if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
 					vfio_iommu_unmap_unpin_all(iommu);
-				iommu_domain_free(domain->domain);
-				list_del(&domain->next);
 				kfree(domain);
+				iommu->local_domain = NULL;
+			}
+			goto detach_group_done;
+		}
+	}
+
+	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+		goto detach_group_done;
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		group = find_iommu_group(domain, iommu_group);
+		if (!group)
+			continue;
+
+		iommu_detach_group(domain->domain, iommu_group);
+		list_del(&group->next);
+		kfree(group);
+		/*
+		 * Group ownership provides privilege, if the group list is
+		 * empty, the domain goes away. If it's the last domain with
+		 * iommu and local domain doesn't exist, then all the mappings
+		 * go away too. If it's the last domain with iommu and local
+		 * domain exist, update accounting
+		 */
+		if (list_empty(&domain->group_list)) {
+			if (list_is_singular(&iommu->domain_list)) {
+				if (!iommu->local_domain)
+					vfio_iommu_unmap_unpin_all(iommu);
+				else
+					vfio_iommu_unmap_unpin_reaccount(iommu);
 			}
-			goto done;
+			iommu_domain_free(domain->domain);
+			list_del(&domain->next);
+			kfree(domain);
 		}
+		break;
 	}
 
-done:
+detach_group_done:
 	mutex_unlock(&iommu->lock);
 }
 
@@ -924,27 +1403,48 @@  static void *vfio_iommu_type1_open(unsigned long arg)
 	return iommu;
 }
 
+static void vfio_release_domain(struct vfio_domain *domain)
+{
+	struct vfio_group *group, *group_tmp;
+
+	list_for_each_entry_safe(group, group_tmp,
+				 &domain->group_list, next) {
+		if (!domain->local_addr_space)
+			iommu_detach_group(domain->domain, group->iommu_group);
+		list_del(&group->next);
+		kfree(group);
+	}
+
+	if (domain->local_addr_space)
+		vfio_local_unpin_all(domain);
+	else
+		iommu_domain_free(domain->domain);
+}
+
 static void vfio_iommu_type1_release(void *iommu_data)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_domain *domain, *domain_tmp;
-	struct vfio_group *group, *group_tmp;
+
+	if (iommu->local_domain) {
+		vfio_release_domain(iommu->local_domain);
+		kfree(iommu->local_domain);
+		iommu->local_domain = NULL;
+	}
 
 	vfio_iommu_unmap_unpin_all(iommu);
 
+	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+		goto release_exit;
+
 	list_for_each_entry_safe(domain, domain_tmp,
 				 &iommu->domain_list, next) {
-		list_for_each_entry_safe(group, group_tmp,
-					 &domain->group_list, next) {
-			iommu_detach_group(domain->domain, group->iommu_group);
-			list_del(&group->next);
-			kfree(group);
-		}
-		iommu_domain_free(domain->domain);
+		vfio_release_domain(domain);
 		list_del(&domain->next);
 		kfree(domain);
 	}
 
+release_exit:
 	kfree(iommu);
 }
 
@@ -1048,6 +1548,8 @@  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.ioctl		= vfio_iommu_type1_ioctl,
 	.attach_group	= vfio_iommu_type1_attach_group,
 	.detach_group	= vfio_iommu_type1_detach_group,
+	.pin_pages	= vfio_iommu_type1_pin_pages,
+	.unpin_pages	= vfio_iommu_type1_unpin_pages,
 };
 
 static int __init vfio_iommu_type1_init(void)
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0ecae0b1cd34..0bd25ba6223d 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -17,6 +17,7 @@ 
 #include <linux/workqueue.h>
 #include <linux/poll.h>
 #include <uapi/linux/vfio.h>
+#include <linux/mdev.h>
 
 /**
  * struct vfio_device_ops - VFIO bus driver device callbacks
@@ -75,7 +76,11 @@  struct vfio_iommu_driver_ops {
 					struct iommu_group *group);
 	void		(*detach_group)(void *iommu_data,
 					struct iommu_group *group);
-
+	long		(*pin_pages)(void *iommu_data, unsigned long *user_pfn,
+				     long npage, int prot,
+				     unsigned long *phys_pfn);
+	long		(*unpin_pages)(void *iommu_data, unsigned long *pfn,
+				       long npage);
 };
 
 extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
@@ -127,6 +132,12 @@  static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
 }
 #endif /* CONFIG_EEH */
 
+extern long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
+			   long npage, int prot, unsigned long *phys_pfn);
+
+extern long vfio_unpin_pages(struct device *dev, unsigned long *pfn,
+			     long npage);
+
 /*
  * IRQfd - generic
  */