diff mbox series

[v2,13/40] vfio: Add support for Shared Virtual Addressing

Message ID 20180511190641.23008-14-jean-philippe.brucker@arm.com
State Not Applicable
Headers show
Series Shared Virtual Addressing for the IOMMU | expand

Commit Message

Jean-Philippe Brucker May 11, 2018, 7:06 p.m. UTC
Add two new ioctls for VFIO containers. VFIO_IOMMU_BIND_PROCESS creates a
bond between a container and a process address space, identified by a
Process Address Space ID (PASID). Devices in the container append this
PASID to DMA transactions in order to access the process' address space.
The process page tables are shared with the IOMMU, and mechanisms such as
PCI ATS/PRI are used to handle faults. VFIO_IOMMU_UNBIND_PROCESS removes a
bond created with VFIO_IOMMU_BIND_PROCESS.

Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>

---
v1->v2:
* Simplify mm_exit
* Can be built as module again
---
 drivers/vfio/vfio_iommu_type1.c | 449 ++++++++++++++++++++++++++++++--
 include/uapi/linux/vfio.h       |  76 ++++++
 2 files changed, 504 insertions(+), 21 deletions(-)

Comments

Jonathan Cameron May 17, 2018, 3:58 p.m. UTC | #1
On Fri, 11 May 2018 20:06:14 +0100
Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote:

> Add two new ioctls for VFIO containers. VFIO_IOMMU_BIND_PROCESS creates a
> bond between a container and a process address space, identified by a
> Process Address Space ID (PASID). Devices in the container append this
> PASID to DMA transactions in order to access the process' address space.
> The process page tables are shared with the IOMMU, and mechanisms such as
> PCI ATS/PRI are used to handle faults. VFIO_IOMMU_UNBIND_PROCESS removes a
> bond created with VFIO_IOMMU_BIND_PROCESS.
> 
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>

A couple of small comments inline..

Jonathan

> 
> ---
> v1->v2:
> * Simplify mm_exit
> * Can be built as module again
> ---
>  drivers/vfio/vfio_iommu_type1.c | 449 ++++++++++++++++++++++++++++++--
>  include/uapi/linux/vfio.h       |  76 ++++++
>  2 files changed, 504 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 5c212bf29640..2902774062b8 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -30,6 +30,7 @@
>  #include <linux/iommu.h>
>  #include <linux/module.h>
>  #include <linux/mm.h>
> +#include <linux/ptrace.h>
>  #include <linux/rbtree.h>
>  #include <linux/sched/signal.h>
>  #include <linux/sched/mm.h>
> @@ -60,6 +61,7 @@ MODULE_PARM_DESC(disable_hugepages,
>  
>  struct vfio_iommu {
>  	struct list_head	domain_list;
> +	struct list_head	mm_list;
>  	struct vfio_domain	*external_domain; /* domain for external user */
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
> @@ -90,6 +92,14 @@ struct vfio_dma {
>  struct vfio_group {
>  	struct iommu_group	*iommu_group;
>  	struct list_head	next;
> +	bool			sva_enabled;
> +};
> +
> +struct vfio_mm {
> +#define VFIO_PASID_INVALID	(-1)
> +	int			pasid;
> +	struct mm_struct	*mm;
> +	struct list_head	next;
>  };
>  
>  /*
> @@ -1238,6 +1248,164 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  	return 0;
>  }
>  
> +static int vfio_iommu_mm_exit(struct device *dev, int pasid, void *data)
> +{
> +	struct vfio_mm *vfio_mm;
> +	struct vfio_iommu *iommu = data;
> +
> +	mutex_lock(&iommu->lock);
> +	list_for_each_entry(vfio_mm, &iommu->mm_list, next) {
> +		if (vfio_mm->pasid == pasid) {
> +			list_del(&vfio_mm->next);
> +			kfree(vfio_mm);
> +			break;
> +		}
> +	}
> +	mutex_unlock(&iommu->lock);
> +
> +	return 0;
> +}
> +
> +static int vfio_iommu_sva_init(struct device *dev, void *data)
> +{
> +	return iommu_sva_device_init(dev, IOMMU_SVA_FEAT_IOPF, 0,
> +				     vfio_iommu_mm_exit);
> +}
> +
> +static int vfio_iommu_sva_shutdown(struct device *dev, void *data)
> +{
> +	iommu_sva_device_shutdown(dev);
> +
> +	return 0;
> +}
> +
> +struct vfio_iommu_sva_bind_data {
> +	struct vfio_mm *vfio_mm;
> +	struct vfio_iommu *iommu;
> +	int count;
> +};
> +
> +static int vfio_iommu_sva_bind_dev(struct device *dev, void *data)
> +{
> +	struct vfio_iommu_sva_bind_data *bind_data = data;
> +
> +	/* Multi-device groups aren't support for SVA */
> +	if (bind_data->count++)
> +		return -EINVAL;
> +
> +	return __iommu_sva_bind_device(dev, bind_data->vfio_mm->mm,
> +				       &bind_data->vfio_mm->pasid,
> +				       IOMMU_SVA_FEAT_IOPF, bind_data->iommu);
> +}
> +
> +static int vfio_iommu_sva_unbind_dev(struct device *dev, void *data)
> +{
> +	struct vfio_mm *vfio_mm = data;
> +
> +	return __iommu_sva_unbind_device(dev, vfio_mm->pasid);
> +}
> +
> +static int vfio_iommu_bind_group(struct vfio_iommu *iommu,
> +				 struct vfio_group *group,
> +				 struct vfio_mm *vfio_mm)
> +{
> +	int ret;
> +	bool enabled_sva = false;
> +	struct vfio_iommu_sva_bind_data data = {
> +		.vfio_mm	= vfio_mm,
> +		.iommu		= iommu,
> +		.count		= 0,
> +	};
> +
> +	if (!group->sva_enabled) {
> +		ret = iommu_group_for_each_dev(group->iommu_group, NULL,
> +					       vfio_iommu_sva_init);
> +		if (ret)
> +			return ret;
> +
> +		group->sva_enabled = enabled_sva = true;
> +	}
> +
> +	ret = iommu_group_for_each_dev(group->iommu_group, &data,
> +				       vfio_iommu_sva_bind_dev);
> +	if (ret && data.count > 1)

Are we safe to run this even if data.count == 1?  I assume that at
that point we would always not have an iommu domain associated with the
device so the initial check would error out.

Just be nice to get rid of the special casing in this error path as then
could just do it all under if (ret) and make it visually clearer these
are different aspects of the same error path.


> +		iommu_group_for_each_dev(group->iommu_group, vfio_mm,
> +					 vfio_iommu_sva_unbind_dev);
> +	if (ret && enabled_sva) {
> +		iommu_group_for_each_dev(group->iommu_group, NULL,
> +					 vfio_iommu_sva_shutdown);
> +		group->sva_enabled = false;
> +	}
> +
> +	return ret;
> +}
> +
> +static void vfio_iommu_unbind_group(struct vfio_group *group,
> +				    struct vfio_mm *vfio_mm)
> +{
> +	iommu_group_for_each_dev(group->iommu_group, vfio_mm,
> +				 vfio_iommu_sva_unbind_dev);
> +}
> +
> +static void vfio_iommu_unbind(struct vfio_iommu *iommu,
> +			      struct vfio_mm *vfio_mm)
> +{
> +	struct vfio_group *group;
> +	struct vfio_domain *domain;
> +
> +	list_for_each_entry(domain, &iommu->domain_list, next)
> +		list_for_each_entry(group, &domain->group_list, next)
> +			vfio_iommu_unbind_group(group, vfio_mm);
> +}
> +
> +static int vfio_iommu_replay_bind(struct vfio_iommu *iommu,
> +				  struct vfio_group *group)
> +{
> +	int ret = 0;
> +	struct vfio_mm *vfio_mm;
> +
> +	list_for_each_entry(vfio_mm, &iommu->mm_list, next) {
> +		/*
> +		 * Ensure that mm doesn't exit while we're binding it to the new
> +		 * group. It may already have exited, and the mm_exit notifier
> +		 * might be waiting on the IOMMU mutex to remove this vfio_mm
> +		 * from the list.
> +		 */
> +		if (!mmget_not_zero(vfio_mm->mm))
> +			continue;
> +		ret = vfio_iommu_bind_group(iommu, group, vfio_mm);
> +		/*
> +		 * Use async to avoid triggering an mm_exit callback right away,
> +		 * which would block on the mutex that we're holding.
> +		 */
> +		mmput_async(vfio_mm->mm);
> +
> +		if (ret)
> +			goto out_unbind;
> +	}
> +
> +	return 0;
> +
> +out_unbind:
> +	list_for_each_entry_continue_reverse(vfio_mm, &iommu->mm_list, next)
> +		vfio_iommu_unbind_group(group, vfio_mm);
> +
> +	return ret;
> +}
> +
> +static void vfio_iommu_free_all_mm(struct vfio_iommu *iommu)
> +{
> +	struct vfio_mm *vfio_mm, *next;
> +
> +	/*
> +	 * No need for unbind() here. Since all groups are detached from this
> +	 * iommu, bonds have been removed.
> +	 */
> +	list_for_each_entry_safe(vfio_mm, next, &iommu->mm_list, next)
> +		kfree(vfio_mm);
> +	INIT_LIST_HEAD(&iommu->mm_list);
> +}
> +
>  /*
>   * We change our unmap behavior slightly depending on whether the IOMMU
>   * supports fine-grained superpages.  IOMMUs like AMD-Vi will use a superpage
> @@ -1313,6 +1481,44 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
>  	return ret;
>  }
>  
> +static int vfio_iommu_try_attach_group(struct vfio_iommu *iommu,
> +				       struct vfio_group *group,
> +				       struct vfio_domain *cur_domain,
> +				       struct vfio_domain *new_domain)
> +{
> +	struct iommu_group *iommu_group = group->iommu_group;
> +
> +	/*
> +	 * Try to match an existing compatible domain.  We don't want to
> +	 * preclude an IOMMU driver supporting multiple bus_types and being
> +	 * able to include different bus_types in the same IOMMU domain, so
> +	 * we test whether the domains use the same iommu_ops rather than
> +	 * testing if they're on the same bus_type.
> +	 */
> +	if (new_domain->domain->ops != cur_domain->domain->ops ||
> +	    new_domain->prot != cur_domain->prot)
> +		return 1;
> +
> +	iommu_detach_group(cur_domain->domain, iommu_group);
> +
> +	if (iommu_attach_group(new_domain->domain, iommu_group))
> +		goto out_reattach;
> +
> +	if (vfio_iommu_replay_bind(iommu, group))
> +		goto out_detach;
> +
> +	return 0;
> +
> +out_detach:
> +	iommu_detach_group(new_domain->domain, iommu_group);
> +
> +out_reattach:
> +	if (iommu_attach_group(cur_domain->domain, iommu_group))
> +		return -EINVAL;
> +
> +	return 1;
> +}
> +
>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>  					 struct iommu_group *iommu_group)
>  {
> @@ -1410,28 +1616,16 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
>  		domain->prot |= IOMMU_CACHE;
>  
> -	/*
> -	 * Try to match an existing compatible domain.  We don't want to
> -	 * preclude an IOMMU driver supporting multiple bus_types and being
> -	 * able to include different bus_types in the same IOMMU domain, so
> -	 * we test whether the domains use the same iommu_ops rather than
> -	 * testing if they're on the same bus_type.
> -	 */
>  	list_for_each_entry(d, &iommu->domain_list, next) {
> -		if (d->domain->ops == domain->domain->ops &&
> -		    d->prot == domain->prot) {
> -			iommu_detach_group(domain->domain, iommu_group);
> -			if (!iommu_attach_group(d->domain, iommu_group)) {
> -				list_add(&group->next, &d->group_list);
> -				iommu_domain_free(domain->domain);
> -				kfree(domain);
> -				mutex_unlock(&iommu->lock);
> -				return 0;
> -			}
> -
> -			ret = iommu_attach_group(domain->domain, iommu_group);
> -			if (ret)
> -				goto out_domain;
> +		ret = vfio_iommu_try_attach_group(iommu, group, domain, d);
> +		if (ret < 0) {
> +			goto out_domain;
> +		} else if (!ret) {
> +			list_add(&group->next, &d->group_list);
> +			iommu_domain_free(domain->domain);
> +			kfree(domain);
> +			mutex_unlock(&iommu->lock);
> +			return 0;
>  		}
>  	}
>  
> @@ -1442,6 +1636,10 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_detach;
>  
> +	ret = vfio_iommu_replay_bind(iommu, group);
> +	if (ret)
> +		goto out_detach;
> +
>  	if (resv_msi) {
>  		ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
>  		if (ret)
> @@ -1547,6 +1745,11 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  			continue;
>  
>  		iommu_detach_group(domain->domain, iommu_group);
> +		if (group->sva_enabled) {
> +			iommu_group_for_each_dev(iommu_group, NULL,
> +						 vfio_iommu_sva_shutdown);
> +			group->sva_enabled = false;
> +		}
>  		list_del(&group->next);
>  		kfree(group);
>  		/*
> @@ -1562,6 +1765,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  					vfio_iommu_unmap_unpin_all(iommu);
>  				else
>  					vfio_iommu_unmap_unpin_reaccount(iommu);
> +				vfio_iommu_free_all_mm(iommu);
>  			}
>  			iommu_domain_free(domain->domain);
>  			list_del(&domain->next);
> @@ -1596,6 +1800,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  	}
>  
>  	INIT_LIST_HEAD(&iommu->domain_list);
> +	INIT_LIST_HEAD(&iommu->mm_list);
>  	iommu->dma_list = RB_ROOT;
>  	mutex_init(&iommu->lock);
>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
> @@ -1630,6 +1835,7 @@ static void vfio_iommu_type1_release(void *iommu_data)
>  		kfree(iommu->external_domain);
>  	}
>  
> +	vfio_iommu_free_all_mm(iommu);
>  	vfio_iommu_unmap_unpin_all(iommu);
>  
>  	list_for_each_entry_safe(domain, domain_tmp,
> @@ -1658,6 +1864,169 @@ static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
>  	return ret;
>  }
>  
> +static struct mm_struct *vfio_iommu_get_mm_by_vpid(pid_t vpid)
> +{
> +	struct mm_struct *mm;
> +	struct task_struct *task;
> +
> +	task = find_get_task_by_vpid(vpid);
> +	if (!task)
> +		return ERR_PTR(-ESRCH);
> +
> +	/* Ensure that current has RW access on the mm */
> +	mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
> +	put_task_struct(task);
> +
> +	if (!mm)
> +		return ERR_PTR(-ESRCH);
> +
> +	return mm;
> +}
> +
> +static long vfio_iommu_type1_bind_process(struct vfio_iommu *iommu,
> +					  void __user *arg,
> +					  struct vfio_iommu_type1_bind *bind)
> +{
> +	struct vfio_iommu_type1_bind_process params;
> +	struct vfio_domain *domain;
> +	struct vfio_group *group;
> +	struct vfio_mm *vfio_mm;
> +	struct mm_struct *mm;
> +	unsigned long minsz;
> +	int ret = 0;
> +
> +	minsz = sizeof(*bind) + sizeof(params);
> +	if (bind->argsz < minsz)
> +		return -EINVAL;
> +
> +	arg += sizeof(*bind);
> +	if (copy_from_user(&params, arg, sizeof(params)))
> +		return -EFAULT;
> +
> +	if (params.flags & ~VFIO_IOMMU_BIND_PID)
> +		return -EINVAL;
> +
> +	if (params.flags & VFIO_IOMMU_BIND_PID) {
> +		mm = vfio_iommu_get_mm_by_vpid(params.pid);
> +		if (IS_ERR(mm))
> +			return PTR_ERR(mm);
> +	} else {
> +		mm = get_task_mm(current);
> +		if (!mm)
> +			return -EINVAL;
> +	}
> +
> +	mutex_lock(&iommu->lock);
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	list_for_each_entry(vfio_mm, &iommu->mm_list, next) {
> +		if (vfio_mm->mm == mm) {
> +			params.pasid = vfio_mm->pasid;
> +			ret = copy_to_user(arg, &params, sizeof(params)) ?
> +				-EFAULT : 0;
> +			goto out_unlock;
> +		}
> +	}
> +
> +	vfio_mm = kzalloc(sizeof(*vfio_mm), GFP_KERNEL);
> +	if (!vfio_mm) {
> +		ret = -ENOMEM;
> +		goto out_unlock;
> +	}
> +
> +	vfio_mm->mm = mm;
> +	vfio_mm->pasid = VFIO_PASID_INVALID;
> +
> +	list_for_each_entry(domain, &iommu->domain_list, next) {
> +		list_for_each_entry(group, &domain->group_list, next) {
> +			ret = vfio_iommu_bind_group(iommu, group, vfio_mm);
> +			if (ret)
> +				goto out_unbind;
> +		}
> +	}
> +
> +	list_add(&vfio_mm->next, &iommu->mm_list);
> +
> +	params.pasid = vfio_mm->pasid;
> +	ret = copy_to_user(arg, &params, sizeof(params)) ? -EFAULT : 0;
> +	if (ret)
> +		goto out_delete;
> +
> +	mutex_unlock(&iommu->lock);
> +	mmput(mm);
> +	return 0;
> +
> +out_delete:
> +	list_del(&vfio_mm->next);
> +
> +out_unbind:
> +	/* Undo all binds that already succeeded */
> +	vfio_iommu_unbind(iommu, vfio_mm);
> +	kfree(vfio_mm);
> +
> +out_unlock:
> +	mutex_unlock(&iommu->lock);
> +	mmput(mm);
> +	return ret;
> +}
> +
> +static long vfio_iommu_type1_unbind_process(struct vfio_iommu *iommu,
> +					    void __user *arg,
> +					    struct vfio_iommu_type1_bind *bind)
> +{
> +	int ret = -EINVAL;
> +	unsigned long minsz;
> +	struct mm_struct *mm;
> +	struct vfio_mm *vfio_mm;
> +	struct vfio_iommu_type1_bind_process params;
> +
> +	minsz = sizeof(*bind) + sizeof(params);
> +	if (bind->argsz < minsz)
> +		return -EINVAL;
> +
> +	arg += sizeof(*bind);
> +	if (copy_from_user(&params, arg, sizeof(params)))
> +		return -EFAULT;
> +
> +	if (params.flags & ~VFIO_IOMMU_BIND_PID)
> +		return -EINVAL;
> +
> +	/*
> +	 * We can't simply call unbind with the PASID, because the process might
> +	 * have died and the PASID might have been reallocated to another
> +	 * process. Instead we need to fetch that process mm by PID again to
> +	 * make sure we remove the right vfio_mm.
> +	 */
> +	if (params.flags & VFIO_IOMMU_BIND_PID) {
> +		mm = vfio_iommu_get_mm_by_vpid(params.pid);
> +		if (IS_ERR(mm))
> +			return PTR_ERR(mm);
> +	} else {
> +		mm = get_task_mm(current);
> +		if (!mm)
> +			return -EINVAL;
> +	}
> +
> +	ret = -ESRCH;
> +	mutex_lock(&iommu->lock);
> +	list_for_each_entry(vfio_mm, &iommu->mm_list, next) {
> +		if (vfio_mm->mm == mm) {
> +			vfio_iommu_unbind(iommu, vfio_mm);
> +			list_del(&vfio_mm->next);
> +			kfree(vfio_mm);
> +			ret = 0;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&iommu->lock);
> +	mmput(mm);
> +
> +	return ret;
> +}
> +
>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>  				   unsigned int cmd, unsigned long arg)
>  {
> @@ -1728,6 +2097,44 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  
>  		return copy_to_user((void __user *)arg, &unmap, minsz) ?
>  			-EFAULT : 0;
> +
> +	} else if (cmd == VFIO_IOMMU_BIND) {
> +		struct vfio_iommu_type1_bind bind;
> +
> +		minsz = offsetofend(struct vfio_iommu_type1_bind, flags);
> +
> +		if (copy_from_user(&bind, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (bind.argsz < minsz)
> +			return -EINVAL;
> +
> +		switch (bind.flags) {
> +		case VFIO_IOMMU_BIND_PROCESS:
> +			return vfio_iommu_type1_bind_process(iommu, (void *)arg,
> +							     &bind);

Can we combine these two blocks given it is only this case statement that is different?

> +		default:
> +			return -EINVAL;
> +		}
> +
> +	} else if (cmd == VFIO_IOMMU_UNBIND) {
> +		struct vfio_iommu_type1_bind bind;
> +
> +		minsz = offsetofend(struct vfio_iommu_type1_bind, flags);
> +
> +		if (copy_from_user(&bind, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (bind.argsz < minsz)
> +			return -EINVAL;
> +
> +		switch (bind.flags) {
> +		case VFIO_IOMMU_BIND_PROCESS:
> +			return vfio_iommu_type1_unbind_process(iommu, (void *)arg,
> +							       &bind);
> +		default:
> +			return -EINVAL;
> +		}
>  	}
>  
>  	return -ENOTTY;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 1aa7b82e8169..dc07752c8fe8 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -665,6 +665,82 @@ struct vfio_iommu_type1_dma_unmap {
>  #define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
>  #define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
>  
> +/*
> + * VFIO_IOMMU_BIND_PROCESS
> + *
> + * Allocate a PASID for a process address space, and use it to attach this
> + * process to all devices in the container. Devices can then tag their DMA
> + * traffic with the returned @pasid to perform transactions on the associated
> + * virtual address space. Mapping and unmapping buffers is performed by standard
> + * functions such as mmap and malloc.
> + *
> + * If flag is VFIO_IOMMU_BIND_PID, @pid contains the pid of a foreign process to
> + * bind. Otherwise the current task is bound. Given that the caller owns the
> + * device, setting this flag grants the caller read and write permissions on the
> + * entire address space of foreign process described by @pid. Therefore,
> + * permission to perform the bind operation on a foreign process is governed by
> + * the ptrace access mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2)
> + * for more information.
> + *
> + * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This
> + * ID is unique to a process and can be used on all devices in the container.
> + *
> + * On fork, the child inherits the device fd and can use the bonds setup by its
> + * parent. Consequently, the child has R/W access on the address spaces bound by
> + * its parent. After an execv, the device fd is closed and the child doesn't
> + * have access to the address space anymore.
> + *
> + * To remove a bond between process and container, VFIO_IOMMU_UNBIND ioctl is
> + * issued with the same parameters. If a pid was specified in VFIO_IOMMU_BIND,
> + * it should also be present for VFIO_IOMMU_UNBIND. Otherwise unbind the current
> + * task from the container.
> + */
> +struct vfio_iommu_type1_bind_process {
> +	__u32	flags;
> +#define VFIO_IOMMU_BIND_PID		(1 << 0)
> +	__u32	pasid;
> +	__s32	pid;
> +};
> +
> +/*
> + * Only mode supported at the moment is VFIO_IOMMU_BIND_PROCESS, which takes
> + * vfio_iommu_type1_bind_process in data.
> + */
> +struct vfio_iommu_type1_bind {
> +	__u32	argsz;
> +	__u32	flags;
> +#define VFIO_IOMMU_BIND_PROCESS		(1 << 0)
> +	__u8	data[];
> +};
> +
> +/*
> + * VFIO_IOMMU_BIND - _IOWR(VFIO_TYPE, VFIO_BASE + 22, struct vfio_iommu_bind)
> + *
> + * Manage address spaces of devices in this container. Initially a TYPE1
> + * container can only have one address space, managed with
> + * VFIO_IOMMU_MAP/UNMAP_DMA.
> + *
> + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP
> + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page
> + * tables, and BIND manages the stage-1 (guest) page tables. Other types of
> + * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls
> + * non-PASID traffic and BIND controls PASID traffic. But this depends on the
> + * underlying IOMMU architecture and isn't guaranteed.
> + *
> + * Availability of this feature depends on the device, its bus, the underlying
> + * IOMMU and the CPU architecture.
> + *
> + * returns: 0 on success, -errno on failure.
> + */
> +#define VFIO_IOMMU_BIND		_IO(VFIO_TYPE, VFIO_BASE + 22)
> +
> +/*
> + * VFIO_IOMMU_UNBIND - _IOWR(VFIO_TYPE, VFIO_BASE + 23, struct vfio_iommu_bind)
> + *
> + * Undo what was done by the corresponding VFIO_IOMMU_BIND ioctl.
> + */
> +#define VFIO_IOMMU_UNBIND	_IO(VFIO_TYPE, VFIO_BASE + 23)
> +
>  /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>  
>  /*
Jean-Philippe Brucker May 21, 2018, 2:51 p.m. UTC | #2
On 17/05/18 16:58, Jonathan Cameron wrote:
>> +static int vfio_iommu_bind_group(struct vfio_iommu *iommu,
>> +				 struct vfio_group *group,
>> +				 struct vfio_mm *vfio_mm)
>> +{
>> +	int ret;
>> +	bool enabled_sva = false;
>> +	struct vfio_iommu_sva_bind_data data = {
>> +		.vfio_mm	= vfio_mm,
>> +		.iommu		= iommu,
>> +		.count		= 0,
>> +	};
>> +
>> +	if (!group->sva_enabled) {
>> +		ret = iommu_group_for_each_dev(group->iommu_group, NULL,
>> +					       vfio_iommu_sva_init);
>> +		if (ret)
>> +			return ret;
>> +
>> +		group->sva_enabled = enabled_sva = true;
>> +	}
>> +
>> +	ret = iommu_group_for_each_dev(group->iommu_group, &data,
>> +				       vfio_iommu_sva_bind_dev);
>> +	if (ret && data.count > 1)
> 
> Are we safe to run this even if data.count == 1?  I assume that at
> that point we would always not have an iommu domain associated with the
> device so the initial check would error out.

If data.count == 1, then the first bind didn't succeed. But it's not
necessarily a domain missing, failure to bind may come from various
places. If this vfio_mm was already bound to another device then it
contains a valid PASID and it's safe to call unbind(). Otherwise we call
it with PASID -1 (VFIO_INVALID_PASID) and that's a bit of a grey area.
-1 is currently invalid everywhere, but in the future someone might
implement 32 bits of PASIDs, in which case a bond between this dev and
PASID -1 might exist. But I think it's safe for now, and whoever
redefines VFIO_INVALID_PASID when such implementation appears will also
fix this case.

> Just be nice to get rid of the special casing in this error path as then
> could just do it all under if (ret) and make it visually clearer these
> are different aspects of the same error path.

[...]
>>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>>  				   unsigned int cmd, unsigned long arg)
>>  {
>> @@ -1728,6 +2097,44 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>>  
>>  		return copy_to_user((void __user *)arg, &unmap, minsz) ?
>>  			-EFAULT : 0;
>> +
>> +	} else if (cmd == VFIO_IOMMU_BIND) {
>> +		struct vfio_iommu_type1_bind bind;
>> +
>> +		minsz = offsetofend(struct vfio_iommu_type1_bind, flags);
>> +
>> +		if (copy_from_user(&bind, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (bind.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		switch (bind.flags) {
>> +		case VFIO_IOMMU_BIND_PROCESS:
>> +			return vfio_iommu_type1_bind_process(iommu, (void *)arg,
>> +							     &bind);
> 
> Can we combine these two blocks given it is only this case statement that is different?

That would be nicer, though I don't know yet what's needed for vSVA (by
Yi Liu on Cc), which will add statements to the switches.

Thanks,
Jean

> 
>> +		default:
>> +			return -EINVAL;
>> +		}
>> +
>> +	} else if (cmd == VFIO_IOMMU_UNBIND) {
>> +		struct vfio_iommu_type1_bind bind;
>> +
>> +		minsz = offsetofend(struct vfio_iommu_type1_bind, flags);
>> +
>> +		if (copy_from_user(&bind, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (bind.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		switch (bind.flags) {
>> +		case VFIO_IOMMU_BIND_PROCESS:
>> +			return vfio_iommu_type1_unbind_process(iommu, (void *)arg,
>> +							       &bind);
>> +		default:
>> +			return -EINVAL;
>> +		}
>>  	}
Xu Zaibo May 23, 2018, 9:38 a.m. UTC | #3
Hi,

On 2018/5/12 3:06, Jean-Philippe Brucker wrote:
> Add two new ioctls for VFIO containers. VFIO_IOMMU_BIND_PROCESS creates a
> bond between a container and a process address space, identified by a
> Process Address Space ID (PASID). Devices in the container append this
> PASID to DMA transactions in order to access the process' address space.
> The process page tables are shared with the IOMMU, and mechanisms such as
> PCI ATS/PRI are used to handle faults. VFIO_IOMMU_UNBIND_PROCESS removes a
> bond created with VFIO_IOMMU_BIND_PROCESS.
>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
>
>
>
>
>
>
> +static int vfio_iommu_bind_group(struct vfio_iommu *iommu,
> +				 struct vfio_group *group,
> +				 struct vfio_mm *vfio_mm)
> +{
> +	int ret;
> +	bool enabled_sva = false;
> +	struct vfio_iommu_sva_bind_data data = {
> +		.vfio_mm	= vfio_mm,
> +		.iommu		= iommu,
> +		.count		= 0,
> +	};
> +
> +	if (!group->sva_enabled) {
> +		ret = iommu_group_for_each_dev(group->iommu_group, NULL,
> +					       vfio_iommu_sva_init);
Do we need to do *sva_init here or do anything to avoid repeated initiation?
while another process already did initiation at this device, I think 
that current process will get an EEXIST.

Thanks.
> +		if (ret)
> +			return ret;
> +
> +		group->sva_enabled = enabled_sva = true;
> +	}
> +
> +	ret = iommu_group_for_each_dev(group->iommu_group, &data,
> +				       vfio_iommu_sva_bind_dev);
> +	if (ret && data.count > 1)
> +		iommu_group_for_each_dev(group->iommu_group, vfio_mm,
> +					 vfio_iommu_sva_unbind_dev);
> +	if (ret && enabled_sva) {
> +		iommu_group_for_each_dev(group->iommu_group, NULL,
> +					 vfio_iommu_sva_shutdown);
> +		group->sva_enabled = false;
> +	}
> +
> +	return ret;
> +}
>
>
>   
> @@ -1442,6 +1636,10 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>   	if (ret)
>   		goto out_detach;
>   
> +	ret = vfio_iommu_replay_bind(iommu, group);
> +	if (ret)
> +		goto out_detach;
> +
>   	if (resv_msi) {
>   		ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
>   		if (ret)
> @@ -1547,6 +1745,11 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>   			continue;
>   
>   		iommu_detach_group(domain->domain, iommu_group);
> +		if (group->sva_enabled) {
> +			iommu_group_for_each_dev(iommu_group, NULL,
> +						 vfio_iommu_sva_shutdown);
> +			group->sva_enabled = false;
> +		}
Here, why shut down here? If another process is working on the device, 
there may be a crash?

Thanks.
>   		list_del(&group->next);
>   		kfree(group);
>   		/*
> @@ -1562,6 +1765,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>
Jean-Philippe Brucker May 24, 2018, 11:44 a.m. UTC | #4
Hi,

On 23/05/18 10:38, Xu Zaibo wrote:
>> +static int vfio_iommu_bind_group(struct vfio_iommu *iommu,
>> +                 struct vfio_group *group,
>> +                 struct vfio_mm *vfio_mm)
>> +{
>> +    int ret;
>> +    bool enabled_sva = false;
>> +    struct vfio_iommu_sva_bind_data data = {
>> +        .vfio_mm    = vfio_mm,
>> +        .iommu        = iommu,
>> +        .count        = 0,
>> +    };
>> +
>> +    if (!group->sva_enabled) {
>> +        ret = iommu_group_for_each_dev(group->iommu_group, NULL,
>> +                           vfio_iommu_sva_init);
> Do we need to do *sva_init here or do anything to avoid repeated
> initiation?
> while another process already did initiation at this device, I think
> that current process will get an EEXIST.

Right, sva_init() must be called once for any device that intends to use
bind(). For the second process though, group->sva_enabled will be true
so we won't call sva_init() again, only bind().

Thanks,
Jean
Jean-Philippe Brucker May 24, 2018, 3:04 p.m. UTC | #5
On 24/05/18 13:35, Xu Zaibo wrote:
>> Right, sva_init() must be called once for any device that intends to use
>> bind(). For the second process though, group->sva_enabled will be true
>> so we won't call sva_init() again, only bind().
> 
> Well, while I create mediated devices based on one parent device to support multiple
> processes(a new process will create a new 'vfio_group' for the corresponding mediated device,
> and 'sva_enabled' cannot work any more), in fact, *sva_init and *sva_shutdown are basically
> working on parent device, so, as a result, I just only need sva initiation and shutdown on the
> parent device only once. So I change the two as following:
> 
> @@ -551,8 +565,18 @@ int iommu_sva_device_init(struct device *dev, unsigned long features,
>       if (features & ~IOMMU_SVA_FEAT_IOPF)
>          return -EINVAL;
> 
> +    /* If already exists, do nothing  */
> +    mutex_lock(&dev->iommu_param->lock);
> +    if (dev->iommu_param->sva_param) {
> +        mutex_unlock(&dev->iommu_param->lock);
> +        return 0;
> +    }
> +    mutex_unlock(&dev->iommu_param->lock);
> 
>      if (features & IOMMU_SVA_FEAT_IOPF) {
>          ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf,
> 
> 
> @@ -621,6 +646,14 @@ int iommu_sva_device_shutdown(struct device *dev)
>      if (!domain)
>          return -ENODEV;
> 
> +    /* If any other process is working on the device, shut down does nothing. */
> +    mutex_lock(&dev->iommu_param->lock);
> +    if (!list_empty(&dev->iommu_param->sva_param->mm_list)) {
> +        mutex_unlock(&dev->iommu_param->lock);
> +        return 0;
> +    }
> +    mutex_unlock(&dev->iommu_param->lock);

I don't think iommu-sva.c is the best place for this, it's probably
better to implement an intermediate layer (the mediating driver), that
calls iommu_sva_device_init() and iommu_sva_device_shutdown() once. Then
vfio-pci would still call these functions itself, but for mdev the
mediating driver keeps a refcount of groups, and calls device_shutdown()
only when freeing the last mdev.

A device driver (non mdev in this example) expects to be able to free
all its resources after sva_device_shutdown() returns. Imagine the
mm_list isn't empty (mm_exit() is running late), and instead of waiting
in unbind_dev_all() below, we return 0 immediately. Then the calling
driver frees its resources, and the mm_exit callback along with private
data passed to bind() disappear. If a mm_exit() is still running in
parallel, then it will try to access freed data and corrupt memory. So
in this function if mm_list isn't empty, the only thing we can do is wait.

Thanks,
Jean

> +
>      __iommu_sva_unbind_dev_all(dev);
> 
>      mutex_lock(&dev->iommu_param->lock);
> 
> I add the above two checkings in both *sva_init and *sva_shutdown, it is working now,
> but i don't know if it will cause any new problems. What's more, i doubt if it is
> reasonable to check this to avoid repeating operation in VFIO, maybe it is better to check
> in IOMMU. :)
Xu Zaibo May 25, 2018, 2:39 a.m. UTC | #6
Hi,

On 2018/5/24 23:04, Jean-Philippe Brucker wrote:
> On 24/05/18 13:35, Xu Zaibo wrote:
>>> Right, sva_init() must be called once for any device that intends to use
>>> bind(). For the second process though, group->sva_enabled will be true
>>> so we won't call sva_init() again, only bind().
>> Well, while I create mediated devices based on one parent device to support multiple
>> processes(a new process will create a new 'vfio_group' for the corresponding mediated device,
>> and 'sva_enabled' cannot work any more), in fact, *sva_init and *sva_shutdown are basically
>> working on parent device, so, as a result, I just only need sva initiation and shutdown on the
>> parent device only once. So I change the two as following:
>>
>> @@ -551,8 +565,18 @@ int iommu_sva_device_init(struct device *dev, unsigned long features,
>>        if (features & ~IOMMU_SVA_FEAT_IOPF)
>>           return -EINVAL;
>>
>> +    /* If already exists, do nothing  */
>> +    mutex_lock(&dev->iommu_param->lock);
>> +    if (dev->iommu_param->sva_param) {
>> +        mutex_unlock(&dev->iommu_param->lock);
>> +        return 0;
>> +    }
>> +    mutex_unlock(&dev->iommu_param->lock);
>>
>>       if (features & IOMMU_SVA_FEAT_IOPF) {
>>           ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf,
>>
>>
>> @@ -621,6 +646,14 @@ int iommu_sva_device_shutdown(struct device *dev)
>>       if (!domain)
>>           return -ENODEV;
>>
>> +    /* If any other process is working on the device, shut down does nothing. */
>> +    mutex_lock(&dev->iommu_param->lock);
>> +    if (!list_empty(&dev->iommu_param->sva_param->mm_list)) {
>> +        mutex_unlock(&dev->iommu_param->lock);
>> +        return 0;
>> +    }
>> +    mutex_unlock(&dev->iommu_param->lock);
> I don't think iommu-sva.c is the best place for this, it's probably
> better to implement an intermediate layer (the mediating driver), that
> calls iommu_sva_device_init() and iommu_sva_device_shutdown() once. Then
> vfio-pci would still call these functions itself, but for mdev the
> mediating driver keeps a refcount of groups, and calls device_shutdown()
> only when freeing the last mdev.
>
> A device driver (non mdev in this example) expects to be able to free
> all its resources after sva_device_shutdown() returns. Imagine the
> mm_list isn't empty (mm_exit() is running late), and instead of waiting
> in unbind_dev_all() below, we return 0 immediately. Then the calling
> driver frees its resources, and the mm_exit callback along with private
> data passed to bind() disappear. If a mm_exit() is still running in
> parallel, then it will try to access freed data and corrupt memory. So
> in this function if mm_list isn't empty, the only thing we can do is wait.
>
I still don't understand why we should 'unbind_dev_all', is it possible 
to do a 'unbind_dev_pasid'?
Then we can do other things instead of waiting that user may not like. :)

Thanks
Zaibo
>
>> +
>>       __iommu_sva_unbind_dev_all(dev);
>>
>>       mutex_lock(&dev->iommu_param->lock);
>>
>> I add the above two checkings in both *sva_init and *sva_shutdown, it is working now,
>> but i don't know if it will cause any new problems. What's more, i doubt if it is
>> reasonable to check this to avoid repeating operation in VFIO, maybe it is better to check
>> in IOMMU. :)
>
>
>
> .
>
Jean-Philippe Brucker May 25, 2018, 9:47 a.m. UTC | #7
On 25/05/18 03:39, Xu Zaibo wrote:
> Hi,
> 
> On 2018/5/24 23:04, Jean-Philippe Brucker wrote:
>> On 24/05/18 13:35, Xu Zaibo wrote:
>>>> Right, sva_init() must be called once for any device that intends to use
>>>> bind(). For the second process though, group->sva_enabled will be true
>>>> so we won't call sva_init() again, only bind().
>>> Well, while I create mediated devices based on one parent device to support multiple
>>> processes(a new process will create a new 'vfio_group' for the corresponding mediated device,
>>> and 'sva_enabled' cannot work any more), in fact, *sva_init and *sva_shutdown are basically
>>> working on parent device, so, as a result, I just only need sva initiation and shutdown on the
>>> parent device only once. So I change the two as following:
>>>
>>> @@ -551,8 +565,18 @@ int iommu_sva_device_init(struct device *dev, unsigned long features,
>>>        if (features & ~IOMMU_SVA_FEAT_IOPF)
>>>           return -EINVAL;
>>>
>>> +    /* If already exists, do nothing  */
>>> +    mutex_lock(&dev->iommu_param->lock);
>>> +    if (dev->iommu_param->sva_param) {
>>> +        mutex_unlock(&dev->iommu_param->lock);
>>> +        return 0;
>>> +    }
>>> +    mutex_unlock(&dev->iommu_param->lock);
>>>
>>>       if (features & IOMMU_SVA_FEAT_IOPF) {
>>>           ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf,
>>>
>>>
>>> @@ -621,6 +646,14 @@ int iommu_sva_device_shutdown(struct device *dev)
>>>       if (!domain)
>>>           return -ENODEV;
>>>
>>> +    /* If any other process is working on the device, shut down does nothing. */
>>> +    mutex_lock(&dev->iommu_param->lock);
>>> +    if (!list_empty(&dev->iommu_param->sva_param->mm_list)) {
>>> +        mutex_unlock(&dev->iommu_param->lock);
>>> +        return 0;
>>> +    }
>>> +    mutex_unlock(&dev->iommu_param->lock);
>> I don't think iommu-sva.c is the best place for this, it's probably
>> better to implement an intermediate layer (the mediating driver), that
>> calls iommu_sva_device_init() and iommu_sva_device_shutdown() once. Then
>> vfio-pci would still call these functions itself, but for mdev the
>> mediating driver keeps a refcount of groups, and calls device_shutdown()
>> only when freeing the last mdev.
>>
>> A device driver (non mdev in this example) expects to be able to free
>> all its resources after sva_device_shutdown() returns. Imagine the
>> mm_list isn't empty (mm_exit() is running late), and instead of waiting
>> in unbind_dev_all() below, we return 0 immediately. Then the calling
>> driver frees its resources, and the mm_exit callback along with private
>> data passed to bind() disappear. If a mm_exit() is still running in
>> parallel, then it will try to access freed data and corrupt memory. So
>> in this function if mm_list isn't empty, the only thing we can do is wait.
>>
> I still don't understand why we should 'unbind_dev_all', is it possible 
> to do a 'unbind_dev_pasid'?

Not in sva_device_shutdown(), it needs to clean up everything. For
example you want to physically unplug the device, or assign it to a VM.
To prevent any leak sva_device_shutdown() needs to remove all bonds. In
theory there shouldn't be any, since either the driver did unbind_dev(),
or all process exited. This is a safety net.

> Then we can do other things instead of waiting that user may not like. :)

They may not like it, but it's for their own good :) At the moment we're
waiting that:

* All exit_mm() callback for this device have finished. If we don't wait
  then the caller will free the private data passed to bind and the
  mm_exit() callback while they are still being used.

* All page requests targeting this device are dealt with. If we don't
  wait then some requests, that are lingering in the IOMMU PRI queue,
  may hit the next contexts bound to this device, possibly in a
  different VM. It may not be too risky (though probably exploitable in
  some way), but is incredibly messy.

All of this is bounded in time, and normally should be over pretty fast
unless the device driver's exit_mm() does something strange. If the
driver did the right thing, there shouldn't be any wait here (although
there may be one in unbind_dev() for the same reasons - prevent use
after free).

Thanks,
Jean
Jean-Philippe Brucker May 29, 2018, 11:55 a.m. UTC | #8
(If possible, please reply in plain text to the list. Reading this in a
text-only reader is confusing, because all quotes have the same level)

On 26/05/18 04:53, Xu Zaibo wrote:
> I guess there may be some misunderstandings :).
> 
> From the current patches, 'iommu_sva_device_shutdown' is called by 'vfio_iommu_sva_shutdown', which
> is mainly used by 'vfio_iommu_type1_detach_group' that is finally called by processes' release of vfio facilitiy
> automatically or called by 'VFIO_GROUP_UNSET_CONTAINER' manually in the processes.
> 
> So, just image that 2 processes is working on the device with IOPF feature, and the 2 do the following to enable SVA:
> 
>     1.open("/dev/vfio/vfio") ;
> 
>    2.open the group of the devcie by calling open("/dev/vfio/x"), but now,
>      I think the second processes will fail to open the group because current VFIO thinks that one group only can be used by one process/vm,
>      at present, I use mediated device to create more groups on the parent device to support multiple processes;
> 
>     3.VFIO_GROUP_SET_CONTAINER;
> 
>     4.VFIO_SET_IOMMU;
> 
>     5.VFIO_IOMMU_BIND;

I have a concern regarding your driver. With mdev you can't allow
processes to program the PASID themselves, since the parent device has a
single PASID table. You lose all isolation since processes could write
any value in the PASID field and access address spaces of other
processes bound to this parent device (even if the BIND call was for
other mdevs).

The wrap driver has to mediate calls to bind(), and either program the
PASID into the device itself, or at least check that, when receiving a
SET_PASID ioctl from a process, the given PASID was actually allocated
to the process.

>     6.Do some works with the hardware working unit filled by PASID on the device;
> 
>    7.VFIO_IOMMU_UNBIND;
> 
>     8.VFIO_GROUP_UNSET_CONTAINER;---here, have to sleep to wait another process to finish works of the step 6;
> 
>     9. close(group); close(containner);
> 
> 
> So, my idea is: If it is possible to just release the params or facilities that only belong to the current process while the process shutdown the device,
> and while the last process releases them all. Then, as in the above step 8, we
> don't need to wait, or maybe wait for a very long time if the other process is doing lots of work on the device.
Given that you need to notify the mediating driver of IOMMU_BIND calls
as explained above, you could do something similar for shutdown: from
the mdev driver, call iommu_sva_shutdown_device() only for the last mdev.

Thanks,
Jean
Xu Zaibo May 29, 2018, 12:24 p.m. UTC | #9
Hi,

On 2018/5/29 19:55, Jean-Philippe Brucker wrote:
> (If possible, please reply in plain text to the list. Reading this in a
> text-only reader is confusing, because all quotes have the same level)
Sorry for that, I have reset the thunderbird, :) thanks.
> On 26/05/18 04:53, Xu Zaibo wrote:
>> I guess there may be some misunderstandings :).
>>
>>  From the current patches, 'iommu_sva_device_shutdown' is called by 'vfio_iommu_sva_shutdown', which
>> is mainly used by 'vfio_iommu_type1_detach_group' that is finally called by processes' release of vfio facilitiy
>> automatically or called by 'VFIO_GROUP_UNSET_CONTAINER' manually in the processes.
>>
>> So, just image that 2 processes is working on the device with IOPF feature, and the 2 do the following to enable SVA:
>>
>>      1.open("/dev/vfio/vfio") ;
>>
>>     2.open the group of the devcie by calling open("/dev/vfio/x"), but now,
>>       I think the second processes will fail to open the group because current VFIO thinks that one group only can be used by one process/vm,
>>       at present, I use mediated device to create more groups on the parent device to support multiple processes;
>>
>>      3.VFIO_GROUP_SET_CONTAINER;
>>
>>      4.VFIO_SET_IOMMU;
>>
>>      5.VFIO_IOMMU_BIND;
> I have a concern regarding your driver. With mdev you can't allow
> processes to program the PASID themselves, since the parent device has a
> single PASID table. You lose all isolation since processes could write
> any value in the PASID field and access address spaces of other
> processes bound to this parent device (even if the BIND call was for
> other mdevs).
Yes, if wrapdrive do nothing on this PASID setting procedure in kernel 
space, I think
there definitely exists this security risk.
> The wrap driver has to mediate calls to bind(), and either program the
> PASID into the device itself, or at least check that, when receiving a
> SET_PASID ioctl from a process, the given PASID was actually allocated
> to the process.
Yes, good advice, thanks.
>
>>      6.Do some works with the hardware working unit filled by PASID on the device;
>>
>>     7.VFIO_IOMMU_UNBIND;
>>
>>      8.VFIO_GROUP_UNSET_CONTAINER;---here, have to sleep to wait another process to finish works of the step 6;
>>
>>      9. close(group); close(containner);
>>
>>
>> So, my idea is: If it is possible to just release the params or facilities that only belong to the current process while the process shutdown the device,
>> and while the last process releases them all. Then, as in the above step 8, we
>> don't need to wait, or maybe wait for a very long time if the other process is doing lots of work on the device.
> Given that you need to notify the mediating driver of IOMMU_BIND calls
> as explained above, you could do something similar for shutdown: from
> the mdev driver, call iommu_sva_shutdown_device() only for the last mdev.
Currently, I add an API to check if it is the last mdev in wrapdrive, as 
vfio shutdowns the device,
it call the API to do the check at first.

Thanks
Zaibo
Xu Zaibo Aug. 27, 2018, 8:06 a.m. UTC | #10
Hi Jean,

On 2018/5/12 3:06, Jean-Philippe Brucker wrote:
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 1aa7b82e8169..dc07752c8fe8 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -665,6 +665,82 @@ struct vfio_iommu_type1_dma_unmap {
>   #define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
>   #define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
>   
> +/*
> + * VFIO_IOMMU_BIND_PROCESS
> + *
> + * Allocate a PASID for a process address space, and use it to attach this
> + * process to all devices in the container. Devices can then tag their DMA
> + * traffic with the returned @pasid to perform transactions on the associated
> + * virtual address space. Mapping and unmapping buffers is performed by standard
> + * functions such as mmap and malloc.
> + *
> + * If flag is VFIO_IOMMU_BIND_PID, @pid contains the pid of a foreign process to
> + * bind. Otherwise the current task is bound. Given that the caller owns the
> + * device, setting this flag grants the caller read and write permissions on the
> + * entire address space of foreign process described by @pid. Therefore,
> + * permission to perform the bind operation on a foreign process is governed by
> + * the ptrace access mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2)
> + * for more information.
> + *
> + * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This
> + * ID is unique to a process and can be used on all devices in the container.
> + *
> + * On fork, the child inherits the device fd and can use the bonds setup by its
> + * parent. Consequently, the child has R/W access on the address spaces bound by
> + * its parent. After an execv, the device fd is closed and the child doesn't
> + * have access to the address space anymore.
> + *
> + * To remove a bond between process and container, VFIO_IOMMU_UNBIND ioctl is
> + * issued with the same parameters. If a pid was specified in VFIO_IOMMU_BIND,
> + * it should also be present for VFIO_IOMMU_UNBIND. Otherwise unbind the current
> + * task from the container.
> + */
> +struct vfio_iommu_type1_bind_process {
> +	__u32	flags;
> +#define VFIO_IOMMU_BIND_PID		(1 << 0)
> +	__u32	pasid;
As I am doing some works on the SVA patch set. I just consider why the 
user space need this pasid.
Maybe, is it much more reasonable to set the pasid into all devices 
under the vfio container by
a call back function from 'vfio_devices'  while 
'VFIO_IOMMU_BIND_PROCESS' CMD is executed
in kernel land? I am not sure because there exists no suitable call back 
in 'vfio_device' at present.

Thanks,
Zaibo
> +	__s32	pid;
> +};
> +
> +/*
> + * Only mode supported at the moment is VFIO_IOMMU_BIND_PROCESS, which takes
> + * vfio_iommu_type1_bind_process in data.
> + */
> +struct vfio_iommu_type1_bind {
> +	__u32	argsz;
> +	__u32	flags;
> +#define VFIO_IOMMU_BIND_PROCESS		(1 << 0)
> +	__u8	data[];
> +};
> +
> +/*
> + * VFIO_IOMMU_BIND - _IOWR(VFIO_TYPE, VFIO_BASE + 22, struct vfio_iommu_bind)
> + *
> + * Manage address spaces of devices in this container. Initially a TYPE1
> + * container can only have one address space, managed with
> + * VFIO_IOMMU_MAP/UNMAP_DMA.
> + *
> + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP
> + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page
> + * tables, and BIND manages the stage-1 (guest) page tables. Other types of
> + * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls
> + * non-PASID traffic and BIND controls PASID traffic. But this depends on the
> + * underlying IOMMU architecture and isn't guaranteed.
> + *
> + * Availability of this feature depends on the device, its bus, the underlying
> + * IOMMU and the CPU architecture.
> + *
> + * returns: 0 on success, -errno on failure.
> + */
> +#define VFIO_IOMMU_BIND		_IO(VFIO_TYPE, VFIO_BASE + 22)
> +
> +/*
> + * VFIO_IOMMU_UNBIND - _IOWR(VFIO_TYPE, VFIO_BASE + 23, struct vfio_iommu_bind)
> + *
> + * Undo what was done by the corresponding VFIO_IOMMU_BIND ioctl.
> + */
> +#define VFIO_IOMMU_UNBIND	_IO(VFIO_TYPE, VFIO_BASE + 23)
> +
>   /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>   
>   /*
Jean-Philippe Brucker Aug. 31, 2018, 1:34 p.m. UTC | #11
Hi Zaibo,

On 27/08/18 09:06, Xu Zaibo wrote:
>> +struct vfio_iommu_type1_bind_process {
>> +    __u32    flags;
>> +#define VFIO_IOMMU_BIND_PID        (1 << 0)
>> +    __u32    pasid;
> As I am doing some works on the SVA patch set. I just consider why the
> user space need this pasid.
> Maybe, is it much more reasonable to set the pasid into all devices
> under the vfio container by
> a call back function from 'vfio_devices'  while
> 'VFIO_IOMMU_BIND_PROCESS' CMD is executed
> in kernel land? I am not sure because there exists no suitable call back
> in 'vfio_device' at present.

When using vfio-pci, the kernel doesn't know how to program the PASID
into the device because the only kernel driver for the device is the
generic vfio-pci module. The PCI specification doesn't describe a way of
setting up the PASID, it's vendor-specific. Only the userspace
application owning the device (and calling VFIO_IOMMU_BIND) knows how to
do it, so we return the allocated PASID.

Note that unlike vfio-mdev where applications share slices of a
function, with vfio-pci one application owns the whole function so it's
safe to let userspace set the PASID in hardware. With vfio-mdev it's the
kernel driver that should be in charge of setting the PASID as you
described, and we wouldn't have a reason to return the PASID in the
vfio_iommu_type1_bind_process structure.

Thanks,
Jean
Xu Zaibo Sept. 1, 2018, 2:23 a.m. UTC | #12
Hi Jean,

On 2018/8/31 21:34, Jean-Philippe Brucker wrote:
> On 27/08/18 09:06, Xu Zaibo wrote:
>>> +struct vfio_iommu_type1_bind_process {
>>> +    __u32    flags;
>>> +#define VFIO_IOMMU_BIND_PID        (1 << 0)
>>> +    __u32    pasid;
>> As I am doing some works on the SVA patch set. I just consider why the
>> user space need this pasid.
>> Maybe, is it much more reasonable to set the pasid into all devices
>> under the vfio container by
>> a call back function from 'vfio_devices'  while
>> 'VFIO_IOMMU_BIND_PROCESS' CMD is executed
>> in kernel land? I am not sure because there exists no suitable call back
>> in 'vfio_device' at present.
> When using vfio-pci, the kernel doesn't know how to program the PASID
> into the device because the only kernel driver for the device is the
> generic vfio-pci module. The PCI specification doesn't describe a way of
> setting up the PASID, it's vendor-specific. Only the userspace
> application owning the device (and calling VFIO_IOMMU_BIND) knows how to
> do it, so we return the allocated PASID.
>
> Note that unlike vfio-mdev where applications share slices of a
> function, with vfio-pci one application owns the whole function so it's
> safe to let userspace set the PASID in hardware. With vfio-mdev it's the
> kernel driver that should be in charge of setting the PASID as you
> described, and we wouldn't have a reason to return the PASID in the
> vfio_iommu_type1_bind_process structure.
Understood. But I still get the following confusion:

As one application takes a whole function while using VFIO-PCI, why do 
the application and the
function need to enable PASID capability? (Since just one I/O page table 
is enough for them.)

Maybe I misunderstood, hope you can help me clear it. Thank you very much.

Thanks,
Zaibo

.
Jean-Philippe Brucker Sept. 3, 2018, 10:34 a.m. UTC | #13
On 01/09/18 03:23, Xu Zaibo wrote:
> As one application takes a whole function while using VFIO-PCI, why do 
> the application and the
> function need to enable PASID capability? (Since just one I/O page table 
> is enough for them.)

At the moment the series doesn't provide support for SVA without PASID
(on the I/O page fault path, 08/40). In addition the BIND ioctl could be
used by the owner application to bind other processes (slaves) and
perform sub-assignment. But that feature is incomplete because we don't
send stop_pasid notification to the owner when a slave dies.

Thanks,
Jean
Xu Zaibo Sept. 4, 2018, 2:12 a.m. UTC | #14
On 2018/9/3 18:34, Jean-Philippe Brucker wrote:
> On 01/09/18 03:23, Xu Zaibo wrote:
>> As one application takes a whole function while using VFIO-PCI, why do
>> the application and the
>> function need to enable PASID capability? (Since just one I/O page table
>> is enough for them.)
> At the moment the series doesn't provide support for SVA without PASID
> (on the I/O page fault path, 08/40). In addition the BIND ioctl could be
> used by the owner application to bind other processes (slaves) and
> perform sub-assignment. But that feature is incomplete because we don't
> send stop_pasid notification to the owner when a slave dies.
>
So, Could I understand like this?

     1. While the series are finished well, VFIO-PCI device can be held 
by only one process
         through binding IOCTL command without PASID (without PASID 
being exposed user space).

     2. While using VFIO-PCI device to support multiple processes with 
SVA series, a primary
         process with multiple secondary processes must be deployed just 
like DPDK(https://www.dpdk.org/).
         And, the PASID still has to be exposed to user land.


Thanks,
Zaibo

.
Jean-Philippe Brucker Sept. 4, 2018, 10:57 a.m. UTC | #15
On 04/09/2018 03:12, Xu Zaibo wrote:
> On 2018/9/3 18:34, Jean-Philippe Brucker wrote:
>> On 01/09/18 03:23, Xu Zaibo wrote:
>>> As one application takes a whole function while using VFIO-PCI, why do
>>> the application and the
>>> function need to enable PASID capability? (Since just one I/O page table
>>> is enough for them.)
>> At the moment the series doesn't provide support for SVA without PASID
>> (on the I/O page fault path, 08/40). In addition the BIND ioctl could be
>> used by the owner application to bind other processes (slaves) and
>> perform sub-assignment. But that feature is incomplete because we don't
>> send stop_pasid notification to the owner when a slave dies.
>>
> So, Could I understand like this?
> 
>      1. While the series are finished well, VFIO-PCI device can be held 
> by only one process
>          through binding IOCTL command without PASID (without PASID 
> being exposed user space).

It could, but isn't supported at the moment. In addition to adding
support in the I/O page fault code, we'd also need to update the VFIO
API. Currently a VFIO_TYPE1 domain always supports the MAP/UNMAP ioctl.
The case you describe isn't compatible with MAP/UNMAP, since the process
manages the shared address space with mmap or malloc. We'd probably need
to introduce a new VFIO IOMMU type, in which case the bind could be
performed implicitly when the process does VFIO_SET_IOMMU. Then the
process wouldn't need to send an additional BIND IOCTL.

>      2. While using VFIO-PCI device to support multiple processes with 
> SVA series, a primary
>          process with multiple secondary processes must be deployed just 
> like DPDK(https://www.dpdk.org/).
>          And, the PASID still has to be exposed to user land.

Right. A third case, also implemented by this patch (and complete), is
the primary process simply doing a BIND for itself, and using the
returned PASID to share its own address space with the device.

Thanks,
Jean
Xu Zaibo Sept. 5, 2018, 3:15 a.m. UTC | #16
Hi,

On 2018/9/4 18:57, Jean-Philippe Brucker wrote:
> On 04/09/2018 03:12, Xu Zaibo wrote:
>> On 2018/9/3 18:34, Jean-Philippe Brucker wrote:
>>> On 01/09/18 03:23, Xu Zaibo wrote:
>>>> As one application takes a whole function while using VFIO-PCI, why do
>>>> the application and the
>>>> function need to enable PASID capability? (Since just one I/O page table
>>>> is enough for them.)
>>> At the moment the series doesn't provide support for SVA without PASID
>>> (on the I/O page fault path, 08/40). In addition the BIND ioctl could be
>>> used by the owner application to bind other processes (slaves) and
>>> perform sub-assignment. But that feature is incomplete because we don't
>>> send stop_pasid notification to the owner when a slave dies.
>>>
>> So, Could I understand like this?
>>
>>       1. While the series are finished well, VFIO-PCI device can be held
>> by only one process
>>           through binding IOCTL command without PASID (without PASID
>> being exposed user space).
> It could, but isn't supported at the moment. In addition to adding
> support in the I/O page fault code, we'd also need to update the VFIO
> API. Currently a VFIO_TYPE1 domain always supports the MAP/UNMAP ioctl.
> The case you describe isn't compatible with MAP/UNMAP, since the process
> manages the shared address space with mmap or malloc. We'd probably need
> to introduce a new VFIO IOMMU type, in which case the bind could be
> performed implicitly when the process does VFIO_SET_IOMMU. Then the
> process wouldn't need to send an additional BIND IOCTL.
ok. got it.  This is the legacy mode, so all the VFIO APIs are kept 
unchanged?
>>       2. While using VFIO-PCI device to support multiple processes with
>> SVA series, a primary
>>           process with multiple secondary processes must be deployed just
>> like DPDK(https://www.dpdk.org/).
>>           And, the PASID still has to be exposed to user land.
> Right. A third case, also implemented by this patch (and complete), is
> the primary process simply doing a BIND for itself, and using the
> returned PASID to share its own address space with the device.
>
ok. But I am worried that the sulotion of one primary processes with 
several secondary ones

is a little bit limited. Maybe, users don't want to depend on the 
primary process. :)



Thanks,
Zaibo

.
Jean-Philippe Brucker Sept. 5, 2018, 11:02 a.m. UTC | #17
On 05/09/2018 04:15, Xu Zaibo wrote:
>>>       1. While the series are finished well, VFIO-PCI device can be held
>>> by only one process
>>>           through binding IOCTL command without PASID (without PASID
>>> being exposed user space).
>> It could, but isn't supported at the moment. In addition to adding
>> support in the I/O page fault code, we'd also need to update the VFIO
>> API. Currently a VFIO_TYPE1 domain always supports the MAP/UNMAP ioctl.
>> The case you describe isn't compatible with MAP/UNMAP, since the process
>> manages the shared address space with mmap or malloc. We'd probably need
>> to introduce a new VFIO IOMMU type, in which case the bind could be
>> performed implicitly when the process does VFIO_SET_IOMMU. Then the
>> process wouldn't need to send an additional BIND IOCTL.
> ok. got it.  This is the legacy mode, so all the VFIO APIs are kept 
> unchanged?

Yes, existing VFIO semantics are preserved

>>>       2. While using VFIO-PCI device to support multiple processes with
>>> SVA series, a primary
>>>           process with multiple secondary processes must be deployed just
>>> like DPDK(https://www.dpdk.org/).
>>>           And, the PASID still has to be exposed to user land.
>> Right. A third case, also implemented by this patch (and complete), is
>> the primary process simply doing a BIND for itself, and using the
>> returned PASID to share its own address space with the device.
>>
> ok. But I am worried that the sulotion of one primary processes with 
> several secondary ones
> 
> is a little bit limited. Maybe, users don't want to depend on the 
> primary process. :)

I don't see a better way for vfio-pci, though. But more importantly, I
don't know of any users :) While the feature is great for testing new
hardware, and I've been using it for all kinds of stress testing, I
haven't received feedback from possible users in production settings
(DPDK etc) and can't speculate about what they'd prefer.

Thanks,
Jean
Xu Zaibo Sept. 6, 2018, 7:26 a.m. UTC | #18
On 2018/9/5 19:02, Jean-Philippe Brucker wrote:
> On 05/09/2018 04:15, Xu Zaibo wrote:
>>>>        1. While the series are finished well, VFIO-PCI device can be held
>>>> by only one process
>>>>            through binding IOCTL command without PASID (without PASID
>>>> being exposed user space).
>>> It could, but isn't supported at the moment. In addition to adding
>>> support in the I/O page fault code, we'd also need to update the VFIO
>>> API. Currently a VFIO_TYPE1 domain always supports the MAP/UNMAP ioctl.
>>> The case you describe isn't compatible with MAP/UNMAP, since the process
>>> manages the shared address space with mmap or malloc. We'd probably need
>>> to introduce a new VFIO IOMMU type, in which case the bind could be
>>> performed implicitly when the process does VFIO_SET_IOMMU. Then the
>>> process wouldn't need to send an additional BIND IOCTL.
>> ok. got it.  This is the legacy mode, so all the VFIO APIs are kept
>> unchanged?
> Yes, existing VFIO semantics are preserved
>
>>>>        2. While using VFIO-PCI device to support multiple processes with
>>>> SVA series, a primary
>>>>            process with multiple secondary processes must be deployed just
>>>> like DPDK(https://www.dpdk.org/).
>>>>            And, the PASID still has to be exposed to user land.
>>> Right. A third case, also implemented by this patch (and complete), is
>>> the primary process simply doing a BIND for itself, and using the
>>> returned PASID to share its own address space with the device.
>>>
>> ok. But I am worried that the sulotion of one primary processes with
>> several secondary ones
>>
>> is a little bit limited. Maybe, users don't want to depend on the
>> primary process. :)
> I don't see a better way for vfio-pci, though. But more importantly, I
> don't know of any users :) While the feature is great for testing new
> hardware, and I've been using it for all kinds of stress testing, I
> haven't received feedback from possible users in production settings
> (DPDK etc) and can't speculate about what they'd prefer.
>
At present, It seems no other way existing while being compatible with 
current logic.
Thank you.

Thanks,
Zaibo
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 5c212bf29640..2902774062b8 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -30,6 +30,7 @@ 
 #include <linux/iommu.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/ptrace.h>
 #include <linux/rbtree.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
@@ -60,6 +61,7 @@  MODULE_PARM_DESC(disable_hugepages,
 
 struct vfio_iommu {
 	struct list_head	domain_list;
+	struct list_head	mm_list;
 	struct vfio_domain	*external_domain; /* domain for external user */
 	struct mutex		lock;
 	struct rb_root		dma_list;
@@ -90,6 +92,14 @@  struct vfio_dma {
 struct vfio_group {
 	struct iommu_group	*iommu_group;
 	struct list_head	next;
+	bool			sva_enabled;
+};
+
+struct vfio_mm {
+#define VFIO_PASID_INVALID	(-1)
+	int			pasid;
+	struct mm_struct	*mm;
+	struct list_head	next;
 };
 
 /*
@@ -1238,6 +1248,164 @@  static int vfio_iommu_replay(struct vfio_iommu *iommu,
 	return 0;
 }
 
+static int vfio_iommu_mm_exit(struct device *dev, int pasid, void *data)
+{
+	struct vfio_mm *vfio_mm;
+	struct vfio_iommu *iommu = data;
+
+	mutex_lock(&iommu->lock);
+	list_for_each_entry(vfio_mm, &iommu->mm_list, next) {
+		if (vfio_mm->pasid == pasid) {
+			list_del(&vfio_mm->next);
+			kfree(vfio_mm);
+			break;
+		}
+	}
+	mutex_unlock(&iommu->lock);
+
+	return 0;
+}
+
+static int vfio_iommu_sva_init(struct device *dev, void *data)
+{
+	return iommu_sva_device_init(dev, IOMMU_SVA_FEAT_IOPF, 0,
+				     vfio_iommu_mm_exit);
+}
+
+static int vfio_iommu_sva_shutdown(struct device *dev, void *data)
+{
+	iommu_sva_device_shutdown(dev);
+
+	return 0;
+}
+
+struct vfio_iommu_sva_bind_data {
+	struct vfio_mm *vfio_mm;
+	struct vfio_iommu *iommu;
+	int count;
+};
+
+static int vfio_iommu_sva_bind_dev(struct device *dev, void *data)
+{
+	struct vfio_iommu_sva_bind_data *bind_data = data;
+
+	/* Multi-device groups aren't support for SVA */
+	if (bind_data->count++)
+		return -EINVAL;
+
+	return __iommu_sva_bind_device(dev, bind_data->vfio_mm->mm,
+				       &bind_data->vfio_mm->pasid,
+				       IOMMU_SVA_FEAT_IOPF, bind_data->iommu);
+}
+
+static int vfio_iommu_sva_unbind_dev(struct device *dev, void *data)
+{
+	struct vfio_mm *vfio_mm = data;
+
+	return __iommu_sva_unbind_device(dev, vfio_mm->pasid);
+}
+
+static int vfio_iommu_bind_group(struct vfio_iommu *iommu,
+				 struct vfio_group *group,
+				 struct vfio_mm *vfio_mm)
+{
+	int ret;
+	bool enabled_sva = false;
+	struct vfio_iommu_sva_bind_data data = {
+		.vfio_mm	= vfio_mm,
+		.iommu		= iommu,
+		.count		= 0,
+	};
+
+	if (!group->sva_enabled) {
+		ret = iommu_group_for_each_dev(group->iommu_group, NULL,
+					       vfio_iommu_sva_init);
+		if (ret)
+			return ret;
+
+		group->sva_enabled = enabled_sva = true;
+	}
+
+	ret = iommu_group_for_each_dev(group->iommu_group, &data,
+				       vfio_iommu_sva_bind_dev);
+	if (ret && data.count > 1)
+		iommu_group_for_each_dev(group->iommu_group, vfio_mm,
+					 vfio_iommu_sva_unbind_dev);
+	if (ret && enabled_sva) {
+		iommu_group_for_each_dev(group->iommu_group, NULL,
+					 vfio_iommu_sva_shutdown);
+		group->sva_enabled = false;
+	}
+
+	return ret;
+}
+
+static void vfio_iommu_unbind_group(struct vfio_group *group,
+				    struct vfio_mm *vfio_mm)
+{
+	iommu_group_for_each_dev(group->iommu_group, vfio_mm,
+				 vfio_iommu_sva_unbind_dev);
+}
+
+static void vfio_iommu_unbind(struct vfio_iommu *iommu,
+			      struct vfio_mm *vfio_mm)
+{
+	struct vfio_group *group;
+	struct vfio_domain *domain;
+
+	list_for_each_entry(domain, &iommu->domain_list, next)
+		list_for_each_entry(group, &domain->group_list, next)
+			vfio_iommu_unbind_group(group, vfio_mm);
+}
+
+static int vfio_iommu_replay_bind(struct vfio_iommu *iommu,
+				  struct vfio_group *group)
+{
+	int ret = 0;
+	struct vfio_mm *vfio_mm;
+
+	list_for_each_entry(vfio_mm, &iommu->mm_list, next) {
+		/*
+		 * Ensure that mm doesn't exit while we're binding it to the new
+		 * group. It may already have exited, and the mm_exit notifier
+		 * might be waiting on the IOMMU mutex to remove this vfio_mm
+		 * from the list.
+		 */
+		if (!mmget_not_zero(vfio_mm->mm))
+			continue;
+		ret = vfio_iommu_bind_group(iommu, group, vfio_mm);
+		/*
+		 * Use async to avoid triggering an mm_exit callback right away,
+		 * which would block on the mutex that we're holding.
+		 */
+		mmput_async(vfio_mm->mm);
+
+		if (ret)
+			goto out_unbind;
+	}
+
+	return 0;
+
+out_unbind:
+	list_for_each_entry_continue_reverse(vfio_mm, &iommu->mm_list, next)
+		vfio_iommu_unbind_group(group, vfio_mm);
+
+	return ret;
+}
+
+static void vfio_iommu_free_all_mm(struct vfio_iommu *iommu)
+{
+	struct vfio_mm *vfio_mm, *next;
+
+	/*
+	 * No need for unbind() here. Since all groups are detached from this
+	 * iommu, bonds have been removed.
+	 */
+	list_for_each_entry_safe(vfio_mm, next, &iommu->mm_list, next)
+		kfree(vfio_mm);
+	INIT_LIST_HEAD(&iommu->mm_list);
+}
+
 /*
  * We change our unmap behavior slightly depending on whether the IOMMU
  * supports fine-grained superpages.  IOMMUs like AMD-Vi will use a superpage
@@ -1313,6 +1481,44 @@  static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
 	return ret;
 }
 
+static int vfio_iommu_try_attach_group(struct vfio_iommu *iommu,
+				       struct vfio_group *group,
+				       struct vfio_domain *cur_domain,
+				       struct vfio_domain *new_domain)
+{
+	struct iommu_group *iommu_group = group->iommu_group;
+
+	/*
+	 * Try to match an existing compatible domain.  We don't want to
+	 * preclude an IOMMU driver supporting multiple bus_types and being
+	 * able to include different bus_types in the same IOMMU domain, so
+	 * we test whether the domains use the same iommu_ops rather than
+	 * testing if they're on the same bus_type.
+	 */
+	if (new_domain->domain->ops != cur_domain->domain->ops ||
+	    new_domain->prot != cur_domain->prot)
+		return 1;
+
+	iommu_detach_group(cur_domain->domain, iommu_group);
+
+	if (iommu_attach_group(new_domain->domain, iommu_group))
+		goto out_reattach;
+
+	if (vfio_iommu_replay_bind(iommu, group))
+		goto out_detach;
+
+	return 0;
+
+out_detach:
+	iommu_detach_group(new_domain->domain, iommu_group);
+
+out_reattach:
+	if (iommu_attach_group(cur_domain->domain, iommu_group))
+		return -EINVAL;
+
+	return 1;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
@@ -1410,28 +1616,16 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
 		domain->prot |= IOMMU_CACHE;
 
-	/*
-	 * Try to match an existing compatible domain.  We don't want to
-	 * preclude an IOMMU driver supporting multiple bus_types and being
-	 * able to include different bus_types in the same IOMMU domain, so
-	 * we test whether the domains use the same iommu_ops rather than
-	 * testing if they're on the same bus_type.
-	 */
 	list_for_each_entry(d, &iommu->domain_list, next) {
-		if (d->domain->ops == domain->domain->ops &&
-		    d->prot == domain->prot) {
-			iommu_detach_group(domain->domain, iommu_group);
-			if (!iommu_attach_group(d->domain, iommu_group)) {
-				list_add(&group->next, &d->group_list);
-				iommu_domain_free(domain->domain);
-				kfree(domain);
-				mutex_unlock(&iommu->lock);
-				return 0;
-			}
-
-			ret = iommu_attach_group(domain->domain, iommu_group);
-			if (ret)
-				goto out_domain;
+		ret = vfio_iommu_try_attach_group(iommu, group, domain, d);
+		if (ret < 0) {
+			goto out_domain;
+		} else if (!ret) {
+			list_add(&group->next, &d->group_list);
+			iommu_domain_free(domain->domain);
+			kfree(domain);
+			mutex_unlock(&iommu->lock);
+			return 0;
 		}
 	}
 
@@ -1442,6 +1636,10 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_detach;
 
+	ret = vfio_iommu_replay_bind(iommu, group);
+	if (ret)
+		goto out_detach;
+
 	if (resv_msi) {
 		ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
 		if (ret)
@@ -1547,6 +1745,11 @@  static void vfio_iommu_type1_detach_group(void *iommu_data,
 			continue;
 
 		iommu_detach_group(domain->domain, iommu_group);
+		if (group->sva_enabled) {
+			iommu_group_for_each_dev(iommu_group, NULL,
+						 vfio_iommu_sva_shutdown);
+			group->sva_enabled = false;
+		}
 		list_del(&group->next);
 		kfree(group);
 		/*
@@ -1562,6 +1765,7 @@  static void vfio_iommu_type1_detach_group(void *iommu_data,
 					vfio_iommu_unmap_unpin_all(iommu);
 				else
 					vfio_iommu_unmap_unpin_reaccount(iommu);
+				vfio_iommu_free_all_mm(iommu);
 			}
 			iommu_domain_free(domain->domain);
 			list_del(&domain->next);
@@ -1596,6 +1800,7 @@  static void *vfio_iommu_type1_open(unsigned long arg)
 	}
 
 	INIT_LIST_HEAD(&iommu->domain_list);
+	INIT_LIST_HEAD(&iommu->mm_list);
 	iommu->dma_list = RB_ROOT;
 	mutex_init(&iommu->lock);
 	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
@@ -1630,6 +1835,7 @@  static void vfio_iommu_type1_release(void *iommu_data)
 		kfree(iommu->external_domain);
 	}
 
+	vfio_iommu_free_all_mm(iommu);
 	vfio_iommu_unmap_unpin_all(iommu);
 
 	list_for_each_entry_safe(domain, domain_tmp,
@@ -1658,6 +1864,169 @@  static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
 	return ret;
 }
 
+static struct mm_struct *vfio_iommu_get_mm_by_vpid(pid_t vpid)
+{
+	struct mm_struct *mm;
+	struct task_struct *task;
+
+	task = find_get_task_by_vpid(vpid);
+	if (!task)
+		return ERR_PTR(-ESRCH);
+
+	/* Ensure that current has RW access on the mm */
+	mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
+	put_task_struct(task);
+
+	if (!mm)
+		return ERR_PTR(-ESRCH);
+
+	return mm;
+}
+
+static long vfio_iommu_type1_bind_process(struct vfio_iommu *iommu,
+					  void __user *arg,
+					  struct vfio_iommu_type1_bind *bind)
+{
+	struct vfio_iommu_type1_bind_process params;
+	struct vfio_domain *domain;
+	struct vfio_group *group;
+	struct vfio_mm *vfio_mm;
+	struct mm_struct *mm;
+	unsigned long minsz;
+	int ret = 0;
+
+	minsz = sizeof(*bind) + sizeof(params);
+	if (bind->argsz < minsz)
+		return -EINVAL;
+
+	arg += sizeof(*bind);
+	if (copy_from_user(&params, arg, sizeof(params)))
+		return -EFAULT;
+
+	if (params.flags & ~VFIO_IOMMU_BIND_PID)
+		return -EINVAL;
+
+	if (params.flags & VFIO_IOMMU_BIND_PID) {
+		mm = vfio_iommu_get_mm_by_vpid(params.pid);
+		if (IS_ERR(mm))
+			return PTR_ERR(mm);
+	} else {
+		mm = get_task_mm(current);
+		if (!mm)
+			return -EINVAL;
+	}
+
+	mutex_lock(&iommu->lock);
+	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	list_for_each_entry(vfio_mm, &iommu->mm_list, next) {
+		if (vfio_mm->mm == mm) {
+			params.pasid = vfio_mm->pasid;
+			ret = copy_to_user(arg, &params, sizeof(params)) ?
+				-EFAULT : 0;
+			goto out_unlock;
+		}
+	}
+
+	vfio_mm = kzalloc(sizeof(*vfio_mm), GFP_KERNEL);
+	if (!vfio_mm) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	vfio_mm->mm = mm;
+	vfio_mm->pasid = VFIO_PASID_INVALID;
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		list_for_each_entry(group, &domain->group_list, next) {
+			ret = vfio_iommu_bind_group(iommu, group, vfio_mm);
+			if (ret)
+				goto out_unbind;
+		}
+	}
+
+	list_add(&vfio_mm->next, &iommu->mm_list);
+
+	params.pasid = vfio_mm->pasid;
+	ret = copy_to_user(arg, &params, sizeof(params)) ? -EFAULT : 0;
+	if (ret)
+		goto out_delete;
+
+	mutex_unlock(&iommu->lock);
+	mmput(mm);
+	return 0;
+
+out_delete:
+	list_del(&vfio_mm->next);
+
+out_unbind:
+	/* Undo all binds that already succeeded */
+	vfio_iommu_unbind(iommu, vfio_mm);
+	kfree(vfio_mm);
+
+out_unlock:
+	mutex_unlock(&iommu->lock);
+	mmput(mm);
+	return ret;
+}
+
+static long vfio_iommu_type1_unbind_process(struct vfio_iommu *iommu,
+					    void __user *arg,
+					    struct vfio_iommu_type1_bind *bind)
+{
+	int ret = -EINVAL;
+	unsigned long minsz;
+	struct mm_struct *mm;
+	struct vfio_mm *vfio_mm;
+	struct vfio_iommu_type1_bind_process params;
+
+	minsz = sizeof(*bind) + sizeof(params);
+	if (bind->argsz < minsz)
+		return -EINVAL;
+
+	arg += sizeof(*bind);
+	if (copy_from_user(&params, arg, sizeof(params)))
+		return -EFAULT;
+
+	if (params.flags & ~VFIO_IOMMU_BIND_PID)
+		return -EINVAL;
+
+	/*
+	 * We can't simply call unbind with the PASID, because the process might
+	 * have died and the PASID might have been reallocated to another
+	 * process. Instead we need to fetch that process mm by PID again to
+	 * make sure we remove the right vfio_mm.
+	 */
+	if (params.flags & VFIO_IOMMU_BIND_PID) {
+		mm = vfio_iommu_get_mm_by_vpid(params.pid);
+		if (IS_ERR(mm))
+			return PTR_ERR(mm);
+	} else {
+		mm = get_task_mm(current);
+		if (!mm)
+			return -EINVAL;
+	}
+
+	ret = -ESRCH;
+	mutex_lock(&iommu->lock);
+	list_for_each_entry(vfio_mm, &iommu->mm_list, next) {
+		if (vfio_mm->mm == mm) {
+			vfio_iommu_unbind(iommu, vfio_mm);
+			list_del(&vfio_mm->next);
+			kfree(vfio_mm);
+			ret = 0;
+			break;
+		}
+	}
+	mutex_unlock(&iommu->lock);
+	mmput(mm);
+
+	return ret;
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
 				   unsigned int cmd, unsigned long arg)
 {
@@ -1728,6 +2097,44 @@  static long vfio_iommu_type1_ioctl(void *iommu_data,
 
 		return copy_to_user((void __user *)arg, &unmap, minsz) ?
 			-EFAULT : 0;
+
+	} else if (cmd == VFIO_IOMMU_BIND) {
+		struct vfio_iommu_type1_bind bind;
+
+		minsz = offsetofend(struct vfio_iommu_type1_bind, flags);
+
+		if (copy_from_user(&bind, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (bind.argsz < minsz)
+			return -EINVAL;
+
+		switch (bind.flags) {
+		case VFIO_IOMMU_BIND_PROCESS:
+			return vfio_iommu_type1_bind_process(iommu, (void *)arg,
+							     &bind);
+		default:
+			return -EINVAL;
+		}
+
+	} else if (cmd == VFIO_IOMMU_UNBIND) {
+		struct vfio_iommu_type1_bind bind;
+
+		minsz = offsetofend(struct vfio_iommu_type1_bind, flags);
+
+		if (copy_from_user(&bind, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (bind.argsz < minsz)
+			return -EINVAL;
+
+		switch (bind.flags) {
+		case VFIO_IOMMU_BIND_PROCESS:
+			return vfio_iommu_type1_unbind_process(iommu, (void *)arg,
+							       &bind);
+		default:
+			return -EINVAL;
+		}
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 1aa7b82e8169..dc07752c8fe8 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -665,6 +665,82 @@  struct vfio_iommu_type1_dma_unmap {
 #define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
 #define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/*
+ * VFIO_IOMMU_BIND_PROCESS
+ *
+ * Allocate a PASID for a process address space, and use it to attach this
+ * process to all devices in the container. Devices can then tag their DMA
+ * traffic with the returned @pasid to perform transactions on the associated
+ * virtual address space. Mapping and unmapping buffers is performed by standard
+ * functions such as mmap and malloc.
+ *
+ * If flag is VFIO_IOMMU_BIND_PID, @pid contains the pid of a foreign process to
+ * bind. Otherwise the current task is bound. Given that the caller owns the
+ * device, setting this flag grants the caller read and write permissions on the
+ * entire address space of foreign process described by @pid. Therefore,
+ * permission to perform the bind operation on a foreign process is governed by
+ * the ptrace access mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2)
+ * for more information.
+ *
+ * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This
+ * ID is unique to a process and can be used on all devices in the container.
+ *
+ * On fork, the child inherits the device fd and can use the bonds setup by its
+ * parent. Consequently, the child has R/W access on the address spaces bound by
+ * its parent. After an execv, the device fd is closed and the child doesn't
+ * have access to the address space anymore.
+ *
+ * To remove a bond between process and container, VFIO_IOMMU_UNBIND ioctl is
+ * issued with the same parameters. If a pid was specified in VFIO_IOMMU_BIND,
+ * it should also be present for VFIO_IOMMU_UNBIND. Otherwise unbind the current
+ * task from the container.
+ */
+struct vfio_iommu_type1_bind_process {
+	__u32	flags;
+#define VFIO_IOMMU_BIND_PID		(1 << 0)
+	__u32	pasid;
+	__s32	pid;
+};
+
+/*
+ * Only mode supported at the moment is VFIO_IOMMU_BIND_PROCESS, which takes
+ * vfio_iommu_type1_bind_process in data.
+ */
+struct vfio_iommu_type1_bind {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_IOMMU_BIND_PROCESS		(1 << 0)
+	__u8	data[];
+};
+
+/*
+ * VFIO_IOMMU_BIND - _IOWR(VFIO_TYPE, VFIO_BASE + 22, struct vfio_iommu_bind)
+ *
+ * Manage address spaces of devices in this container. Initially a TYPE1
+ * container can only have one address space, managed with
+ * VFIO_IOMMU_MAP/UNMAP_DMA.
+ *
+ * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP
+ * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page
+ * tables, and BIND manages the stage-1 (guest) page tables. Other types of
+ * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls
+ * non-PASID traffic and BIND controls PASID traffic. But this depends on the
+ * underlying IOMMU architecture and isn't guaranteed.
+ *
+ * Availability of this feature depends on the device, its bus, the underlying
+ * IOMMU and the CPU architecture.
+ *
+ * returns: 0 on success, -errno on failure.
+ */
+#define VFIO_IOMMU_BIND		_IO(VFIO_TYPE, VFIO_BASE + 22)
+
+/*
+ * VFIO_IOMMU_UNBIND - _IOWR(VFIO_TYPE, VFIO_BASE + 23, struct vfio_iommu_bind)
+ *
+ * Undo what was done by the corresponding VFIO_IOMMU_BIND ioctl.
+ */
+#define VFIO_IOMMU_UNBIND	_IO(VFIO_TYPE, VFIO_BASE + 23)
+
 /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
 
 /*