diff mbox

[v13,09/22] vfio iommu type1: Add task structure to vfio_dma

Message ID 1479223805-22895-10-git-send-email-kwankhede@nvidia.com
State New
Headers show

Commit Message

Kirti Wankhede Nov. 15, 2016, 3:29 p.m. UTC
Add task structure to vfio_dma structure. Task structure is used for:
- During DMA_UNMAP, same task who mapped it or other task who shares same
address space is allowed to unmap, otherwise unmap fails.
QEMU maps few iova ranges initially, then fork threads and from the child
thread calls DMA_UNMAP on previously mapped iova. Since child shares same
address space, DMA_UNMAP is successful.
- Avoid accessing struct mm while process is exiting by acquiring
reference of task's mm during page accounting.
- It is also used to get task mlock capability and rlimit for mlock.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I7600f1bea6b384fd589fa72421ccf031bcfd9ac5
---
 drivers/vfio/vfio_iommu_type1.c | 137 +++++++++++++++++++++++++---------------
 1 file changed, 86 insertions(+), 51 deletions(-)

Comments

Dong Jia Shi Nov. 16, 2016, 6:06 a.m. UTC | #1
* Kirti Wankhede <kwankhede@nvidia.com> [2016-11-15 20:59:52 +0530]:

Hi Kirti,

[...]
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

> @@ -331,13 +338,16 @@ static long vfio_pin_pages_remote(unsigned long vaddr, long npage,
>  	}
> 
>  	if (!rsvd)
> -		vfio_lock_acct(current, i);
> +		vfio_lock_acct(dma->task, i);
> +	ret = i;
> 
> -	return i;
> +pin_pg_remote_exit:
out_mmput sounds a better name to me.

> +	mmput(mm);
> +	return ret;
>  }
> 
[...]

> @@ -510,6 +521,12 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
>  		if (!iommu->v2 && unmap->iova > dma->iova)
>  			break;
> +		/*
> +		 * Task with same address space who mapped this iova range is
> +		 * allowed to unmap the iova range.
> +		 */
> +		if (dma->task->mm != current->mm)
How about:
		if (dma->task != current)

> +			break;
>  		unmapped += dma->size;
>  		vfio_remove_dma(iommu, dma);
>  	}
> @@ -576,17 +593,55 @@ unwind:
>  	return ret;
>  }
> 
> +static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> +			    size_t map_size)
Do you factor out this function for future usage?
I didn't find the other callers.

> +{
> +	dma_addr_t iova = dma->iova;
> +	unsigned long vaddr = dma->vaddr;
> +	size_t size = map_size;
> +	long npage;
> +	unsigned long pfn;
> +	int ret = 0;
> +
> +	while (size) {
> +		/* Pin a contiguous chunk of memory */
> +		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
> +					      size >> PAGE_SHIFT, dma->prot,
> +					      &pfn);
> +		if (npage <= 0) {
> +			WARN_ON(!npage);
> +			ret = (int)npage;
> +			break;
> +		}
> +
> +		/* Map it! */
> +		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
> +				     dma->prot);
> +		if (ret) {
> +			vfio_unpin_pages_remote(dma, pfn, npage,
> +						 dma->prot, true);
> +			break;
> +		}
> +
> +		size -= npage << PAGE_SHIFT;
> +		dma->size += npage << PAGE_SHIFT;
> +	}
> +
> +	if (ret)
> +		vfio_remove_dma(iommu, dma);
> +
> +	return ret;
> +}
> +
>  static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  			   struct vfio_iommu_type1_dma_map *map)
>  {
>  	dma_addr_t iova = map->iova;
>  	unsigned long vaddr = map->vaddr;
>  	size_t size = map->size;
> -	long npage;
>  	int ret = 0, prot = 0;
>  	uint64_t mask;
>  	struct vfio_dma *dma;
> -	unsigned long pfn;
> 
>  	/* Verify that none of our __u64 fields overflow */
>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> @@ -612,47 +667,27 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	mutex_lock(&iommu->lock);
> 
>  	if (vfio_find_dma(iommu, iova, size)) {
> -		mutex_unlock(&iommu->lock);
> -		return -EEXIST;
> +		ret = -EEXIST;
> +		goto do_map_err;
>  	}
> 
>  	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
>  	if (!dma) {
> -		mutex_unlock(&iommu->lock);
> -		return -ENOMEM;
> +		ret = -ENOMEM;
> +		goto do_map_err;
>  	}
> 
>  	dma->iova = iova;
>  	dma->vaddr = vaddr;
>  	dma->prot = prot;
> +	get_task_struct(current);
> +	dma->task = current;
> 
>  	/* Insert zero-sized and grow as we map chunks of it */
>  	vfio_link_dma(iommu, dma);
> 
> -	while (size) {
> -		/* Pin a contiguous chunk of memory */
> -		npage = vfio_pin_pages_remote(vaddr + dma->size,
> -					      size >> PAGE_SHIFT, prot, &pfn);
> -		if (npage <= 0) {
> -			WARN_ON(!npage);
> -			ret = (int)npage;
> -			break;
> -		}
> -
> -		/* Map it! */
> -		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
> -		if (ret) {
> -			vfio_unpin_pages_remote(pfn, npage, prot, true);
> -			break;
> -		}
> -
> -		size -= npage << PAGE_SHIFT;
> -		dma->size += npage << PAGE_SHIFT;
> -	}
> -
> -	if (ret)
> -		vfio_remove_dma(iommu, dma);
> -
> +	ret = vfio_pin_map_dma(iommu, dma, size);
> +do_map_err:
Rename to out_unlock?

>  	mutex_unlock(&iommu->lock);
>  	return ret;
>  }
> -- 
> 2.7.0
> 

Otherwise, LGTM!
Kirti Wankhede Nov. 16, 2016, 3:11 p.m. UTC | #2
On 11/16/2016 11:36 AM, Dong Jia Shi wrote:
> * Kirti Wankhede <kwankhede@nvidia.com> [2016-11-15 20:59:52 +0530]:
> 
> Hi Kirti,
> 
> [...]
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> 
>> @@ -331,13 +338,16 @@ static long vfio_pin_pages_remote(unsigned long vaddr, long npage,
>>  	}
>>
>>  	if (!rsvd)
>> -		vfio_lock_acct(current, i);
>> +		vfio_lock_acct(dma->task, i);
>> +	ret = i;
>>
>> -	return i;
>> +pin_pg_remote_exit:
> out_mmput sounds a better name to me.
> 
>> +	mmput(mm);
>> +	return ret;
>>  }
>>
> [...]
> 
>> @@ -510,6 +521,12 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>  	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
>>  		if (!iommu->v2 && unmap->iova > dma->iova)
>>  			break;
>> +		/*
>> +		 * Task with same address space who mapped this iova range is
>> +		 * allowed to unmap the iova range.
>> +		 */
>> +		if (dma->task->mm != current->mm)
> How about:
> 		if (dma->task != current)
> 

As I mentioned in comment above this and commit description, if a
process calls DMA_MAP, forks a thread and then child thread calls
DMA_UNMAP, this should be allowed since address space is same for parent
process and child. QEMU also works that way.

>> +			break;
>>  		unmapped += dma->size;
>>  		vfio_remove_dma(iommu, dma);
>>  	}
>> @@ -576,17 +593,55 @@ unwind:
>>  	return ret;
>>  }
>>
>> +static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
>> +			    size_t map_size)
> Do you factor out this function for future usage?
> I didn't find the other callers.
>

This is pulled out to make caller simple and short. Otherwise
vfio_dma_do_map() would have become a long function.


>> +{
>> +	dma_addr_t iova = dma->iova;
>> +	unsigned long vaddr = dma->vaddr;
>> +	size_t size = map_size;
>> +	long npage;
>> +	unsigned long pfn;
>> +	int ret = 0;
>> +
>> +	while (size) {
>> +		/* Pin a contiguous chunk of memory */
>> +		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
>> +					      size >> PAGE_SHIFT, dma->prot,
>> +					      &pfn);
>> +		if (npage <= 0) {
>> +			WARN_ON(!npage);
>> +			ret = (int)npage;
>> +			break;
>> +		}
>> +
>> +		/* Map it! */
>> +		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
>> +				     dma->prot);
>> +		if (ret) {
>> +			vfio_unpin_pages_remote(dma, pfn, npage,
>> +						 dma->prot, true);
>> +			break;
>> +		}
>> +
>> +		size -= npage << PAGE_SHIFT;
>> +		dma->size += npage << PAGE_SHIFT;
>> +	}
>> +
>> +	if (ret)
>> +		vfio_remove_dma(iommu, dma);
>> +
>> +	return ret;
>> +}
>> +
>>  static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>  			   struct vfio_iommu_type1_dma_map *map)
>>  {
>>  	dma_addr_t iova = map->iova;
>>  	unsigned long vaddr = map->vaddr;
>>  	size_t size = map->size;
>> -	long npage;
>>  	int ret = 0, prot = 0;
>>  	uint64_t mask;
>>  	struct vfio_dma *dma;
>> -	unsigned long pfn;
>>
>>  	/* Verify that none of our __u64 fields overflow */
>>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
>> @@ -612,47 +667,27 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>  	mutex_lock(&iommu->lock);
>>
>>  	if (vfio_find_dma(iommu, iova, size)) {
>> -		mutex_unlock(&iommu->lock);
>> -		return -EEXIST;
>> +		ret = -EEXIST;
>> +		goto do_map_err;
>>  	}
>>
>>  	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
>>  	if (!dma) {
>> -		mutex_unlock(&iommu->lock);
>> -		return -ENOMEM;
>> +		ret = -ENOMEM;
>> +		goto do_map_err;
>>  	}
>>
>>  	dma->iova = iova;
>>  	dma->vaddr = vaddr;
>>  	dma->prot = prot;
>> +	get_task_struct(current);
>> +	dma->task = current;
>>
>>  	/* Insert zero-sized and grow as we map chunks of it */
>>  	vfio_link_dma(iommu, dma);
>>
>> -	while (size) {
>> -		/* Pin a contiguous chunk of memory */
>> -		npage = vfio_pin_pages_remote(vaddr + dma->size,
>> -					      size >> PAGE_SHIFT, prot, &pfn);
>> -		if (npage <= 0) {
>> -			WARN_ON(!npage);
>> -			ret = (int)npage;
>> -			break;
>> -		}
>> -
>> -		/* Map it! */
>> -		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
>> -		if (ret) {
>> -			vfio_unpin_pages_remote(pfn, npage, prot, true);
>> -			break;
>> -		}
>> -
>> -		size -= npage << PAGE_SHIFT;
>> -		dma->size += npage << PAGE_SHIFT;
>> -	}
>> -
>> -	if (ret)
>> -		vfio_remove_dma(iommu, dma);
>> -
>> +	ret = vfio_pin_map_dma(iommu, dma, size);
>> +do_map_err:
> Rename to out_unlock?
> 
>>  	mutex_unlock(&iommu->lock);
>>  	return ret;
>>  }
>> -- 
>> 2.7.0
>>
> 
> Otherwise, LGTM!
> 

Thanks.

Kirti.
diff mbox

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index ffe2026f1341..50aca95cf61e 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -36,6 +36,7 @@ 
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
 #include <linux/workqueue.h>
+#include <linux/pid_namespace.h>
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
@@ -75,6 +76,7 @@  struct vfio_dma {
 	unsigned long		vaddr;		/* Process virtual addr */
 	size_t			size;		/* Map size (bytes) */
 	int			prot;		/* IOMMU_READ/WRITE */
+	struct task_struct	*task;
 };
 
 struct vfio_group {
@@ -277,41 +279,47 @@  static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
  * the iommu can only map chunks of consecutive pfns anyway, so get the
  * first page and all consecutive pages with the same locking.
  */
-static long vfio_pin_pages_remote(unsigned long vaddr, long npage,
-				  int prot, unsigned long *pfn_base)
+static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
+				  long npage, int prot, unsigned long *pfn_base)
 {
-	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	bool lock_cap = capable(CAP_IPC_LOCK);
+	unsigned long limit;
+	bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns,
+				   CAP_IPC_LOCK);
+	struct mm_struct *mm;
 	long ret, i;
 	bool rsvd;
 
-	if (!current->mm)
+	mm = get_task_mm(dma->task);
+	if (!mm)
 		return -ENODEV;
 
-	ret = vaddr_get_pfn(current->mm, vaddr, prot, pfn_base);
+	ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
 	if (ret)
-		return ret;
+		goto pin_pg_remote_exit;
 
 	rsvd = is_invalid_reserved_pfn(*pfn_base);
+	limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
-	if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
+	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
 		put_pfn(*pfn_base, prot);
 		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
 			limit << PAGE_SHIFT);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto pin_pg_remote_exit;
 	}
 
 	if (unlikely(disable_hugepages)) {
 		if (!rsvd)
-			vfio_lock_acct(current, 1);
-		return 1;
+			vfio_lock_acct(dma->task, 1);
+		ret = 1;
+		goto pin_pg_remote_exit;
 	}
 
 	/* Lock all the consecutive pages from pfn_base */
 	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 		unsigned long pfn = 0;
 
-		ret = vaddr_get_pfn(current->mm, vaddr, prot, &pfn);
+		ret = vaddr_get_pfn(mm, vaddr, prot, &pfn);
 		if (ret)
 			break;
 
@@ -321,8 +329,7 @@  static long vfio_pin_pages_remote(unsigned long vaddr, long npage,
 			break;
 		}
 
-		if (!rsvd && !lock_cap &&
-		    current->mm->locked_vm + i + 1 > limit) {
+		if (!rsvd && !lock_cap && mm->locked_vm + i + 1 > limit) {
 			put_pfn(pfn, prot);
 			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 				__func__, limit << PAGE_SHIFT);
@@ -331,13 +338,16 @@  static long vfio_pin_pages_remote(unsigned long vaddr, long npage,
 	}
 
 	if (!rsvd)
-		vfio_lock_acct(current, i);
+		vfio_lock_acct(dma->task, i);
+	ret = i;
 
-	return i;
+pin_pg_remote_exit:
+	mmput(mm);
+	return ret;
 }
 
-static long vfio_unpin_pages_remote(unsigned long pfn, long npage,
-				    int prot, bool do_accounting)
+static long vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn,
+				    long npage, int prot, bool do_accounting)
 {
 	unsigned long unlocked = 0;
 	long i;
@@ -346,7 +356,7 @@  static long vfio_unpin_pages_remote(unsigned long pfn, long npage,
 		unlocked += put_pfn(pfn++, prot);
 
 	if (do_accounting)
-		vfio_lock_acct(current, -unlocked);
+		vfio_lock_acct(dma->task, -unlocked);
 
 	return unlocked;
 }
@@ -400,7 +410,7 @@  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 		if (WARN_ON(!unmapped))
 			break;
 
-		unlocked += vfio_unpin_pages_remote(phys >> PAGE_SHIFT,
+		unlocked += vfio_unpin_pages_remote(dma, phys >> PAGE_SHIFT,
 						    unmapped >> PAGE_SHIFT,
 						    dma->prot, false);
 		iova += unmapped;
@@ -408,13 +418,14 @@  static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 		cond_resched();
 	}
 
-	vfio_lock_acct(current, -unlocked);
+	vfio_lock_acct(dma->task, -unlocked);
 }
 
 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 {
 	vfio_unmap_unpin(iommu, dma);
 	vfio_unlink_dma(iommu, dma);
+	put_task_struct(dma->task);
 	kfree(dma);
 }
 
@@ -510,6 +521,12 @@  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
 		if (!iommu->v2 && unmap->iova > dma->iova)
 			break;
+		/*
+		 * Task with same address space who mapped this iova range is
+		 * allowed to unmap the iova range.
+		 */
+		if (dma->task->mm != current->mm)
+			break;
 		unmapped += dma->size;
 		vfio_remove_dma(iommu, dma);
 	}
@@ -576,17 +593,55 @@  unwind:
 	return ret;
 }
 
+static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			    size_t map_size)
+{
+	dma_addr_t iova = dma->iova;
+	unsigned long vaddr = dma->vaddr;
+	size_t size = map_size;
+	long npage;
+	unsigned long pfn;
+	int ret = 0;
+
+	while (size) {
+		/* Pin a contiguous chunk of memory */
+		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
+					      size >> PAGE_SHIFT, dma->prot,
+					      &pfn);
+		if (npage <= 0) {
+			WARN_ON(!npage);
+			ret = (int)npage;
+			break;
+		}
+
+		/* Map it! */
+		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
+				     dma->prot);
+		if (ret) {
+			vfio_unpin_pages_remote(dma, pfn, npage,
+						 dma->prot, true);
+			break;
+		}
+
+		size -= npage << PAGE_SHIFT;
+		dma->size += npage << PAGE_SHIFT;
+	}
+
+	if (ret)
+		vfio_remove_dma(iommu, dma);
+
+	return ret;
+}
+
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			   struct vfio_iommu_type1_dma_map *map)
 {
 	dma_addr_t iova = map->iova;
 	unsigned long vaddr = map->vaddr;
 	size_t size = map->size;
-	long npage;
 	int ret = 0, prot = 0;
 	uint64_t mask;
 	struct vfio_dma *dma;
-	unsigned long pfn;
 
 	/* Verify that none of our __u64 fields overflow */
 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
@@ -612,47 +667,27 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	mutex_lock(&iommu->lock);
 
 	if (vfio_find_dma(iommu, iova, size)) {
-		mutex_unlock(&iommu->lock);
-		return -EEXIST;
+		ret = -EEXIST;
+		goto do_map_err;
 	}
 
 	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
 	if (!dma) {
-		mutex_unlock(&iommu->lock);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto do_map_err;
 	}
 
 	dma->iova = iova;
 	dma->vaddr = vaddr;
 	dma->prot = prot;
+	get_task_struct(current);
+	dma->task = current;
 
 	/* Insert zero-sized and grow as we map chunks of it */
 	vfio_link_dma(iommu, dma);
 
-	while (size) {
-		/* Pin a contiguous chunk of memory */
-		npage = vfio_pin_pages_remote(vaddr + dma->size,
-					      size >> PAGE_SHIFT, prot, &pfn);
-		if (npage <= 0) {
-			WARN_ON(!npage);
-			ret = (int)npage;
-			break;
-		}
-
-		/* Map it! */
-		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
-		if (ret) {
-			vfio_unpin_pages_remote(pfn, npage, prot, true);
-			break;
-		}
-
-		size -= npage << PAGE_SHIFT;
-		dma->size += npage << PAGE_SHIFT;
-	}
-
-	if (ret)
-		vfio_remove_dma(iommu, dma);
-
+	ret = vfio_pin_map_dma(iommu, dma, size);
+do_map_err:
 	mutex_unlock(&iommu->lock);
 	return ret;
 }