diff mbox series

[v2] cxl: Fix possible deadlock when processing page faults from cxllib

Message ID 20180403135402.30374-1-fbarrat@linux.vnet.ibm.com (mailing list archive)
State Accepted
Commit ad7b4e8022b9864c075fe71e1328b1d25cad82f6
Headers show
Series [v2] cxl: Fix possible deadlock when processing page faults from cxllib | expand

Commit Message

Frederic Barrat April 3, 2018, 1:54 p.m. UTC
cxllib_handle_fault() is called by an external driver when it needs to
have the host resolve page faults for a buffer. The buffer can cover
several pages and VMAs. The function iterates over all the pages used
by the buffer, based on the page size of the VMA.

To ensure some stability while processing the faults, the thread T1
grabs the mm->mmap_sem semaphore with read access (R1). However, when
processing a page fault for a single page, one of the underlying
functions, copro_handle_mm_fault(), also grabs the same semaphore with
read access (R2). So the thread T1 takes the semaphore twice.

If another thread T2 tries to access the semaphore in write mode W1
(say, because it wants to allocate memory and calls 'brk'), then that
thread T2 will have to wait because there's a reader (R1). If the
thread T1 is processing a new page at that time, it won't get an
automatic grant at R2, because there's now a writer thread
waiting (T2). And we have a deadlock.

The timeline is:
1. thread T1 owns the semaphore with read access R1
2. thread T2 requests write access W1 and waits
3. thread T1 requests read access R2 and waits

The fix is for the thread T1 to release the semaphore R1 once it got
the information it needs from the current VMA. The address space/VMAs
could evolve while T1 iterates over the full buffer, but in the
unlikely case where T1 misses a page, the external driver will raise a
new page fault when retrying the memory access.

Fixes: 3ced8d730063 ("cxl: Export library to support IBM XSL")
Cc: stable@vger.kernel.org # 4.13+
Signed-off-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
---
 drivers/misc/cxl/cxllib.c | 85 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 55 insertions(+), 30 deletions(-)

Comments

Laurent Dufour April 3, 2018, 2:06 p.m. UTC | #1
On 03/04/2018 15:54, Frederic Barrat wrote:
> cxllib_handle_fault() is called by an external driver when it needs to
> have the host resolve page faults for a buffer. The buffer can cover
> several pages and VMAs. The function iterates over all the pages used
> by the buffer, based on the page size of the VMA.
> 
> To ensure some stability while processing the faults, the thread T1
> grabs the mm->mmap_sem semaphore with read access (R1). However, when
> processing a page fault for a single page, one of the underlying
> functions, copro_handle_mm_fault(), also grabs the same semaphore with
> read access (R2). So the thread T1 takes the semaphore twice.
> 
> If another thread T2 tries to access the semaphore in write mode W1
> (say, because it wants to allocate memory and calls 'brk'), then that
> thread T2 will have to wait because there's a reader (R1). If the
> thread T1 is processing a new page at that time, it won't get an
> automatic grant at R2, because there's now a writer thread
> waiting (T2). And we have a deadlock.
> 
> The timeline is:
> 1. thread T1 owns the semaphore with read access R1
> 2. thread T2 requests write access W1 and waits
> 3. thread T1 requests read access R2 and waits
> 
> The fix is for the thread T1 to release the semaphore R1 once it got
> the information it needs from the current VMA. The address space/VMAs
> could evolve while T1 iterates over the full buffer, but in the
> unlikely case where T1 misses a page, the external driver will raise a
> new page fault when retrying the memory access.
> 
> Fixes: 3ced8d730063 ("cxl: Export library to support IBM XSL")
> Cc: stable@vger.kernel.org # 4.13+
> Signed-off-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>

What are the changes introduced in this v2 ?

> ---
>  drivers/misc/cxl/cxllib.c | 85 ++++++++++++++++++++++++++++++-----------------
>  1 file changed, 55 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/misc/cxl/cxllib.c b/drivers/misc/cxl/cxllib.c
> index 30ccba436b3b..55cd35d1a9cc 100644
> --- a/drivers/misc/cxl/cxllib.c
> +++ b/drivers/misc/cxl/cxllib.c
> @@ -208,49 +208,74 @@ int cxllib_get_PE_attributes(struct task_struct *task,
>  }
>  EXPORT_SYMBOL_GPL(cxllib_get_PE_attributes);
> 
> -int cxllib_handle_fault(struct mm_struct *mm, u64 addr, u64 size, u64 flags)
> +static int get_vma_info(struct mm_struct *mm, u64 addr,
> +			u64 *vma_start, u64 *vma_end,
> +			unsigned long *page_size)
>  {
> -	int rc;
> -	u64 dar;
>  	struct vm_area_struct *vma = NULL;
> -	unsigned long page_size;
> -
> -	if (mm == NULL)
> -		return -EFAULT;
> +	int rc = 0;
> 
>  	down_read(&mm->mmap_sem);
> 
>  	vma = find_vma(mm, addr);
>  	if (!vma) {
> -		pr_err("Can't find vma for addr %016llx\n", addr);
>  		rc = -EFAULT;
>  		goto out;
>  	}
> -	/* get the size of the pages allocated */
> -	page_size = vma_kernel_pagesize(vma);
> -
> -	for (dar = (addr & ~(page_size - 1)); dar < (addr + size); dar += page_size) {
> -		if (dar < vma->vm_start || dar >= vma->vm_end) {
> -			vma = find_vma(mm, addr);
> -			if (!vma) {
> -				pr_err("Can't find vma for addr %016llx\n", addr);
> -				rc = -EFAULT;
> -				goto out;
> -			}
> -			/* get the size of the pages allocated */
> -			page_size = vma_kernel_pagesize(vma);
> +	*page_size = vma_kernel_pagesize(vma);
> +	*vma_start = vma->vm_start;
> +	*vma_end = vma->vm_end;
> +out:
> +	up_read(&mm->mmap_sem);
> +	return rc;
> +}
> +
> +int cxllib_handle_fault(struct mm_struct *mm, u64 addr, u64 size, u64 flags)
> +{
> +	int rc;
> +	u64 dar, vma_start, vma_end;
> +	unsigned long page_size;
> +
> +	if (mm == NULL)
> +		return -EFAULT;
> +
> +	/*
> +	 * The buffer we have to process can extend over several pages
> +	 * and may also cover several VMAs.
> +	 * We iterate over all the pages. The page size could vary
> +	 * between VMAs.
> +	 */
> +	rc = get_vma_info(mm, addr, &vma_start, &vma_end, &page_size);
> +	if (rc)
> +		return rc;
> +
> +	for (dar = (addr & ~(page_size - 1)); dar < (addr + size);
> +	     dar += page_size) {
> +		if (dar < vma_start || dar >= vma_end) {
> +			/*
> +			 * We don't hold the mm->mmap_sem semaphore
> +			 * while iterating, since the semaphore is
> +			 * required by one of the lower-level page
> +			 * fault processing functions and it could
> +			 * create a deadlock.
> +			 *
> +			 * It means the VMAs can be altered between 2
> +			 * loop iterations and we could theoretically
> +			 * miss a page (however unlikely). But that's
> +			 * not really a problem, as the driver will
> +			 * retry access, get another page fault on the
> +			 * missing page and call us again.
> +			 */
> +			rc = get_vma_info(mm, dar, &vma_start, &vma_end,
> +					&page_size);
> +			if (rc)
> +				return rc;
>  		}
> 
>  		rc = cxl_handle_mm_fault(mm, flags, dar);
> -		if (rc) {
> -			pr_err("cxl_handle_mm_fault failed %d", rc);
> -			rc = -EFAULT;
> -			goto out;
> -		}
> +		if (rc)
> +			return -EFAULT;
>  	}
> -	rc = 0;
> -out:
> -	up_read(&mm->mmap_sem);
> -	return rc;
> +	return 0;
>  }
>  EXPORT_SYMBOL_GPL(cxllib_handle_fault);
>
Frederic Barrat April 3, 2018, 2:15 p.m. UTC | #2
Le 03/04/2018 à 16:06, Laurent Dufour a écrit :
> On 03/04/2018 15:54, Frederic Barrat wrote:
>> cxllib_handle_fault() is called by an external driver when it needs to
>> have the host resolve page faults for a buffer. The buffer can cover
>> several pages and VMAs. The function iterates over all the pages used
>> by the buffer, based on the page size of the VMA.
>>
>> To ensure some stability while processing the faults, the thread T1
>> grabs the mm->mmap_sem semaphore with read access (R1). However, when
>> processing a page fault for a single page, one of the underlying
>> functions, copro_handle_mm_fault(), also grabs the same semaphore with
>> read access (R2). So the thread T1 takes the semaphore twice.
>>
>> If another thread T2 tries to access the semaphore in write mode W1
>> (say, because it wants to allocate memory and calls 'brk'), then that
>> thread T2 will have to wait because there's a reader (R1). If the
>> thread T1 is processing a new page at that time, it won't get an
>> automatic grant at R2, because there's now a writer thread
>> waiting (T2). And we have a deadlock.
>>
>> The timeline is:
>> 1. thread T1 owns the semaphore with read access R1
>> 2. thread T2 requests write access W1 and waits
>> 3. thread T1 requests read access R2 and waits
>>
>> The fix is for the thread T1 to release the semaphore R1 once it got
>> the information it needs from the current VMA. The address space/VMAs
>> could evolve while T1 iterates over the full buffer, but in the
>> unlikely case where T1 misses a page, the external driver will raise a
>> new page fault when retrying the memory access.
>>
>> Fixes: 3ced8d730063 ("cxl: Export library to support IBM XSL")
>> Cc: stable@vger.kernel.org # 4.13+
>> Signed-off-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
> 
> What are the changes introduced in this v2 ?

How did that happen! I would have sworn to have added the changelog!

Code is the same. I rewrote the commit message to hopefully make it more 
readable.

   Fred



>> ---
>>   drivers/misc/cxl/cxllib.c | 85 ++++++++++++++++++++++++++++++-----------------
>>   1 file changed, 55 insertions(+), 30 deletions(-)
>>
>> diff --git a/drivers/misc/cxl/cxllib.c b/drivers/misc/cxl/cxllib.c
>> index 30ccba436b3b..55cd35d1a9cc 100644
>> --- a/drivers/misc/cxl/cxllib.c
>> +++ b/drivers/misc/cxl/cxllib.c
>> @@ -208,49 +208,74 @@ int cxllib_get_PE_attributes(struct task_struct *task,
>>   }
>>   EXPORT_SYMBOL_GPL(cxllib_get_PE_attributes);
>>
>> -int cxllib_handle_fault(struct mm_struct *mm, u64 addr, u64 size, u64 flags)
>> +static int get_vma_info(struct mm_struct *mm, u64 addr,
>> +			u64 *vma_start, u64 *vma_end,
>> +			unsigned long *page_size)
>>   {
>> -	int rc;
>> -	u64 dar;
>>   	struct vm_area_struct *vma = NULL;
>> -	unsigned long page_size;
>> -
>> -	if (mm == NULL)
>> -		return -EFAULT;
>> +	int rc = 0;
>>
>>   	down_read(&mm->mmap_sem);
>>
>>   	vma = find_vma(mm, addr);
>>   	if (!vma) {
>> -		pr_err("Can't find vma for addr %016llx\n", addr);
>>   		rc = -EFAULT;
>>   		goto out;
>>   	}
>> -	/* get the size of the pages allocated */
>> -	page_size = vma_kernel_pagesize(vma);
>> -
>> -	for (dar = (addr & ~(page_size - 1)); dar < (addr + size); dar += page_size) {
>> -		if (dar < vma->vm_start || dar >= vma->vm_end) {
>> -			vma = find_vma(mm, addr);
>> -			if (!vma) {
>> -				pr_err("Can't find vma for addr %016llx\n", addr);
>> -				rc = -EFAULT;
>> -				goto out;
>> -			}
>> -			/* get the size of the pages allocated */
>> -			page_size = vma_kernel_pagesize(vma);
>> +	*page_size = vma_kernel_pagesize(vma);
>> +	*vma_start = vma->vm_start;
>> +	*vma_end = vma->vm_end;
>> +out:
>> +	up_read(&mm->mmap_sem);
>> +	return rc;
>> +}
>> +
>> +int cxllib_handle_fault(struct mm_struct *mm, u64 addr, u64 size, u64 flags)
>> +{
>> +	int rc;
>> +	u64 dar, vma_start, vma_end;
>> +	unsigned long page_size;
>> +
>> +	if (mm == NULL)
>> +		return -EFAULT;
>> +
>> +	/*
>> +	 * The buffer we have to process can extend over several pages
>> +	 * and may also cover several VMAs.
>> +	 * We iterate over all the pages. The page size could vary
>> +	 * between VMAs.
>> +	 */
>> +	rc = get_vma_info(mm, addr, &vma_start, &vma_end, &page_size);
>> +	if (rc)
>> +		return rc;
>> +
>> +	for (dar = (addr & ~(page_size - 1)); dar < (addr + size);
>> +	     dar += page_size) {
>> +		if (dar < vma_start || dar >= vma_end) {
>> +			/*
>> +			 * We don't hold the mm->mmap_sem semaphore
>> +			 * while iterating, since the semaphore is
>> +			 * required by one of the lower-level page
>> +			 * fault processing functions and it could
>> +			 * create a deadlock.
>> +			 *
>> +			 * It means the VMAs can be altered between 2
>> +			 * loop iterations and we could theoretically
>> +			 * miss a page (however unlikely). But that's
>> +			 * not really a problem, as the driver will
>> +			 * retry access, get another page fault on the
>> +			 * missing page and call us again.
>> +			 */
>> +			rc = get_vma_info(mm, dar, &vma_start, &vma_end,
>> +					&page_size);
>> +			if (rc)
>> +				return rc;
>>   		}
>>
>>   		rc = cxl_handle_mm_fault(mm, flags, dar);
>> -		if (rc) {
>> -			pr_err("cxl_handle_mm_fault failed %d", rc);
>> -			rc = -EFAULT;
>> -			goto out;
>> -		}
>> +		if (rc)
>> +			return -EFAULT;
>>   	}
>> -	rc = 0;
>> -out:
>> -	up_read(&mm->mmap_sem);
>> -	return rc;
>> +	return 0;
>>   }
>>   EXPORT_SYMBOL_GPL(cxllib_handle_fault);
>>
Michael Ellerman April 4, 2018, 2:39 p.m. UTC | #3
On Tue, 2018-04-03 at 13:54:02 UTC, Frederic Barrat wrote:
> cxllib_handle_fault() is called by an external driver when it needs to
> have the host resolve page faults for a buffer. The buffer can cover
> several pages and VMAs. The function iterates over all the pages used
> by the buffer, based on the page size of the VMA.
> 
> To ensure some stability while processing the faults, the thread T1
> grabs the mm->mmap_sem semaphore with read access (R1). However, when
> processing a page fault for a single page, one of the underlying
> functions, copro_handle_mm_fault(), also grabs the same semaphore with
> read access (R2). So the thread T1 takes the semaphore twice.
> 
> If another thread T2 tries to access the semaphore in write mode W1
> (say, because it wants to allocate memory and calls 'brk'), then that
> thread T2 will have to wait because there's a reader (R1). If the
> thread T1 is processing a new page at that time, it won't get an
> automatic grant at R2, because there's now a writer thread
> waiting (T2). And we have a deadlock.
> 
> The timeline is:
> 1. thread T1 owns the semaphore with read access R1
> 2. thread T2 requests write access W1 and waits
> 3. thread T1 requests read access R2 and waits
> 
> The fix is for the thread T1 to release the semaphore R1 once it got
> the information it needs from the current VMA. The address space/VMAs
> could evolve while T1 iterates over the full buffer, but in the
> unlikely case where T1 misses a page, the external driver will raise a
> new page fault when retrying the memory access.
> 
> Fixes: 3ced8d730063 ("cxl: Export library to support IBM XSL")
> Cc: stable@vger.kernel.org # 4.13+
> Signed-off-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/ad7b4e8022b9864c075fe71e1328b1

cheers
diff mbox series

Patch

diff --git a/drivers/misc/cxl/cxllib.c b/drivers/misc/cxl/cxllib.c
index 30ccba436b3b..55cd35d1a9cc 100644
--- a/drivers/misc/cxl/cxllib.c
+++ b/drivers/misc/cxl/cxllib.c
@@ -208,49 +208,74 @@  int cxllib_get_PE_attributes(struct task_struct *task,
 }
 EXPORT_SYMBOL_GPL(cxllib_get_PE_attributes);
 
-int cxllib_handle_fault(struct mm_struct *mm, u64 addr, u64 size, u64 flags)
+static int get_vma_info(struct mm_struct *mm, u64 addr,
+			u64 *vma_start, u64 *vma_end,
+			unsigned long *page_size)
 {
-	int rc;
-	u64 dar;
 	struct vm_area_struct *vma = NULL;
-	unsigned long page_size;
-
-	if (mm == NULL)
-		return -EFAULT;
+	int rc = 0;
 
 	down_read(&mm->mmap_sem);
 
 	vma = find_vma(mm, addr);
 	if (!vma) {
-		pr_err("Can't find vma for addr %016llx\n", addr);
 		rc = -EFAULT;
 		goto out;
 	}
-	/* get the size of the pages allocated */
-	page_size = vma_kernel_pagesize(vma);
-
-	for (dar = (addr & ~(page_size - 1)); dar < (addr + size); dar += page_size) {
-		if (dar < vma->vm_start || dar >= vma->vm_end) {
-			vma = find_vma(mm, addr);
-			if (!vma) {
-				pr_err("Can't find vma for addr %016llx\n", addr);
-				rc = -EFAULT;
-				goto out;
-			}
-			/* get the size of the pages allocated */
-			page_size = vma_kernel_pagesize(vma);
+	*page_size = vma_kernel_pagesize(vma);
+	*vma_start = vma->vm_start;
+	*vma_end = vma->vm_end;
+out:
+	up_read(&mm->mmap_sem);
+	return rc;
+}
+
+int cxllib_handle_fault(struct mm_struct *mm, u64 addr, u64 size, u64 flags)
+{
+	int rc;
+	u64 dar, vma_start, vma_end;
+	unsigned long page_size;
+
+	if (mm == NULL)
+		return -EFAULT;
+
+	/*
+	 * The buffer we have to process can extend over several pages
+	 * and may also cover several VMAs.
+	 * We iterate over all the pages. The page size could vary
+	 * between VMAs.
+	 */
+	rc = get_vma_info(mm, addr, &vma_start, &vma_end, &page_size);
+	if (rc)
+		return rc;
+
+	for (dar = (addr & ~(page_size - 1)); dar < (addr + size);
+	     dar += page_size) {
+		if (dar < vma_start || dar >= vma_end) {
+			/*
+			 * We don't hold the mm->mmap_sem semaphore
+			 * while iterating, since the semaphore is
+			 * required by one of the lower-level page
+			 * fault processing functions and it could
+			 * create a deadlock.
+			 *
+			 * It means the VMAs can be altered between 2
+			 * loop iterations and we could theoretically
+			 * miss a page (however unlikely). But that's
+			 * not really a problem, as the driver will
+			 * retry access, get another page fault on the
+			 * missing page and call us again.
+			 */
+			rc = get_vma_info(mm, dar, &vma_start, &vma_end,
+					&page_size);
+			if (rc)
+				return rc;
 		}
 
 		rc = cxl_handle_mm_fault(mm, flags, dar);
-		if (rc) {
-			pr_err("cxl_handle_mm_fault failed %d", rc);
-			rc = -EFAULT;
-			goto out;
-		}
+		if (rc)
+			return -EFAULT;
 	}
-	rc = 0;
-out:
-	up_read(&mm->mmap_sem);
-	return rc;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(cxllib_handle_fault);