diff mbox series

[1/5] powerpc/64s/hash: Fix 128TB-512TB virtual address boundary case allocation

Message ID 20171106100315.29720-2-npiggin@gmail.com (mailing list archive)
State Superseded
Headers show
Series VA allocator fixes | expand

Commit Message

Nicholas Piggin Nov. 6, 2017, 10:03 a.m. UTC
When allocating VA space with a hint that crosses 128TB, the SLB addr_limit
variable is not expanded if addr is not > 128TB, but the slice allocation
looks at task_size, which is 512TB. This results in slice_check_fit()
incorrectly succeeding because the slice_count truncates off bit 128 of the
requested mask, so the comparison to the available mask succeeds.

Fix this by using mm->context.addr_limit instead of mm->task_size for
testing allocation limits. This causes such allocations to fail.

Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Fixes: f4ea6dcb08 ("powerpc/mm: Enable mappings above 128TB")
Reported-by: Florian Weimer <fweimer@redhat.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/slice.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

Comments

Aneesh Kumar K.V Nov. 6, 2017, 10:38 a.m. UTC | #1
Nicholas Piggin <npiggin@gmail.com> writes:

> When allocating VA space with a hint that crosses 128TB, the SLB addr_limit
> variable is not expanded if addr is not > 128TB, but the slice allocation
> looks at task_size, which is 512TB. This results in slice_check_fit()
> incorrectly succeeding because the slice_count truncates off bit 128 of the
> requested mask, so the comparison to the available mask succeeds.


But then the mask passed to slice_check_fit() is generated using
context.addr_limit as max value. So how did that return succcess? ie,
we get the request mask via

slice_range_to_mask(addr, len, &mask);

And the potential/possible mask using

slice_mask_for_size(mm, psize, &good_mask);

So how did slice_check_fit() return sucess with

slice_check_fit(mm, mask, good_mask);


>
> Fix this by using mm->context.addr_limit instead of mm->task_size for
> testing allocation limits. This causes such allocations to fail.
>
> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> Fixes: f4ea6dcb08 ("powerpc/mm: Enable mappings above 128TB")
> Reported-by: Florian Weimer <fweimer@redhat.com>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/powerpc/mm/slice.c | 11 ++++++-----
>  1 file changed, 6 insertions(+), 5 deletions(-)
>
> diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
> index 45f6740dd407..567db541c0a1 100644
> --- a/arch/powerpc/mm/slice.c
> +++ b/arch/powerpc/mm/slice.c
> @@ -96,7 +96,7 @@ static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
>  {
>  	struct vm_area_struct *vma;
>
> -	if ((mm->task_size - len) < addr)
> +	if ((mm->context.addr_limit - len) < addr)

I was looking at these as generic boundary check against task size and
for specific range check we should have created mask always using
context.addr_limit. That should keep the boundary condition check same
across radix/hash.

>  		return 0;
>  	vma = find_vma(mm, addr);
>  	return (!vma || (addr + len) <= vm_start_gap(vma));
> @@ -133,7 +133,7 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret)
>  		if (!slice_low_has_vma(mm, i))
>  			ret->low_slices |= 1u << i;
>
> -	if (mm->task_size <= SLICE_LOW_TOP)
> +	if (mm->context.addr_limit <= SLICE_LOW_TOP)
>  		return;
>
>  	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++)
> @@ -446,19 +446,20 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
>
>  	/* Sanity checks */
>  	BUG_ON(mm->task_size == 0);
> +	BUG_ON(mm->context.addr_limit == 0);
>  	VM_BUG_ON(radix_enabled());
>
>  	slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
>  	slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
>  		  addr, len, flags, topdown);
>
> -	if (len > mm->task_size)
> +	if (len > mm->context.addr_limit)
>  		return -ENOMEM;
>  	if (len & ((1ul << pshift) - 1))
>  		return -EINVAL;
>  	if (fixed && (addr & ((1ul << pshift) - 1)))
>  		return -EINVAL;
> -	if (fixed && addr > (mm->task_size - len))
> +	if (fixed && addr > (mm->context.addr_limit - len))
>  		return -ENOMEM;
>
>  	/* If hint, make sure it matches our alignment restrictions */
> @@ -466,7 +467,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
>  		addr = _ALIGN_UP(addr, 1ul << pshift);
>  		slice_dbg(" aligned addr=%lx\n", addr);
>  		/* Ignore hint if it's too large or overlaps a VMA */
> -		if (addr > mm->task_size - len ||
> +		if (addr > mm->context.addr_limit - len ||
>  		    !slice_area_is_free(mm, addr, len))
>  			addr = 0;
>  	}
> -- 
> 2.15.0
Nicholas Piggin Nov. 6, 2017, 10:54 a.m. UTC | #2
On Mon, 06 Nov 2017 16:08:06 +0530
"Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> wrote:

> Nicholas Piggin <npiggin@gmail.com> writes:
> 
> > When allocating VA space with a hint that crosses 128TB, the SLB addr_limit
> > variable is not expanded if addr is not > 128TB, but the slice allocation
> > looks at task_size, which is 512TB. This results in slice_check_fit()
> > incorrectly succeeding because the slice_count truncates off bit 128 of the
> > requested mask, so the comparison to the available mask succeeds.  
> 
> 
> But then the mask passed to slice_check_fit() is generated using
> context.addr_limit as max value. So how did that return succcess? ie,
> we get the request mask via
> 
> slice_range_to_mask(addr, len, &mask);
> 
> And the potential/possible mask using
> 
> slice_mask_for_size(mm, psize, &good_mask);
> 
> So how did slice_check_fit() return sucess with
> 
> slice_check_fit(mm, mask, good_mask);

Because the addr_limit check is used to *limit* the comparison.

The available mask had bit up to 127 set, and the mask had 127 and
128 set. However the 128T addr_limit causes only bits 0-127 to be
compared.

> > Fix this by using mm->context.addr_limit instead of mm->task_size for
> > testing allocation limits. This causes such allocations to fail.
> >
> > Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> > Fixes: f4ea6dcb08 ("powerpc/mm: Enable mappings above 128TB")
> > Reported-by: Florian Weimer <fweimer@redhat.com>
> > Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> > ---
> >  arch/powerpc/mm/slice.c | 11 ++++++-----
> >  1 file changed, 6 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
> > index 45f6740dd407..567db541c0a1 100644
> > --- a/arch/powerpc/mm/slice.c
> > +++ b/arch/powerpc/mm/slice.c
> > @@ -96,7 +96,7 @@ static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
> >  {
> >  	struct vm_area_struct *vma;
> >
> > -	if ((mm->task_size - len) < addr)
> > +	if ((mm->context.addr_limit - len) < addr)  
> 
> I was looking at these as generic boundary check against task size and
> for specific range check we should have created mask always using
> context.addr_limit. That should keep the boundary condition check same
> across radix/hash.

We need to actually fix the radix case too for other-but-similar reasons,
so fixing it like this does end up with the same tests for both. See
the later radix patch.

Thanks,
Nick
Aneesh Kumar K.V Nov. 6, 2017, 11:05 a.m. UTC | #3
On 11/06/2017 04:24 PM, Nicholas Piggin wrote:
> On Mon, 06 Nov 2017 16:08:06 +0530
> "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> wrote:
> 
>> Nicholas Piggin <npiggin@gmail.com> writes:
>>
>>> When allocating VA space with a hint that crosses 128TB, the SLB addr_limit
>>> variable is not expanded if addr is not > 128TB, but the slice allocation
>>> looks at task_size, which is 512TB. This results in slice_check_fit()
>>> incorrectly succeeding because the slice_count truncates off bit 128 of the
>>> requested mask, so the comparison to the available mask succeeds.
>>
>>
>> But then the mask passed to slice_check_fit() is generated using
>> context.addr_limit as max value. So how did that return succcess? ie,
>> we get the request mask via
>>
>> slice_range_to_mask(addr, len, &mask);
>>
>> And the potential/possible mask using
>>
>> slice_mask_for_size(mm, psize, &good_mask);
>>
>> So how did slice_check_fit() return sucess with
>>
>> slice_check_fit(mm, mask, good_mask);
> 
> Because the addr_limit check is used to *limit* the comparison.
> 
> The available mask had bit up to 127 set, and the mask had 127 and
> 128 set. However the 128T addr_limit causes only bits 0-127 to be
> compared.
>

Should we fix it then via ? I haven't tested this yet. Also this result 
in us comparing more bits?

modified   arch/powerpc/mm/slice.c
@@ -169,13 +169,12 @@ static int slice_check_fit(struct mm_struct *mm,
  			   struct slice_mask mask, struct slice_mask available)
  {
  	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
-	unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.addr_limit);

  	bitmap_and(result, mask.high_slices,
-		   available.high_slices, slice_count);
+		   available.high_slices, SLICE_NUM_HIGH);

  	return (mask.low_slices & available.low_slices) == mask.low_slices &&
-		bitmap_equal(result, mask.high_slices, slice_count);
+		bitmap_equal(result, mask.high_slices, SLICE_NUM_HIGH)


-aneesh
Nicholas Piggin Nov. 6, 2017, 11:21 a.m. UTC | #4
On Mon, 6 Nov 2017 16:35:43 +0530
"Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> wrote:

> On 11/06/2017 04:24 PM, Nicholas Piggin wrote:
> > On Mon, 06 Nov 2017 16:08:06 +0530
> > "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> wrote:
> >   
> >> Nicholas Piggin <npiggin@gmail.com> writes:
> >>  
> >>> When allocating VA space with a hint that crosses 128TB, the SLB addr_limit
> >>> variable is not expanded if addr is not > 128TB, but the slice allocation
> >>> looks at task_size, which is 512TB. This results in slice_check_fit()
> >>> incorrectly succeeding because the slice_count truncates off bit 128 of the
> >>> requested mask, so the comparison to the available mask succeeds.  
> >>
> >>
> >> But then the mask passed to slice_check_fit() is generated using
> >> context.addr_limit as max value. So how did that return succcess? ie,
> >> we get the request mask via
> >>
> >> slice_range_to_mask(addr, len, &mask);
> >>
> >> And the potential/possible mask using
> >>
> >> slice_mask_for_size(mm, psize, &good_mask);
> >>
> >> So how did slice_check_fit() return sucess with
> >>
> >> slice_check_fit(mm, mask, good_mask);  
> > 
> > Because the addr_limit check is used to *limit* the comparison.
> > 
> > The available mask had bit up to 127 set, and the mask had 127 and
> > 128 set. However the 128T addr_limit causes only bits 0-127 to be
> > compared.
> >  
> 
> Should we fix it then via ? I haven't tested this yet. Also this result 
> in us comparing more bits?

I prefer not to rely on that as the fix because we should not be calling
into the slice code with an address beyond addr_limit IMO. There's quite
a few other places that use addr_limit. So I prefer my patch.

You could add this as an extra check, but yes it does result in more bitmap
to test. So if anything I would prefer to go the other way and actually
reduce the scope of *other* bitmap operations that are now using
SLICE_NUM_HIGH by similarly using addr_limit (if there are other
performance critical ones).

We could add some VM_BUG_ON checks to ensure tail bits are zero if
that's a concern.

> 
> modified   arch/powerpc/mm/slice.c
> @@ -169,13 +169,12 @@ static int slice_check_fit(struct mm_struct *mm,
>   			   struct slice_mask mask, struct slice_mask available)
>   {
>   	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
> -	unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.addr_limit);
> 
>   	bitmap_and(result, mask.high_slices,
> -		   available.high_slices, slice_count);
> +		   available.high_slices, SLICE_NUM_HIGH);
> 
>   	return (mask.low_slices & available.low_slices) == mask.low_slices &&
> -		bitmap_equal(result, mask.high_slices, slice_count);
> +		bitmap_equal(result, mask.high_slices, SLICE_NUM_HIGH)
> 
> 
> -aneesh
>
Aneesh Kumar K.V Nov. 7, 2017, 2 a.m. UTC | #5
On 11/06/2017 04:35 PM, Aneesh Kumar K.V wrote:
> 
> 
> On 11/06/2017 04:24 PM, Nicholas Piggin wrote:
>> On Mon, 06 Nov 2017 16:08:06 +0530
>> "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> wrote:
>>
>>> Nicholas Piggin <npiggin@gmail.com> writes:
>>>
>>>> When allocating VA space with a hint that crosses 128TB, the SLB 
>>>> addr_limit
>>>> variable is not expanded if addr is not > 128TB, but the slice 
>>>> allocation
>>>> looks at task_size, which is 512TB. This results in slice_check_fit()
>>>> incorrectly succeeding because the slice_count truncates off bit 128 
>>>> of the
>>>> requested mask, so the comparison to the available mask succeeds.
>>>
>>>
>>> But then the mask passed to slice_check_fit() is generated using
>>> context.addr_limit as max value. So how did that return succcess? ie,
>>> we get the request mask via
>>>
>>> slice_range_to_mask(addr, len, &mask);
>>>
>>> And the potential/possible mask using
>>>
>>> slice_mask_for_size(mm, psize, &good_mask);
>>>
>>> So how did slice_check_fit() return sucess with
>>>
>>> slice_check_fit(mm, mask, good_mask);
>>
>> Because the addr_limit check is used to *limit* the comparison.
>>
>> The available mask had bit up to 127 set, and the mask had 127 and
>> 128 set. However the 128T addr_limit causes only bits 0-127 to be
>> compared.
>>
> 
> Should we fix it then via ? I haven't tested this yet. Also this result 
> in us comparing more bits?
> 
> modified   arch/powerpc/mm/slice.c
> @@ -169,13 +169,12 @@ static int slice_check_fit(struct mm_struct *mm,
>                  struct slice_mask mask, struct slice_mask available)
>   {
>       DECLARE_BITMAP(result, SLICE_NUM_HIGH);
> -    unsigned long slice_count = 
> GET_HIGH_SLICE_INDEX(mm->context.addr_limit);
> 
>       bitmap_and(result, mask.high_slices,
> -           available.high_slices, slice_count);
> +           available.high_slices, SLICE_NUM_HIGH);
> 
>       return (mask.low_slices & available.low_slices) == mask.low_slices &&
> -        bitmap_equal(result, mask.high_slices, slice_count);
> +        bitmap_equal(result, mask.high_slices, SLICE_NUM_HIGH)
> 
> 

Florian, will you be able to test this patch ? We may not really want to 
push this. But it will confirm that we end up getting >128TB address 
because of this.

-aneesh
Nicholas Piggin Nov. 7, 2017, 2:03 a.m. UTC | #6
On Tue, 7 Nov 2017 07:30:51 +0530
"Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> wrote:

> On 11/06/2017 04:35 PM, Aneesh Kumar K.V wrote:
> > 
> > 
> > On 11/06/2017 04:24 PM, Nicholas Piggin wrote:  
> >> On Mon, 06 Nov 2017 16:08:06 +0530
> >> "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> wrote:
> >>  
> >>> Nicholas Piggin <npiggin@gmail.com> writes:
> >>>  
> >>>> When allocating VA space with a hint that crosses 128TB, the SLB 
> >>>> addr_limit
> >>>> variable is not expanded if addr is not > 128TB, but the slice 
> >>>> allocation
> >>>> looks at task_size, which is 512TB. This results in slice_check_fit()
> >>>> incorrectly succeeding because the slice_count truncates off bit 128 
> >>>> of the
> >>>> requested mask, so the comparison to the available mask succeeds.  
> >>>
> >>>
> >>> But then the mask passed to slice_check_fit() is generated using
> >>> context.addr_limit as max value. So how did that return succcess? ie,
> >>> we get the request mask via
> >>>
> >>> slice_range_to_mask(addr, len, &mask);
> >>>
> >>> And the potential/possible mask using
> >>>
> >>> slice_mask_for_size(mm, psize, &good_mask);
> >>>
> >>> So how did slice_check_fit() return sucess with
> >>>
> >>> slice_check_fit(mm, mask, good_mask);  
> >>
> >> Because the addr_limit check is used to *limit* the comparison.
> >>
> >> The available mask had bit up to 127 set, and the mask had 127 and
> >> 128 set. However the 128T addr_limit causes only bits 0-127 to be
> >> compared.
> >>  
> > 
> > Should we fix it then via ? I haven't tested this yet. Also this result 
> > in us comparing more bits?
> > 
> > modified   arch/powerpc/mm/slice.c
> > @@ -169,13 +169,12 @@ static int slice_check_fit(struct mm_struct *mm,
> >                  struct slice_mask mask, struct slice_mask available)
> >   {
> >       DECLARE_BITMAP(result, SLICE_NUM_HIGH);
> > -    unsigned long slice_count = 
> > GET_HIGH_SLICE_INDEX(mm->context.addr_limit);
> > 
> >       bitmap_and(result, mask.high_slices,
> > -           available.high_slices, slice_count);
> > +           available.high_slices, SLICE_NUM_HIGH);
> > 
> >       return (mask.low_slices & available.low_slices) == mask.low_slices &&
> > -        bitmap_equal(result, mask.high_slices, slice_count);
> > +        bitmap_equal(result, mask.high_slices, SLICE_NUM_HIGH)
> > 
> >   
> 
> Florian, will you be able to test this patch ? We may not really want to 
> push this. But it will confirm that we end up getting >128TB address 
> because of this.

Oh we are, I went through and traced it, and this is the reason hash's
get_unmapped_area gives out > 128TB addresses if addr + len crosses
128TB.

Thanks,
Nick
diff mbox series

Patch

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 45f6740dd407..567db541c0a1 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -96,7 +96,7 @@  static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
 {
 	struct vm_area_struct *vma;
 
-	if ((mm->task_size - len) < addr)
+	if ((mm->context.addr_limit - len) < addr)
 		return 0;
 	vma = find_vma(mm, addr);
 	return (!vma || (addr + len) <= vm_start_gap(vma));
@@ -133,7 +133,7 @@  static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret)
 		if (!slice_low_has_vma(mm, i))
 			ret->low_slices |= 1u << i;
 
-	if (mm->task_size <= SLICE_LOW_TOP)
+	if (mm->context.addr_limit <= SLICE_LOW_TOP)
 		return;
 
 	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++)
@@ -446,19 +446,20 @@  unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 
 	/* Sanity checks */
 	BUG_ON(mm->task_size == 0);
+	BUG_ON(mm->context.addr_limit == 0);
 	VM_BUG_ON(radix_enabled());
 
 	slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
 	slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
 		  addr, len, flags, topdown);
 
-	if (len > mm->task_size)
+	if (len > mm->context.addr_limit)
 		return -ENOMEM;
 	if (len & ((1ul << pshift) - 1))
 		return -EINVAL;
 	if (fixed && (addr & ((1ul << pshift) - 1)))
 		return -EINVAL;
-	if (fixed && addr > (mm->task_size - len))
+	if (fixed && addr > (mm->context.addr_limit - len))
 		return -ENOMEM;
 
 	/* If hint, make sure it matches our alignment restrictions */
@@ -466,7 +467,7 @@  unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 		addr = _ALIGN_UP(addr, 1ul << pshift);
 		slice_dbg(" aligned addr=%lx\n", addr);
 		/* Ignore hint if it's too large or overlaps a VMA */
-		if (addr > mm->task_size - len ||
+		if (addr > mm->context.addr_limit - len ||
 		    !slice_area_is_free(mm, addr, len))
 			addr = 0;
 	}