diff mbox

[v2,2/3] powerpc: get hugetlbpage handling more generic

Message ID dab1fde59f711dd7d3f1996dee72f61c46033f47.1474009019.git.christophe.leroy@c-s.fr (mailing list archive)
State Superseded
Headers show

Commit Message

Christophe Leroy Sept. 16, 2016, 7:40 a.m. UTC
Today there are two implementations of hugetlbpages which are managed
by exclusive #ifdefs:
* FSL_BOOKE: several directory entries points to the same single hugepage
* BOOK3S: one upper level directory entry points to a table of hugepages

In preparation of implementation of hugepage support on the 8xx, we
need a mix of the two above solutions, because the 8xx needs both cases
depending on the size of pages:
* In 4k page size mode, each PGD entry covers a 4M bytes area. It means
that 2 PGD entries will be necessary to cover an 8M hugepage while a
single PGD entry will cover 8x 512k hugepages.
* In 16 page size mode, each PGD entry covers a 64M bytes area. It means
that 8x 8M hugepages will be covered by one PGD entry and 64x 512k
hugepages will be covers by one PGD entry.

This patch:
* removes #ifdefs in favor of if/else based on the range sizes
* merges the two huge_pte_alloc() functions as they are pretty similar
* merges the two hugetlbpage_init() functions as they are pretty similar

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
v2: This part is new and results from a split of last patch of v1 serie in
two parts

 arch/powerpc/mm/hugetlbpage.c | 189 +++++++++++++++++-------------------------
 1 file changed, 77 insertions(+), 112 deletions(-)

Comments

Aneesh Kumar K.V Sept. 19, 2016, 5:45 a.m. UTC | #1
Christophe Leroy <christophe.leroy@c-s.fr> writes:

> Today there are two implementations of hugetlbpages which are managed
> by exclusive #ifdefs:
> * FSL_BOOKE: several directory entries points to the same single hugepage
> * BOOK3S: one upper level directory entry points to a table of hugepages
>
> In preparation of implementation of hugepage support on the 8xx, we
> need a mix of the two above solutions, because the 8xx needs both cases
> depending on the size of pages:
> * In 4k page size mode, each PGD entry covers a 4M bytes area. It means
> that 2 PGD entries will be necessary to cover an 8M hugepage while a
> single PGD entry will cover 8x 512k hugepages.
> * In 16 page size mode, each PGD entry covers a 64M bytes area. It means
> that 8x 8M hugepages will be covered by one PGD entry and 64x 512k
> hugepages will be covers by one PGD entry.
>
> This patch:
> * removes #ifdefs in favor of if/else based on the range sizes
> * merges the two huge_pte_alloc() functions as they are pretty similar
> * merges the two hugetlbpage_init() functions as they are pretty similar
>
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> ---
> v2: This part is new and results from a split of last patch of v1 serie in
> two parts
>
>  arch/powerpc/mm/hugetlbpage.c | 189 +++++++++++++++++-------------------------
>  1 file changed, 77 insertions(+), 112 deletions(-)
>
> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> index 8a512b1..2119f00 100644
> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -64,14 +64,16 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
>  {
>  	struct kmem_cache *cachep;
>  	pte_t *new;
> -
> -#ifdef CONFIG_PPC_FSL_BOOK3E
>  	int i;
> -	int num_hugepd = 1 << (pshift - pdshift);
> -	cachep = hugepte_cache;
> -#else
> -	cachep = PGT_CACHE(pdshift - pshift);
> -#endif
> +	int num_hugepd;
> +
> +	if (pshift >= pdshift) {
> +		cachep = hugepte_cache;
> +		num_hugepd = 1 << (pshift - pdshift);
> +	} else {
> +		cachep = PGT_CACHE(pdshift - pshift);
> +		num_hugepd = 1;
> +	}

Is there a way to hint likely/unlikely branch based on the page size
selected at build time ?



>  
>  	new = kmem_cache_zalloc(cachep, GFP_KERNEL);
>  
> @@ -89,7 +91,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
>  	smp_wmb();
>  
>  	spin_lock(&mm->page_table_lock);
> -#ifdef CONFIG_PPC_FSL_BOOK3E
> +
>  	/*
>  	 * We have multiple higher-level entries that point to the same
>  	 * actual pte location.  Fill in each as we go and backtrack on error.
> @@ -100,8 +102,13 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
>  		if (unlikely(!hugepd_none(*hpdp)))
>  			break;
>  		else

....

> -#ifdef CONFIG_PPC_FSL_BOOK3E
>  struct kmem_cache *hugepte_cache;
>  static int __init hugetlbpage_init(void)
>  {
>  	int psize;
>  
> -	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
> -		unsigned shift;
> -
> -		if (!mmu_psize_defs[psize].shift)
> -			continue;
> -
> -		shift = mmu_psize_to_shift(psize);
> -
> -		/* Don't treat normal page sizes as huge... */
> -		if (shift != PAGE_SHIFT)
> -			if (add_huge_page_size(1ULL << shift) < 0)
> -				continue;
> -	}
> -
> -	/*
> -	 * Create a kmem cache for hugeptes.  The bottom bits in the pte have
> -	 * size information encoded in them, so align them to allow this
> -	 */
> -	hugepte_cache =  kmem_cache_create("hugepte-cache", sizeof(pte_t),
> -					   HUGEPD_SHIFT_MASK + 1, 0, NULL);
> -	if (hugepte_cache == NULL)
> -		panic("%s: Unable to create kmem cache for hugeptes\n",
> -		      __func__);
> -
> -	/* Default hpage size = 4M */
> -	if (mmu_psize_defs[MMU_PAGE_4M].shift)
> -		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
> -	else
> -		panic("%s: Unable to set default huge page size\n", __func__);
> -
> -
> -	return 0;
> -}
> -#else
> -static int __init hugetlbpage_init(void)
> -{
> -	int psize;
> -
> +#if !defined(CONFIG_PPC_FSL_BOOK3E)
>  	if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
>  		return -ENODEV;
> -
> +#endif

Do we need that #if ? radix_enabled() should become 0 and that if
condition should be removed at compile time isn't it ? or are you
finding errors with that ?


>  	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
>  		unsigned shift;
>  		unsigned pdshift;
> @@ -860,16 +807,31 @@ static int __init hugetlbpage_init(void)
>  		 * if we have pdshift and shift value same, we don't
>  		 * use pgt cache for hugepd.
>  		 */
> -		if (pdshift != shift) {
> +		if (pdshift > shift) {
>  			pgtable_cache_add(pdshift - shift, NULL);
>  			if (!PGT_CACHE(pdshift - shift))
>  				panic("hugetlbpage_init(): could not create "
>  				      "pgtable cache for %d bit pagesize\n", shift);
> +		} else if (!hugepte_cache) {
> +			/*
> +			 * Create a kmem cache for hugeptes.  The bottom bits in
> +			 * the pte have size information encoded in them, so
> +			 * align them to allow this
> +			 */
> +			hugepte_cache = kmem_cache_create("hugepte-cache",
> +							  sizeof(pte_t),
> +							  HUGEPD_SHIFT_MASK + 1,
> +							  0, NULL);
> +			if (hugepte_cache == NULL)
> +				panic("%s: Unable to create kmem cache "
> +				      "for hugeptes\n", __func__);
> +


We don't need hugepte_cache for book3s 64K. I guess we will endup
creating one here ?

>  		}
>  	}
>  
>  	/* Set default large page size. Currently, we pick 16M or 1M
>  	 * depending on what is available
> +	 * We select 4M on other ones.
>  	 */
>  	if (mmu_psize_defs[MMU_PAGE_16M].shift)
>  		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
> @@ -877,11 +839,14 @@ static int __init hugetlbpage_init(void)
>  		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
>  	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
>  		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
> -
> +	else if (mmu_psize_defs[MMU_PAGE_4M].shift)
> +		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
> +	else
> +		panic("%s: Unable to set default huge page size\n", __func__);
>  
>  	return 0;
>  }
> -#endif
> +
>  arch_initcall(hugetlbpage_init);
>  
>  void flush_dcache_icache_hugepage(struct page *page)
> -- 
> 2.1.0
Aneesh Kumar K.V Sept. 19, 2016, 5:50 a.m. UTC | #2
Christophe Leroy <christophe.leroy@c-s.fr> writes:
> +#else
> +static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
> +{
> +	BUG();
> +}
> +
>  #endif


I was expecting that BUG will get removed in the next patch. But I don't
see it in the next patch. Considering

@@ -475,11 +453,10 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif
        for (i = 0; i < num_hugepd; i++, hpdp++)
                hpdp->pd = 0;

-#ifdef CONFIG_PPC_FSL_BOOK3E
-	hugepd_free(tlb, hugepte);
-#else
-	pgtable_free_tlb(tlb, hugepte, pdshift - shift);
-#endif
+	if (shift >= pdshift)
+		hugepd_free(tlb, hugepte);
+	else
+		pgtable_free_tlb(tlb, hugepte, pdshift - shift);
 }

What is that I am missing ?

-aneesh
Christophe Leroy Sept. 19, 2016, 6:32 p.m. UTC | #3
Le 19/09/2016 à 07:45, Aneesh Kumar K.V a écrit :
> Christophe Leroy <christophe.leroy@c-s.fr> writes:
>
>> Today there are two implementations of hugetlbpages which are managed
>> by exclusive #ifdefs:
>> * FSL_BOOKE: several directory entries points to the same single hugepage
>> * BOOK3S: one upper level directory entry points to a table of hugepages
>>
>> In preparation of implementation of hugepage support on the 8xx, we
>> need a mix of the two above solutions, because the 8xx needs both cases
>> depending on the size of pages:
>> * In 4k page size mode, each PGD entry covers a 4M bytes area. It means
>> that 2 PGD entries will be necessary to cover an 8M hugepage while a
>> single PGD entry will cover 8x 512k hugepages.
>> * In 16 page size mode, each PGD entry covers a 64M bytes area. It means
>> that 8x 8M hugepages will be covered by one PGD entry and 64x 512k
>> hugepages will be covers by one PGD entry.
>>
>> This patch:
>> * removes #ifdefs in favor of if/else based on the range sizes
>> * merges the two huge_pte_alloc() functions as they are pretty similar
>> * merges the two hugetlbpage_init() functions as they are pretty similar
>>
>> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
>> ---
>> v2: This part is new and results from a split of last patch of v1 serie in
>> two parts
>>
>>  arch/powerpc/mm/hugetlbpage.c | 189 +++++++++++++++++-------------------------
>>  1 file changed, 77 insertions(+), 112 deletions(-)
>>
>> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
>> index 8a512b1..2119f00 100644
>> --- a/arch/powerpc/mm/hugetlbpage.c
>> +++ b/arch/powerpc/mm/hugetlbpage.c

[...]

>
>> -#ifdef CONFIG_PPC_FSL_BOOK3E
>>  struct kmem_cache *hugepte_cache;
>>  static int __init hugetlbpage_init(void)
>>  {
>>  	int psize;
>>
>> -	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
>> -		unsigned shift;
>> -
>> -		if (!mmu_psize_defs[psize].shift)
>> -			continue;
>> -
>> -		shift = mmu_psize_to_shift(psize);
>> -
>> -		/* Don't treat normal page sizes as huge... */
>> -		if (shift != PAGE_SHIFT)
>> -			if (add_huge_page_size(1ULL << shift) < 0)
>> -				continue;
>> -	}
>> -
>> -	/*
>> -	 * Create a kmem cache for hugeptes.  The bottom bits in the pte have
>> -	 * size information encoded in them, so align them to allow this
>> -	 */
>> -	hugepte_cache =  kmem_cache_create("hugepte-cache", sizeof(pte_t),
>> -					   HUGEPD_SHIFT_MASK + 1, 0, NULL);
>> -	if (hugepte_cache == NULL)
>> -		panic("%s: Unable to create kmem cache for hugeptes\n",
>> -		      __func__);
>> -
>> -	/* Default hpage size = 4M */
>> -	if (mmu_psize_defs[MMU_PAGE_4M].shift)
>> -		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
>> -	else
>> -		panic("%s: Unable to set default huge page size\n", __func__);
>> -
>> -
>> -	return 0;
>> -}
>> -#else
>> -static int __init hugetlbpage_init(void)
>> -{
>> -	int psize;
>> -
>> +#if !defined(CONFIG_PPC_FSL_BOOK3E)
>>  	if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
>>  		return -ENODEV;
>> -
>> +#endif
>
> Do we need that #if ? radix_enabled() should become 0 and that if
> condition should be removed at compile time isn't it ? or are you
> finding errors with that ?

Having radix_enabled() being 0, it becomes:

if (!mmu_has_feature(MMU_FTR_16M_PAGE))
	return -ENODEV;

Which means hugepage will only be handled by CPUs having 16M pages. 
That's the issue.

>
>
>>  	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
>>  		unsigned shift;
>>  		unsigned pdshift;
>> @@ -860,16 +807,31 @@ static int __init hugetlbpage_init(void)
>>  		 * if we have pdshift and shift value same, we don't
>>  		 * use pgt cache for hugepd.
>>  		 */
>> -		if (pdshift != shift) {
>> +		if (pdshift > shift) {
>>  			pgtable_cache_add(pdshift - shift, NULL);
>>  			if (!PGT_CACHE(pdshift - shift))
>>  				panic("hugetlbpage_init(): could not create "
>>  				      "pgtable cache for %d bit pagesize\n", shift);
>> +		} else if (!hugepte_cache) {
>> +			/*
>> +			 * Create a kmem cache for hugeptes.  The bottom bits in
>> +			 * the pte have size information encoded in them, so
>> +			 * align them to allow this
>> +			 */
>> +			hugepte_cache = kmem_cache_create("hugepte-cache",
>> +							  sizeof(pte_t),
>> +							  HUGEPD_SHIFT_MASK + 1,
>> +							  0, NULL);
>> +			if (hugepte_cache == NULL)
>> +				panic("%s: Unable to create kmem cache "
>> +				      "for hugeptes\n", __func__);
>> +
>
>
> We don't need hugepte_cache for book3s 64K. I guess we will endup
> creating one here ?

Should not, because on book3s 64k, we will have pdshift > shift
won't we ?

>
>>  		}
>>  	}
>>
>>  	/* Set default large page size. Currently, we pick 16M or 1M
>>  	 * depending on what is available
>> +	 * We select 4M on other ones.
>>  	 */
>>  	if (mmu_psize_defs[MMU_PAGE_16M].shift)
>>  		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
>> @@ -877,11 +839,14 @@ static int __init hugetlbpage_init(void)
>>  		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
>>  	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
>>  		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
>> -
>> +	else if (mmu_psize_defs[MMU_PAGE_4M].shift)
>> +		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
>> +	else
>> +		panic("%s: Unable to set default huge page size\n", __func__);
>>
>>  	return 0;
>>  }
>> -#endif
>> +
>>  arch_initcall(hugetlbpage_init);
>>
>>  void flush_dcache_icache_hugepage(struct page *page)
>> --
>> 2.1.0

---
L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel antivirus Avast.
https://www.avast.com/antivirus
Christophe Leroy Sept. 19, 2016, 6:36 p.m. UTC | #4
Le 19/09/2016 à 07:50, Aneesh Kumar K.V a écrit :
>
> Christophe Leroy <christophe.leroy@c-s.fr> writes:
>> +#else
>> +static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
>> +{
>> +	BUG();
>> +}
>> +
>>  #endif
>
>
> I was expecting that BUG will get removed in the next patch. But I don't
> see it in the next patch. Considering
>
> @@ -475,11 +453,10 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif
>         for (i = 0; i < num_hugepd; i++, hpdp++)
>                 hpdp->pd = 0;
>
> -#ifdef CONFIG_PPC_FSL_BOOK3E
> -	hugepd_free(tlb, hugepte);
> -#else
> -	pgtable_free_tlb(tlb, hugepte, pdshift - shift);
> -#endif
> +	if (shift >= pdshift)
> +		hugepd_free(tlb, hugepte);
> +	else
> +		pgtable_free_tlb(tlb, hugepte, pdshift - shift);
>  }
>
> What is that I am missing ?
>

Previously, call to hugepd_free() was compiled only when #ifdef 
CONFIG_PPC_FSL_BOOK3E
Now, it is compiled at all time, but it should never be called if not 
CONFIG_PPC_FSL_BOOK3E because we always have shift < pdshift in that case.
Then the function needs to be defined anyway but should never be called. 
Should I just define it static inline {} ?

Christophe

---
L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel antivirus Avast.
https://www.avast.com/antivirus
Aneesh Kumar K.V Sept. 20, 2016, 2:28 a.m. UTC | #5
christophe leroy <christophe.leroy@c-s.fr> writes:

> Le 19/09/2016 à 07:50, Aneesh Kumar K.V a écrit :
>>
>> Christophe Leroy <christophe.leroy@c-s.fr> writes:
>>> +#else
>>> +static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
>>> +{
>>> +	BUG();
>>> +}
>>> +
>>>  #endif
>>
>>
>> I was expecting that BUG will get removed in the next patch. But I don't
>> see it in the next patch. Considering
>>
>> @@ -475,11 +453,10 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif
>>         for (i = 0; i < num_hugepd; i++, hpdp++)
>>                 hpdp->pd = 0;
>>
>> -#ifdef CONFIG_PPC_FSL_BOOK3E
>> -	hugepd_free(tlb, hugepte);
>> -#else
>> -	pgtable_free_tlb(tlb, hugepte, pdshift - shift);
>> -#endif
>> +	if (shift >= pdshift)
>> +		hugepd_free(tlb, hugepte);
>> +	else
>> +		pgtable_free_tlb(tlb, hugepte, pdshift - shift);
>>  }
>>
>> What is that I am missing ?
>>
>
> Previously, call to hugepd_free() was compiled only when #ifdef 
> CONFIG_PPC_FSL_BOOK3E
> Now, it is compiled at all time, but it should never be called if not 
> CONFIG_PPC_FSL_BOOK3E because we always have shift < pdshift in that case.
> Then the function needs to be defined anyway but should never be called. 
> Should I just define it static inline {} ?
>

For 8M with 4K mode, we have shift >= pdshift right ?

-aneesh
Aneesh Kumar K.V Sept. 20, 2016, 2:45 a.m. UTC | #6
christophe leroy <christophe.leroy@c-s.fr> writes:

>>
>>
>>>  	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
>>>  		unsigned shift;
>>>  		unsigned pdshift;
>>> @@ -860,16 +807,31 @@ static int __init hugetlbpage_init(void)
>>>  		 * if we have pdshift and shift value same, we don't
>>>  		 * use pgt cache for hugepd.
>>>  		 */
>>> -		if (pdshift != shift) {
>>> +		if (pdshift > shift) {
>>>  			pgtable_cache_add(pdshift - shift, NULL);
>>>  			if (!PGT_CACHE(pdshift - shift))
>>>  				panic("hugetlbpage_init(): could not create "
>>>  				      "pgtable cache for %d bit pagesize\n", shift);
>>> +		} else if (!hugepte_cache) {
>>> +			/*
>>> +			 * Create a kmem cache for hugeptes.  The bottom bits in
>>> +			 * the pte have size information encoded in them, so
>>> +			 * align them to allow this
>>> +			 */
>>> +			hugepte_cache = kmem_cache_create("hugepte-cache",
>>> +							  sizeof(pte_t),
>>> +							  HUGEPD_SHIFT_MASK + 1,
>>> +							  0, NULL);
>>> +			if (hugepte_cache == NULL)
>>> +				panic("%s: Unable to create kmem cache "
>>> +				      "for hugeptes\n", __func__);
>>> +
>>
>>
>> We don't need hugepte_cache for book3s 64K. I guess we will endup
>> creating one here ?
>
> Should not, because on book3s 64k, we will have pdshift > shift
> won't we ?
>

on 64k book3s, we have pdshift == shift and we don't need to create 
hugepd cache on book3s 64k.

-aneesh
Christophe Leroy Sept. 20, 2016, 5:22 a.m. UTC | #7
Le 20/09/2016 à 04:28, Aneesh Kumar K.V a écrit :
> christophe leroy <christophe.leroy@c-s.fr> writes:
>
>> Le 19/09/2016 à 07:50, Aneesh Kumar K.V a écrit :
>>>
>>> Christophe Leroy <christophe.leroy@c-s.fr> writes:
>>>> +#else
>>>> +static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
>>>> +{
>>>> +	BUG();
>>>> +}
>>>> +
>>>>  #endif
>>>
>>>
>>> I was expecting that BUG will get removed in the next patch. But I don't
>>> see it in the next patch. Considering
>>>
>>> @@ -475,11 +453,10 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif
>>>         for (i = 0; i < num_hugepd; i++, hpdp++)
>>>                 hpdp->pd = 0;
>>>
>>> -#ifdef CONFIG_PPC_FSL_BOOK3E
>>> -	hugepd_free(tlb, hugepte);
>>> -#else
>>> -	pgtable_free_tlb(tlb, hugepte, pdshift - shift);
>>> -#endif
>>> +	if (shift >= pdshift)
>>> +		hugepd_free(tlb, hugepte);
>>> +	else
>>> +		pgtable_free_tlb(tlb, hugepte, pdshift - shift);
>>>  }
>>>
>>> What is that I am missing ?
>>>
>>
>> Previously, call to hugepd_free() was compiled only when #ifdef
>> CONFIG_PPC_FSL_BOOK3E
>> Now, it is compiled at all time, but it should never be called if not
>> CONFIG_PPC_FSL_BOOK3E because we always have shift < pdshift in that case.
>> Then the function needs to be defined anyway but should never be called.
>> Should I just define it static inline {} ?
>>
>
> For 8M with 4K mode, we have shift >= pdshift right ?
>

Yes, thats the reason why in the following patch we get. That way we get 
a real hugepd_free() also for the 8xx.

@@ -366,7 +373,7 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
  }
  #endif

-#ifdef CONFIG_PPC_FSL_BOOK3E
+#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
  #define HUGEPD_FREELIST_SIZE \
  	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))



Christophe
Christophe Leroy Sept. 21, 2016, 6:13 a.m. UTC | #8
Le 19/09/2016 à 07:45, Aneesh Kumar K.V a écrit :
> Christophe Leroy <christophe.leroy@c-s.fr> writes:
>
>> Today there are two implementations of hugetlbpages which are managed
>> by exclusive #ifdefs:
>> * FSL_BOOKE: several directory entries points to the same single hugepage
>> * BOOK3S: one upper level directory entry points to a table of hugepages
>>
>> In preparation of implementation of hugepage support on the 8xx, we
>> need a mix of the two above solutions, because the 8xx needs both cases
>> depending on the size of pages:
>> * In 4k page size mode, each PGD entry covers a 4M bytes area. It means
>> that 2 PGD entries will be necessary to cover an 8M hugepage while a
>> single PGD entry will cover 8x 512k hugepages.
>> * In 16 page size mode, each PGD entry covers a 64M bytes area. It means
>> that 8x 8M hugepages will be covered by one PGD entry and 64x 512k
>> hugepages will be covers by one PGD entry.
>>
>> This patch:
>> * removes #ifdefs in favor of if/else based on the range sizes
>> * merges the two huge_pte_alloc() functions as they are pretty similar
>> * merges the two hugetlbpage_init() functions as they are pretty similar
>>
>> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
>> ---
>> v2: This part is new and results from a split of last patch of v1 serie in
>> two parts
>>
>>  arch/powerpc/mm/hugetlbpage.c | 189 +++++++++++++++++-------------------------
>>  1 file changed, 77 insertions(+), 112 deletions(-)
>>
>> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
>> index 8a512b1..2119f00 100644
>> --- a/arch/powerpc/mm/hugetlbpage.c
>> +++ b/arch/powerpc/mm/hugetlbpage.c
>> @@ -64,14 +64,16 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
>>  {
>>  	struct kmem_cache *cachep;
>>  	pte_t *new;
>> -
>> -#ifdef CONFIG_PPC_FSL_BOOK3E
>>  	int i;
>> -	int num_hugepd = 1 << (pshift - pdshift);
>> -	cachep = hugepte_cache;
>> -#else
>> -	cachep = PGT_CACHE(pdshift - pshift);
>> -#endif
>> +	int num_hugepd;
>> +
>> +	if (pshift >= pdshift) {
>> +		cachep = hugepte_cache;
>> +		num_hugepd = 1 << (pshift - pdshift);
>> +	} else {
>> +		cachep = PGT_CACHE(pdshift - pshift);
>> +		num_hugepd = 1;
>> +	}
>
> Is there a way to hint likely/unlikely branch based on the page size
> selected at build time ?

Is that really worth it, won't it be negligeable compared to  other 
actions in that function (like for instance kmem_cache_zalloc()) ?
Can't we just trust GCC on that one ?

Christophe
diff mbox

Patch

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 8a512b1..2119f00 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -64,14 +64,16 @@  static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 {
 	struct kmem_cache *cachep;
 	pte_t *new;
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
 	int i;
-	int num_hugepd = 1 << (pshift - pdshift);
-	cachep = hugepte_cache;
-#else
-	cachep = PGT_CACHE(pdshift - pshift);
-#endif
+	int num_hugepd;
+
+	if (pshift >= pdshift) {
+		cachep = hugepte_cache;
+		num_hugepd = 1 << (pshift - pdshift);
+	} else {
+		cachep = PGT_CACHE(pdshift - pshift);
+		num_hugepd = 1;
+	}
 
 	new = kmem_cache_zalloc(cachep, GFP_KERNEL);
 
@@ -89,7 +91,7 @@  static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 	smp_wmb();
 
 	spin_lock(&mm->page_table_lock);
-#ifdef CONFIG_PPC_FSL_BOOK3E
+
 	/*
 	 * We have multiple higher-level entries that point to the same
 	 * actual pte location.  Fill in each as we go and backtrack on error.
@@ -100,8 +102,13 @@  static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 		if (unlikely(!hugepd_none(*hpdp)))
 			break;
 		else
+#ifdef CONFIG_PPC_BOOK3S_64
+			hpdp->pd = __pa(new) |
+				   (shift_to_mmu_psize(pshift) << 2);
+#else
 			/* We use the old format for PPC_FSL_BOOK3E */
 			hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
+#endif
 	}
 	/* If we bailed from the for loop early, an error occurred, clean up */
 	if (i < num_hugepd) {
@@ -109,17 +116,6 @@  static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 			hpdp->pd = 0;
 		kmem_cache_free(cachep, new);
 	}
-#else
-	if (!hugepd_none(*hpdp))
-		kmem_cache_free(cachep, new);
-	else {
-#ifdef CONFIG_PPC_BOOK3S_64
-		hpdp->pd = __pa(new) | (shift_to_mmu_psize(pshift) << 2);
-#else
-		hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
-#endif
-	}
-#endif
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
@@ -136,7 +132,6 @@  static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 #define HUGEPD_PUD_SHIFT PMD_SHIFT
 #endif
 
-#ifdef CONFIG_PPC_BOOK3S_64
 /*
  * At this point we do the placement change only for BOOK3S 64. This would
  * possibly work on other subarchs.
@@ -153,6 +148,7 @@  pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
 	addr &= ~(sz-1);
 	pg = pgd_offset(mm, addr);
 
+#ifdef CONFIG_PPC_BOOK3S_64
 	if (pshift == PGDIR_SHIFT)
 		/* 16GB huge page */
 		return (pte_t *) pg;
@@ -178,32 +174,7 @@  pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
 				hpdp = (hugepd_t *)pm;
 		}
 	}
-	if (!hpdp)
-		return NULL;
-
-	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
-
-	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
-		return NULL;
-
-	return hugepte_offset(*hpdp, addr, pdshift);
-}
-
 #else
-
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
-{
-	pgd_t *pg;
-	pud_t *pu;
-	pmd_t *pm;
-	hugepd_t *hpdp = NULL;
-	unsigned pshift = __ffs(sz);
-	unsigned pdshift = PGDIR_SHIFT;
-
-	addr &= ~(sz-1);
-
-	pg = pgd_offset(mm, addr);
-
 	if (pshift >= HUGEPD_PGD_SHIFT) {
 		hpdp = (hugepd_t *)pg;
 	} else {
@@ -217,7 +188,7 @@  pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
 			hpdp = (hugepd_t *)pm;
 		}
 	}
-
+#endif
 	if (!hpdp)
 		return NULL;
 
@@ -228,7 +199,6 @@  pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
 
 	return hugepte_offset(*hpdp, addr, pdshift);
 }
-#endif
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
 /* Build list of addresses of gigantic pages.  This function is used in early
@@ -310,7 +280,11 @@  static int __init do_gpage_early_setup(char *param, char *val,
 				npages = 0;
 			if (npages > MAX_NUMBER_GPAGES) {
 				pr_warn("MMU: %lu pages requested for page "
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
 					"size %llu KB, limiting to "
+#else
+					"size %u KB, limiting to "
+#endif
 					__stringify(MAX_NUMBER_GPAGES) "\n",
 					npages, size / 1024);
 				npages = MAX_NUMBER_GPAGES;
@@ -442,6 +416,12 @@  static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
 	}
 	put_cpu_var(hugepd_freelist_cur);
 }
+#else
+static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
+{
+	BUG();
+}
+
 #endif
 
 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
@@ -453,13 +433,11 @@  static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif
 
 	unsigned long pdmask = ~((1UL << pdshift) - 1);
 	unsigned int num_hugepd = 1;
+	unsigned int shift = hugepd_shift(*hpdp);
 
-#ifdef CONFIG_PPC_FSL_BOOK3E
 	/* Note: On fsl the hpdp may be the first of several */
-	num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift));
-#else
-	unsigned int shift = hugepd_shift(*hpdp);
-#endif
+	if (shift > pdshift)
+		num_hugepd = 1 << (shift - pdshift);
 
 	start &= pdmask;
 	if (start < floor)
@@ -475,11 +453,10 @@  static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif
 	for (i = 0; i < num_hugepd; i++, hpdp++)
 		hpdp->pd = 0;
 
-#ifdef CONFIG_PPC_FSL_BOOK3E
-	hugepd_free(tlb, hugepte);
-#else
-	pgtable_free_tlb(tlb, hugepte, pdshift - shift);
-#endif
+	if (shift >= pdshift)
+		hugepd_free(tlb, hugepte);
+	else
+		pgtable_free_tlb(tlb, hugepte, pdshift - shift);
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -492,6 +469,8 @@  static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 
 	start = addr;
 	do {
+		unsigned long more;
+
 		pmd = pmd_offset(pud, addr);
 		next = pmd_addr_end(addr, end);
 		if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
@@ -502,15 +481,16 @@  static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 			WARN_ON(!pmd_none_or_clear_bad(pmd));
 			continue;
 		}
-#ifdef CONFIG_PPC_FSL_BOOK3E
 		/*
 		 * Increment next by the size of the huge mapping since
 		 * there may be more than one entry at this level for a
 		 * single hugepage, but all of them point to
 		 * the same kmem cache that holds the hugepte.
 		 */
-		next = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
-#endif
+		more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
+		if (more > next)
+			next = more;
+
 		free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
 				  addr, next, floor, ceiling);
 	} while (addr = next, addr != end);
@@ -550,15 +530,17 @@  static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
 					       ceiling);
 		} else {
-#ifdef CONFIG_PPC_FSL_BOOK3E
+			unsigned long more;
 			/*
 			 * Increment next by the size of the huge mapping since
 			 * there may be more than one entry at this level for a
 			 * single hugepage, but all of them point to
 			 * the same kmem cache that holds the hugepte.
 			 */
-			next = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
-#endif
+			more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
+			if (more > next)
+				next = more;
+
 			free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
 					  addr, next, floor, ceiling);
 		}
@@ -615,15 +597,17 @@  void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 				continue;
 			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 		} else {
-#ifdef CONFIG_PPC_FSL_BOOK3E
+			unsigned long more;
 			/*
 			 * Increment next by the size of the huge mapping since
 			 * there may be more than one entry at the pgd level
 			 * for a single hugepage, but all of them point to the
 			 * same kmem cache that holds the hugepte.
 			 */
-			next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
-#endif
+			more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
+			if (more > next)
+				next = more;
+
 			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
 					  addr, next, floor, ceiling);
 		}
@@ -753,12 +737,13 @@  static int __init add_huge_page_size(unsigned long long size)
 
 	/* Check that it is a page size supported by the hardware and
 	 * that it fits within pagetable and slice limits. */
+	if (size <= PAGE_SIZE)
+		return -EINVAL;
 #ifdef CONFIG_PPC_FSL_BOOK3E
-	if ((size < PAGE_SIZE) || !is_power_of_4(size))
+	if (!is_power_of_4(size))
 		return -EINVAL;
 #else
-	if (!is_power_of_2(size)
-	    || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
+	if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
 		return -EINVAL;
 #endif
 
@@ -791,53 +776,15 @@  static int __init hugepage_setup_sz(char *str)
 }
 __setup("hugepagesz=", hugepage_setup_sz);
 
-#ifdef CONFIG_PPC_FSL_BOOK3E
 struct kmem_cache *hugepte_cache;
 static int __init hugetlbpage_init(void)
 {
 	int psize;
 
-	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
-		unsigned shift;
-
-		if (!mmu_psize_defs[psize].shift)
-			continue;
-
-		shift = mmu_psize_to_shift(psize);
-
-		/* Don't treat normal page sizes as huge... */
-		if (shift != PAGE_SHIFT)
-			if (add_huge_page_size(1ULL << shift) < 0)
-				continue;
-	}
-
-	/*
-	 * Create a kmem cache for hugeptes.  The bottom bits in the pte have
-	 * size information encoded in them, so align them to allow this
-	 */
-	hugepte_cache =  kmem_cache_create("hugepte-cache", sizeof(pte_t),
-					   HUGEPD_SHIFT_MASK + 1, 0, NULL);
-	if (hugepte_cache == NULL)
-		panic("%s: Unable to create kmem cache for hugeptes\n",
-		      __func__);
-
-	/* Default hpage size = 4M */
-	if (mmu_psize_defs[MMU_PAGE_4M].shift)
-		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
-	else
-		panic("%s: Unable to set default huge page size\n", __func__);
-
-
-	return 0;
-}
-#else
-static int __init hugetlbpage_init(void)
-{
-	int psize;
-
+#if !defined(CONFIG_PPC_FSL_BOOK3E)
 	if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
 		return -ENODEV;
-
+#endif
 	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 		unsigned shift;
 		unsigned pdshift;
@@ -860,16 +807,31 @@  static int __init hugetlbpage_init(void)
 		 * if we have pdshift and shift value same, we don't
 		 * use pgt cache for hugepd.
 		 */
-		if (pdshift != shift) {
+		if (pdshift > shift) {
 			pgtable_cache_add(pdshift - shift, NULL);
 			if (!PGT_CACHE(pdshift - shift))
 				panic("hugetlbpage_init(): could not create "
 				      "pgtable cache for %d bit pagesize\n", shift);
+		} else if (!hugepte_cache) {
+			/*
+			 * Create a kmem cache for hugeptes.  The bottom bits in
+			 * the pte have size information encoded in them, so
+			 * align them to allow this
+			 */
+			hugepte_cache = kmem_cache_create("hugepte-cache",
+							  sizeof(pte_t),
+							  HUGEPD_SHIFT_MASK + 1,
+							  0, NULL);
+			if (hugepte_cache == NULL)
+				panic("%s: Unable to create kmem cache "
+				      "for hugeptes\n", __func__);
+
 		}
 	}
 
 	/* Set default large page size. Currently, we pick 16M or 1M
 	 * depending on what is available
+	 * We select 4M on other ones.
 	 */
 	if (mmu_psize_defs[MMU_PAGE_16M].shift)
 		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
@@ -877,11 +839,14 @@  static int __init hugetlbpage_init(void)
 		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
 	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
 		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
-
+	else if (mmu_psize_defs[MMU_PAGE_4M].shift)
+		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
+	else
+		panic("%s: Unable to set default huge page size\n", __func__);
 
 	return 0;
 }
-#endif
+
 arch_initcall(hugetlbpage_init);
 
 void flush_dcache_icache_hugepage(struct page *page)