diff mbox

[RFCv2,5/9] arch/powerpc: Split hash page table sizing heuristic into a helper

Message ID 1454045043-25545-6-git-send-email-david@gibson.dropbear.id.au (mailing list archive)
State Superseded
Headers show

Commit Message

David Gibson Jan. 29, 2016, 5:23 a.m. UTC
htab_get_table_size() either retrieve the size of the hash page table (HPT)
from the device tree - if the HPT size is determined by firmware - or
uses a heuristic to determine a good size based on RAM size if the kernel
is responsible for allocating the HPT.

To support a PAPR extension allowing resizing of the HPT, we're going to
want the memory size -> HPT size logic elsewhere, so split it out into a
helper function.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 arch/powerpc/include/asm/mmu-hash64.h |  3 +++
 arch/powerpc/mm/hash_utils_64.c       | 30 +++++++++++++++++-------------
 2 files changed, 20 insertions(+), 13 deletions(-)

Comments

Anshuman Khandual Feb. 1, 2016, 7:04 a.m. UTC | #1
On 01/29/2016 10:53 AM, David Gibson wrote:
> htab_get_table_size() either retrieve the size of the hash page table (HPT)
> from the device tree - if the HPT size is determined by firmware - or
> uses a heuristic to determine a good size based on RAM size if the kernel
> is responsible for allocating the HPT.
> 
> To support a PAPR extension allowing resizing of the HPT, we're going to
> want the memory size -> HPT size logic elsewhere, so split it out into a
> helper function.
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  arch/powerpc/include/asm/mmu-hash64.h |  3 +++
>  arch/powerpc/mm/hash_utils_64.c       | 30 +++++++++++++++++-------------
>  2 files changed, 20 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
> index 7352d3f..cf070fd 100644
> --- a/arch/powerpc/include/asm/mmu-hash64.h
> +++ b/arch/powerpc/include/asm/mmu-hash64.h
> @@ -607,6 +607,9 @@ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
>  	context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1;
>  	return get_vsid(context, ea, ssize);
>  }
> +
> +unsigned htab_shift_for_mem_size(unsigned long mem_size);
> +
>  #endif /* __ASSEMBLY__ */
>  
>  #endif /* _ASM_POWERPC_MMU_HASH64_H_ */
> diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
> index e88a86e..d63f7dc 100644
> --- a/arch/powerpc/mm/hash_utils_64.c
> +++ b/arch/powerpc/mm/hash_utils_64.c
> @@ -606,10 +606,24 @@ static int __init htab_dt_scan_pftsize(unsigned long node,
>  	return 0;
>  }
>  
> -static unsigned long __init htab_get_table_size(void)
> +unsigned htab_shift_for_mem_size(unsigned long mem_size)
>  {
> -	unsigned long mem_size, rnd_mem_size, pteg_count, psize;
> +	unsigned memshift = __ilog2(mem_size);
> +	unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift;
> +	unsigned pteg_shift;
> +
> +	/* round mem_size up to next power of 2 */
> +	if ((1UL << memshift) < mem_size)
> +		memshift += 1;
> +
> +	/* aim for 2 pages / pteg */

While here I guess its a good opportunity to write couple of lines
about why one PTE group for every two physical pages on the system,
why minimum (1UL << 11 = 2048) number of PTE groups required, why
(1U << 7 = 128) entries per PTE group and also remove the existing
confusing comments above ? Just a suggestion.

> +	pteg_shift = memshift - (pshift + 1);
> +
> +	return max(pteg_shift + 7, 18U);
> +}
>  
> +static unsigned long __init htab_get_table_size(void)
> +{
>  	/* If hash size isn't already provided by the platform, we try to
>  	 * retrieve it from the device-tree. If it's not there neither, we
>  	 * calculate it now based on the total RAM size
> @@ -619,17 +633,7 @@ static unsigned long __init htab_get_table_size(void)
>  	if (ppc64_pft_size)
>  		return 1UL << ppc64_pft_size;
>  
> -	/* round mem_size up to next power of 2 */
> -	mem_size = memblock_phys_mem_size();
> -	rnd_mem_size = 1UL << __ilog2(mem_size);
> -	if (rnd_mem_size < mem_size)
> -		rnd_mem_size <<= 1;
> -
> -	/* # pages / 2 */
> -	psize = mmu_psize_defs[mmu_virtual_psize].shift;
> -	pteg_count = max(rnd_mem_size >> (psize + 1), 1UL << 11);
> -
> -	return pteg_count << 7;
> +	return htab_shift_for_mem_size(memblock_phys_mem_size());

Would it be 1UL << htab_shift_for_mem_size(memblock_phys_mem_size())
instead ? It was returning the size of the HPT not the shift of HPT
originally or I am missing something here.
David Gibson Feb. 2, 2016, 1:04 a.m. UTC | #2
On Mon, Feb 01, 2016 at 12:34:32PM +0530, Anshuman Khandual wrote:
> On 01/29/2016 10:53 AM, David Gibson wrote:
> > htab_get_table_size() either retrieve the size of the hash page table (HPT)
> > from the device tree - if the HPT size is determined by firmware - or
> > uses a heuristic to determine a good size based on RAM size if the kernel
> > is responsible for allocating the HPT.
> > 
> > To support a PAPR extension allowing resizing of the HPT, we're going to
> > want the memory size -> HPT size logic elsewhere, so split it out into a
> > helper function.
> > 
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  arch/powerpc/include/asm/mmu-hash64.h |  3 +++
> >  arch/powerpc/mm/hash_utils_64.c       | 30 +++++++++++++++++-------------
> >  2 files changed, 20 insertions(+), 13 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
> > index 7352d3f..cf070fd 100644
> > --- a/arch/powerpc/include/asm/mmu-hash64.h
> > +++ b/arch/powerpc/include/asm/mmu-hash64.h
> > @@ -607,6 +607,9 @@ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
> >  	context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1;
> >  	return get_vsid(context, ea, ssize);
> >  }
> > +
> > +unsigned htab_shift_for_mem_size(unsigned long mem_size);
> > +
> >  #endif /* __ASSEMBLY__ */
> >  
> >  #endif /* _ASM_POWERPC_MMU_HASH64_H_ */
> > diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
> > index e88a86e..d63f7dc 100644
> > --- a/arch/powerpc/mm/hash_utils_64.c
> > +++ b/arch/powerpc/mm/hash_utils_64.c
> > @@ -606,10 +606,24 @@ static int __init htab_dt_scan_pftsize(unsigned long node,
> >  	return 0;
> >  }
> >  
> > -static unsigned long __init htab_get_table_size(void)
> > +unsigned htab_shift_for_mem_size(unsigned long mem_size)
> >  {
> > -	unsigned long mem_size, rnd_mem_size, pteg_count, psize;
> > +	unsigned memshift = __ilog2(mem_size);
> > +	unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift;
> > +	unsigned pteg_shift;
> > +
> > +	/* round mem_size up to next power of 2 */
> > +	if ((1UL << memshift) < mem_size)
> > +		memshift += 1;
> > +
> > +	/* aim for 2 pages / pteg */
> 
> While here I guess its a good opportunity to write couple of lines
> about why one PTE group for every two physical pages on the system,

Well, that don't really know, it's just copied from the existing code.

> why minimum (1UL << 11 = 2048) number of PTE groups required,

Ok.

> why
> (1U << 7 = 128) entries per PTE group

Um.. what?  Because that's how big a PTEG is, I don't think
re-explaining the HPT structure here is useful.

> and also remove the existing
> confusing comments above ? Just a suggestion.

Not sure which comment you mean.

> 
> > +	pteg_shift = memshift - (pshift + 1);
> > +
> > +	return max(pteg_shift + 7, 18U);
> > +}
> >  
> > +static unsigned long __init htab_get_table_size(void)
> > +{
> >  	/* If hash size isn't already provided by the platform, we try to
> >  	 * retrieve it from the device-tree. If it's not there neither, we
> >  	 * calculate it now based on the total RAM size
> > @@ -619,17 +633,7 @@ static unsigned long __init htab_get_table_size(void)
> >  	if (ppc64_pft_size)
> >  		return 1UL << ppc64_pft_size;
> >  
> > -	/* round mem_size up to next power of 2 */
> > -	mem_size = memblock_phys_mem_size();
> > -	rnd_mem_size = 1UL << __ilog2(mem_size);
> > -	if (rnd_mem_size < mem_size)
> > -		rnd_mem_size <<= 1;
> > -
> > -	/* # pages / 2 */
> > -	psize = mmu_psize_defs[mmu_virtual_psize].shift;
> > -	pteg_count = max(rnd_mem_size >> (psize + 1), 1UL << 11);
> > -
> > -	return pteg_count << 7;
> > +	return htab_shift_for_mem_size(memblock_phys_mem_size());
> 
> Would it be 1UL << htab_shift_for_mem_size(memblock_phys_mem_size())
> instead ? It was returning the size of the HPT not the shift of HPT
> originally or I am missing something here.

Oops, yes.  That would have broken all non-LPAR platforms.
Anshuman Khandual Feb. 4, 2016, 10:56 a.m. UTC | #3
On 02/02/2016 06:34 AM, David Gibson wrote:
> On Mon, Feb 01, 2016 at 12:34:32PM +0530, Anshuman Khandual wrote:
>> On 01/29/2016 10:53 AM, David Gibson wrote:
>>> htab_get_table_size() either retrieve the size of the hash page table (HPT)
>>> from the device tree - if the HPT size is determined by firmware - or
>>> uses a heuristic to determine a good size based on RAM size if the kernel
>>> is responsible for allocating the HPT.
>>>
>>> To support a PAPR extension allowing resizing of the HPT, we're going to
>>> want the memory size -> HPT size logic elsewhere, so split it out into a
>>> helper function.
>>>
>>> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
>>> ---
>>>  arch/powerpc/include/asm/mmu-hash64.h |  3 +++
>>>  arch/powerpc/mm/hash_utils_64.c       | 30 +++++++++++++++++-------------
>>>  2 files changed, 20 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
>>> index 7352d3f..cf070fd 100644
>>> --- a/arch/powerpc/include/asm/mmu-hash64.h
>>> +++ b/arch/powerpc/include/asm/mmu-hash64.h
>>> @@ -607,6 +607,9 @@ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
>>>  	context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1;
>>>  	return get_vsid(context, ea, ssize);
>>>  }
>>> +
>>> +unsigned htab_shift_for_mem_size(unsigned long mem_size);
>>> +
>>>  #endif /* __ASSEMBLY__ */
>>>  
>>>  #endif /* _ASM_POWERPC_MMU_HASH64_H_ */
>>> diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
>>> index e88a86e..d63f7dc 100644
>>> --- a/arch/powerpc/mm/hash_utils_64.c
>>> +++ b/arch/powerpc/mm/hash_utils_64.c
>>> @@ -606,10 +606,24 @@ static int __init htab_dt_scan_pftsize(unsigned long node,
>>>  	return 0;
>>>  }
>>>  
>>> -static unsigned long __init htab_get_table_size(void)
>>> +unsigned htab_shift_for_mem_size(unsigned long mem_size)
>>>  {
>>> -	unsigned long mem_size, rnd_mem_size, pteg_count, psize;
>>> +	unsigned memshift = __ilog2(mem_size);
>>> +	unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift;
>>> +	unsigned pteg_shift;
>>> +
>>> +	/* round mem_size up to next power of 2 */
>>> +	if ((1UL << memshift) < mem_size)
>>> +		memshift += 1;
>>> +
>>> +	/* aim for 2 pages / pteg */
>>
>> While here I guess its a good opportunity to write couple of lines
>> about why one PTE group for every two physical pages on the system,
> 
> Well, that don't really know, it's just copied from the existing code.

Aneesh, would you know why ?

> 
>> why minimum (1UL << 11 = 2048) number of PTE groups required,

Aneesh, would you know why ?

> 
> Ok.
> 
>> why
>> (1U << 7 = 128) entries per PTE group
> 
> Um.. what?  Because that's how big a PTEG is, I don't think
> re-explaining the HPT structure here is useful.

Agreed, though think some where these things should be macros not used
as hard coded numbers like this.
Paul Mackerras Feb. 8, 2016, 5:57 a.m. UTC | #4
On Thu, Feb 04, 2016 at 04:26:20PM +0530, Anshuman Khandual wrote:
> On 02/02/2016 06:34 AM, David Gibson wrote:
> > On Mon, Feb 01, 2016 at 12:34:32PM +0530, Anshuman Khandual wrote:
> >> On 01/29/2016 10:53 AM, David Gibson wrote:
> >>> htab_get_table_size() either retrieve the size of the hash page table (HPT)
> >>> from the device tree - if the HPT size is determined by firmware - or
> >>> uses a heuristic to determine a good size based on RAM size if the kernel
> >>> is responsible for allocating the HPT.
> >>>
> >>> To support a PAPR extension allowing resizing of the HPT, we're going to
> >>> want the memory size -> HPT size logic elsewhere, so split it out into a
> >>> helper function.
> >>>
> >>> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> >>> ---
> >>>  arch/powerpc/include/asm/mmu-hash64.h |  3 +++
> >>>  arch/powerpc/mm/hash_utils_64.c       | 30 +++++++++++++++++-------------
> >>>  2 files changed, 20 insertions(+), 13 deletions(-)
> >>>
> >>> diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
> >>> index 7352d3f..cf070fd 100644
> >>> --- a/arch/powerpc/include/asm/mmu-hash64.h
> >>> +++ b/arch/powerpc/include/asm/mmu-hash64.h
> >>> @@ -607,6 +607,9 @@ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
> >>>  	context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1;
> >>>  	return get_vsid(context, ea, ssize);
> >>>  }
> >>> +
> >>> +unsigned htab_shift_for_mem_size(unsigned long mem_size);
> >>> +
> >>>  #endif /* __ASSEMBLY__ */
> >>>  
> >>>  #endif /* _ASM_POWERPC_MMU_HASH64_H_ */
> >>> diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
> >>> index e88a86e..d63f7dc 100644
> >>> --- a/arch/powerpc/mm/hash_utils_64.c
> >>> +++ b/arch/powerpc/mm/hash_utils_64.c
> >>> @@ -606,10 +606,24 @@ static int __init htab_dt_scan_pftsize(unsigned long node,
> >>>  	return 0;
> >>>  }
> >>>  
> >>> -static unsigned long __init htab_get_table_size(void)
> >>> +unsigned htab_shift_for_mem_size(unsigned long mem_size)
> >>>  {
> >>> -	unsigned long mem_size, rnd_mem_size, pteg_count, psize;
> >>> +	unsigned memshift = __ilog2(mem_size);
> >>> +	unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift;
> >>> +	unsigned pteg_shift;
> >>> +
> >>> +	/* round mem_size up to next power of 2 */
> >>> +	if ((1UL << memshift) < mem_size)
> >>> +		memshift += 1;
> >>> +
> >>> +	/* aim for 2 pages / pteg */
> >>
> >> While here I guess its a good opportunity to write couple of lines
> >> about why one PTE group for every two physical pages on the system,
> > 
> > Well, that don't really know, it's just copied from the existing code.
> 
> Aneesh, would you know why ?

1 PTEG per 2 pages means 4 HPTEs per page, which means you can map
each page to an average of 4 different virtual addresses.  It's a
heuristic that has been around for a long time and dates back to the
early days of AIX.  For Linux, running on machines which typically
have quite a lot of memory, it's probably overkill.

> > 
> >> why minimum (1UL << 11 = 2048) number of PTE groups required,
> 
> Aneesh, would you know why ?

It's in the architecture, which specifies the minimum size of the HPT
as 256kB.  The reason is because not all of the virtual address bits
are present in the HPT.  That's OK because some of the virtual address
bits are implied by the HPTEG index in the hash table.  If the HPT was
less than 256kB (2048 HPTEGs) there would be the possibility of
collisions where two different virtual addresses could hash to the
same HPTEG and their HPTEs would be impossible to tell apart.

> 
> > 
> > Ok.
> > 
> >> why
> >> (1U << 7 = 128) entries per PTE group
> > 
> > Um.. what?  Because that's how big a PTEG is, I don't think
> > re-explaining the HPT structure here is useful.
> 
> Agreed, though think some where these things should be macros not used
> as hard coded numbers like this.

Using symbols instead of constant numbers is not always clearer.  The
symbol name can give some context (but so can a suitable comment) but
has the cost of obscuring the actual numeric value.

Paul.
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 7352d3f..cf070fd 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -607,6 +607,9 @@  static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
 	context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1;
 	return get_vsid(context, ea, ssize);
 }
+
+unsigned htab_shift_for_mem_size(unsigned long mem_size);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_MMU_HASH64_H_ */
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index e88a86e..d63f7dc 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -606,10 +606,24 @@  static int __init htab_dt_scan_pftsize(unsigned long node,
 	return 0;
 }
 
-static unsigned long __init htab_get_table_size(void)
+unsigned htab_shift_for_mem_size(unsigned long mem_size)
 {
-	unsigned long mem_size, rnd_mem_size, pteg_count, psize;
+	unsigned memshift = __ilog2(mem_size);
+	unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift;
+	unsigned pteg_shift;
+
+	/* round mem_size up to next power of 2 */
+	if ((1UL << memshift) < mem_size)
+		memshift += 1;
+
+	/* aim for 2 pages / pteg */
+	pteg_shift = memshift - (pshift + 1);
+
+	return max(pteg_shift + 7, 18U);
+}
 
+static unsigned long __init htab_get_table_size(void)
+{
 	/* If hash size isn't already provided by the platform, we try to
 	 * retrieve it from the device-tree. If it's not there neither, we
 	 * calculate it now based on the total RAM size
@@ -619,17 +633,7 @@  static unsigned long __init htab_get_table_size(void)
 	if (ppc64_pft_size)
 		return 1UL << ppc64_pft_size;
 
-	/* round mem_size up to next power of 2 */
-	mem_size = memblock_phys_mem_size();
-	rnd_mem_size = 1UL << __ilog2(mem_size);
-	if (rnd_mem_size < mem_size)
-		rnd_mem_size <<= 1;
-
-	/* # pages / 2 */
-	psize = mmu_psize_defs[mmu_virtual_psize].shift;
-	pteg_count = max(rnd_mem_size >> (psize + 1), 1UL << 11);
-
-	return pteg_count << 7;
+	return htab_shift_for_mem_size(memblock_phys_mem_size());
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG