diff mbox series

[09/13] mm/powerpc: Redefine pXd_huge() with pXd_leaf()

Message ID 20240313214719.253873-10-peterx@redhat.com
State New
Headers show
Series mm/treewide: Remove pXd_huge() API | expand

Commit Message

Peter Xu March 13, 2024, 9:47 p.m. UTC
From: Peter Xu <peterx@redhat.com>

PowerPC book3s 4K mostly has the same definition on both, except pXd_huge()
constantly returns 0 for hash MMUs.  As Michael Ellerman pointed out [1],
it is safe to check _PAGE_PTE on hash MMUs, as the bit will never be set so
it will keep returning false.

As a reference, __p[mu]d_mkhuge() will trigger a BUG_ON trying to create
such huge mappings for 4K hash MMUs.  Meanwhile, the major powerpc hugetlb
pgtable walker __find_linux_pte() already used pXd_leaf() to check hugetlb
mappings.

The goal should be that we will have one API pXd_leaf() to detect all kinds
of huge mappings.  AFAICT we need to use the pXd_leaf() impl (rather than
pXd_huge() ones) to make sure ie. THPs on hash MMU will also return true.

This helps to simplify a follow up patch to drop pXd_huge() treewide.

NOTE: *_leaf() definition need to be moved before the inclusion of
asm/book3s/64/pgtable-4k.h, which defines pXd_huge() with it.

[1] https://lore.kernel.org/r/87v85zo6w7.fsf@mail.lhotse

Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 .../include/asm/book3s/64/pgtable-4k.h        | 14 ++--------
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 27 +++++++++----------
 2 files changed, 14 insertions(+), 27 deletions(-)

Comments

Christophe Leroy March 14, 2024, 8:45 a.m. UTC | #1
Le 13/03/2024 à 22:47, peterx@redhat.com a écrit :
> From: Peter Xu <peterx@redhat.com>
> 
> PowerPC book3s 4K mostly has the same definition on both, except pXd_huge()
> constantly returns 0 for hash MMUs.  As Michael Ellerman pointed out [1],
> it is safe to check _PAGE_PTE on hash MMUs, as the bit will never be set so
> it will keep returning false.
> 
> As a reference, __p[mu]d_mkhuge() will trigger a BUG_ON trying to create
> such huge mappings for 4K hash MMUs.  Meanwhile, the major powerpc hugetlb
> pgtable walker __find_linux_pte() already used pXd_leaf() to check hugetlb
> mappings.
> 
> The goal should be that we will have one API pXd_leaf() to detect all kinds
> of huge mappings.  AFAICT we need to use the pXd_leaf() impl (rather than
> pXd_huge() ones) to make sure ie. THPs on hash MMU will also return true.

All kinds of huge mappings ?

pXd_leaf() will detect only leaf mappings (like pXd_huge() ). There are 
also huge mappings through hugepd. On powerpc 8xx we have 8M huge pages 
and 512k huge pages. A PGD entry covers 4M so pgd_leaf() won't report 
those huge pages.

> 
> This helps to simplify a follow up patch to drop pXd_huge() treewide.
> 
> NOTE: *_leaf() definition need to be moved before the inclusion of
> asm/book3s/64/pgtable-4k.h, which defines pXd_huge() with it.
> 
> [1] https://lore.kernel.org/r/87v85zo6w7.fsf@mail.lhotse
> 
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Nicholas Piggin <npiggin@gmail.com>
> Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
> Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
> Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
> Cc: linuxppc-dev@lists.ozlabs.org
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>   .../include/asm/book3s/64/pgtable-4k.h        | 14 ++--------
>   arch/powerpc/include/asm/book3s/64/pgtable.h  | 27 +++++++++----------
>   2 files changed, 14 insertions(+), 27 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
> index 48f21820afe2..92545981bb49 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
> @@ -8,22 +8,12 @@
>   #ifdef CONFIG_HUGETLB_PAGE
>   static inline int pmd_huge(pmd_t pmd)
>   {
> -	/*
> -	 * leaf pte for huge page
> -	 */
> -	if (radix_enabled())
> -		return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
> -	return 0;
> +	return pmd_leaf(pmd);
>   }
>   
>   static inline int pud_huge(pud_t pud)
>   {
> -	/*
> -	 * leaf pte for huge page
> -	 */
> -	if (radix_enabled())
> -		return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
> -	return 0;
> +	return pud_leaf(pud);
>   }
>   
>   /*
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index df66dce8306f..fd7180fded75 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -262,6 +262,18 @@ extern unsigned long __kernel_io_end;
>   
>   extern struct page *vmemmap;
>   extern unsigned long pci_io_base;
> +
> +#define pmd_leaf pmd_leaf
> +static inline bool pmd_leaf(pmd_t pmd)
> +{
> +	return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
> +}
> +
> +#define pud_leaf pud_leaf
> +static inline bool pud_leaf(pud_t pud)
> +{
> +	return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
> +}
>   #endif /* __ASSEMBLY__ */
>   
>   #include <asm/book3s/64/hash.h>
> @@ -1436,20 +1448,5 @@ static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_va
>   	return false;
>   }
>   
> -/*
> - * Like pmd_huge(), but works regardless of config options
> - */
> -#define pmd_leaf pmd_leaf
> -static inline bool pmd_leaf(pmd_t pmd)
> -{
> -	return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
> -}
> -
> -#define pud_leaf pud_leaf
> -static inline bool pud_leaf(pud_t pud)
> -{
> -	return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
> -}
> -
>   #endif /* __ASSEMBLY__ */
>   #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
Peter Xu March 14, 2024, 12:53 p.m. UTC | #2
On Thu, Mar 14, 2024 at 08:45:34AM +0000, Christophe Leroy wrote:
> 
> 
> Le 13/03/2024 à 22:47, peterx@redhat.com a écrit :
> > From: Peter Xu <peterx@redhat.com>
> > 
> > PowerPC book3s 4K mostly has the same definition on both, except pXd_huge()
> > constantly returns 0 for hash MMUs.  As Michael Ellerman pointed out [1],
> > it is safe to check _PAGE_PTE on hash MMUs, as the bit will never be set so
> > it will keep returning false.
> > 
> > As a reference, __p[mu]d_mkhuge() will trigger a BUG_ON trying to create
> > such huge mappings for 4K hash MMUs.  Meanwhile, the major powerpc hugetlb
> > pgtable walker __find_linux_pte() already used pXd_leaf() to check hugetlb
> > mappings.
> > 
> > The goal should be that we will have one API pXd_leaf() to detect all kinds
> > of huge mappings.  AFAICT we need to use the pXd_leaf() impl (rather than
> > pXd_huge() ones) to make sure ie. THPs on hash MMU will also return true.
> 
> All kinds of huge mappings ?
> 
> pXd_leaf() will detect only leaf mappings (like pXd_huge() ). There are 
> also huge mappings through hugepd. On powerpc 8xx we have 8M huge pages 
> and 512k huge pages. A PGD entry covers 4M so pgd_leaf() won't report 
> those huge pages.

Ah yes, I should always mention this is in the context of leaf huge pages
only.  Are the examples you provided all fall into hugepd category?  If so
I can reword the commit message, as:

        As a reference, __p[mu]d_mkhuge() will trigger a BUG_ON trying to
        create such huge mappings for 4K hash MMUs.  Meanwhile, the major
        powerpc hugetlb pgtable walker __find_linux_pte() already used
        pXd_leaf() to check leaf hugetlb mappings.

        The goal should be that we will have one API pXd_leaf() to detect
        all kinds of huge mappings except hugepd.  AFAICT we need to use
        the pXd_leaf() impl (rather than pXd_huge() ones) to make sure
        ie. THPs on hash MMU will also return true.

Does this look good to you?

Thanks,
Christophe Leroy March 14, 2024, 1:11 p.m. UTC | #3
Le 14/03/2024 à 13:53, Peter Xu a écrit :
> On Thu, Mar 14, 2024 at 08:45:34AM +0000, Christophe Leroy wrote:
>>
>>
>> Le 13/03/2024 à 22:47, peterx@redhat.com a écrit :
>>> From: Peter Xu <peterx@redhat.com>
>>>
>>> PowerPC book3s 4K mostly has the same definition on both, except pXd_huge()
>>> constantly returns 0 for hash MMUs.  As Michael Ellerman pointed out [1],
>>> it is safe to check _PAGE_PTE on hash MMUs, as the bit will never be set so
>>> it will keep returning false.
>>>
>>> As a reference, __p[mu]d_mkhuge() will trigger a BUG_ON trying to create
>>> such huge mappings for 4K hash MMUs.  Meanwhile, the major powerpc hugetlb
>>> pgtable walker __find_linux_pte() already used pXd_leaf() to check hugetlb
>>> mappings.
>>>
>>> The goal should be that we will have one API pXd_leaf() to detect all kinds
>>> of huge mappings.  AFAICT we need to use the pXd_leaf() impl (rather than
>>> pXd_huge() ones) to make sure ie. THPs on hash MMU will also return true.
>>
>> All kinds of huge mappings ?
>>
>> pXd_leaf() will detect only leaf mappings (like pXd_huge() ). There are
>> also huge mappings through hugepd. On powerpc 8xx we have 8M huge pages
>> and 512k huge pages. A PGD entry covers 4M so pgd_leaf() won't report
>> those huge pages.
> 
> Ah yes, I should always mention this is in the context of leaf huge pages
> only.  Are the examples you provided all fall into hugepd category?  If so
> I can reword the commit message, as:

On powerpc 8xx, only the 8M huge pages fall into the hugepd case.

The 512k hugepages are at PTE level, they are handled more or less like 
CONT_PTE on ARM. see function set_huge_pte_at() for more context.

You can also look at pte_leaf_size() and pgd_leaf_size().

By the way pgd_leaf_size() looks odd because it is called only when 
pgd_leaf_size() returns true, which never happens for 8M pages.

> 
>          As a reference, __p[mu]d_mkhuge() will trigger a BUG_ON trying to
>          create such huge mappings for 4K hash MMUs.  Meanwhile, the major
>          powerpc hugetlb pgtable walker __find_linux_pte() already used
>          pXd_leaf() to check leaf hugetlb mappings.
> 
>          The goal should be that we will have one API pXd_leaf() to detect
>          all kinds of huge mappings except hugepd.  AFAICT we need to use
>          the pXd_leaf() impl (rather than pXd_huge() ones) to make sure
>          ie. THPs on hash MMU will also return true.
> 
> Does this look good to you?
> 
> Thanks,
>
Jason Gunthorpe March 18, 2024, 4:15 p.m. UTC | #4
On Thu, Mar 14, 2024 at 01:11:59PM +0000, Christophe Leroy wrote:
> 
> 
> Le 14/03/2024 à 13:53, Peter Xu a écrit :
> > On Thu, Mar 14, 2024 at 08:45:34AM +0000, Christophe Leroy wrote:
> >>
> >>
> >> Le 13/03/2024 à 22:47, peterx@redhat.com a écrit :
> >>> From: Peter Xu <peterx@redhat.com>
> >>>
> >>> PowerPC book3s 4K mostly has the same definition on both, except pXd_huge()
> >>> constantly returns 0 for hash MMUs.  As Michael Ellerman pointed out [1],
> >>> it is safe to check _PAGE_PTE on hash MMUs, as the bit will never be set so
> >>> it will keep returning false.
> >>>
> >>> As a reference, __p[mu]d_mkhuge() will trigger a BUG_ON trying to create
> >>> such huge mappings for 4K hash MMUs.  Meanwhile, the major powerpc hugetlb
> >>> pgtable walker __find_linux_pte() already used pXd_leaf() to check hugetlb
> >>> mappings.
> >>>
> >>> The goal should be that we will have one API pXd_leaf() to detect all kinds
> >>> of huge mappings.  AFAICT we need to use the pXd_leaf() impl (rather than
> >>> pXd_huge() ones) to make sure ie. THPs on hash MMU will also return true.
> >>
> >> All kinds of huge mappings ?
> >>
> >> pXd_leaf() will detect only leaf mappings (like pXd_huge() ). There are
> >> also huge mappings through hugepd. On powerpc 8xx we have 8M huge pages
> >> and 512k huge pages. A PGD entry covers 4M so pgd_leaf() won't report
> >> those huge pages.
> > 
> > Ah yes, I should always mention this is in the context of leaf huge pages
> > only.  Are the examples you provided all fall into hugepd category?  If so
> > I can reword the commit message, as:
> 
> On powerpc 8xx, only the 8M huge pages fall into the hugepd case.
> 
> The 512k hugepages are at PTE level, they are handled more or less like 
> CONT_PTE on ARM. see function set_huge_pte_at() for more context.
> 
> You can also look at pte_leaf_size() and pgd_leaf_size().

IMHO leaf should return false if the thing is pointing to a next level
page table, even if that next level is fully populated with contiguous
pages.

This seems more aligned with the contig page direction that hugepd
should be moved over to..

> By the way pgd_leaf_size() looks odd because it is called only when 
> pgd_leaf_size() returns true, which never happens for 8M pages.

Like this, you should reach the actual final leaf that the HW will
load and leaf_size() should say it is greater size than the current
table level. Other levels should return 0.

If necessary the core MM code should deal with this by iterating over
adjacent tables.

Jason
Christophe Leroy March 19, 2024, 11:07 p.m. UTC | #5
Le 18/03/2024 à 17:15, Jason Gunthorpe a écrit :
> On Thu, Mar 14, 2024 at 01:11:59PM +0000, Christophe Leroy wrote:
>>
>>
>> Le 14/03/2024 à 13:53, Peter Xu a écrit :
>>> On Thu, Mar 14, 2024 at 08:45:34AM +0000, Christophe Leroy wrote:
>>>>
>>>>
>>>> Le 13/03/2024 à 22:47, peterx@redhat.com a écrit :
>>>>> From: Peter Xu <peterx@redhat.com>
>>>>>
>>>>> PowerPC book3s 4K mostly has the same definition on both, except pXd_huge()
>>>>> constantly returns 0 for hash MMUs.  As Michael Ellerman pointed out [1],
>>>>> it is safe to check _PAGE_PTE on hash MMUs, as the bit will never be set so
>>>>> it will keep returning false.
>>>>>
>>>>> As a reference, __p[mu]d_mkhuge() will trigger a BUG_ON trying to create
>>>>> such huge mappings for 4K hash MMUs.  Meanwhile, the major powerpc hugetlb
>>>>> pgtable walker __find_linux_pte() already used pXd_leaf() to check hugetlb
>>>>> mappings.
>>>>>
>>>>> The goal should be that we will have one API pXd_leaf() to detect all kinds
>>>>> of huge mappings.  AFAICT we need to use the pXd_leaf() impl (rather than
>>>>> pXd_huge() ones) to make sure ie. THPs on hash MMU will also return true.
>>>>
>>>> All kinds of huge mappings ?
>>>>
>>>> pXd_leaf() will detect only leaf mappings (like pXd_huge() ). There are
>>>> also huge mappings through hugepd. On powerpc 8xx we have 8M huge pages
>>>> and 512k huge pages. A PGD entry covers 4M so pgd_leaf() won't report
>>>> those huge pages.
>>>
>>> Ah yes, I should always mention this is in the context of leaf huge pages
>>> only.  Are the examples you provided all fall into hugepd category?  If so
>>> I can reword the commit message, as:
>>
>> On powerpc 8xx, only the 8M huge pages fall into the hugepd case.
>>
>> The 512k hugepages are at PTE level, they are handled more or less like
>> CONT_PTE on ARM. see function set_huge_pte_at() for more context.
>>
>> You can also look at pte_leaf_size() and pgd_leaf_size().
> 
> IMHO leaf should return false if the thing is pointing to a next level
> page table, even if that next level is fully populated with contiguous
> pages.
> 
> This seems more aligned with the contig page direction that hugepd
> should be moved over to..

Should hugepd be moved to the contig page direction, really ?

Would it be acceptable that a 8M hugepage requires 2048 contig entries 
in 2 page tables, when the hugepd allows a single entry ? Would it be 
acceptable performancewise ?

> 
>> By the way pgd_leaf_size() looks odd because it is called only when
>> pgd_leaf_size() returns true, which never happens for 8M pages.
> 
> Like this, you should reach the actual final leaf that the HW will
> load and leaf_size() should say it is greater size than the current
> table level. Other levels should return 0.
> 
> If necessary the core MM code should deal with this by iterating over
> adjacent tables.
> 
> Jason
Jason Gunthorpe March 19, 2024, 11:26 p.m. UTC | #6
On Tue, Mar 19, 2024 at 11:07:08PM +0000, Christophe Leroy wrote:
> 
> 
> Le 18/03/2024 à 17:15, Jason Gunthorpe a écrit :
> > On Thu, Mar 14, 2024 at 01:11:59PM +0000, Christophe Leroy wrote:
> >>
> >>
> >> Le 14/03/2024 à 13:53, Peter Xu a écrit :
> >>> On Thu, Mar 14, 2024 at 08:45:34AM +0000, Christophe Leroy wrote:
> >>>>
> >>>>
> >>>> Le 13/03/2024 à 22:47, peterx@redhat.com a écrit :
> >>>>> From: Peter Xu <peterx@redhat.com>
> >>>>>
> >>>>> PowerPC book3s 4K mostly has the same definition on both, except pXd_huge()
> >>>>> constantly returns 0 for hash MMUs.  As Michael Ellerman pointed out [1],
> >>>>> it is safe to check _PAGE_PTE on hash MMUs, as the bit will never be set so
> >>>>> it will keep returning false.
> >>>>>
> >>>>> As a reference, __p[mu]d_mkhuge() will trigger a BUG_ON trying to create
> >>>>> such huge mappings for 4K hash MMUs.  Meanwhile, the major powerpc hugetlb
> >>>>> pgtable walker __find_linux_pte() already used pXd_leaf() to check hugetlb
> >>>>> mappings.
> >>>>>
> >>>>> The goal should be that we will have one API pXd_leaf() to detect all kinds
> >>>>> of huge mappings.  AFAICT we need to use the pXd_leaf() impl (rather than
> >>>>> pXd_huge() ones) to make sure ie. THPs on hash MMU will also return true.
> >>>>
> >>>> All kinds of huge mappings ?
> >>>>
> >>>> pXd_leaf() will detect only leaf mappings (like pXd_huge() ). There are
> >>>> also huge mappings through hugepd. On powerpc 8xx we have 8M huge pages
> >>>> and 512k huge pages. A PGD entry covers 4M so pgd_leaf() won't report
> >>>> those huge pages.
> >>>
> >>> Ah yes, I should always mention this is in the context of leaf huge pages
> >>> only.  Are the examples you provided all fall into hugepd category?  If so
> >>> I can reword the commit message, as:
> >>
> >> On powerpc 8xx, only the 8M huge pages fall into the hugepd case.
> >>
> >> The 512k hugepages are at PTE level, they are handled more or less like
> >> CONT_PTE on ARM. see function set_huge_pte_at() for more context.
> >>
> >> You can also look at pte_leaf_size() and pgd_leaf_size().
> > 
> > IMHO leaf should return false if the thing is pointing to a next level
> > page table, even if that next level is fully populated with contiguous
> > pages.
> > 
> > This seems more aligned with the contig page direction that hugepd
> > should be moved over to..
> 
> Should hugepd be moved to the contig page direction, really ?

Sure? Is there any downside for the reading side to do so?

> Would it be acceptable that a 8M hugepage requires 2048 contig entries
> in 2 page tables, when the hugepd allows a single entry ? 

? I thought we agreed the only difference would be that something new
is needed to merge the two identical sibling page tables into one, ie
you pay 2x the page table memory if that isn't fixed. That is write
side only change and I imagine it could be done with a single PPC
special API.

Honestly not totally sure that is a big deal, it is already really
memory inefficient compared to every other arch's huge page by needing
the child page table in the first place.

> Would it be acceptable performancewise ?

Isn't this particular PPC sub platform ancient? Are there current real
users that are going to have hugetlbfs special code and care about
this performance detail on a 6.20 era kernel?

In today's world wouldn't it be performance better if these platforms
could support THP by aligning to the contig API instead of being
special?

Am I wrong to question why we are polluting the core code for this
special optimization?

Jason
Christophe Leroy March 20, 2024, 6:16 a.m. UTC | #7
Le 20/03/2024 à 00:26, Jason Gunthorpe a écrit :
> On Tue, Mar 19, 2024 at 11:07:08PM +0000, Christophe Leroy wrote:
>>
>>
>> Le 18/03/2024 à 17:15, Jason Gunthorpe a écrit :
>>> On Thu, Mar 14, 2024 at 01:11:59PM +0000, Christophe Leroy wrote:
>>>>
>>>>
>>>> Le 14/03/2024 à 13:53, Peter Xu a écrit :
>>>>> On Thu, Mar 14, 2024 at 08:45:34AM +0000, Christophe Leroy wrote:
>>>>>>
>>>>>>
>>>>>> Le 13/03/2024 à 22:47, peterx@redhat.com a écrit :
>>>>>>> From: Peter Xu <peterx@redhat.com>
>>>>>>>
>>>>>>> PowerPC book3s 4K mostly has the same definition on both, except pXd_huge()
>>>>>>> constantly returns 0 for hash MMUs.  As Michael Ellerman pointed out [1],
>>>>>>> it is safe to check _PAGE_PTE on hash MMUs, as the bit will never be set so
>>>>>>> it will keep returning false.
>>>>>>>
>>>>>>> As a reference, __p[mu]d_mkhuge() will trigger a BUG_ON trying to create
>>>>>>> such huge mappings for 4K hash MMUs.  Meanwhile, the major powerpc hugetlb
>>>>>>> pgtable walker __find_linux_pte() already used pXd_leaf() to check hugetlb
>>>>>>> mappings.
>>>>>>>
>>>>>>> The goal should be that we will have one API pXd_leaf() to detect all kinds
>>>>>>> of huge mappings.  AFAICT we need to use the pXd_leaf() impl (rather than
>>>>>>> pXd_huge() ones) to make sure ie. THPs on hash MMU will also return true.
>>>>>>
>>>>>> All kinds of huge mappings ?
>>>>>>
>>>>>> pXd_leaf() will detect only leaf mappings (like pXd_huge() ). There are
>>>>>> also huge mappings through hugepd. On powerpc 8xx we have 8M huge pages
>>>>>> and 512k huge pages. A PGD entry covers 4M so pgd_leaf() won't report
>>>>>> those huge pages.
>>>>>
>>>>> Ah yes, I should always mention this is in the context of leaf huge pages
>>>>> only.  Are the examples you provided all fall into hugepd category?  If so
>>>>> I can reword the commit message, as:
>>>>
>>>> On powerpc 8xx, only the 8M huge pages fall into the hugepd case.
>>>>
>>>> The 512k hugepages are at PTE level, they are handled more or less like
>>>> CONT_PTE on ARM. see function set_huge_pte_at() for more context.
>>>>
>>>> You can also look at pte_leaf_size() and pgd_leaf_size().
>>>
>>> IMHO leaf should return false if the thing is pointing to a next level
>>> page table, even if that next level is fully populated with contiguous
>>> pages.
>>>
>>> This seems more aligned with the contig page direction that hugepd
>>> should be moved over to..
>>
>> Should hugepd be moved to the contig page direction, really ?
> 
> Sure? Is there any downside for the reading side to do so?

Probably not.

> 
>> Would it be acceptable that a 8M hugepage requires 2048 contig entries
>> in 2 page tables, when the hugepd allows a single entry ?
> 
> ? I thought we agreed the only difference would be that something new
> is needed to merge the two identical sibling page tables into one, ie
> you pay 2x the page table memory if that isn't fixed. That is write
> side only change and I imagine it could be done with a single PPC
> special API.
> 
> Honestly not totally sure that is a big deal, it is already really
> memory inefficient compared to every other arch's huge page by needing
> the child page table in the first place.
> 
>> Would it be acceptable performancewise ?
> 
> Isn't this particular PPC sub platform ancient? Are there current real
> users that are going to have hugetlbfs special code and care about
> this performance detail on a 6.20 era kernel?

Ancient yes but still widely in use and with the emergence of voice over 
IP in Air Trafic Control, performance becomes more and more challenge 
with those old boards that have another 10 years in front of them.

> 
> In today's world wouldn't it be performance better if these platforms
> could support THP by aligning to the contig API instead of being
> special?

Indeed, if we can promote THP that'd be even better.

> 
> Am I wrong to question why we are polluting the core code for this
> special optimization?

At the first place that was to get a close fit between hardware 
pagetable topology and linux pagetable topology. But obviously we 
already stepped back for 512k pages, so let's go one more step aside and 
do similar with 8M pages.

I'll give it a try and see how it goes.

Christophe
Peter Xu March 20, 2024, 4:09 p.m. UTC | #8
On Wed, Mar 20, 2024 at 06:16:43AM +0000, Christophe Leroy wrote:
> At the first place that was to get a close fit between hardware 
> pagetable topology and linux pagetable topology. But obviously we 
> already stepped back for 512k pages, so let's go one more step aside and 
> do similar with 8M pages.
> 
> I'll give it a try and see how it goes.

So you're talking about 8M only for 8xx, am I right?

There seem to be other PowerPC systems use hugepd.  Is it possible that we
convert all hugepd into cont_pte form?

Thanks,
Christophe Leroy March 20, 2024, 5:40 p.m. UTC | #9
Le 20/03/2024 à 17:09, Peter Xu a écrit :
> On Wed, Mar 20, 2024 at 06:16:43AM +0000, Christophe Leroy wrote:
>> At the first place that was to get a close fit between hardware
>> pagetable topology and linux pagetable topology. But obviously we
>> already stepped back for 512k pages, so let's go one more step aside and
>> do similar with 8M pages.
>>
>> I'll give it a try and see how it goes.
> 
> So you're talking about 8M only for 8xx, am I right?

Yes I am.

> 
> There seem to be other PowerPC systems use hugepd.  Is it possible that we
> convert all hugepd into cont_pte form?

Indeed.

Seems like we have hugepd for book3s/64 and for nohash.

For book3s I don't know, may Aneesh can answer.

For nohash I think it should be possible because TLB misses are handled 
by software. Even the e6500 which has a hardware tablewalk falls back on 
software walk when it is a hugepage IIUC.

Christophe
Peter Xu March 20, 2024, 8:24 p.m. UTC | #10
On Wed, Mar 20, 2024 at 05:40:39PM +0000, Christophe Leroy wrote:
> 
> 
> Le 20/03/2024 à 17:09, Peter Xu a écrit :
> > On Wed, Mar 20, 2024 at 06:16:43AM +0000, Christophe Leroy wrote:
> >> At the first place that was to get a close fit between hardware
> >> pagetable topology and linux pagetable topology. But obviously we
> >> already stepped back for 512k pages, so let's go one more step aside and
> >> do similar with 8M pages.
> >>
> >> I'll give it a try and see how it goes.
> > 
> > So you're talking about 8M only for 8xx, am I right?
> 
> Yes I am.
> 
> > 
> > There seem to be other PowerPC systems use hugepd.  Is it possible that we
> > convert all hugepd into cont_pte form?
> 
> Indeed.
> 
> Seems like we have hugepd for book3s/64 and for nohash.
> 
> For book3s I don't know, may Aneesh can answer.
> 
> For nohash I think it should be possible because TLB misses are handled 
> by software. Even the e6500 which has a hardware tablewalk falls back on 
> software walk when it is a hugepage IIUC.

It'll be great if I can get some answer here, and then I know the path for
hugepd in general.  I don't want to add any new code into core mm to
something destined to fade away soon.

One option for me is I can check a macro of hugepd existance, so all new
code will only work when hugepd is not supported on such arch.  However
that'll start to make some PowerPC systems special (which I still tried
hard to avoid, if that wasn't proved in the past..), meanwhile we'll also
need to keep some generic-mm paths (that I can already remove along with
the new code) only for these hugepd systems.  But it's still okay to me,
it'll be just a matter of when to drop those codes, sooner or later.

Thanks,
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
index 48f21820afe2..92545981bb49 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
@@ -8,22 +8,12 @@ 
 #ifdef CONFIG_HUGETLB_PAGE
 static inline int pmd_huge(pmd_t pmd)
 {
-	/*
-	 * leaf pte for huge page
-	 */
-	if (radix_enabled())
-		return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
-	return 0;
+	return pmd_leaf(pmd);
 }
 
 static inline int pud_huge(pud_t pud)
 {
-	/*
-	 * leaf pte for huge page
-	 */
-	if (radix_enabled())
-		return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
-	return 0;
+	return pud_leaf(pud);
 }
 
 /*
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index df66dce8306f..fd7180fded75 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -262,6 +262,18 @@  extern unsigned long __kernel_io_end;
 
 extern struct page *vmemmap;
 extern unsigned long pci_io_base;
+
+#define pmd_leaf pmd_leaf
+static inline bool pmd_leaf(pmd_t pmd)
+{
+	return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
+}
+
+#define pud_leaf pud_leaf
+static inline bool pud_leaf(pud_t pud)
+{
+	return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
+}
 #endif /* __ASSEMBLY__ */
 
 #include <asm/book3s/64/hash.h>
@@ -1436,20 +1448,5 @@  static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_va
 	return false;
 }
 
-/*
- * Like pmd_huge(), but works regardless of config options
- */
-#define pmd_leaf pmd_leaf
-static inline bool pmd_leaf(pmd_t pmd)
-{
-	return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
-}
-
-#define pud_leaf pud_leaf
-static inline bool pud_leaf(pud_t pud)
-{
-	return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
-}
-
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */