diff mbox

[RFC,V1,03/33] powerpc/mm: Switch book3s 64 with 64K page size to 4 level page table

Message ID 1452582968-22669-4-git-send-email-aneesh.kumar@linux.vnet.ibm.com (mailing list archive)
State Superseded
Headers show

Commit Message

Aneesh Kumar K.V Jan. 12, 2016, 7:15 a.m. UTC
This is needed so that we can support both hash and radix page table
using single kernel. Radix kernel uses a 4 level table.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/Kconfig                          |  1 +
 arch/powerpc/include/asm/book3s/64/hash-4k.h  | 33 +--------------------------
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 20 +++++++++-------
 arch/powerpc/include/asm/book3s/64/hash.h     |  8 +++++++
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 25 +++++++++++++++++++-
 arch/powerpc/include/asm/pgalloc-64.h         | 24 ++++++++++++++++---
 arch/powerpc/include/asm/pgtable-types.h      | 13 +++++++----
 arch/powerpc/mm/init_64.c                     | 21 ++++++++++++-----
 8 files changed, 90 insertions(+), 55 deletions(-)

Comments

Balbir Singh Jan. 13, 2016, 8:52 a.m. UTC | #1
On Tue, 12 Jan 2016 12:45:38 +0530
"Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> wrote:

> This is needed so that we can support both hash and radix page table
> using single kernel. Radix kernel uses a 4 level table.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/Kconfig                          |  1 +
>  arch/powerpc/include/asm/book3s/64/hash-4k.h  | 33
> +--------------------------
> arch/powerpc/include/asm/book3s/64/hash-64k.h | 20 +++++++++-------
> arch/powerpc/include/asm/book3s/64/hash.h     |  8 +++++++
> arch/powerpc/include/asm/book3s/64/pgtable.h  | 25
> +++++++++++++++++++- arch/powerpc/include/asm/pgalloc-64.h         |
> 24 ++++++++++++++++--- arch/powerpc/include/asm/pgtable-types.h
> | 13 +++++++---- arch/powerpc/mm/init_64.c                     | 21
> ++++++++++++----- 8 files changed, 90 insertions(+), 55 deletions(-)
> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 378f1127ca98..618afea4c9fc 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -303,6 +303,7 @@ config ZONE_DMA32

snip
> -
>  #define PTE_INDEX_SIZE  8
> -#define PMD_INDEX_SIZE  10
> -#define PUD_INDEX_SIZE	0
> +#define PMD_INDEX_SIZE  5
> +#define PUD_INDEX_SIZE	5
>  #define PGD_INDEX_SIZE  12
>  

OK, so PMD index split from 10 to 5 and 5 to PMD/PUD? What is the plan
for huge pages, I saw you mentioned it was a TODO

>  #define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
>  #define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
> +#define PTRS_PER_PUD	(1 << PUD_INDEX_SIZE)
>  #define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
>  
>  /* With 4k base page size, hugepage PTEs go at the PMD level */
> @@ -20,8 +19,13 @@
>  #define PMD_SIZE	(1UL << PMD_SHIFT)
>  #define PMD_MASK	(~(PMD_SIZE-1))
>  
> +/* PUD_SHIFT determines what a third-level page table entry can map
> */ +#define PUD_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
> +#define PUD_SIZE	(1UL << PUD_SHIFT)
> +#define PUD_MASK	(~(PUD_SIZE-1))
> +
>  /* PGDIR_SHIFT determines what a third-level page table entry can
> map */ -#define PGDIR_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
> +#define PGDIR_SHIFT	(PUD_SHIFT + PUD_INDEX_SIZE)
>  #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
>  #define PGDIR_MASK	(~(PGDIR_SIZE-1))
>  
> @@ -61,6 +65,8 @@
>  #define PMD_MASKED_BITS		(PTE_FRAG_SIZE - 1)
>  /* Bits to mask out from a PGD/PUD to get to the PMD page */

The comment looks like it applied to PMD and not PUD.
>  #define PUD_MASKED_BITS		0x1ff

Given that PUD is now 5 bits, this should be 0x1f?

> +/* FIXME!! check this */
> +#define PGD_MASKED_BITS		0
>  

PGD_MASKED_BITS is 0? Shouldn't it be 0xfe

>  #ifndef __ASSEMBLY__
>  
> @@ -130,11 +136,9 @@ extern bool __rpte_sub_valid(real_pte_t rpte,
> unsigned long index); #else
>  #define PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
>  #endif
> +#define PUD_TABLE_SIZE	(sizeof(pud_t) << PUD_INDEX_SIZE)
>  #define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
>  
> -#define pgd_pte(pgd)	(pud_pte(((pud_t){ pgd })))
> -#define pte_pgd(pte)	((pgd_t)pte_pud(pte))
> -
>  #ifdef CONFIG_HUGETLB_PAGE
>  /*
>   * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can
> have diff --git a/arch/powerpc/include/asm/book3s/64/hash.h
> b/arch/powerpc/include/asm/book3s/64/hash.h index
> f46974d0134a..9ff1e056acef 100644 ---
> a/arch/powerpc/include/asm/book3s/64/hash.h +++
> b/arch/powerpc/include/asm/book3s/64/hash.h @@ -226,6 +226,7 @@
>  #define pud_page_vaddr(pud)	(pud_val(pud) & ~PUD_MASKED_BITS)
>  
>  #define pgd_index(address) (((address) >> (PGDIR_SHIFT)) &
> (PTRS_PER_PGD - 1)) +#define pud_index(address) (((address) >>
> (PUD_SHIFT)) & (PTRS_PER_PUD - 1)) #define pmd_index(address)
> (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1)) #define
> pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1)) 
> @@ -354,8 +355,15 @@ static inline void __ptep_set_access_flags(pte_t
> *ptep, pte_t entry) :"cc");
>  }
>  
> +static inline int pgd_bad(pgd_t pgd)
> +{
> +	return (pgd_val(pgd) == 0);
> +}
> +
>  #define __HAVE_ARCH_PTE_SAME
>  #define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) &
> ~_PAGE_HPTEFLAGS) == 0) +#define pgd_page_vaddr(pgd)
> (pgd_val(pgd) & ~PGD_MASKED_BITS) +
>  
>  /* Generic accessors to PTE bits */
>  static inline int pte_write(pte_t pte)
> { return !!(pte_val(pte) & _PAGE_RW);} diff --git
> a/arch/powerpc/include/asm/book3s/64/pgtable.h
> b/arch/powerpc/include/asm/book3s/64/pgtable.h index
> e7162dba987e..8f639401c7ba 100644 ---
> a/arch/powerpc/include/asm/book3s/64/pgtable.h +++
> b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -111,6 +111,26 @@
> static inline void pgd_set(pgd_t *pgdp, unsigned long val) *pgdp =
> __pgd(val); } 
> +static inline void pgd_clear(pgd_t *pgdp)
> +{
> +	*pgdp = __pgd(0);
> +}
> +
> +#define pgd_none(pgd)		(!pgd_val(pgd))
> +#define pgd_present(pgd)	(!pgd_none(pgd))
> +
> +static inline pte_t pgd_pte(pgd_t pgd)
> +{
> +	return __pte(pgd_val(pgd));
> +}
> +
> +static inline pgd_t pte_pgd(pte_t pte)
> +{
> +	return __pgd(pte_val(pte));
> +}
> +
> +extern struct page *pgd_page(pgd_t pgd);
> +
>  /*
>   * Find an entry in a page-table-directory.  We combine the address
> region
>   * (the high order N bits) and the pgd portion of the address.
> @@ -118,9 +138,10 @@ static inline void pgd_set(pgd_t *pgdp, unsigned
> long val) 
>  #define pgd_offset(mm, address)	 ((mm)->pgd +
> pgd_index(address)) 
> +#define pud_offset(pgdp, addr)	\
> +	(((pud_t *) pgd_page_vaddr(*(pgdp))) + pud_index(addr))
>  #define pmd_offset(pudp,addr) \
>  	(((pmd_t *) pud_page_vaddr(*(pudp))) + pmd_index(addr))
> -
>  #define pte_offset_kernel(dir,addr) \
>  	(((pte_t *) pmd_page_vaddr(*(dir))) + pte_index(addr))
>  
> @@ -135,6 +156,8 @@ static inline void pgd_set(pgd_t *pgdp, unsigned
> long val) pr_err("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__,
> pte_val(e)) #define pmd_ERROR(e) \
>  	pr_err("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__,
> pmd_val(e)) +#define pud_ERROR(e) \
> +	pr_err("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__,
> pud_val(e)) #define pgd_ERROR(e) \
>  	pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__,
> pgd_val(e)) 
> diff --git a/arch/powerpc/include/asm/pgalloc-64.h
> b/arch/powerpc/include/asm/pgalloc-64.h index
> 69ef28a81733..014489a619d0 100644 ---
> a/arch/powerpc/include/asm/pgalloc-64.h +++
> b/arch/powerpc/include/asm/pgalloc-64.h @@ -171,7 +171,25 @@ extern
> void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int
> shift); extern void __tlb_remove_table(void *_table); #endif
>  
> -#define pud_populate(mm, pud, pmd)	pud_set(pud, (unsigned
> long)pmd) +#ifndef __PAGETABLE_PUD_FOLDED
> +/* book3s 64 is 4 level page table */
> +#define pgd_populate(MM, PGD, PUD)	pgd_set(PGD, PUD)
> +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned
> long addr) +{
> +	return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE),
> +				GFP_KERNEL|__GFP_REPEAT);
> +}
> +
> +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
> +{
> +	kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud);
> +}
> +#endif
> +
> +static inline void pud_populate(struct mm_struct *mm, pud_t *pud,
> pmd_t *pmd) +{
> +	pud_set(pud, (unsigned long)pmd);
> +}
>  
>  static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t
> *pmd, pte_t *pte)
> @@ -233,11 +251,11 @@ static inline void pmd_free(struct mm_struct
> *mm, pmd_t *pmd) 
>  #define __pmd_free_tlb(tlb, pmd, addr)		      \
>  	pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX)
> -#ifndef CONFIG_PPC_64K_PAGES
> +#ifndef __PAGETABLE_PUD_FOLDED
>  #define __pud_free_tlb(tlb, pud, addr)		      \
>  	pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
>  
> -#endif /* CONFIG_PPC_64K_PAGES */
> +#endif /* __PAGETABLE_PUD_FOLDED */
>  
>  #define check_pgt_cache()	do { } while (0)
>  
> diff --git a/arch/powerpc/include/asm/pgtable-types.h
> b/arch/powerpc/include/asm/pgtable-types.h index
> 71487e1ca638..43140f8b0592 100644 ---
> a/arch/powerpc/include/asm/pgtable-types.h +++
> b/arch/powerpc/include/asm/pgtable-types.h @@ -21,15 +21,18 @@ static
> inline unsigned long pmd_val(pmd_t x) return x.pmd;
>  }
>  
> -/* PUD level exusts only on 4k pages */
> -#ifndef CONFIG_PPC_64K_PAGES
> +/*
> + * 64 bit hash always use 4 level table. Everybody else use 4 level
> + * only for 4K page size.
> + */
> +#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES)
>  typedef struct { unsigned long pud; } pud_t;

>  #define __pud(x)	((pud_t) { (x) })
>  static inline unsigned long pud_val(pud_t x)
>  {
>  	return x.pud;
>  }
> -#endif /* !CONFIG_PPC_64K_PAGES */
> +#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */
>  #endif /* CONFIG_PPC64 */
>  
>  /* PGD level */
> @@ -66,14 +69,14 @@ static inline unsigned long pmd_val(pmd_t pmd)
>  	return pmd;
>  }
>  
> -#ifndef CONFIG_PPC_64K_PAGES
> +#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES)
>  typedef unsigned long pud_t;



>  #define __pud(x)	(x)
>  static inline unsigned long pud_val(pud_t pud)
>  {
>  	return pud;
>  }
> -#endif /* !CONFIG_PPC_64K_PAGES */
> +#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */
>  #endif /* CONFIG_PPC64 */
>  
>  typedef unsigned long pgd_t;
> diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
> index 379a6a90644b..8ce1ec24d573 100644
> --- a/arch/powerpc/mm/init_64.c
> +++ b/arch/powerpc/mm/init_64.c
> @@ -85,6 +85,11 @@ static void pgd_ctor(void *addr)
>  	memset(addr, 0, PGD_TABLE_SIZE);
>  }
>  
> +static void pud_ctor(void *addr)
> +{
> +	memset(addr, 0, PUD_TABLE_SIZE);
> +}
> +
>  static void pmd_ctor(void *addr)
>  {
>  	memset(addr, 0, PMD_TABLE_SIZE);
> @@ -138,14 +143,18 @@ void pgtable_cache_init(void)
>  {
>  	pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
>  	pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
> +	/*
> +	 * In all current configs, when the PUD index exists it's the
> +	 * same size as either the pgd or pmd index except with THP
> enabled
> +	 * on book3s 64
> +	 */
> +	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
> +		pgtable_cache_add(PUD_INDEX_SIZE, pud_ctor);
> +
>  	if (!PGT_CACHE(PGD_INDEX_SIZE)
> || !PGT_CACHE(PMD_CACHE_INDEX)) panic("Couldn't allocate pgtable
> caches");
> -	/* In all current configs, when the PUD index exists it's the
> -	 * same size as either the pgd or pmd index.  Verify that the
> -	 * initialization above has also created a PUD cache.  This
> -	 * will need re-examiniation if we add new possibilities for
> -	 * the pagetable layout. */
> -	BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE));
> +	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
> +		panic("Couldn't allocate pud pgtable caches");
>  }
>  
>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
Balbir Singh Jan. 15, 2016, 12:25 a.m. UTC | #2
On 12/01/16 18:15, Aneesh Kumar K.V wrote:
> This is needed so that we can support both hash and radix page table
> using single kernel. Radix kernel uses a 4 level table.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/Kconfig                          |  1 +
>  arch/powerpc/include/asm/book3s/64/hash-4k.h  | 33 +--------------------------
>  arch/powerpc/include/asm/book3s/64/hash-64k.h | 20 +++++++++-------
>  arch/powerpc/include/asm/book3s/64/hash.h     |  8 +++++++
>  arch/powerpc/include/asm/book3s/64/pgtable.h  | 25 +++++++++++++++++++-
>  arch/powerpc/include/asm/pgalloc-64.h         | 24 ++++++++++++++++---
>  arch/powerpc/include/asm/pgtable-types.h      | 13 +++++++----
>  arch/powerpc/mm/init_64.c                     | 21 ++++++++++++-----
>  8 files changed, 90 insertions(+), 55 deletions(-)
>
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 378f1127ca98..618afea4c9fc 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -303,6 +303,7 @@ config ZONE_DMA32
>  config PGTABLE_LEVELS
>  	int
>  	default 2 if !PPC64
> +	default 4 if PPC_BOOK3S_64
>  	default 3 if PPC_64K_PAGES
>  	default 4
>  
> diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> index ea0414d6659e..c78f5928001b 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> @@ -57,39 +57,8 @@
>  #define _PAGE_4K_PFN		0
>  #ifndef __ASSEMBLY__
>  /*
> - * 4-level page tables related bits
> + * On all 4K setups, remap_4k_pfn() equates to remap_pfn_range()
>   */
> -
> -#define pgd_none(pgd)		(!pgd_val(pgd))
> -#define pgd_bad(pgd)		(pgd_val(pgd) == 0)
> -#define pgd_present(pgd)	(pgd_val(pgd) != 0)
> -#define pgd_page_vaddr(pgd)	(pgd_val(pgd) & ~PGD_MASKED_BITS)
> -
> -static inline void pgd_clear(pgd_t *pgdp)
> -{
> -	*pgdp = __pgd(0);
> -}
> -
> -static inline pte_t pgd_pte(pgd_t pgd)
> -{
> -	return __pte(pgd_val(pgd));
> -}
> -
> -static inline pgd_t pte_pgd(pte_t pte)
> -{
> -	return __pgd(pte_val(pte));
> -}
> -extern struct page *pgd_page(pgd_t pgd);
> -
> -#define pud_offset(pgdp, addr)	\
> -  (((pud_t *) pgd_page_vaddr(*(pgdp))) + \
> -    (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
> -
> -#define pud_ERROR(e) \
> -	pr_err("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e))
> -
> -/*
> - * On all 4K setups, remap_4k_pfn() equates to remap_pfn_range() */
>  #define remap_4k_pfn(vma, addr, pfn, prot)	\
>  	remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
>  
> diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> index 849bbec80f7b..5c9392b71a6b 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> @@ -1,15 +1,14 @@
>  #ifndef _ASM_POWERPC_BOOK3S_64_HASH_64K_H
>  #define _ASM_POWERPC_BOOK3S_64_HASH_64K_H
>  
> -#include <asm-generic/pgtable-nopud.h>
> -
>  #define PTE_INDEX_SIZE  8
> -#define PMD_INDEX_SIZE  10
> -#define PUD_INDEX_SIZE	0
> +#define PMD_INDEX_SIZE  5
> +#define PUD_INDEX_SIZE	5
>  #define PGD_INDEX_SIZE  12


10 splits to 5 and 5 for PMD/PUD? Does this impact huge page?

>  
>  #define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
>  #define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
> +#define PTRS_PER_PUD	(1 << PUD_INDEX_SIZE)
>  #define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
>  
>  /* With 4k base page size, hugepage PTEs go at the PMD level */
> @@ -20,8 +19,13 @@
>  #define PMD_SIZE	(1UL << PMD_SHIFT)
>  #define PMD_MASK	(~(PMD_SIZE-1))
>  
> +/* PUD_SHIFT determines what a third-level page table entry can map */
> +#define PUD_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
> +#define PUD_SIZE	(1UL << PUD_SHIFT)
> +#define PUD_MASK	(~(PUD_SIZE-1))
> +
>  /* PGDIR_SHIFT determines what a third-level page table entry can map */
> -#define PGDIR_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
> +#define PGDIR_SHIFT	(PUD_SHIFT + PUD_INDEX_SIZE)
>  #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
>  #define PGDIR_MASK	(~(PGDIR_SIZE-1))
>  
> @@ -61,6 +65,8 @@
>  #define PMD_MASKED_BITS		(PTE_FRAG_SIZE - 1)
>  /* Bits to mask out from a PGD/PUD to get to the PMD page */
>  #define PUD_MASKED_BITS		0x1ff
> +/* FIXME!! check this */

Shouldn't PUD_MASKED_BITS be 0x1f?

> +#define PGD_MASKED_BITS		0
>  
0?

>  #ifndef __ASSEMBLY__
>  
> @@ -130,11 +136,9 @@ extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index);
>  #else
>  #define PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
>  #endif
> +#define PUD_TABLE_SIZE	(sizeof(pud_t) << PUD_INDEX_SIZE)
>  #define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
>  
> -#define pgd_pte(pgd)	(pud_pte(((pud_t){ pgd })))
> -#define pte_pgd(pte)	((pgd_t)pte_pud(pte))
> -
>  #ifdef CONFIG_HUGETLB_PAGE
>  /*
>   * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
> diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
> index f46974d0134a..9ff1e056acef 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash.h
> @@ -226,6 +226,7 @@
>  #define pud_page_vaddr(pud)	(pud_val(pud) & ~PUD_MASKED_BITS)
>  
>  #define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
> +#define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1))
>  #define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1))
>  #define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1))
>  
> @@ -354,8 +355,15 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
>  	:"cc");
>  }
>  
> +static inline int pgd_bad(pgd_t pgd)
> +{
> +	return (pgd_val(pgd) == 0);
> +}
> +
>  #define __HAVE_ARCH_PTE_SAME
>  #define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
> +#define pgd_page_vaddr(pgd)	(pgd_val(pgd) & ~PGD_MASKED_BITS)
> +
>  
>  /* Generic accessors to PTE bits */
>  static inline int pte_write(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_RW);}
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index e7162dba987e..8f639401c7ba 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -111,6 +111,26 @@ static inline void pgd_set(pgd_t *pgdp, unsigned long val)
>  	*pgdp = __pgd(val);
>  }
>  
> +static inline void pgd_clear(pgd_t *pgdp)
> +{
> +	*pgdp = __pgd(0);
> +}
> +
> +#define pgd_none(pgd)		(!pgd_val(pgd))
> +#define pgd_present(pgd)	(!pgd_none(pgd))
> +
> +static inline pte_t pgd_pte(pgd_t pgd)
> +{
> +	return __pte(pgd_val(pgd));
> +}
> +
> +static inline pgd_t pte_pgd(pte_t pte)
> +{
> +	return __pgd(pte_val(pte));
> +}
> +
> +extern struct page *pgd_page(pgd_t pgd);
> +
>  /*
>   * Find an entry in a page-table-directory.  We combine the address region
>   * (the high order N bits) and the pgd portion of the address.
> @@ -118,9 +138,10 @@ static inline void pgd_set(pgd_t *pgdp, unsigned long val)
>  
>  #define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
>  
> +#define pud_offset(pgdp, addr)	\
> +	(((pud_t *) pgd_page_vaddr(*(pgdp))) + pud_index(addr))
>  #define pmd_offset(pudp,addr) \
>  	(((pmd_t *) pud_page_vaddr(*(pudp))) + pmd_index(addr))
> -
>  #define pte_offset_kernel(dir,addr) \
>  	(((pte_t *) pmd_page_vaddr(*(dir))) + pte_index(addr))
>  
> @@ -135,6 +156,8 @@ static inline void pgd_set(pgd_t *pgdp, unsigned long val)
>  	pr_err("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
>  #define pmd_ERROR(e) \
>  	pr_err("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
> +#define pud_ERROR(e) \
> +	pr_err("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e))
>  #define pgd_ERROR(e) \
>  	pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
>  
> diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h
> index 69ef28a81733..014489a619d0 100644
> --- a/arch/powerpc/include/asm/pgalloc-64.h
> +++ b/arch/powerpc/include/asm/pgalloc-64.h
> @@ -171,7 +171,25 @@ extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
>  extern void __tlb_remove_table(void *_table);
>  #endif
>  
> -#define pud_populate(mm, pud, pmd)	pud_set(pud, (unsigned long)pmd)
> +#ifndef __PAGETABLE_PUD_FOLDED
> +/* book3s 64 is 4 level page table */
> +#define pgd_populate(MM, PGD, PUD)	pgd_set(PGD, PUD)
> +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
> +{
> +	return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE),
> +				GFP_KERNEL|__GFP_REPEAT);
> +}
> +
> +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
> +{
> +	kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud);
> +}
> +#endif
> +
> +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
> +{
> +	pud_set(pud, (unsigned long)pmd);
> +}
>  
>  static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
>  				       pte_t *pte)
> @@ -233,11 +251,11 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
>  
>  #define __pmd_free_tlb(tlb, pmd, addr)		      \
>  	pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX)
> -#ifndef CONFIG_PPC_64K_PAGES
> +#ifndef __PAGETABLE_PUD_FOLDED
>  #define __pud_free_tlb(tlb, pud, addr)		      \
>  	pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
>  
> -#endif /* CONFIG_PPC_64K_PAGES */
> +#endif /* __PAGETABLE_PUD_FOLDED */
>  
>  #define check_pgt_cache()	do { } while (0)
>  
> diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h
> index 71487e1ca638..43140f8b0592 100644
> --- a/arch/powerpc/include/asm/pgtable-types.h
> +++ b/arch/powerpc/include/asm/pgtable-types.h
> @@ -21,15 +21,18 @@ static inline unsigned long pmd_val(pmd_t x)
>  	return x.pmd;
>  }
>  
> -/* PUD level exusts only on 4k pages */
> -#ifndef CONFIG_PPC_64K_PAGES
> +/*
> + * 64 bit hash always use 4 level table. Everybody else use 4 level
> + * only for 4K page size.
> + */
> +#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES)
>  typedef struct { unsigned long pud; } pud_t;
>  #define __pud(x)	((pud_t) { (x) })
>  static inline unsigned long pud_val(pud_t x)
>  {
>  	return x.pud;
>  }
> -#endif /* !CONFIG_PPC_64K_PAGES */
> +#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */
>  #endif /* CONFIG_PPC64 */
>  
>  /* PGD level */
> @@ -66,14 +69,14 @@ static inline unsigned long pmd_val(pmd_t pmd)
>  	return pmd;
>  }
>  
> -#ifndef CONFIG_PPC_64K_PAGES
> +#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES)
>  typedef unsigned long pud_t;
>  #define __pud(x)	(x)
>  static inline unsigned long pud_val(pud_t pud)
>  {
>  	return pud;
>  }
> -#endif /* !CONFIG_PPC_64K_PAGES */
> +#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */
>  #endif /* CONFIG_PPC64 */
>  
>  typedef unsigned long pgd_t;
> diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
> index 379a6a90644b..8ce1ec24d573 100644
> --- a/arch/powerpc/mm/init_64.c
> +++ b/arch/powerpc/mm/init_64.c
> @@ -85,6 +85,11 @@ static void pgd_ctor(void *addr)
>  	memset(addr, 0, PGD_TABLE_SIZE);
>  }
>  
> +static void pud_ctor(void *addr)
> +{
> +	memset(addr, 0, PUD_TABLE_SIZE);
> +}
> +
>  static void pmd_ctor(void *addr)
>  {
>  	memset(addr, 0, PMD_TABLE_SIZE);
> @@ -138,14 +143,18 @@ void pgtable_cache_init(void)
>  {
>  	pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
>  	pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
> +	/*
> +	 * In all current configs, when the PUD index exists it's the
> +	 * same size as either the pgd or pmd index except with THP enabled
> +	 * on book3s 64
> +	 */
> +	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
> +		pgtable_cache_add(PUD_INDEX_SIZE, pud_ctor);
> +
>  	if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
>  		panic("Couldn't allocate pgtable caches");
> -	/* In all current configs, when the PUD index exists it's the
> -	 * same size as either the pgd or pmd index.  Verify that the
> -	 * initialization above has also created a PUD cache.  This
> -	 * will need re-examiniation if we add new possibilities for
> -	 * the pagetable layout. */
> -	BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE));
> +	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
> +		panic("Couldn't allocate pud pgtable caches");
>  }
>  
>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
Aneesh Kumar K.V Jan. 18, 2016, 7:32 a.m. UTC | #3
Balbir Singh <bsingharora@gmail.com> writes:

> On 12/01/16 18:15, Aneesh Kumar K.V wrote:
>> This is needed so that we can support both hash and radix page table
>> using single kernel. Radix kernel uses a 4 level table.
>>

.....

> diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
>> index 849bbec80f7b..5c9392b71a6b 100644
>> --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
>> +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
>> @@ -1,15 +1,14 @@
>>  #ifndef _ASM_POWERPC_BOOK3S_64_HASH_64K_H
>>  #define _ASM_POWERPC_BOOK3S_64_HASH_64K_H
>>  
>> -#include <asm-generic/pgtable-nopud.h>
>> -
>>  #define PTE_INDEX_SIZE  8
>> -#define PMD_INDEX_SIZE  10
>> -#define PUD_INDEX_SIZE	0
>> +#define PMD_INDEX_SIZE  5
>> +#define PUD_INDEX_SIZE	5
>>  #define PGD_INDEX_SIZE  12
>
>
> 10 splits to 5 and 5 for PMD/PUD? Does this impact huge page?


Nope. We have huge page at top level and pmd level. (16G and 16M)

>
>>  
>>  #define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
>>  #define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
>> +#define PTRS_PER_PUD	(1 << PUD_INDEX_SIZE)
>>  #define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
>>  
>>  /* With 4k base page size, hugepage PTEs go at the PMD level */
>> @@ -20,8 +19,13 @@
>>  #define PMD_SIZE	(1UL << PMD_SHIFT)
>>  #define PMD_MASK	(~(PMD_SIZE-1))
>>  
>> +/* PUD_SHIFT determines what a third-level page table entry can map */
>> +#define PUD_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
>> +#define PUD_SIZE	(1UL << PUD_SHIFT)
>> +#define PUD_MASK	(~(PUD_SIZE-1))
>> +
>>  /* PGDIR_SHIFT determines what a third-level page table entry can map */
>> -#define PGDIR_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
>> +#define PGDIR_SHIFT	(PUD_SHIFT + PUD_INDEX_SIZE)
>>  #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
>>  #define PGDIR_MASK	(~(PGDIR_SIZE-1))
>>  
>> @@ -61,6 +65,8 @@
>>  #define PMD_MASKED_BITS		(PTE_FRAG_SIZE - 1)
>>  /* Bits to mask out from a PGD/PUD to get to the PMD page */
>>  #define PUD_MASKED_BITS		0x1ff
>> +/* FIXME!! check this */
>
> Shouldn't PUD_MASKED_BITS be 0x1f?
>
>> +#define PGD_MASKED_BITS		0
>>  
> 0?
>


The MASKED_BITS need to be cleaned up hence the FIXME!! Linux page table
are aligned differently and I didn't want to cleanup that in this
series. IMHO using #defines like above instead of deriving it from the
pmd table align value is wrong. Will get to that later. 



>>  #ifndef __ASSEMBLY__
>>  
>> @@ -130,11 +136,9 @@ extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index);
>>  #else
>>  #define PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
>>  #endif
>> +#define PUD_TABLE_SIZE	(sizeof(pud_t) << PUD_INDEX_SIZE)
>>  #define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
>>  
>> -#define pgd_pte(pgd)	(pud_pte(((pud_t){ pgd })))
>> -#define pte_pgd(pte)	((pgd_t)pte_pud(pte))
>> -
>>  #ifdef CONFIG_HUGETLB_PAGE

-aneesh
diff mbox

Patch

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 378f1127ca98..618afea4c9fc 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -303,6 +303,7 @@  config ZONE_DMA32
 config PGTABLE_LEVELS
 	int
 	default 2 if !PPC64
+	default 4 if PPC_BOOK3S_64
 	default 3 if PPC_64K_PAGES
 	default 4
 
diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index ea0414d6659e..c78f5928001b 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -57,39 +57,8 @@ 
 #define _PAGE_4K_PFN		0
 #ifndef __ASSEMBLY__
 /*
- * 4-level page tables related bits
+ * On all 4K setups, remap_4k_pfn() equates to remap_pfn_range()
  */
-
-#define pgd_none(pgd)		(!pgd_val(pgd))
-#define pgd_bad(pgd)		(pgd_val(pgd) == 0)
-#define pgd_present(pgd)	(pgd_val(pgd) != 0)
-#define pgd_page_vaddr(pgd)	(pgd_val(pgd) & ~PGD_MASKED_BITS)
-
-static inline void pgd_clear(pgd_t *pgdp)
-{
-	*pgdp = __pgd(0);
-}
-
-static inline pte_t pgd_pte(pgd_t pgd)
-{
-	return __pte(pgd_val(pgd));
-}
-
-static inline pgd_t pte_pgd(pte_t pte)
-{
-	return __pgd(pte_val(pte));
-}
-extern struct page *pgd_page(pgd_t pgd);
-
-#define pud_offset(pgdp, addr)	\
-  (((pud_t *) pgd_page_vaddr(*(pgdp))) + \
-    (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
-
-#define pud_ERROR(e) \
-	pr_err("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e))
-
-/*
- * On all 4K setups, remap_4k_pfn() equates to remap_pfn_range() */
 #define remap_4k_pfn(vma, addr, pfn, prot)	\
 	remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
 
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 849bbec80f7b..5c9392b71a6b 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -1,15 +1,14 @@ 
 #ifndef _ASM_POWERPC_BOOK3S_64_HASH_64K_H
 #define _ASM_POWERPC_BOOK3S_64_HASH_64K_H
 
-#include <asm-generic/pgtable-nopud.h>
-
 #define PTE_INDEX_SIZE  8
-#define PMD_INDEX_SIZE  10
-#define PUD_INDEX_SIZE	0
+#define PMD_INDEX_SIZE  5
+#define PUD_INDEX_SIZE	5
 #define PGD_INDEX_SIZE  12
 
 #define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
 #define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
+#define PTRS_PER_PUD	(1 << PUD_INDEX_SIZE)
 #define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
 
 /* With 4k base page size, hugepage PTEs go at the PMD level */
@@ -20,8 +19,13 @@ 
 #define PMD_SIZE	(1UL << PMD_SHIFT)
 #define PMD_MASK	(~(PMD_SIZE-1))
 
+/* PUD_SHIFT determines what a third-level page table entry can map */
+#define PUD_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
+#define PUD_SIZE	(1UL << PUD_SHIFT)
+#define PUD_MASK	(~(PUD_SIZE-1))
+
 /* PGDIR_SHIFT determines what a third-level page table entry can map */
-#define PGDIR_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
+#define PGDIR_SHIFT	(PUD_SHIFT + PUD_INDEX_SIZE)
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
@@ -61,6 +65,8 @@ 
 #define PMD_MASKED_BITS		(PTE_FRAG_SIZE - 1)
 /* Bits to mask out from a PGD/PUD to get to the PMD page */
 #define PUD_MASKED_BITS		0x1ff
+/* FIXME!! check this */
+#define PGD_MASKED_BITS		0
 
 #ifndef __ASSEMBLY__
 
@@ -130,11 +136,9 @@  extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index);
 #else
 #define PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
 #endif
+#define PUD_TABLE_SIZE	(sizeof(pud_t) << PUD_INDEX_SIZE)
 #define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
 
-#define pgd_pte(pgd)	(pud_pte(((pud_t){ pgd })))
-#define pte_pgd(pte)	((pgd_t)pte_pud(pte))
-
 #ifdef CONFIG_HUGETLB_PAGE
 /*
  * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index f46974d0134a..9ff1e056acef 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -226,6 +226,7 @@ 
 #define pud_page_vaddr(pud)	(pud_val(pud) & ~PUD_MASKED_BITS)
 
 #define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
+#define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1))
 #define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1))
 #define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1))
 
@@ -354,8 +355,15 @@  static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 	:"cc");
 }
 
+static inline int pgd_bad(pgd_t pgd)
+{
+	return (pgd_val(pgd) == 0);
+}
+
 #define __HAVE_ARCH_PTE_SAME
 #define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
+#define pgd_page_vaddr(pgd)	(pgd_val(pgd) & ~PGD_MASKED_BITS)
+
 
 /* Generic accessors to PTE bits */
 static inline int pte_write(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_RW);}
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index e7162dba987e..8f639401c7ba 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -111,6 +111,26 @@  static inline void pgd_set(pgd_t *pgdp, unsigned long val)
 	*pgdp = __pgd(val);
 }
 
+static inline void pgd_clear(pgd_t *pgdp)
+{
+	*pgdp = __pgd(0);
+}
+
+#define pgd_none(pgd)		(!pgd_val(pgd))
+#define pgd_present(pgd)	(!pgd_none(pgd))
+
+static inline pte_t pgd_pte(pgd_t pgd)
+{
+	return __pte(pgd_val(pgd));
+}
+
+static inline pgd_t pte_pgd(pte_t pte)
+{
+	return __pgd(pte_val(pte));
+}
+
+extern struct page *pgd_page(pgd_t pgd);
+
 /*
  * Find an entry in a page-table-directory.  We combine the address region
  * (the high order N bits) and the pgd portion of the address.
@@ -118,9 +138,10 @@  static inline void pgd_set(pgd_t *pgdp, unsigned long val)
 
 #define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
 
+#define pud_offset(pgdp, addr)	\
+	(((pud_t *) pgd_page_vaddr(*(pgdp))) + pud_index(addr))
 #define pmd_offset(pudp,addr) \
 	(((pmd_t *) pud_page_vaddr(*(pudp))) + pmd_index(addr))
-
 #define pte_offset_kernel(dir,addr) \
 	(((pte_t *) pmd_page_vaddr(*(dir))) + pte_index(addr))
 
@@ -135,6 +156,8 @@  static inline void pgd_set(pgd_t *pgdp, unsigned long val)
 	pr_err("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
 #define pmd_ERROR(e) \
 	pr_err("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
+#define pud_ERROR(e) \
+	pr_err("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e))
 #define pgd_ERROR(e) \
 	pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 
diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h
index 69ef28a81733..014489a619d0 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/pgalloc-64.h
@@ -171,7 +171,25 @@  extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
 extern void __tlb_remove_table(void *_table);
 #endif
 
-#define pud_populate(mm, pud, pmd)	pud_set(pud, (unsigned long)pmd)
+#ifndef __PAGETABLE_PUD_FOLDED
+/* book3s 64 is 4 level page table */
+#define pgd_populate(MM, PGD, PUD)	pgd_set(PGD, PUD)
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+	return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE),
+				GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+	kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud);
+}
+#endif
+
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+	pud_set(pud, (unsigned long)pmd);
+}
 
 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
 				       pte_t *pte)
@@ -233,11 +251,11 @@  static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 
 #define __pmd_free_tlb(tlb, pmd, addr)		      \
 	pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX)
-#ifndef CONFIG_PPC_64K_PAGES
+#ifndef __PAGETABLE_PUD_FOLDED
 #define __pud_free_tlb(tlb, pud, addr)		      \
 	pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
 
-#endif /* CONFIG_PPC_64K_PAGES */
+#endif /* __PAGETABLE_PUD_FOLDED */
 
 #define check_pgt_cache()	do { } while (0)
 
diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h
index 71487e1ca638..43140f8b0592 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -21,15 +21,18 @@  static inline unsigned long pmd_val(pmd_t x)
 	return x.pmd;
 }
 
-/* PUD level exusts only on 4k pages */
-#ifndef CONFIG_PPC_64K_PAGES
+/*
+ * 64 bit hash always use 4 level table. Everybody else use 4 level
+ * only for 4K page size.
+ */
+#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES)
 typedef struct { unsigned long pud; } pud_t;
 #define __pud(x)	((pud_t) { (x) })
 static inline unsigned long pud_val(pud_t x)
 {
 	return x.pud;
 }
-#endif /* !CONFIG_PPC_64K_PAGES */
+#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */
 #endif /* CONFIG_PPC64 */
 
 /* PGD level */
@@ -66,14 +69,14 @@  static inline unsigned long pmd_val(pmd_t pmd)
 	return pmd;
 }
 
-#ifndef CONFIG_PPC_64K_PAGES
+#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES)
 typedef unsigned long pud_t;
 #define __pud(x)	(x)
 static inline unsigned long pud_val(pud_t pud)
 {
 	return pud;
 }
-#endif /* !CONFIG_PPC_64K_PAGES */
+#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */
 #endif /* CONFIG_PPC64 */
 
 typedef unsigned long pgd_t;
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 379a6a90644b..8ce1ec24d573 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -85,6 +85,11 @@  static void pgd_ctor(void *addr)
 	memset(addr, 0, PGD_TABLE_SIZE);
 }
 
+static void pud_ctor(void *addr)
+{
+	memset(addr, 0, PUD_TABLE_SIZE);
+}
+
 static void pmd_ctor(void *addr)
 {
 	memset(addr, 0, PMD_TABLE_SIZE);
@@ -138,14 +143,18 @@  void pgtable_cache_init(void)
 {
 	pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
 	pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
+	/*
+	 * In all current configs, when the PUD index exists it's the
+	 * same size as either the pgd or pmd index except with THP enabled
+	 * on book3s 64
+	 */
+	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
+		pgtable_cache_add(PUD_INDEX_SIZE, pud_ctor);
+
 	if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
 		panic("Couldn't allocate pgtable caches");
-	/* In all current configs, when the PUD index exists it's the
-	 * same size as either the pgd or pmd index.  Verify that the
-	 * initialization above has also created a PUD cache.  This
-	 * will need re-examiniation if we add new possibilities for
-	 * the pagetable layout. */
-	BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE));
+	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
+		panic("Couldn't allocate pud pgtable caches");
 }
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP