diff mbox

[1/6] powerpc: port 64 bits pgtable_cache to 32 bits

Message ID 794b4abd8caaad32edf4180ac485a2eb914fba49.1471020647.git.christophe.leroy@c-s.fr (mailing list archive)
State Superseded
Headers show

Commit Message

Christophe Leroy Aug. 12, 2016, 4:55 p.m. UTC
Today powerpc64 uses a set of pgtable_caches while powerpc32 uses
standard pages when using 4k pages and a single pgtable_cache
if using other size pages. In addition powerpc32 uses another cache
when handling huge pages.

In preparation of implementing huge pages on the 8xx, this patch
replaces the specific powerpc32 handling by the 64 bits approach.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/include/asm/book3s/32/pgalloc.h |  44 ++++++--
 arch/powerpc/include/asm/book3s/32/pgtable.h |  43 ++++----
 arch/powerpc/include/asm/book3s/64/pgtable.h |   3 -
 arch/powerpc/include/asm/hugetlb.h           |   2 -
 arch/powerpc/include/asm/nohash/32/pgalloc.h |  44 ++++++--
 arch/powerpc/include/asm/nohash/32/pgtable.h |  45 ++++----
 arch/powerpc/include/asm/nohash/64/pgtable.h |   2 -
 arch/powerpc/include/asm/pgtable.h           |   2 +
 arch/powerpc/mm/Makefile                     |   2 +-
 arch/powerpc/mm/hugetlbpage.c                |  12 +--
 arch/powerpc/mm/init-common.c                | 152 +++++++++++++++++++++++++++
 arch/powerpc/mm/init_32.c                    |   5 -
 arch/powerpc/mm/init_64.c                    |  82 ---------------
 arch/powerpc/mm/pgtable_32.c                 |  37 -------
 14 files changed, 282 insertions(+), 193 deletions(-)
 create mode 100644 arch/powerpc/mm/init-common.c

Comments

Aneesh Kumar K.V Aug. 14, 2016, 2:17 p.m. UTC | #1
Christophe Leroy <christophe.leroy@c-s.fr> writes:

> Today powerpc64 uses a set of pgtable_caches while powerpc32 uses
> standard pages when using 4k pages and a single pgtable_cache
> if using other size pages. In addition powerpc32 uses another cache
> when handling huge pages.
>
> In preparation of implementing huge pages on the 8xx, this patch
> replaces the specific powerpc32 handling by the 64 bits approach.

Why is this needed ? Can you also summarize the page size used and the
hugepage format you are planning to use ? . What are the page sizes
supported by 8xx ? Also is the new code copy of existing powerpc64 4k
page size code ?

>
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> ---
>  arch/powerpc/include/asm/book3s/32/pgalloc.h |  44 ++++++--
>  arch/powerpc/include/asm/book3s/32/pgtable.h |  43 ++++----
>  arch/powerpc/include/asm/book3s/64/pgtable.h |   3 -
>  arch/powerpc/include/asm/hugetlb.h           |   2 -
>  arch/powerpc/include/asm/nohash/32/pgalloc.h |  44 ++++++--
>  arch/powerpc/include/asm/nohash/32/pgtable.h |  45 ++++----
>  arch/powerpc/include/asm/nohash/64/pgtable.h |   2 -
>  arch/powerpc/include/asm/pgtable.h           |   2 +
>  arch/powerpc/mm/Makefile                     |   2 +-
>  arch/powerpc/mm/hugetlbpage.c                |  12 +--
>  arch/powerpc/mm/init-common.c                | 152 +++++++++++++++++++++++++++
>  arch/powerpc/mm/init_32.c                    |   5 -
>  arch/powerpc/mm/init_64.c                    |  82 ---------------
>  arch/powerpc/mm/pgtable_32.c                 |  37 -------
>  14 files changed, 282 insertions(+), 193 deletions(-)
>  create mode 100644 arch/powerpc/mm/init-common.c
>
> diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h
> index 8e21bb4..ab215fd 100644
> --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
> +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
> @@ -2,14 +2,42 @@
>  #define _ASM_POWERPC_BOOK3S_32_PGALLOC_H
>  
>  #include <linux/threads.h>
> +#include <linux/slab.h>
>  
> -/* For 32-bit, all levels of page tables are just drawn from get_free_page() */
> -#define MAX_PGTABLE_INDEX_SIZE	0
> +/*
> + * Functions that deal with pagetables that could be at any level of
> + * the table need to be passed an "index_size" so they know how to
> + * handle allocation.  For PTE pages (which are linked to a struct
> + * page for now, and drawn from the main get_free_pages() pool), the
> + * allocation size will be (2^index_size * sizeof(pointer)) and
> + * allocations are drawn from the kmem_cache in PGT_CACHE(index_size).
> + *
> + * The maximum index size needs to be big enough to allow any
> + * pagetable sizes we need, but small enough to fit in the low bits of
> + * any page table pointer.  In other words all pagetables, even tiny
> + * ones, must be aligned to allow at least enough low 0 bits to
> + * contain this value.  This value is also used as a mask, so it must
> + * be one less than a power of two.
> + */
> +#define MAX_PGTABLE_INDEX_SIZE	0xf
>  
>  extern void __bad_pte(pmd_t *pmd);
>  
> -extern pgd_t *pgd_alloc(struct mm_struct *mm);
> -extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
> +extern struct kmem_cache *pgtable_cache[];
> +#define PGT_CACHE(shift) ({				\
> +			BUG_ON(!(shift));		\
> +			pgtable_cache[(shift) - 1];	\
> +		})
> +
> +static inline pgd_t *pgd_alloc(struct mm_struct *mm)
> +{
> +	return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL);
> +}
> +
> +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
> +{
> +	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
> +}
>  
>  /*
>   * We don't have any real pmd's, and this code never triggers because
> @@ -68,8 +96,12 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
>  
>  static inline void pgtable_free(void *table, unsigned index_size)
>  {
> -	BUG_ON(index_size); /* 32-bit doesn't use this */
> -	free_page((unsigned long)table);
> +	if (!index_size)
> +		free_page((unsigned long)table);
> +	else {
> +		BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
> +		kmem_cache_free(PGT_CACHE(index_size), table);
> +	}
>  }
>  
>  #define check_pgt_cache()	do { } while (0)
> diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
> index 38b33dc..83a2159 100644
> --- a/arch/powerpc/include/asm/book3s/32/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
> @@ -8,6 +8,26 @@
>  /* And here we include common definitions */
>  #include <asm/pte-common.h>
>  
> +#define PTE_INDEX_SIZE	PTE_SHIFT
> +#define PMD_INDEX_SIZE	0
> +#define PUD_INDEX_SIZE	0
> +#define PGD_INDEX_SIZE	(32 - PGDIR_SHIFT)
> +
> +#define PMD_CACHE_INDEX	PMD_INDEX_SIZE
> +
> +#ifndef __ASSEMBLY__
> +#define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
> +#define PMD_TABLE_SIZE	(sizeof(pmd_t) << PTE_INDEX_SIZE)
> +#define PUD_TABLE_SIZE	(sizeof(pud_t) << PTE_INDEX_SIZE)
> +#define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
> +#endif	/* __ASSEMBLY__ */
> +
> +#define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
> +#define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
> +
> +/* With 4k base page size, hugepage PTEs go at the PMD level */
> +#define MIN_HUGEPTE_SHIFT	PMD_SHIFT
> +
>  /*
>   * The normal case is that PTEs are 32-bits and we have a 1-page
>   * 1024-entry pgdir pointing to 1-page 1024-entry PTE pages.  -- paulus
> @@ -19,14 +39,10 @@
>   * -Matt
>   */
>  /* PGDIR_SHIFT determines what a top-level page table entry can map */
> -#define PGDIR_SHIFT	(PAGE_SHIFT + PTE_SHIFT)
> +#define PGDIR_SHIFT	(PAGE_SHIFT + PTE_INDEX_SIZE)
>  #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
>  #define PGDIR_MASK	(~(PGDIR_SIZE-1))
>  
> -#define PTRS_PER_PTE	(1 << PTE_SHIFT)
> -#define PTRS_PER_PMD	1
> -#define PTRS_PER_PGD	(1 << (32 - PGDIR_SHIFT))
> -
>  #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
>  /*
>   * This is the bottom of the PKMAP area with HIGHMEM or an arbitrary
> @@ -82,12 +98,8 @@
>  
>  extern unsigned long ioremap_bot;
>  
> -/*
> - * entries per page directory level: our page-table tree is two-level, so
> - * we don't really have any PMD directory.
> - */
> -#define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_SHIFT)
> -#define PGD_TABLE_SIZE	(sizeof(pgd_t) << (32 - PGDIR_SHIFT))
> +/* Bits to mask out from a PGD to get to the PUD page */
> +#define PGD_MASKED_BITS		0
>  
>  #define pte_ERROR(e) \
>  	pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \
> @@ -282,15 +294,6 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
>  #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) >> 3 })
>  #define __swp_entry_to_pte(x)		((pte_t) { (x).val << 3 })
>  
> -#ifndef CONFIG_PPC_4K_PAGES
> -void pgtable_cache_init(void);
> -#else
> -/*
> - * No page table caches to initialise
> - */
> -#define pgtable_cache_init()	do { } while (0)
> -#endif
> -
>  extern int get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep,
>  		      pmd_t **pmdp);
>  
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index 263bf39..3f85d43 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -786,9 +786,6 @@ extern struct page *pgd_page(pgd_t pgd);
>  #define pgd_ERROR(e) \
>  	pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
>  
> -void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
> -void pgtable_cache_init(void);
> -
>  static inline int map_kernel_page(unsigned long ea, unsigned long pa,
>  				  unsigned long flags)
>  {
> diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
> index c5517f4..c201cd6 100644
> --- a/arch/powerpc/include/asm/hugetlb.h
> +++ b/arch/powerpc/include/asm/hugetlb.h
> @@ -5,8 +5,6 @@
>  #include <asm/page.h>
>  #include <asm-generic/hugetlb.h>
>  
> -extern struct kmem_cache *hugepte_cache;
> -
>  #ifdef CONFIG_PPC_BOOK3S_64
>  
>  #include <asm/book3s/64/hugetlb-radix.h>
> diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h
> index 76d6b9e..c2fe85c 100644
> --- a/arch/powerpc/include/asm/nohash/32/pgalloc.h
> +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
> @@ -2,14 +2,42 @@
>  #define _ASM_POWERPC_PGALLOC_32_H
>  
>  #include <linux/threads.h>
> +#include <linux/slab.h>
>  
> -/* For 32-bit, all levels of page tables are just drawn from get_free_page() */
> -#define MAX_PGTABLE_INDEX_SIZE	0
> +/*
> + * Functions that deal with pagetables that could be at any level of
> + * the table need to be passed an "index_size" so they know how to
> + * handle allocation.  For PTE pages (which are linked to a struct
> + * page for now, and drawn from the main get_free_pages() pool), the
> + * allocation size will be (2^index_size * sizeof(pointer)) and
> + * allocations are drawn from the kmem_cache in PGT_CACHE(index_size).
> + *
> + * The maximum index size needs to be big enough to allow any
> + * pagetable sizes we need, but small enough to fit in the low bits of
> + * any page table pointer.  In other words all pagetables, even tiny
> + * ones, must be aligned to allow at least enough low 0 bits to
> + * contain this value.  This value is also used as a mask, so it must
> + * be one less than a power of two.
> + */
> +#define MAX_PGTABLE_INDEX_SIZE	0xf
>  
>  extern void __bad_pte(pmd_t *pmd);
>  
> -extern pgd_t *pgd_alloc(struct mm_struct *mm);
> -extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
> +extern struct kmem_cache *pgtable_cache[];
> +#define PGT_CACHE(shift) ({				\
> +			BUG_ON(!(shift));		\
> +			pgtable_cache[(shift) - 1];	\
> +		})
> +
> +static inline pgd_t *pgd_alloc(struct mm_struct *mm)
> +{
> +	return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL);
> +}
> +
> +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
> +{
> +	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
> +}
>  
>  /*
>   * We don't have any real pmd's, and this code never triggers because
> @@ -68,8 +96,12 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
>  
>  static inline void pgtable_free(void *table, unsigned index_size)
>  {
> -	BUG_ON(index_size); /* 32-bit doesn't use this */
> -	free_page((unsigned long)table);
> +	if (!index_size)
> +		free_page((unsigned long)table);
> +	else {
> +		BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
> +		kmem_cache_free(PGT_CACHE(index_size), table);
> +	}
>  }
>  
>  #define check_pgt_cache()	do { } while (0)
> diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
> index 7808475..8a2937d 100644
> --- a/arch/powerpc/include/asm/nohash/32/pgtable.h
> +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
> @@ -16,6 +16,26 @@ extern int icache_44x_need_flush;
>  
>  #endif /* __ASSEMBLY__ */
>  
> +#define PTE_INDEX_SIZE	PTE_SHIFT
> +#define PMD_INDEX_SIZE	0
> +#define PUD_INDEX_SIZE	0
> +#define PGD_INDEX_SIZE	(32 - PGDIR_SHIFT)
> +
> +#define PMD_CACHE_INDEX	PMD_INDEX_SIZE
> +
> +#ifndef __ASSEMBLY__
> +#define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
> +#define PMD_TABLE_SIZE	(sizeof(pmd_t) << PTE_INDEX_SIZE)
> +#define PUD_TABLE_SIZE	(sizeof(pud_t) << PTE_INDEX_SIZE)
> +#define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
> +#endif	/* __ASSEMBLY__ */
> +
> +#define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
> +#define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
> +
> +/* With 4k base page size, hugepage PTEs go at the PMD level */
> +#define MIN_HUGEPTE_SHIFT	PMD_SHIFT
> +
>  /*
>   * The normal case is that PTEs are 32-bits and we have a 1-page
>   * 1024-entry pgdir pointing to 1-page 1024-entry PTE pages.  -- paulus
> @@ -27,22 +47,12 @@ extern int icache_44x_need_flush;
>   * -Matt
>   */
>  /* PGDIR_SHIFT determines what a top-level page table entry can map */
> -#define PGDIR_SHIFT	(PAGE_SHIFT + PTE_SHIFT)
> +#define PGDIR_SHIFT	(PAGE_SHIFT + PTE_INDEX_SIZE)
>  #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
>  #define PGDIR_MASK	(~(PGDIR_SIZE-1))
>  
> -/*
> - * entries per page directory level: our page-table tree is two-level, so
> - * we don't really have any PMD directory.
> - */
> -#ifndef __ASSEMBLY__
> -#define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_SHIFT)
> -#define PGD_TABLE_SIZE	(sizeof(pgd_t) << (32 - PGDIR_SHIFT))
> -#endif	/* __ASSEMBLY__ */
> -
> -#define PTRS_PER_PTE	(1 << PTE_SHIFT)
> -#define PTRS_PER_PMD	1
> -#define PTRS_PER_PGD	(1 << (32 - PGDIR_SHIFT))
> +/* Bits to mask out from a PGD to get to the PUD page */
> +#define PGD_MASKED_BITS		0
>  
>  #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
>  #define FIRST_USER_ADDRESS	0UL
> @@ -327,15 +337,6 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
>  #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) >> 3 })
>  #define __swp_entry_to_pte(x)		((pte_t) { (x).val << 3 })
>  
> -#ifndef CONFIG_PPC_4K_PAGES
> -void pgtable_cache_init(void);
> -#else
> -/*
> - * No page table caches to initialise
> - */
> -#define pgtable_cache_init()	do { } while (0)
> -#endif
> -
>  extern int get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep,
>  		      pmd_t **pmdp);
>  
> diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
> index d4d808c..b0fc9e4 100644
> --- a/arch/powerpc/include/asm/nohash/64/pgtable.h
> +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
> @@ -357,8 +357,6 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
>  #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val((pte)) })
>  #define __swp_entry_to_pte(x)		__pte((x).val)
>  
> -void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
> -void pgtable_cache_init(void);
>  extern int map_kernel_page(unsigned long ea, unsigned long pa,
>  			   unsigned long flags);
>  extern int __meminit vmemmap_create_mapping(unsigned long start,
> diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
> index 9bd87f2..dd01212 100644
> --- a/arch/powerpc/include/asm/pgtable.h
> +++ b/arch/powerpc/include/asm/pgtable.h
> @@ -78,6 +78,8 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
>  
>  unsigned long vmalloc_to_phys(void *vmalloc_addr);
>  
> +void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
> +void pgtable_cache_init(void);
>  #endif /* __ASSEMBLY__ */
>  
>  #endif /* _ASM_POWERPC_PGTABLE_H */
> diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
> index f2cea6d..08bb010 100644
> --- a/arch/powerpc/mm/Makefile
> +++ b/arch/powerpc/mm/Makefile
> @@ -7,7 +7,7 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
>  ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
>  
>  obj-y				:= fault.o mem.o pgtable.o mmap.o \
> -				   init_$(CONFIG_WORD_SIZE).o \
> +				   init_$(CONFIG_WORD_SIZE).o init-common.o \
>  				   pgtable_$(CONFIG_WORD_SIZE).o
>  obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
>  				   tlb_nohash_low.o
> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> index 7372ee1..9164a77 100644
> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -68,7 +68,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
>  #ifdef CONFIG_PPC_FSL_BOOK3E
>  	int i;
>  	int num_hugepd = 1 << (pshift - pdshift);
> -	cachep = hugepte_cache;
> +	cachep = PGT_CACHE(1);
>  #else
>  	cachep = PGT_CACHE(pdshift - pshift);
>  #endif

Can you explain the usage of PGT_CACHE(1) ?

> @@ -411,7 +411,7 @@ static void hugepd_free_rcu_callback(struct rcu_head *head)
>  	unsigned int i;
>  
>  	for (i = 0; i < batch->index; i++)
> -		kmem_cache_free(hugepte_cache, batch->ptes[i]);
> +		kmem_cache_free(PGT_CACHE(1), batch->ptes[i]);
>  
>  	free_page((unsigned long)batch);
>  }
> @@ -425,7 +425,7 @@ static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
>  	if (atomic_read(&tlb->mm->mm_users) < 2 ||
>  	    cpumask_equal(mm_cpumask(tlb->mm),
>  			  cpumask_of(smp_processor_id()))) {
> -		kmem_cache_free(hugepte_cache, hugepte);
> +		kmem_cache_free(PGT_CACHE(1), hugepte);
>  		put_cpu_var(hugepd_freelist_cur);
>  		return;
>  	}
> @@ -792,7 +792,6 @@ static int __init hugepage_setup_sz(char *str)
>  __setup("hugepagesz=", hugepage_setup_sz);
>  
>  #ifdef CONFIG_PPC_FSL_BOOK3E
> -struct kmem_cache *hugepte_cache;
>  static int __init hugetlbpage_init(void)
>  {
>  	int psize;
> @@ -815,9 +814,8 @@ static int __init hugetlbpage_init(void)
>  	 * Create a kmem cache for hugeptes.  The bottom bits in the pte have
>  	 * size information encoded in them, so align them to allow this
>  	 */
> -	hugepte_cache =  kmem_cache_create("hugepte-cache", sizeof(pte_t),
> -					   HUGEPD_SHIFT_MASK + 1, 0, NULL);
> -	if (hugepte_cache == NULL)
> +	pgtable_cache_add(1, NULL);
> +	if (!PGT_CACHE(1))
>  		panic("%s: Unable to create kmem cache for hugeptes\n",
>  		      __func__);
>  
> diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
> new file mode 100644
> index 0000000..2632eab
> --- /dev/null
> +++ b/arch/powerpc/mm/init-common.c
> @@ -0,0 +1,152 @@
> +/*
> + *  PowerPC version
> + *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
> + *
> + *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
> + *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
> + *    Copyright (C) 1996 Paul Mackerras
> + *
> + *  Derived from "arch/i386/mm/init.c"
> + *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
> + *
> + *  Dave Engebretsen <engebret@us.ibm.com>
> + *      Rework for PPC64 port.
> + *
> + *  This program is free software; you can redistribute it and/or
> + *  modify it under the terms of the GNU General Public License
> + *  as published by the Free Software Foundation; either version
> + *  2 of the License, or (at your option) any later version.
> + *
> + */
> +
> +#undef DEBUG
> +
> +#include <linux/signal.h>
> +#include <linux/sched.h>
> +#include <linux/kernel.h>
> +#include <linux/errno.h>
> +#include <linux/string.h>
> +#include <linux/types.h>
> +#include <linux/mman.h>
> +#include <linux/mm.h>
> +#include <linux/swap.h>
> +#include <linux/stddef.h>
> +#include <linux/vmalloc.h>
> +#include <linux/init.h>
> +#include <linux/delay.h>
> +#include <linux/highmem.h>
> +#include <linux/idr.h>
> +#include <linux/nodemask.h>
> +#include <linux/module.h>
> +#include <linux/poison.h>
> +#include <linux/memblock.h>
> +#include <linux/hugetlb.h>
> +#include <linux/slab.h>
> +
> +#include <asm/pgalloc.h>
> +#include <asm/page.h>
> +#include <asm/prom.h>
> +#include <asm/rtas.h>
> +#include <asm/io.h>
> +#include <asm/mmu_context.h>
> +#include <asm/pgtable.h>
> +#include <asm/mmu.h>
> +#include <asm/uaccess.h>
> +#include <asm/smp.h>
> +#include <asm/machdep.h>
> +#include <asm/tlb.h>
> +#include <asm/eeh.h>
> +#include <asm/processor.h>
> +#include <asm/mmzone.h>
> +#include <asm/cputable.h>
> +#include <asm/sections.h>
> +#include <asm/iommu.h>
> +#include <asm/vdso.h>
> +
> +#include "mmu_decl.h"
> +
> +phys_addr_t memstart_addr = (phys_addr_t)~0ull;
> +EXPORT_SYMBOL_GPL(memstart_addr);
> +phys_addr_t kernstart_addr;
> +EXPORT_SYMBOL_GPL(kernstart_addr);
> +
> +static void pgd_ctor(void *addr)
> +{
> +	memset(addr, 0, PGD_TABLE_SIZE);
> +}
> +
> +static void pud_ctor(void *addr)
> +{
> +	memset(addr, 0, PUD_TABLE_SIZE);
> +}
> +
> +static void pmd_ctor(void *addr)
> +{
> +	memset(addr, 0, PMD_TABLE_SIZE);
> +}
> +
> +struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
> +
> +/*
> + * Create a kmem_cache() for pagetables.  This is not used for PTE
> + * pages - they're linked to struct page, come from the normal free
> + * pages pool and have a different entry size (see real_pte_t) to
> + * everything else.  Caches created by this function are used for all
> + * the higher level pagetables, and for hugepage pagetables.
> + */
> +void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
> +{
> +	char *name;
> +	unsigned long table_size = sizeof(void *) << shift;
> +	unsigned long align = table_size;
> +
> +	/* When batching pgtable pointers for RCU freeing, we store
> +	 * the index size in the low bits.  Table alignment must be
> +	 * big enough to fit it.
> +	 *
> +	 * Likewise, hugeapge pagetable pointers contain a (different)
> +	 * shift value in the low bits.  All tables must be aligned so
> +	 * as to leave enough 0 bits in the address to contain it. */
> +	unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
> +				     HUGEPD_SHIFT_MASK + 1);
> +	struct kmem_cache *new;
> +
> +	/* It would be nice if this was a BUILD_BUG_ON(), but at the
> +	 * moment, gcc doesn't seem to recognize is_power_of_2 as a
> +	 * constant expression, so so much for that. */
> +	BUG_ON(!is_power_of_2(minalign));
> +	BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
> +
> +	if (PGT_CACHE(shift))
> +		return; /* Already have a cache of this size */
> +
> +	align = max_t(unsigned long, align, minalign);
> +	name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
> +	new = kmem_cache_create(name, table_size, align, 0, ctor);
> +	kfree(name);
> +	pgtable_cache[shift - 1] = new;
> +	pr_debug("Allocated pgtable cache for order %d\n", shift);
> +}
> +
> +
> +void pgtable_cache_init(void)
> +{
> +	pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
> +
> +	if (PMD_INDEX_SIZE && !PGT_CACHE(PMD_INDEX_SIZE))
> +		pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
> +	/*
> +	 * In all current configs, when the PUD index exists it's the
> +	 * same size as either the pgd or pmd index except with THP enabled
> +	 * on book3s 64
> +	 */
> +	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
> +		pgtable_cache_add(PUD_INDEX_SIZE, pud_ctor);
> +
> +	if (!PGT_CACHE(PGD_INDEX_SIZE))
> +		panic("Couldn't allocate pgd cache");
> +	if (PMD_INDEX_SIZE && !PGT_CACHE(PMD_INDEX_SIZE))
> +		panic("Couldn't allocate pmd pgtable caches");
> +	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
> +		panic("Couldn't allocate pud pgtable caches");
> +}
> diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
> index 448685f..79c24d4 100644
> --- a/arch/powerpc/mm/init_32.c
> +++ b/arch/powerpc/mm/init_32.c
> @@ -59,11 +59,6 @@
>  phys_addr_t total_memory;
>  phys_addr_t total_lowmem;
>  
> -phys_addr_t memstart_addr = (phys_addr_t)~0ull;
> -EXPORT_SYMBOL(memstart_addr);
> -phys_addr_t kernstart_addr;
> -EXPORT_SYMBOL(kernstart_addr);
> -
>  #ifdef CONFIG_RELOCATABLE
>  /* Used in __va()/__pa() */
>  long long virt_phys_offset;
> diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
> index 16ada1e..4acd546 100644
> --- a/arch/powerpc/mm/init_64.c
> +++ b/arch/powerpc/mm/init_64.c
> @@ -75,88 +75,6 @@
>  #endif
>  #endif /* CONFIG_PPC_STD_MMU_64 */
>  
> -phys_addr_t memstart_addr = ~0;
> -EXPORT_SYMBOL_GPL(memstart_addr);
> -phys_addr_t kernstart_addr;
> -EXPORT_SYMBOL_GPL(kernstart_addr);
> -
> -static void pgd_ctor(void *addr)
> -{
> -	memset(addr, 0, PGD_TABLE_SIZE);
> -}
> -
> -static void pud_ctor(void *addr)
> -{
> -	memset(addr, 0, PUD_TABLE_SIZE);
> -}
> -
> -static void pmd_ctor(void *addr)
> -{
> -	memset(addr, 0, PMD_TABLE_SIZE);
> -}
> -
> -struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
> -
> -/*
> - * Create a kmem_cache() for pagetables.  This is not used for PTE
> - * pages - they're linked to struct page, come from the normal free
> - * pages pool and have a different entry size (see real_pte_t) to
> - * everything else.  Caches created by this function are used for all
> - * the higher level pagetables, and for hugepage pagetables.
> - */
> -void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
> -{
> -	char *name;
> -	unsigned long table_size = sizeof(void *) << shift;
> -	unsigned long align = table_size;
> -
> -	/* When batching pgtable pointers for RCU freeing, we store
> -	 * the index size in the low bits.  Table alignment must be
> -	 * big enough to fit it.
> -	 *
> -	 * Likewise, hugeapge pagetable pointers contain a (different)
> -	 * shift value in the low bits.  All tables must be aligned so
> -	 * as to leave enough 0 bits in the address to contain it. */
> -	unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
> -				     HUGEPD_SHIFT_MASK + 1);
> -	struct kmem_cache *new;
> -
> -	/* It would be nice if this was a BUILD_BUG_ON(), but at the
> -	 * moment, gcc doesn't seem to recognize is_power_of_2 as a
> -	 * constant expression, so so much for that. */
> -	BUG_ON(!is_power_of_2(minalign));
> -	BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
> -
> -	if (PGT_CACHE(shift))
> -		return; /* Already have a cache of this size */
> -
> -	align = max_t(unsigned long, align, minalign);
> -	name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
> -	new = kmem_cache_create(name, table_size, align, 0, ctor);
> -	kfree(name);
> -	pgtable_cache[shift - 1] = new;
> -	pr_debug("Allocated pgtable cache for order %d\n", shift);
> -}
> -
> -
> -void pgtable_cache_init(void)
> -{
> -	pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
> -	pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
> -	/*
> -	 * In all current configs, when the PUD index exists it's the
> -	 * same size as either the pgd or pmd index except with THP enabled
> -	 * on book3s 64
> -	 */
> -	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
> -		pgtable_cache_add(PUD_INDEX_SIZE, pud_ctor);
> -
> -	if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
> -		panic("Couldn't allocate pgtable caches");
> -	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
> -		panic("Couldn't allocate pud pgtable caches");
> -}
> -
>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
>  /*
>   * Given an address within the vmemmap, determine the pfn of the page that
> diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
> index 0ae0572..a65c0b4 100644
> --- a/arch/powerpc/mm/pgtable_32.c
> +++ b/arch/powerpc/mm/pgtable_32.c
> @@ -42,43 +42,6 @@ EXPORT_SYMBOL(ioremap_bot);	/* aka VMALLOC_END */
>  
>  extern char etext[], _stext[], _sinittext[], _einittext[];
>  
> -#define PGDIR_ORDER	(32 + PGD_T_LOG2 - PGDIR_SHIFT)
> -
> -#ifndef CONFIG_PPC_4K_PAGES
> -static struct kmem_cache *pgtable_cache;
> -
> -void pgtable_cache_init(void)
> -{
> -	pgtable_cache = kmem_cache_create("PGDIR cache", 1 << PGDIR_ORDER,
> -					  1 << PGDIR_ORDER, 0, NULL);
> -	if (pgtable_cache == NULL)
> -		panic("Couldn't allocate pgtable caches");
> -}
> -#endif
> -
> -pgd_t *pgd_alloc(struct mm_struct *mm)
> -{
> -	pgd_t *ret;
> -
> -	/* pgdir take page or two with 4K pages and a page fraction otherwise */
> -#ifndef CONFIG_PPC_4K_PAGES
> -	ret = kmem_cache_alloc(pgtable_cache, GFP_KERNEL | __GFP_ZERO);
> -#else
> -	ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
> -			PGDIR_ORDER - PAGE_SHIFT);
> -#endif
> -	return ret;
> -}
> -
> -void pgd_free(struct mm_struct *mm, pgd_t *pgd)
> -{
> -#ifndef CONFIG_PPC_4K_PAGES
> -	kmem_cache_free(pgtable_cache, (void *)pgd);
> -#else
> -	free_pages((unsigned long)pgd, PGDIR_ORDER - PAGE_SHIFT);
> -#endif
> -}
> -
>  __ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
>  {
>  	pte_t *pte;
> -- 
> 2.1.0

I still didn't quiet follow why we are replacing

 -	hugepte_cache =  kmem_cache_create("hugepte-cache", sizeof(pte_t),
 -					   HUGEPD_SHIFT_MASK + 1, 0, NULL);
 +	pgtable_cache_add(1, NULL);

-aneesh
Christophe Leroy Aug. 14, 2016, 6:51 p.m. UTC | #2
Le 14/08/2016 à 16:17, Aneesh Kumar K.V a écrit :
> Christophe Leroy <christophe.leroy@c-s.fr> writes:
>
>> Today powerpc64 uses a set of pgtable_caches while powerpc32 uses
>> standard pages when using 4k pages and a single pgtable_cache
>> if using other size pages. In addition powerpc32 uses another cache
>> when handling huge pages.
>>
>> In preparation of implementing huge pages on the 8xx, this patch
>> replaces the specific powerpc32 handling by the 64 bits approach.
>
> Why is this needed ? Can you also summarize the page size used and the
> hugepage format you are planning to use ? . What are the page sizes
> supported by 8xx ? Also is the new code copy of existing powerpc64 4k
> page size code ?

8xx supports two huge page sizes: 8M and 512k.
As PGD entries points on 4M page tables, it means we are in an 
eterogenous situation:
1/ when using 8M huge pages, we are in the same situation as what is 
done for the BOOK3S (which supports 16M, 256M and 1G), that is several 
PDG entries pointing to a single PTE entry.
2/ when using 512k huge pages, we are in the same situation as whan is 
done for the BOOK3E: a PGD entry points to the hugepage table that 
handles several huge pages (in our case 8 huge pages)

The code from init_64 have been moved to a new file named init-common in 
order to be used by init_32 too.
The code from the 64 bits .h has been copied into the 32 bits .h (indeed 
it's been copied twice as the .h are now duplicated into nohash and 
book3s versions)

[...]

>> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
>> index 7372ee1..9164a77 100644
>> --- a/arch/powerpc/mm/hugetlbpage.c
>> +++ b/arch/powerpc/mm/hugetlbpage.c
>> @@ -68,7 +68,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
>>  #ifdef CONFIG_PPC_FSL_BOOK3E
>>  	int i;
>>  	int num_hugepd = 1 << (pshift - pdshift);
>> -	cachep = hugepte_cache;
>> +	cachep = PGT_CACHE(1);
>>  #else
>>  	cachep = PGT_CACHE(pdshift - pshift);
>>  #endif
>
> Can you explain the usage of PGT_CACHE(1) ?

[...]

>
> I still didn't quiet follow why we are replacing
>
>  -	hugepte_cache =  kmem_cache_create("hugepte-cache", sizeof(pte_t),
>  -					   HUGEPD_SHIFT_MASK + 1, 0, NULL);
>  +	pgtable_cache_add(1, NULL);
>

Euh ... Indeed I wanted something to replace hugepte_cache. But it looks 
like it should be something like PGT_CACHE(0) for 32 bits targets having 
32 bits PTEs and PGT_CACHE(1) for 32 bits targets having 64 bits PTEs. 
But PGT_CACHE(0) doesn't exist (yet).

Looking once more, that might not really be needed I think. I'll rework 
it and see what I can achieve.

Christophe

---
L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel antivirus Avast.
https://www.avast.com/antivirus
Aneesh Kumar K.V Aug. 15, 2016, 10:23 a.m. UTC | #3
christophe leroy <christophe.leroy@c-s.fr> writes:

> Le 14/08/2016 à 16:17, Aneesh Kumar K.V a écrit :
>> Christophe Leroy <christophe.leroy@c-s.fr> writes:
>>
>>> Today powerpc64 uses a set of pgtable_caches while powerpc32 uses
>>> standard pages when using 4k pages and a single pgtable_cache
>>> if using other size pages. In addition powerpc32 uses another cache
>>> when handling huge pages.
>>>
>>> In preparation of implementing huge pages on the 8xx, this patch
>>> replaces the specific powerpc32 handling by the 64 bits approach.
>>
>> Why is this needed ? Can you also summarize the page size used and the
>> hugepage format you are planning to use ? . What are the page sizes
>> supported by 8xx ? Also is the new code copy of existing powerpc64 4k
>> page size code ?
>
> 8xx supports two huge page sizes: 8M and 512k.
> As PGD entries points on 4M page tables, it means we are in an 
> eterogenous situation:
> 1/ when using 8M huge pages, we are in the same situation as what is 
> done for the BOOK3S (which supports 16M, 256M and 1G), that is several 
> PDG entries pointing to a single PTE entry.

what is done for FSL BOOK3E ?


> 2/ when using 512k huge pages, we are in the same situation as whan is 
> done for the BOOK3E: a PGD entry points to the hugepage table that 
> handles several huge pages (in our case 8 huge pages)
>

what is done for Book3s with 4K linux page size. ?


So the idea here is to allocate different hugepte table based on
hugepage size requested and hence the need to switch from hugpte-cache
to a more generic PGT_CACHE ?

> The code from init_64 have been moved to a new file named init-common in 
> order to be used by init_32 too.
> The code from the 64 bits .h has been copied into the 32 bits .h (indeed 
> it's been copied twice as the .h are now duplicated into nohash and 
> book3s versions)


That explanation made it a lot easy to follow the patch. Can we capture
that in commit message too. Also Do we support hugepage with both 4k and
16K linux page size ?. I guess we do because 8xx only do a two level
linux page table ? 

>
> [...]
>
>>> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
>>> index 7372ee1..9164a77 100644
>>> --- a/arch/powerpc/mm/hugetlbpage.c
>>> +++ b/arch/powerpc/mm/hugetlbpage.c
>>> @@ -68,7 +68,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
>>>  #ifdef CONFIG_PPC_FSL_BOOK3E
>>>  	int i;
>>>  	int num_hugepd = 1 << (pshift - pdshift);
>>> -	cachep = hugepte_cache;
>>> +	cachep = PGT_CACHE(1);
>>>  #else
>>>  	cachep = PGT_CACHE(pdshift - pshift);
>>>  #endif
>>
>> Can you explain the usage of PGT_CACHE(1) ?
>
> [...]
>
>>
>> I still didn't quiet follow why we are replacing
>>
>>  -	hugepte_cache =  kmem_cache_create("hugepte-cache", sizeof(pte_t),
>>  -					   HUGEPD_SHIFT_MASK + 1, 0, NULL);
>>  +	pgtable_cache_add(1, NULL);
>>
>
> Euh ... Indeed I wanted something to replace hugepte_cache. But it looks 
> like it should be something like PGT_CACHE(0) for 32 bits targets having 
> 32 bits PTEs and PGT_CACHE(1) for 32 bits targets having 64 bits PTEs. 
> But PGT_CACHE(0) doesn't exist (yet).
>
> Looking once more, that might not really be needed I think. I'll rework 
> it and see what I can achieve.
>

Thanks
-aneesh
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index 8e21bb4..ab215fd 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -2,14 +2,42 @@ 
 #define _ASM_POWERPC_BOOK3S_32_PGALLOC_H
 
 #include <linux/threads.h>
+#include <linux/slab.h>
 
-/* For 32-bit, all levels of page tables are just drawn from get_free_page() */
-#define MAX_PGTABLE_INDEX_SIZE	0
+/*
+ * Functions that deal with pagetables that could be at any level of
+ * the table need to be passed an "index_size" so they know how to
+ * handle allocation.  For PTE pages (which are linked to a struct
+ * page for now, and drawn from the main get_free_pages() pool), the
+ * allocation size will be (2^index_size * sizeof(pointer)) and
+ * allocations are drawn from the kmem_cache in PGT_CACHE(index_size).
+ *
+ * The maximum index size needs to be big enough to allow any
+ * pagetable sizes we need, but small enough to fit in the low bits of
+ * any page table pointer.  In other words all pagetables, even tiny
+ * ones, must be aligned to allow at least enough low 0 bits to
+ * contain this value.  This value is also used as a mask, so it must
+ * be one less than a power of two.
+ */
+#define MAX_PGTABLE_INDEX_SIZE	0xf
 
 extern void __bad_pte(pmd_t *pmd);
 
-extern pgd_t *pgd_alloc(struct mm_struct *mm);
-extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+extern struct kmem_cache *pgtable_cache[];
+#define PGT_CACHE(shift) ({				\
+			BUG_ON(!(shift));		\
+			pgtable_cache[(shift) - 1];	\
+		})
+
+static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL);
+}
+
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
+}
 
 /*
  * We don't have any real pmd's, and this code never triggers because
@@ -68,8 +96,12 @@  static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
 
 static inline void pgtable_free(void *table, unsigned index_size)
 {
-	BUG_ON(index_size); /* 32-bit doesn't use this */
-	free_page((unsigned long)table);
+	if (!index_size)
+		free_page((unsigned long)table);
+	else {
+		BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
+		kmem_cache_free(PGT_CACHE(index_size), table);
+	}
 }
 
 #define check_pgt_cache()	do { } while (0)
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 38b33dc..83a2159 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -8,6 +8,26 @@ 
 /* And here we include common definitions */
 #include <asm/pte-common.h>
 
+#define PTE_INDEX_SIZE	PTE_SHIFT
+#define PMD_INDEX_SIZE	0
+#define PUD_INDEX_SIZE	0
+#define PGD_INDEX_SIZE	(32 - PGDIR_SHIFT)
+
+#define PMD_CACHE_INDEX	PMD_INDEX_SIZE
+
+#ifndef __ASSEMBLY__
+#define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
+#define PMD_TABLE_SIZE	(sizeof(pmd_t) << PTE_INDEX_SIZE)
+#define PUD_TABLE_SIZE	(sizeof(pud_t) << PTE_INDEX_SIZE)
+#define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
+#endif	/* __ASSEMBLY__ */
+
+#define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
+#define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
+
+/* With 4k base page size, hugepage PTEs go at the PMD level */
+#define MIN_HUGEPTE_SHIFT	PMD_SHIFT
+
 /*
  * The normal case is that PTEs are 32-bits and we have a 1-page
  * 1024-entry pgdir pointing to 1-page 1024-entry PTE pages.  -- paulus
@@ -19,14 +39,10 @@ 
  * -Matt
  */
 /* PGDIR_SHIFT determines what a top-level page table entry can map */
-#define PGDIR_SHIFT	(PAGE_SHIFT + PTE_SHIFT)
+#define PGDIR_SHIFT	(PAGE_SHIFT + PTE_INDEX_SIZE)
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
-#define PTRS_PER_PTE	(1 << PTE_SHIFT)
-#define PTRS_PER_PMD	1
-#define PTRS_PER_PGD	(1 << (32 - PGDIR_SHIFT))
-
 #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
 /*
  * This is the bottom of the PKMAP area with HIGHMEM or an arbitrary
@@ -82,12 +98,8 @@ 
 
 extern unsigned long ioremap_bot;
 
-/*
- * entries per page directory level: our page-table tree is two-level, so
- * we don't really have any PMD directory.
- */
-#define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_SHIFT)
-#define PGD_TABLE_SIZE	(sizeof(pgd_t) << (32 - PGDIR_SHIFT))
+/* Bits to mask out from a PGD to get to the PUD page */
+#define PGD_MASKED_BITS		0
 
 #define pte_ERROR(e) \
 	pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \
@@ -282,15 +294,6 @@  static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) >> 3 })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val << 3 })
 
-#ifndef CONFIG_PPC_4K_PAGES
-void pgtable_cache_init(void);
-#else
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()	do { } while (0)
-#endif
-
 extern int get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep,
 		      pmd_t **pmdp);
 
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 263bf39..3f85d43 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -786,9 +786,6 @@  extern struct page *pgd_page(pgd_t pgd);
 #define pgd_ERROR(e) \
 	pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 
-void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
-void pgtable_cache_init(void);
-
 static inline int map_kernel_page(unsigned long ea, unsigned long pa,
 				  unsigned long flags)
 {
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index c5517f4..c201cd6 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -5,8 +5,6 @@ 
 #include <asm/page.h>
 #include <asm-generic/hugetlb.h>
 
-extern struct kmem_cache *hugepte_cache;
-
 #ifdef CONFIG_PPC_BOOK3S_64
 
 #include <asm/book3s/64/hugetlb-radix.h>
diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h
index 76d6b9e..c2fe85c 100644
--- a/arch/powerpc/include/asm/nohash/32/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
@@ -2,14 +2,42 @@ 
 #define _ASM_POWERPC_PGALLOC_32_H
 
 #include <linux/threads.h>
+#include <linux/slab.h>
 
-/* For 32-bit, all levels of page tables are just drawn from get_free_page() */
-#define MAX_PGTABLE_INDEX_SIZE	0
+/*
+ * Functions that deal with pagetables that could be at any level of
+ * the table need to be passed an "index_size" so they know how to
+ * handle allocation.  For PTE pages (which are linked to a struct
+ * page for now, and drawn from the main get_free_pages() pool), the
+ * allocation size will be (2^index_size * sizeof(pointer)) and
+ * allocations are drawn from the kmem_cache in PGT_CACHE(index_size).
+ *
+ * The maximum index size needs to be big enough to allow any
+ * pagetable sizes we need, but small enough to fit in the low bits of
+ * any page table pointer.  In other words all pagetables, even tiny
+ * ones, must be aligned to allow at least enough low 0 bits to
+ * contain this value.  This value is also used as a mask, so it must
+ * be one less than a power of two.
+ */
+#define MAX_PGTABLE_INDEX_SIZE	0xf
 
 extern void __bad_pte(pmd_t *pmd);
 
-extern pgd_t *pgd_alloc(struct mm_struct *mm);
-extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+extern struct kmem_cache *pgtable_cache[];
+#define PGT_CACHE(shift) ({				\
+			BUG_ON(!(shift));		\
+			pgtable_cache[(shift) - 1];	\
+		})
+
+static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL);
+}
+
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
+}
 
 /*
  * We don't have any real pmd's, and this code never triggers because
@@ -68,8 +96,12 @@  static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
 
 static inline void pgtable_free(void *table, unsigned index_size)
 {
-	BUG_ON(index_size); /* 32-bit doesn't use this */
-	free_page((unsigned long)table);
+	if (!index_size)
+		free_page((unsigned long)table);
+	else {
+		BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
+		kmem_cache_free(PGT_CACHE(index_size), table);
+	}
 }
 
 #define check_pgt_cache()	do { } while (0)
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 7808475..8a2937d 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -16,6 +16,26 @@  extern int icache_44x_need_flush;
 
 #endif /* __ASSEMBLY__ */
 
+#define PTE_INDEX_SIZE	PTE_SHIFT
+#define PMD_INDEX_SIZE	0
+#define PUD_INDEX_SIZE	0
+#define PGD_INDEX_SIZE	(32 - PGDIR_SHIFT)
+
+#define PMD_CACHE_INDEX	PMD_INDEX_SIZE
+
+#ifndef __ASSEMBLY__
+#define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
+#define PMD_TABLE_SIZE	(sizeof(pmd_t) << PTE_INDEX_SIZE)
+#define PUD_TABLE_SIZE	(sizeof(pud_t) << PTE_INDEX_SIZE)
+#define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
+#endif	/* __ASSEMBLY__ */
+
+#define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
+#define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
+
+/* With 4k base page size, hugepage PTEs go at the PMD level */
+#define MIN_HUGEPTE_SHIFT	PMD_SHIFT
+
 /*
  * The normal case is that PTEs are 32-bits and we have a 1-page
  * 1024-entry pgdir pointing to 1-page 1024-entry PTE pages.  -- paulus
@@ -27,22 +47,12 @@  extern int icache_44x_need_flush;
  * -Matt
  */
 /* PGDIR_SHIFT determines what a top-level page table entry can map */
-#define PGDIR_SHIFT	(PAGE_SHIFT + PTE_SHIFT)
+#define PGDIR_SHIFT	(PAGE_SHIFT + PTE_INDEX_SIZE)
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
-/*
- * entries per page directory level: our page-table tree is two-level, so
- * we don't really have any PMD directory.
- */
-#ifndef __ASSEMBLY__
-#define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_SHIFT)
-#define PGD_TABLE_SIZE	(sizeof(pgd_t) << (32 - PGDIR_SHIFT))
-#endif	/* __ASSEMBLY__ */
-
-#define PTRS_PER_PTE	(1 << PTE_SHIFT)
-#define PTRS_PER_PMD	1
-#define PTRS_PER_PGD	(1 << (32 - PGDIR_SHIFT))
+/* Bits to mask out from a PGD to get to the PUD page */
+#define PGD_MASKED_BITS		0
 
 #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
 #define FIRST_USER_ADDRESS	0UL
@@ -327,15 +337,6 @@  static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) >> 3 })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val << 3 })
 
-#ifndef CONFIG_PPC_4K_PAGES
-void pgtable_cache_init(void);
-#else
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()	do { } while (0)
-#endif
-
 extern int get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep,
 		      pmd_t **pmdp);
 
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index d4d808c..b0fc9e4 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -357,8 +357,6 @@  static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val((pte)) })
 #define __swp_entry_to_pte(x)		__pte((x).val)
 
-void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
-void pgtable_cache_init(void);
 extern int map_kernel_page(unsigned long ea, unsigned long pa,
 			   unsigned long flags);
 extern int __meminit vmemmap_create_mapping(unsigned long start,
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 9bd87f2..dd01212 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -78,6 +78,8 @@  static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 
 unsigned long vmalloc_to_phys(void *vmalloc_addr);
 
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
+void pgtable_cache_init(void);
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_H */
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index f2cea6d..08bb010 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -7,7 +7,7 @@  subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
 
 obj-y				:= fault.o mem.o pgtable.o mmap.o \
-				   init_$(CONFIG_WORD_SIZE).o \
+				   init_$(CONFIG_WORD_SIZE).o init-common.o \
 				   pgtable_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
 				   tlb_nohash_low.o
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 7372ee1..9164a77 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -68,7 +68,7 @@  static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 #ifdef CONFIG_PPC_FSL_BOOK3E
 	int i;
 	int num_hugepd = 1 << (pshift - pdshift);
-	cachep = hugepte_cache;
+	cachep = PGT_CACHE(1);
 #else
 	cachep = PGT_CACHE(pdshift - pshift);
 #endif
@@ -411,7 +411,7 @@  static void hugepd_free_rcu_callback(struct rcu_head *head)
 	unsigned int i;
 
 	for (i = 0; i < batch->index; i++)
-		kmem_cache_free(hugepte_cache, batch->ptes[i]);
+		kmem_cache_free(PGT_CACHE(1), batch->ptes[i]);
 
 	free_page((unsigned long)batch);
 }
@@ -425,7 +425,7 @@  static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
 	if (atomic_read(&tlb->mm->mm_users) < 2 ||
 	    cpumask_equal(mm_cpumask(tlb->mm),
 			  cpumask_of(smp_processor_id()))) {
-		kmem_cache_free(hugepte_cache, hugepte);
+		kmem_cache_free(PGT_CACHE(1), hugepte);
 		put_cpu_var(hugepd_freelist_cur);
 		return;
 	}
@@ -792,7 +792,6 @@  static int __init hugepage_setup_sz(char *str)
 __setup("hugepagesz=", hugepage_setup_sz);
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
-struct kmem_cache *hugepte_cache;
 static int __init hugetlbpage_init(void)
 {
 	int psize;
@@ -815,9 +814,8 @@  static int __init hugetlbpage_init(void)
 	 * Create a kmem cache for hugeptes.  The bottom bits in the pte have
 	 * size information encoded in them, so align them to allow this
 	 */
-	hugepte_cache =  kmem_cache_create("hugepte-cache", sizeof(pte_t),
-					   HUGEPD_SHIFT_MASK + 1, 0, NULL);
-	if (hugepte_cache == NULL)
+	pgtable_cache_add(1, NULL);
+	if (!PGT_CACHE(1))
 		panic("%s: Unable to create kmem cache for hugeptes\n",
 		      __func__);
 
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
new file mode 100644
index 0000000..2632eab
--- /dev/null
+++ b/arch/powerpc/mm/init-common.c
@@ -0,0 +1,152 @@ 
+/*
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Dave Engebretsen <engebret@us.ibm.com>
+ *      Rework for PPC64 port.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#undef DEBUG
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/highmem.h>
+#include <linux/idr.h>
+#include <linux/nodemask.h>
+#include <linux/module.h>
+#include <linux/poison.h>
+#include <linux/memblock.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
+
+#include <asm/pgalloc.h>
+#include <asm/page.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/uaccess.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/tlb.h>
+#include <asm/eeh.h>
+#include <asm/processor.h>
+#include <asm/mmzone.h>
+#include <asm/cputable.h>
+#include <asm/sections.h>
+#include <asm/iommu.h>
+#include <asm/vdso.h>
+
+#include "mmu_decl.h"
+
+phys_addr_t memstart_addr = (phys_addr_t)~0ull;
+EXPORT_SYMBOL_GPL(memstart_addr);
+phys_addr_t kernstart_addr;
+EXPORT_SYMBOL_GPL(kernstart_addr);
+
+static void pgd_ctor(void *addr)
+{
+	memset(addr, 0, PGD_TABLE_SIZE);
+}
+
+static void pud_ctor(void *addr)
+{
+	memset(addr, 0, PUD_TABLE_SIZE);
+}
+
+static void pmd_ctor(void *addr)
+{
+	memset(addr, 0, PMD_TABLE_SIZE);
+}
+
+struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
+
+/*
+ * Create a kmem_cache() for pagetables.  This is not used for PTE
+ * pages - they're linked to struct page, come from the normal free
+ * pages pool and have a different entry size (see real_pte_t) to
+ * everything else.  Caches created by this function are used for all
+ * the higher level pagetables, and for hugepage pagetables.
+ */
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
+{
+	char *name;
+	unsigned long table_size = sizeof(void *) << shift;
+	unsigned long align = table_size;
+
+	/* When batching pgtable pointers for RCU freeing, we store
+	 * the index size in the low bits.  Table alignment must be
+	 * big enough to fit it.
+	 *
+	 * Likewise, hugeapge pagetable pointers contain a (different)
+	 * shift value in the low bits.  All tables must be aligned so
+	 * as to leave enough 0 bits in the address to contain it. */
+	unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
+				     HUGEPD_SHIFT_MASK + 1);
+	struct kmem_cache *new;
+
+	/* It would be nice if this was a BUILD_BUG_ON(), but at the
+	 * moment, gcc doesn't seem to recognize is_power_of_2 as a
+	 * constant expression, so so much for that. */
+	BUG_ON(!is_power_of_2(minalign));
+	BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
+
+	if (PGT_CACHE(shift))
+		return; /* Already have a cache of this size */
+
+	align = max_t(unsigned long, align, minalign);
+	name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
+	new = kmem_cache_create(name, table_size, align, 0, ctor);
+	kfree(name);
+	pgtable_cache[shift - 1] = new;
+	pr_debug("Allocated pgtable cache for order %d\n", shift);
+}
+
+
+void pgtable_cache_init(void)
+{
+	pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
+
+	if (PMD_INDEX_SIZE && !PGT_CACHE(PMD_INDEX_SIZE))
+		pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
+	/*
+	 * In all current configs, when the PUD index exists it's the
+	 * same size as either the pgd or pmd index except with THP enabled
+	 * on book3s 64
+	 */
+	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
+		pgtable_cache_add(PUD_INDEX_SIZE, pud_ctor);
+
+	if (!PGT_CACHE(PGD_INDEX_SIZE))
+		panic("Couldn't allocate pgd cache");
+	if (PMD_INDEX_SIZE && !PGT_CACHE(PMD_INDEX_SIZE))
+		panic("Couldn't allocate pmd pgtable caches");
+	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
+		panic("Couldn't allocate pud pgtable caches");
+}
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 448685f..79c24d4 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -59,11 +59,6 @@ 
 phys_addr_t total_memory;
 phys_addr_t total_lowmem;
 
-phys_addr_t memstart_addr = (phys_addr_t)~0ull;
-EXPORT_SYMBOL(memstart_addr);
-phys_addr_t kernstart_addr;
-EXPORT_SYMBOL(kernstart_addr);
-
 #ifdef CONFIG_RELOCATABLE
 /* Used in __va()/__pa() */
 long long virt_phys_offset;
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 16ada1e..4acd546 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -75,88 +75,6 @@ 
 #endif
 #endif /* CONFIG_PPC_STD_MMU_64 */
 
-phys_addr_t memstart_addr = ~0;
-EXPORT_SYMBOL_GPL(memstart_addr);
-phys_addr_t kernstart_addr;
-EXPORT_SYMBOL_GPL(kernstart_addr);
-
-static void pgd_ctor(void *addr)
-{
-	memset(addr, 0, PGD_TABLE_SIZE);
-}
-
-static void pud_ctor(void *addr)
-{
-	memset(addr, 0, PUD_TABLE_SIZE);
-}
-
-static void pmd_ctor(void *addr)
-{
-	memset(addr, 0, PMD_TABLE_SIZE);
-}
-
-struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
-
-/*
- * Create a kmem_cache() for pagetables.  This is not used for PTE
- * pages - they're linked to struct page, come from the normal free
- * pages pool and have a different entry size (see real_pte_t) to
- * everything else.  Caches created by this function are used for all
- * the higher level pagetables, and for hugepage pagetables.
- */
-void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
-{
-	char *name;
-	unsigned long table_size = sizeof(void *) << shift;
-	unsigned long align = table_size;
-
-	/* When batching pgtable pointers for RCU freeing, we store
-	 * the index size in the low bits.  Table alignment must be
-	 * big enough to fit it.
-	 *
-	 * Likewise, hugeapge pagetable pointers contain a (different)
-	 * shift value in the low bits.  All tables must be aligned so
-	 * as to leave enough 0 bits in the address to contain it. */
-	unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
-				     HUGEPD_SHIFT_MASK + 1);
-	struct kmem_cache *new;
-
-	/* It would be nice if this was a BUILD_BUG_ON(), but at the
-	 * moment, gcc doesn't seem to recognize is_power_of_2 as a
-	 * constant expression, so so much for that. */
-	BUG_ON(!is_power_of_2(minalign));
-	BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
-
-	if (PGT_CACHE(shift))
-		return; /* Already have a cache of this size */
-
-	align = max_t(unsigned long, align, minalign);
-	name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
-	new = kmem_cache_create(name, table_size, align, 0, ctor);
-	kfree(name);
-	pgtable_cache[shift - 1] = new;
-	pr_debug("Allocated pgtable cache for order %d\n", shift);
-}
-
-
-void pgtable_cache_init(void)
-{
-	pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
-	pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
-	/*
-	 * In all current configs, when the PUD index exists it's the
-	 * same size as either the pgd or pmd index except with THP enabled
-	 * on book3s 64
-	 */
-	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
-		pgtable_cache_add(PUD_INDEX_SIZE, pud_ctor);
-
-	if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
-		panic("Couldn't allocate pgtable caches");
-	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
-		panic("Couldn't allocate pud pgtable caches");
-}
-
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 /*
  * Given an address within the vmemmap, determine the pfn of the page that
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 0ae0572..a65c0b4 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -42,43 +42,6 @@  EXPORT_SYMBOL(ioremap_bot);	/* aka VMALLOC_END */
 
 extern char etext[], _stext[], _sinittext[], _einittext[];
 
-#define PGDIR_ORDER	(32 + PGD_T_LOG2 - PGDIR_SHIFT)
-
-#ifndef CONFIG_PPC_4K_PAGES
-static struct kmem_cache *pgtable_cache;
-
-void pgtable_cache_init(void)
-{
-	pgtable_cache = kmem_cache_create("PGDIR cache", 1 << PGDIR_ORDER,
-					  1 << PGDIR_ORDER, 0, NULL);
-	if (pgtable_cache == NULL)
-		panic("Couldn't allocate pgtable caches");
-}
-#endif
-
-pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-	pgd_t *ret;
-
-	/* pgdir take page or two with 4K pages and a page fraction otherwise */
-#ifndef CONFIG_PPC_4K_PAGES
-	ret = kmem_cache_alloc(pgtable_cache, GFP_KERNEL | __GFP_ZERO);
-#else
-	ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
-			PGDIR_ORDER - PAGE_SHIFT);
-#endif
-	return ret;
-}
-
-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-#ifndef CONFIG_PPC_4K_PAGES
-	kmem_cache_free(pgtable_cache, (void *)pgd);
-#else
-	free_pages((unsigned long)pgd, PGDIR_ORDER - PAGE_SHIFT);
-#endif
-}
-
 __ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
 	pte_t *pte;