[1/2] powerpc/mm: Fix crashes with PUD level hugetlb config

Message ID 20180208103442.22045-1-aneesh.kumar@linux.vnet.ibm.com
State Superseded
Headers show
Series
  • [1/2] powerpc/mm: Fix crashes with PUD level hugetlb config
Related show

Commit Message

Aneesh Kumar K.V Feb. 8, 2018, 10:34 a.m.
To support memory keys, we moved the hash pte slot information to the second
half of the page table. This was ok with PTE entries at level 4 and level 3.
We already allocate larger page table pages at those level to accomodate extra
details. For level 4 we already have the extra space which was used to track
4k hash page table entry details and at pmd level the extra space was allocated
to track the THP details.

With hugetlbfs PTE, we used this extra space at the PMD level to store the
slot details. But we also support hugetlbfs PTE at PUD leve and PUD level page
didn't allocate extra space. This resulted in memory corruption.

Fix this by allocating extra space at PUD level when HUGETLB is enabled. We
may need further changes to allocate larger space at PMD level when we enable
HUGETLB. That will be done in next patch.

Fixes:bf9a95f9a6481bc6e(" powerpc: Free up four 64K PTE bits in 64K backed HPTE pages")

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
NOTE: In the long run we may want to look at my patch series to remove slot
tracking completely.

 arch/powerpc/include/asm/book3s/32/pgtable.h  |  1 +
 arch/powerpc/include/asm/book3s/64/hash-64k.h |  5 +++++
 arch/powerpc/include/asm/book3s/64/hash.h     | 10 ++++++++++
 arch/powerpc/include/asm/book3s/64/pgalloc.h  |  6 +++---
 arch/powerpc/include/asm/book3s/64/pgtable.h  |  2 ++
 arch/powerpc/include/asm/nohash/32/pgtable.h  |  1 +
 arch/powerpc/include/asm/nohash/64/pgtable.h  |  1 +
 arch/powerpc/mm/hash_utils_64.c               |  1 +
 arch/powerpc/mm/init-common.c                 |  4 ++--
 arch/powerpc/mm/pgtable-radix.c               |  1 +
 arch/powerpc/mm/pgtable_64.c                  |  2 ++
 11 files changed, 29 insertions(+), 5 deletions(-)

Comments

Aneesh Kumar K.V Feb. 8, 2018, 3:16 p.m. | #1
"Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> writes:

> To support memory keys, we moved the hash pte slot information to the second
> half of the page table. This was ok with PTE entries at level 4 and level 3.
> We already allocate larger page table pages at those level to accomodate extra
> details. For level 4 we already have the extra space which was used to track
> 4k hash page table entry details and at pmd level the extra space was allocated
> to track the THP details.
>
> With hugetlbfs PTE, we used this extra space at the PMD level to store the
> slot details. But we also support hugetlbfs PTE at PUD leve and PUD level page
> didn't allocate extra space. This resulted in memory corruption.
>
> Fix this by allocating extra space at PUD level when HUGETLB is enabled. We
> may need further changes to allocate larger space at PMD level when we enable
> HUGETLB. That will be done in next patch.
>
> Fixes:bf9a95f9a6481bc6e(" powerpc: Free up four 64K PTE bits in 64K backed HPTE pages")
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

Another fix, I still get random memory corruption with hugetlb test with
16G hugepage config.

commit f9484ac6b06bfacfeb82f1116bfc95e396fe7453
Author: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Date:   Thu Feb 8 19:36:22 2018 +0530

    powerpc/mm/hash64: Store the slot information at the right offset.
    
    The hugetlb pte entries are at the PMD and PUD level. Use the right offset
    for them to get the second half of the table.
    
    Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 949d691094a4..67c5475311ee 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -63,7 +63,8 @@ static inline int hash__hugepd_ok(hugepd_t hpd)
  * keeping the prototype consistent across the two formats.
  */
 static inline unsigned long pte_set_hidx(pte_t *ptep, real_pte_t rpte,
-			unsigned int subpg_index, unsigned long hidx)
+					 unsigned int subpg_index, unsigned long hidx,
+					 int offset)
 {
 	return (hidx << H_PAGE_F_GIX_SHIFT) &
 		(H_PAGE_F_SECOND | H_PAGE_F_GIX);
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index ee440fb3d240..3bcf269f8f55 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -45,7 +45,7 @@
  * generic accessors and iterators here
  */
 #define __real_pte __real_pte
-static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
+static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep, int offset)
 {
 	real_pte_t rpte;
 	unsigned long *hidxp;
@@ -59,7 +59,7 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
 	 */
 	smp_rmb();
 
-	hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
+	hidxp = (unsigned long *)(ptep + offset);
 	rpte.hidx = *hidxp;
 	return rpte;
 }
@@ -86,9 +86,10 @@ static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index)
  * expected to modify the PTE bits accordingly and commit the PTE to memory.
  */
 static inline unsigned long pte_set_hidx(pte_t *ptep, real_pte_t rpte,
-		unsigned int subpg_index, unsigned long hidx)
+					 unsigned int subpg_index,
+					 unsigned long hidx, int offset)
 {
-	unsigned long *hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
+	unsigned long *hidxp = (unsigned long *)(ptep + offset);
 
 	rpte.hidx &= ~HIDX_BITS(0xfUL, subpg_index);
 	*hidxp = rpte.hidx  | HIDX_BITS(HIDX_SHIFT_BY_ONE(hidx), subpg_index);
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 3c14663d457d..d92707aff762 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -350,7 +350,7 @@ extern unsigned long pci_io_base;
  */
 #ifndef __real_pte
 
-#define __real_pte(e,p)		((real_pte_t){(e)})
+#define __real_pte(e, p, o)		((real_pte_t){(e)})
 #define __rpte_to_pte(r)	((r).pte)
 #define __rpte_to_hidx(r,index)	(pte_val(__rpte_to_pte(r)) >> H_PAGE_F_GIX_SHIFT)
 
diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c
index 5a69b51d08a3..d573d7d07f25 100644
--- a/arch/powerpc/mm/hash64_4k.c
+++ b/arch/powerpc/mm/hash64_4k.c
@@ -55,7 +55,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 	 * need to add in 0x1 if it's a read-only user page
 	 */
 	rflags = htab_convert_pte_flags(new_pte);
-	rpte = __real_pte(__pte(old_pte), ptep);
+	rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
 
 	if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
 	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
@@ -117,7 +117,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 			return -1;
 		}
 		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
-		new_pte |= pte_set_hidx(ptep, rpte, 0, slot);
+		new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
 	}
 	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
 	return 0;
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index 2253bbc6a599..e601d95c3b20 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -86,7 +86,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 
 	subpg_index = (ea & (PAGE_SIZE - 1)) >> shift;
 	vpn  = hpt_vpn(ea, vsid, ssize);
-	rpte = __real_pte(__pte(old_pte), ptep);
+	rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
 	/*
 	 *None of the sub 4k page is hashed
 	 */
@@ -214,7 +214,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 		return -1;
 	}
 
-	new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot);
+	new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
 	new_pte |= H_PAGE_HASHPTE;
 
 	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
@@ -262,7 +262,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
 	} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
 
 	rflags = htab_convert_pte_flags(new_pte);
-	rpte = __real_pte(__pte(old_pte), ptep);
+	rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
 
 	if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
 	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
@@ -327,7 +327,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
 		}
 
 		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
-		new_pte |= pte_set_hidx(ptep, rpte, 0, slot);
+		new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
 	}
 	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
 	return 0;
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 12511f5a015f..b320f5097a06 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -27,7 +27,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 	unsigned long vpn;
 	unsigned long old_pte, new_pte;
 	unsigned long rflags, pa, sz;
-	long slot;
+	long slot, offset;
 
 	BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
 
@@ -63,7 +63,11 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 	} while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
 
 	rflags = htab_convert_pte_flags(new_pte);
-	rpte = __real_pte(__pte(old_pte), ptep);
+	if (unlikely(mmu_psize == MMU_PAGE_16G))
+		offset = PTRS_PER_PUD;
+	else
+		offset = PTRS_PER_PMD;
+	rpte = __real_pte(__pte(old_pte), ptep, offset);
 
 	sz = ((1UL) << shift);
 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
@@ -104,7 +108,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 			return -1;
 		}
 
-		new_pte |= pte_set_hidx(ptep, rpte, 0, slot);
+		new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset);
 	}
 
 	/*
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 881ebd53ffc2..9b23f12e863c 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -51,7 +51,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 	unsigned int psize;
 	int ssize;
 	real_pte_t rpte;
-	int i;
+	int i, offset;
 
 	i = batch->index;
 
@@ -67,6 +67,10 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 		psize = get_slice_psize(mm, addr);
 		/* Mask the address for the correct page size */
 		addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
+		if (unlikely(psize == MMU_PAGE_16G))
+			offset = PTRS_PER_PUD;
+		else
+			offset = PTRS_PER_PMD;
 #else
 		BUG();
 		psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
@@ -78,6 +82,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 		 * support 64k pages, this might be different from the
 		 * hardware page size encoded in the slice table. */
 		addr &= PAGE_MASK;
+		offset = PTRS_PER_PTE;
 	}
 
 
@@ -91,7 +96,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 	}
 	WARN_ON(vsid == 0);
 	vpn = hpt_vpn(addr, vsid, ssize);
-	rpte = __real_pte(__pte(pte), ptep);
+	rpte = __real_pte(__pte(pte), ptep, offset);
 
 	/*
 	 * Check if we have an active batch on this CPU. If not, just
Ram Pai Feb. 8, 2018, 7:22 p.m. | #2
On Thu, Feb 08, 2018 at 04:04:41PM +0530, Aneesh Kumar K.V wrote:
> To support memory keys, we moved the hash pte slot information to the second
> half of the page table. This was ok with PTE entries at level 4 and level 3.
> We already allocate larger page table pages at those level to accomodate extra
> details. For level 4 we already have the extra space which was used to track
> 4k hash page table entry details and at pmd level the extra space was allocated
> to track the THP details.
> 
> With hugetlbfs PTE, we used this extra space at the PMD level to store the
> slot details. But we also support hugetlbfs PTE at PUD leve and PUD level page
> didn't allocate extra space. This resulted in memory corruption.
> 
> Fix this by allocating extra space at PUD level when HUGETLB is enabled. We
> may need further changes to allocate larger space at PMD level when we enable
> HUGETLB. That will be done in next patch.
> 
> Fixes:bf9a95f9a6481bc6e(" powerpc: Free up four 64K PTE bits in 64K backed HPTE pages")

hmm.. did not know that hugetlbs operated at the PUD level. Thanks for
catching this.

> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
> NOTE: In the long run we may want to look at my patch series to remove slot
...snip...
>  }
> 
>  static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index 51017726d495..3c14663d457d 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -232,11 +232,13 @@ extern unsigned long __pmd_index_size;
>  extern unsigned long __pud_index_size;
>  extern unsigned long __pgd_index_size;
>  extern unsigned long __pmd_cache_index;
> +extern unsigned long __pud_cache_index;
>  #define PTE_INDEX_SIZE  __pte_index_size
>  #define PMD_INDEX_SIZE  __pmd_index_size
>  #define PUD_INDEX_SIZE  __pud_index_size
>  #define PGD_INDEX_SIZE  __pgd_index_size
>  #define PMD_CACHE_INDEX __pmd_cache_index
> +#define PUD_CACHE_INDEX __pmd_cache_index

This is a typo. Should be 'pud' not a 'pmd'.

#define PUD_CACHE_INDEX __pud_cache_index
                           ^

>  /*
>   * Because of use of pte fragments and THP, size of page table
..snip...


RP
Ram Pai Feb. 8, 2018, 7:29 p.m. | #3
On Thu, Feb 08, 2018 at 08:46:27PM +0530, Aneesh Kumar K.V wrote:
> "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> writes:
> 
> > To support memory keys, we moved the hash pte slot information to the second
> > half of the page table. This was ok with PTE entries at level 4 and level 3.
> > We already allocate larger page table pages at those level to accomodate extra
> > details. For level 4 we already have the extra space which was used to track
> > 4k hash page table entry details and at pmd level the extra space was allocated
> > to track the THP details.
> >
> > With hugetlbfs PTE, we used this extra space at the PMD level to store the
> > slot details. But we also support hugetlbfs PTE at PUD leve and PUD level page
> > didn't allocate extra space. This resulted in memory corruption.
> >
> > Fix this by allocating extra space at PUD level when HUGETLB is enabled. We
> > may need further changes to allocate larger space at PMD level when we enable
> > HUGETLB. That will be done in next patch.
> >
> > Fixes:bf9a95f9a6481bc6e(" powerpc: Free up four 64K PTE bits in 64K backed HPTE pages")
> >
> > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> 
> Another fix, I still get random memory corruption with hugetlb test with
> 16G hugepage config.

this fix may not be needed. It random corruption may be artifact of the typo you
had in your first patch?

RP
Aneesh Kumar K.V Feb. 9, 2018, 3:30 p.m. | #4
On 02/09/2018 12:52 AM, Ram Pai wrote:
> On Thu, Feb 08, 2018 at 04:04:41PM +0530, Aneesh Kumar K.V wrote:
>> To support memory keys, we moved the hash pte slot information to the second
>> half of the page table. This was ok with PTE entries at level 4 and level 3.
>> We already allocate larger page table pages at those level to accomodate extra
>> details. For level 4 we already have the extra space which was used to track
>> 4k hash page table entry details and at pmd level the extra space was allocated
>> to track the THP details.
>>
>> With hugetlbfs PTE, we used this extra space at the PMD level to store the
>> slot details. But we also support hugetlbfs PTE at PUD leve and PUD level page
>> didn't allocate extra space. This resulted in memory corruption.
>>
>> Fix this by allocating extra space at PUD level when HUGETLB is enabled. We
>> may need further changes to allocate larger space at PMD level when we enable
>> HUGETLB. That will be done in next patch.
>>
>> Fixes:bf9a95f9a6481bc6e(" powerpc: Free up four 64K PTE bits in 64K backed HPTE pages")
> 
> hmm.. did not know that hugetlbs operated at the PUD level. Thanks for
> catching this.
> 
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>> ---
>> NOTE: In the long run we may want to look at my patch series to remove slot
> ...snip...
>>   }
>>
>>   static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
>> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> index 51017726d495..3c14663d457d 100644
>> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
>> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> @@ -232,11 +232,13 @@ extern unsigned long __pmd_index_size;
>>   extern unsigned long __pud_index_size;
>>   extern unsigned long __pgd_index_size;
>>   extern unsigned long __pmd_cache_index;
>> +extern unsigned long __pud_cache_index;
>>   #define PTE_INDEX_SIZE  __pte_index_size
>>   #define PMD_INDEX_SIZE  __pmd_index_size
>>   #define PUD_INDEX_SIZE  __pud_index_size
>>   #define PGD_INDEX_SIZE  __pgd_index_size
>>   #define PMD_CACHE_INDEX __pmd_cache_index
>> +#define PUD_CACHE_INDEX __pmd_cache_index
> 
> This is a typo. Should be 'pud' not a 'pmd'.
> 
> #define PUD_CACHE_INDEX __pud_cache_index

Thanks fixed that.

-aneesh
Aneesh Kumar K.V Feb. 9, 2018, 3:31 p.m. | #5
On 02/09/2018 12:59 AM, Ram Pai wrote:
> On Thu, Feb 08, 2018 at 08:46:27PM +0530, Aneesh Kumar K.V wrote:
>> "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> writes:
>>
>>> To support memory keys, we moved the hash pte slot information to the second
>>> half of the page table. This was ok with PTE entries at level 4 and level 3.
>>> We already allocate larger page table pages at those level to accomodate extra
>>> details. For level 4 we already have the extra space which was used to track
>>> 4k hash page table entry details and at pmd level the extra space was allocated
>>> to track the THP details.
>>>
>>> With hugetlbfs PTE, we used this extra space at the PMD level to store the
>>> slot details. But we also support hugetlbfs PTE at PUD leve and PUD level page
>>> didn't allocate extra space. This resulted in memory corruption.
>>>
>>> Fix this by allocating extra space at PUD level when HUGETLB is enabled. We
>>> may need further changes to allocate larger space at PMD level when we enable
>>> HUGETLB. That will be done in next patch.
>>>
>>> Fixes:bf9a95f9a6481bc6e(" powerpc: Free up four 64K PTE bits in 64K backed HPTE pages")
>>>
>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>>
>> Another fix, I still get random memory corruption with hugetlb test with
>> 16G hugepage config.
> 
> this fix may not be needed. It random corruption may be artifact of the typo you
> had in your first patch?

Why? the tables at level 2 and leve3 are of different size and we should 
use the right offset to store the slot details. Even with the change you 
mentioned in the previous mail, I still have kernel crashes with 
hugetlbfs test running in parallel to a kernel build.

-aneesh
Aneesh Kumar K.V Feb. 10, 2018, 9:47 a.m. | #6
Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> writes:

> "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> writes:
>
>> To support memory keys, we moved the hash pte slot information to the second
>> half of the page table. This was ok with PTE entries at level 4 and level 3.
>> We already allocate larger page table pages at those level to accomodate extra
>> details. For level 4 we already have the extra space which was used to track
>> 4k hash page table entry details and at pmd level the extra space was allocated
>> to track the THP details.
>>
>> With hugetlbfs PTE, we used this extra space at the PMD level to store the
>> slot details. But we also support hugetlbfs PTE at PUD leve and PUD level page
>> didn't allocate extra space. This resulted in memory corruption.
>>
>> Fix this by allocating extra space at PUD level when HUGETLB is enabled. We
>> may need further changes to allocate larger space at PMD level when we enable
>> HUGETLB. That will be done in next patch.
>>
>> Fixes:bf9a95f9a6481bc6e(" powerpc: Free up four 64K PTE bits in 64K backed HPTE pages")
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>
> Another fix, I still get random memory corruption with hugetlb test with
> 16G hugepage config.

Another one. I am not sure whether we really want this in this form. But
with this tests are running fine.

-aneesh

commit 658fe8c310a913e69e5bc9a40d4c28a3b88d5c08
Author: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Date:   Sat Feb 10 13:17:34 2018 +0530

    powerpc/mm/hash64: memset the pagetable pages on allocation.
    
    Now that we are using second half of the table to store slot details and we
    don't clear them in the huge_pte_get_and_clear, we need to make sure we zero
    out the range on allocation. This done some extra work because the first half
    of the table is cleared by huge_pte_get_and_clear and memset in this patch
    zero-out the full table page.
    
    We need to do this for pgd and pud because both get allocated from the same slab
    cache.
    
    Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
The other option is to get huget_pte_get_and_clear to clear the second half of the page table.
That requires generic changes, because we don't have hugetlb page size available there.

diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index 53df86d3cfce..adb7fba4b6c7 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -73,10 +73,13 @@ static inline void radix__pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
+	pgd_t *pgd;
 	if (radix_enabled())
 		return radix__pgd_alloc(mm);
-	return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
-		pgtable_gfp_flags(mm, GFP_KERNEL));
+	pgd = kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
+			       pgtable_gfp_flags(mm, GFP_KERNEL));
+	memset(pgd, 0, PGD_TABLE_SIZE);
+	return pgd;
 }
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
@@ -93,8 +96,11 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return kmem_cache_alloc(PGT_CACHE(PUD_CACHE_INDEX),
-		pgtable_gfp_flags(mm, GFP_KERNEL));
+	pud_t *pud;
+	pud = kmem_cache_alloc(PGT_CACHE(PUD_CACHE_INDEX),
+			       pgtable_gfp_flags(mm, GFP_KERNEL));
+	memset(pud, 0, PUD_TABLE_SIZE);
+	return pud;
 }
 
 static inline void pud_free(struct mm_struct *mm, pud_t *pud)
Ram Pai Feb. 10, 2018, 4:50 p.m. | #7
On Sat, Feb 10, 2018 at 03:17:02PM +0530, Aneesh Kumar K.V wrote:
> Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> writes:
> 
> > "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> writes:
> >
> >> To support memory keys, we moved the hash pte slot information to the second
> >> half of the page table. This was ok with PTE entries at level 4 and level 3.
> >> We already allocate larger page table pages at those level to accomodate extra
> >> details. For level 4 we already have the extra space which was used to track
> >> 4k hash page table entry details and at pmd level the extra space was allocated
> >> to track the THP details.
> >>
> >> With hugetlbfs PTE, we used this extra space at the PMD level to store the
> >> slot details. But we also support hugetlbfs PTE at PUD leve and PUD level page
> >> didn't allocate extra space. This resulted in memory corruption.
> >>
> >> Fix this by allocating extra space at PUD level when HUGETLB is enabled. We
> >> may need further changes to allocate larger space at PMD level when we enable
> >> HUGETLB. That will be done in next patch.
> >>
> >> Fixes:bf9a95f9a6481bc6e(" powerpc: Free up four 64K PTE bits in 64K backed HPTE pages")
> >>
> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> >
> > Another fix, I still get random memory corruption with hugetlb test with
> > 16G hugepage config.
> 
> Another one. I am not sure whether we really want this in this form. But
> with this tests are running fine.
> 
> -aneesh
> 
> commit 658fe8c310a913e69e5bc9a40d4c28a3b88d5c08
> Author: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> Date:   Sat Feb 10 13:17:34 2018 +0530
> 
>     powerpc/mm/hash64: memset the pagetable pages on allocation.
>     
>     Now that we are using second half of the table to store slot details and we
>     don't clear them in the huge_pte_get_and_clear, we need to make sure we zero
>     out the range on allocation. This done some extra work because the first half
>     of the table is cleared by huge_pte_get_and_clear and memset in this patch
>     zero-out the full table page.
>     
>     We need to do this for pgd and pud because both get allocated from the same slab
>     cache.

Do we need to zero pgd aswell to resolve your corruption? pud is not
sufficient? Or was it done to avoid issues in the future in case pgd
is used as the leaf; possibly for Terra_huge_pages?

RP
Aneesh Kumar K.V Feb. 10, 2018, 5:14 p.m. | #8
On 02/10/2018 10:20 PM, Ram Pai wrote:
> On Sat, Feb 10, 2018 at 03:17:02PM +0530, Aneesh Kumar K.V wrote:
>> Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> writes:
>>
>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> writes:
>>>
>>>> To support memory keys, we moved the hash pte slot information to the second
>>>> half of the page table. This was ok with PTE entries at level 4 and level 3.
>>>> We already allocate larger page table pages at those level to accomodate extra
>>>> details. For level 4 we already have the extra space which was used to track
>>>> 4k hash page table entry details and at pmd level the extra space was allocated
>>>> to track the THP details.
>>>>
>>>> With hugetlbfs PTE, we used this extra space at the PMD level to store the
>>>> slot details. But we also support hugetlbfs PTE at PUD leve and PUD level page
>>>> didn't allocate extra space. This resulted in memory corruption.
>>>>
>>>> Fix this by allocating extra space at PUD level when HUGETLB is enabled. We
>>>> may need further changes to allocate larger space at PMD level when we enable
>>>> HUGETLB. That will be done in next patch.
>>>>
>>>> Fixes:bf9a95f9a6481bc6e(" powerpc: Free up four 64K PTE bits in 64K backed HPTE pages")
>>>>
>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>>>
>>> Another fix, I still get random memory corruption with hugetlb test with
>>> 16G hugepage config.
>>
>> Another one. I am not sure whether we really want this in this form. But
>> with this tests are running fine.
>>
>> -aneesh
>>
>> commit 658fe8c310a913e69e5bc9a40d4c28a3b88d5c08
>> Author: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>> Date:   Sat Feb 10 13:17:34 2018 +0530
>>
>>      powerpc/mm/hash64: memset the pagetable pages on allocation.
>>      
>>      Now that we are using second half of the table to store slot details and we
>>      don't clear them in the huge_pte_get_and_clear, we need to make sure we zero
>>      out the range on allocation. This done some extra work because the first half
>>      of the table is cleared by huge_pte_get_and_clear and memset in this patch
>>      zero-out the full table page.
>>      
>>      We need to do this for pgd and pud because both get allocated from the same slab
>>      cache.
> 
> Do we need to zero pgd aswell to resolve your corruption? pud is not
> sufficient? Or was it done to avoid issues in the future in case pgd
> is used as the leaf; possibly for Terra_huge_pages?
> 

it is the other way round, we need to zero-out pgd. pud zerout can be 
optional because first half of the page table is always cleared in the 
unmap path.

-aneesh

Patch

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 30a155c0a6b0..c615abdce119 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -16,6 +16,7 @@ 
 #define PGD_INDEX_SIZE	(32 - PGDIR_SHIFT)
 
 #define PMD_CACHE_INDEX	PMD_INDEX_SIZE
+#define PUD_CACHE_INDEX	PUD_INDEX_SIZE
 
 #ifndef __ASSEMBLY__
 #define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 338b7da468ce..c08b3b032ec0 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -146,7 +146,12 @@  static inline int hash__remap_4k_pfn(struct vm_area_struct *vma, unsigned long a
 #else
 #define H_PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
 #endif
+#ifdef CONFIG_HUGETLB_PAGE
+#define H_PUD_TABLE_SIZE	((sizeof(pud_t) << PUD_INDEX_SIZE) +	\
+				 (sizeof(unsigned long) << PUD_INDEX_SIZE))
+#else
 #define H_PUD_TABLE_SIZE	(sizeof(pud_t) << PUD_INDEX_SIZE)
+#endif
 #define H_PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 0920eff731b3..234f141fb151 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -32,6 +32,16 @@ 
 #else
 #define H_PMD_CACHE_INDEX	H_PMD_INDEX_SIZE
 #endif
+/*
+ * We not store the slot details in the second half of page table.
+ * Increase the pud level table so that hugetlb ptes can be stored
+ * at pud level.
+ */
+#if defined(CONFIG_HUGETLB_PAGE) &&  defined(CONFIG_PPC_64K_PAGES)
+#define H_PUD_CACHE_INDEX	(H_PUD_INDEX_SIZE + 1)
+#else
+#define H_PUD_CACHE_INDEX	(H_PUD_INDEX_SIZE)
+#endif
 /*
  * Define the address range of the kernel non-linear virtual area
  */
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index 1fcfa425cefa..53df86d3cfce 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -93,13 +93,13 @@  static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE),
+	return kmem_cache_alloc(PGT_CACHE(PUD_CACHE_INDEX),
 		pgtable_gfp_flags(mm, GFP_KERNEL));
 }
 
 static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 {
-	kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud);
+	kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud);
 }
 
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
@@ -115,7 +115,7 @@  static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
 	 * ahead and flush the page walk cache
 	 */
 	flush_tlb_pgtable(tlb, address);
-        pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE);
+        pgtable_free_tlb(tlb, pud, PUD_CACHE_INDEX);
 }
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 51017726d495..3c14663d457d 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -232,11 +232,13 @@  extern unsigned long __pmd_index_size;
 extern unsigned long __pud_index_size;
 extern unsigned long __pgd_index_size;
 extern unsigned long __pmd_cache_index;
+extern unsigned long __pud_cache_index;
 #define PTE_INDEX_SIZE  __pte_index_size
 #define PMD_INDEX_SIZE  __pmd_index_size
 #define PUD_INDEX_SIZE  __pud_index_size
 #define PGD_INDEX_SIZE  __pgd_index_size
 #define PMD_CACHE_INDEX __pmd_cache_index
+#define PUD_CACHE_INDEX __pmd_cache_index
 /*
  * Because of use of pte fragments and THP, size of page table
  * are not always derived out of index size above.
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 504a3c36ce5c..03bbd1149530 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -24,6 +24,7 @@  extern int icache_44x_need_flush;
 #define PGD_INDEX_SIZE	(32 - PGDIR_SHIFT)
 
 #define PMD_CACHE_INDEX	PMD_INDEX_SIZE
+#define PUD_CACHE_INDEX	PUD_INDEX_SIZE
 
 #ifndef __ASSEMBLY__
 #define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index abddf5830ad5..5c5f75d005ad 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -27,6 +27,7 @@ 
 #else
 #define PMD_CACHE_INDEX	PMD_INDEX_SIZE
 #endif
+#define PUD_CACHE_INDEX PUD_INDEX_SIZE
 
 /*
  * Define the address range of the kernel non-linear virtual area
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 7d07c7e17db6..cf290d415dcd 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1008,6 +1008,7 @@  void __init hash__early_init_mmu(void)
 	__pmd_index_size = H_PMD_INDEX_SIZE;
 	__pud_index_size = H_PUD_INDEX_SIZE;
 	__pgd_index_size = H_PGD_INDEX_SIZE;
+	__pud_cache_index = H_PUD_CACHE_INDEX;
 	__pmd_cache_index = H_PMD_CACHE_INDEX;
 	__pte_table_size = H_PTE_TABLE_SIZE;
 	__pmd_table_size = H_PMD_TABLE_SIZE;
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index eb8c6c8c4851..2b656e67f2ea 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -100,6 +100,6 @@  void pgtable_cache_init(void)
 	 * same size as either the pgd or pmd index except with THP enabled
 	 * on book3s 64
 	 */
-	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
-		pgtable_cache_add(PUD_INDEX_SIZE, pud_ctor);
+	if (PUD_CACHE_INDEX && !PGT_CACHE(PUD_CACHE_INDEX))
+		pgtable_cache_add(PUD_CACHE_INDEX, pud_ctor);
 }
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 573a9a2ee455..27d096610369 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -535,6 +535,7 @@  void __init radix__early_init_mmu(void)
 	__pmd_index_size = RADIX_PMD_INDEX_SIZE;
 	__pud_index_size = RADIX_PUD_INDEX_SIZE;
 	__pgd_index_size = RADIX_PGD_INDEX_SIZE;
+	__pud_cache_index = RADIX_PUD_INDEX_SIZE;
 	__pmd_cache_index = RADIX_PMD_INDEX_SIZE;
 	__pte_table_size = RADIX_PTE_TABLE_SIZE;
 	__pmd_table_size = RADIX_PMD_TABLE_SIZE;
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index c9a623c2d8a2..a0f8928c0b86 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -82,6 +82,8 @@  unsigned long __pgd_index_size;
 EXPORT_SYMBOL(__pgd_index_size);
 unsigned long __pmd_cache_index;
 EXPORT_SYMBOL(__pmd_cache_index);
+unsigned long __pud_cache_index;
+EXPORT_SYMBOL(__pud_cache_index);
 unsigned long __pte_table_size;
 EXPORT_SYMBOL(__pte_table_size);
 unsigned long __pmd_table_size;