diff mbox series

[4/7] powerpc: Free up four 64K PTE bits in 64K backed HPTE pages

Message ID 1504910713-7094-5-git-send-email-linuxram@us.ibm.com (mailing list archive)
State Changes Requested
Headers show
Series powerpc: Free up RPAGE_RSV bits | expand

Commit Message

Ram Pai Sept. 8, 2017, 10:44 p.m. UTC
Rearrange 64K PTE bits to  free  up  bits 3, 4, 5  and  6
in the 64K backed HPTE pages. This along with the earlier
patch will  entirely free  up the four bits from 64K PTE.
The bit numbers are  big-endian as defined in the  ISA3.0

This patch  does  the  following change to 64K PTE backed
by 64K HPTE.

H_PAGE_F_SECOND (S) which  occupied  bit  4  moves to the
	second part of the pte to bit 60.
H_PAGE_F_GIX (G,I,X) which  occupied  bit 5, 6 and 7 also
	moves  to  the   second part of the pte to bit 61,
       	62, 63, 64 respectively

since bit 7 is now freed up, we move H_PAGE_BUSY (B) from
bit  9  to  bit  7.

The second part of the PTE will hold
(H_PAGE_F_SECOND|H_PAGE_F_GIX) at bit 60,61,62,63.
NOTE: None of the bits in the secondary PTE were not used
by 64k-HPTE backed PTE.

Before the patch, the 64K HPTE backed 64k PTE format was
as follows

 0 1 2 3 4  5  6  7  8 9 10...........................63
 : : : : :  :  :  :  : : :                            :
 v v v v v  v  v  v  v v v                            v

,-,-,-,-,--,--,--,--,-,-,-,-,-,------------------,-,-,-,
|x|x|x| |S |G |I |X |x|B| |x|x|................|x|x|x|x| <- primary pte
'_'_'_'_'__'__'__'__'_'_'_'_'_'________________'_'_'_'_'
| | | | |  |  |  |  | | | | |..................| | | | | <- secondary pte
'_'_'_'_'__'__'__'__'_'_'_'_'__________________'_'_'_'_'

After the patch, the 64k HPTE backed 64k PTE format is
as follows

 0 1 2 3 4  5  6  7  8 9 10...........................63
 : : : : :  :  :  :  : : :                            :
 v v v v v  v  v  v  v v v                            v

,-,-,-,-,--,--,--,--,-,-,-,-,-,------------------,-,-,-,
|x|x|x| |  |  |  |B |x| | |x|x|................|.|.|.|.| <- primary pte
'_'_'_'_'__'__'__'__'_'_'_'_'_'________________'_'_'_'_'
| | | | |  |  |  |  | | | | |..................|S|G|I|X| <- secondary pte
'_'_'_'_'__'__'__'__'_'_'_'_'__________________'_'_'_'_'

The above PTE changes is applicable to hugetlbpages aswell.

The patch does the following code changes:

a) moves  the  H_PAGE_F_SECOND and  H_PAGE_F_GIX to 4k PTE
	header   since it is no more needed b the 64k PTEs.
b) abstracts  out __real_pte() and __rpte_to_hidx() so the
	caller  need not know the bit location of the slot.
c) moves the slot bits to the secondary pte.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  |    3 ++
 arch/powerpc/include/asm/book3s/64/hash-64k.h |   29 +++++++++++-------------
 arch/powerpc/include/asm/book3s/64/hash.h     |    3 --
 arch/powerpc/mm/hash64_64k.c                  |   23 ++++++++-----------
 arch/powerpc/mm/hugetlbpage-hash64.c          |   18 ++++++---------
 5 files changed, 33 insertions(+), 43 deletions(-)

Comments

Balbir Singh Sept. 14, 2017, 1:44 a.m. UTC | #1
On Fri,  8 Sep 2017 15:44:44 -0700
Ram Pai <linuxram@us.ibm.com> wrote:

> Rearrange 64K PTE bits to  free  up  bits 3, 4, 5  and  6
> in the 64K backed HPTE pages. This along with the earlier
> patch will  entirely free  up the four bits from 64K PTE.
> The bit numbers are  big-endian as defined in the  ISA3.0
> 
> This patch  does  the  following change to 64K PTE backed
> by 64K HPTE.
> 
> H_PAGE_F_SECOND (S) which  occupied  bit  4  moves to the
> 	second part of the pte to bit 60.
> H_PAGE_F_GIX (G,I,X) which  occupied  bit 5, 6 and 7 also
> 	moves  to  the   second part of the pte to bit 61,
>        	62, 63, 64 respectively
> 
> since bit 7 is now freed up, we move H_PAGE_BUSY (B) from
> bit  9  to  bit  7.
> 
> The second part of the PTE will hold
> (H_PAGE_F_SECOND|H_PAGE_F_GIX) at bit 60,61,62,63.
> NOTE: None of the bits in the secondary PTE were not used
> by 64k-HPTE backed PTE.
> 
> Before the patch, the 64K HPTE backed 64k PTE format was
> as follows
> 
>  0 1 2 3 4  5  6  7  8 9 10...........................63
>  : : : : :  :  :  :  : : :                            :
>  v v v v v  v  v  v  v v v                            v
> 
> ,-,-,-,-,--,--,--,--,-,-,-,-,-,------------------,-,-,-,
> |x|x|x| |S |G |I |X |x|B| |x|x|................|x|x|x|x| <- primary pte
> '_'_'_'_'__'__'__'__'_'_'_'_'_'________________'_'_'_'_'
> | | | | |  |  |  |  | | | | |..................| | | | | <- secondary pte
> '_'_'_'_'__'__'__'__'_'_'_'_'__________________'_'_'_'_'
> 
> After the patch, the 64k HPTE backed 64k PTE format is
> as follows
> 
>  0 1 2 3 4  5  6  7  8 9 10...........................63
>  : : : : :  :  :  :  : : :                            :
>  v v v v v  v  v  v  v v v                            v
> 
> ,-,-,-,-,--,--,--,--,-,-,-,-,-,------------------,-,-,-,
> |x|x|x| |  |  |  |B |x| | |x|x|................|.|.|.|.| <- primary pte
> '_'_'_'_'__'__'__'__'_'_'_'_'_'________________'_'_'_'_'
> | | | | |  |  |  |  | | | | |..................|S|G|I|X| <- secondary pte
> '_'_'_'_'__'__'__'__'_'_'_'_'__________________'_'_'_'_'
> 
> The above PTE changes is applicable to hugetlbpages aswell.
> 
> The patch does the following code changes:
> 
> a) moves  the  H_PAGE_F_SECOND and  H_PAGE_F_GIX to 4k PTE
> 	header   since it is no more needed b the 64k PTEs.
> b) abstracts  out __real_pte() and __rpte_to_hidx() so the
> 	caller  need not know the bit location of the slot.
> c) moves the slot bits to the secondary pte.
> 
> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> ---
>  arch/powerpc/include/asm/book3s/64/hash-4k.h  |    3 ++
>  arch/powerpc/include/asm/book3s/64/hash-64k.h |   29 +++++++++++-------------
>  arch/powerpc/include/asm/book3s/64/hash.h     |    3 --
>  arch/powerpc/mm/hash64_64k.c                  |   23 ++++++++-----------
>  arch/powerpc/mm/hugetlbpage-hash64.c          |   18 ++++++---------
>  5 files changed, 33 insertions(+), 43 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> index e66bfeb..dc153c6 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> @@ -16,6 +16,9 @@
>  #define H_PUD_TABLE_SIZE	(sizeof(pud_t) << H_PUD_INDEX_SIZE)
>  #define H_PGD_TABLE_SIZE	(sizeof(pgd_t) << H_PGD_INDEX_SIZE)
>  
> +#define H_PAGE_F_GIX_SHIFT	56
> +#define H_PAGE_F_SECOND	_RPAGE_RSV2	/* HPTE is in 2ndary HPTEG */
> +#define H_PAGE_F_GIX	(_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
>  #define H_PAGE_BUSY	_RPAGE_RSV1     /* software: PTE & hash are busy */
>  
>  /* PTE flags to conserve for HPTE identification */
> diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> index e038f1c..89ef5a9 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> @@ -12,7 +12,7 @@
>   */
>  #define H_PAGE_COMBO	_RPAGE_RPN0 /* this is a combo 4k page */
>  #define H_PAGE_4K_PFN	_RPAGE_RPN1 /* PFN is for a single 4k page */
> -#define H_PAGE_BUSY	_RPAGE_RPN42     /* software: PTE & hash are busy */
> +#define H_PAGE_BUSY	_RPAGE_RPN44     /* software: PTE & hash are busy */
>  
>  /*
>   * We need to differentiate between explicit huge page and THP huge
> @@ -21,8 +21,7 @@
>  #define H_PAGE_THP_HUGE  H_PAGE_4K_PFN
>  
>  /* PTE flags to conserve for HPTE identification */
> -#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_F_SECOND | \
> -			 H_PAGE_F_GIX | H_PAGE_HASHPTE | H_PAGE_COMBO)
> +#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | H_PAGE_COMBO)
>  /*
>   * we support 16 fragments per PTE page of 64K size.
>   */
> @@ -50,24 +49,22 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
>  	unsigned long *hidxp;
>  
>  	rpte.pte = pte;
> -	rpte.hidx = 0;
> -	if (pte_val(pte) & H_PAGE_COMBO) {
> -		/*
> -		 * Make sure we order the hidx load against the H_PAGE_COMBO
> -		 * check. The store side ordering is done in __hash_page_4K
> -		 */
> -		smp_rmb();
> -		hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
> -		rpte.hidx = *hidxp;
> -	}
> +	/*
> +	 * Ensure that we do not read the hidx before we read
> +	 * the pte. Because the writer side is  expected
> +	 * to finish writing the hidx first followed by the pte,
> +	 * by using smp_wmb().
> +	 * pte_set_hash_slot() ensures that.
> +	 */
> +	smp_rmb();
> +	hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
> +	rpte.hidx = *hidxp;
>  	return rpte;
>  }
>  
>  static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index)
>  {
> -	if ((pte_val(rpte.pte) & H_PAGE_COMBO))
> -		return (rpte.hidx >> (index<<2)) & 0xf;
> -	return (pte_val(rpte.pte) >> H_PAGE_F_GIX_SHIFT) & 0xf;
> +	return ((rpte.hidx >> (index<<2)) & 0xfUL);
>  }
>  
>  /*
> diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
> index 8ce4112..46f3a23 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash.h
> @@ -8,9 +8,6 @@
>   *
>   */
>  #define H_PTE_NONE_MASK		_PAGE_HPTEFLAGS
> -#define H_PAGE_F_GIX_SHIFT	56
> -#define H_PAGE_F_SECOND		_RPAGE_RSV2	/* HPTE is in 2ndary HPTEG */
> -#define H_PAGE_F_GIX		(_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
>  #define H_PAGE_HASHPTE		_RPAGE_RPN43	/* PTE has associated HPTE */
>  
>  #ifdef CONFIG_PPC_64K_PAGES
> diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
> index c6c5559..9c63844 100644
> --- a/arch/powerpc/mm/hash64_64k.c
> +++ b/arch/powerpc/mm/hash64_64k.c
> @@ -103,8 +103,8 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
>  		 * On hash insert failure we use old pte value and we don't
>  		 * want slot information there if we have a insert failure.
>  		 */
> -		old_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND);
> -		new_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND);
> +		old_pte &= ~H_PAGE_HASHPTE;
> +		new_pte &= ~H_PAGE_HASHPTE;

Shouldn't we set old/new_pte.slot = invalid? via rpte.hidx

>  		goto htab_insert_hpte;
>  	}
>  	/*
> @@ -227,6 +227,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
>  		    unsigned long vsid, pte_t *ptep, unsigned long trap,
>  		    unsigned long flags, int ssize)
>  {
> +	real_pte_t rpte;
>  	unsigned long hpte_group;
>  	unsigned long rflags, pa;
>  	unsigned long old_pte, new_pte;
> @@ -263,6 +264,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
>  	} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
>  
>  	rflags = htab_convert_pte_flags(new_pte);
> +	rpte = __real_pte(__pte(old_pte), ptep);
>  
>  	if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
>  	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
> @@ -270,18 +272,13 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
>  
>  	vpn  = hpt_vpn(ea, vsid, ssize);
>  	if (unlikely(old_pte & H_PAGE_HASHPTE)) {
> +		unsigned long gslot;
>  		/*
>  		 * There MIGHT be an HPTE for this pte
>  		 */
> -		hash = hpt_hash(vpn, shift, ssize);
> -		if (old_pte & H_PAGE_F_SECOND)
> -			hash = ~hash;
> -		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
> -		slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
> -
> -		if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K,
> -					       MMU_PAGE_64K, ssize,
> -					       flags) == -1)
> +		gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
> +		if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K,
> +				MMU_PAGE_64K, ssize, flags) == -1)
>  			old_pte &= ~_PAGE_HPTEFLAGS;
>  	}
>  
> @@ -328,9 +325,9 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
>  					   MMU_PAGE_64K, MMU_PAGE_64K, old_pte);
>  			return -1;
>  		}
> +
>  		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
> -		new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
> -			(H_PAGE_F_SECOND | H_PAGE_F_GIX);
> +		new_pte |= pte_set_hash_slot(ptep, rpte, 0, slot);
>  	}
>  	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
>  	return 0;
> diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
> index a84bb44..d52d667 100644
> --- a/arch/powerpc/mm/hugetlbpage-hash64.c
> +++ b/arch/powerpc/mm/hugetlbpage-hash64.c
> @@ -22,6 +22,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
>  		     pte_t *ptep, unsigned long trap, unsigned long flags,
>  		     int ssize, unsigned int shift, unsigned int mmu_psize)
>  {
> +	real_pte_t rpte;
>  	unsigned long vpn;
>  	unsigned long old_pte, new_pte;
>  	unsigned long rflags, pa, sz;
> @@ -61,6 +62,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
>  	} while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
>  
>  	rflags = htab_convert_pte_flags(new_pte);
> +	rpte = __real_pte(__pte(old_pte), ptep);
>  
>  	sz = ((1UL) << shift);
>  	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
> @@ -71,16 +73,11 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
>  	/* Check if pte already has an hpte (case 2) */
>  	if (unlikely(old_pte & H_PAGE_HASHPTE)) {
>  		/* There MIGHT be an HPTE for this pte */
> -		unsigned long hash, slot;
> +		unsigned long gslot;
>  
> -		hash = hpt_hash(vpn, shift, ssize);
> -		if (old_pte & H_PAGE_F_SECOND)
> -			hash = ~hash;
> -		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
> -		slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
> -
> -		if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, mmu_psize,
> -					       mmu_psize, ssize, flags) == -1)
> +		gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
> +		if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize,
> +				mmu_psize, ssize, flags) == -1)
>  			old_pte &= ~_PAGE_HPTEFLAGS;
>  	}
>  
> @@ -106,8 +103,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
>  			return -1;
>  		}
>  
> -		new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
> -			(H_PAGE_F_SECOND | H_PAGE_F_GIX);
> +		new_pte |= pte_set_hash_slot(ptep, rpte, 0, slot);
>  	}
>  
>  	/*

Balbir
Benjamin Herrenschmidt Sept. 14, 2017, 8:13 a.m. UTC | #2
On Fri, 2017-09-08 at 15:44 -0700, Ram Pai wrote:
> The second part of the PTE will hold
> (H_PAGE_F_SECOND|H_PAGE_F_GIX) at bit 60,61,62,63.
> NOTE: None of the bits in the secondary PTE were not used
> by 64k-HPTE backed PTE.

Have you measured the performance impact of this ? The second part of
the PTE being in a different cache line there could be one...

Cheers,
Ben.
Ram Pai Sept. 14, 2017, 5:54 p.m. UTC | #3
On Thu, Sep 14, 2017 at 11:44:49AM +1000, Balbir Singh wrote:
> On Fri,  8 Sep 2017 15:44:44 -0700
> Ram Pai <linuxram@us.ibm.com> wrote:
> 
> > Rearrange 64K PTE bits to  free  up  bits 3, 4, 5  and  6
> > in the 64K backed HPTE pages. This along with the earlier
> > patch will  entirely free  up the four bits from 64K PTE.
> > The bit numbers are  big-endian as defined in the  ISA3.0
> > 
> > This patch  does  the  following change to 64K PTE backed
> > by 64K HPTE.
> > 
> > H_PAGE_F_SECOND (S) which  occupied  bit  4  moves to the
> > 	second part of the pte to bit 60.
> > H_PAGE_F_GIX (G,I,X) which  occupied  bit 5, 6 and 7 also
> > 	moves  to  the   second part of the pte to bit 61,
> >        	62, 63, 64 respectively
> > 
> > since bit 7 is now freed up, we move H_PAGE_BUSY (B) from
> > bit  9  to  bit  7.
> > 
> > The second part of the PTE will hold
> > (H_PAGE_F_SECOND|H_PAGE_F_GIX) at bit 60,61,62,63.
> > NOTE: None of the bits in the secondary PTE were not used
> > by 64k-HPTE backed PTE.
> > 
> > Before the patch, the 64K HPTE backed 64k PTE format was
> > as follows
> > 
> >  0 1 2 3 4  5  6  7  8 9 10...........................63
> >  : : : : :  :  :  :  : : :                            :
> >  v v v v v  v  v  v  v v v                            v
> > 
> > ,-,-,-,-,--,--,--,--,-,-,-,-,-,------------------,-,-,-,
> > |x|x|x| |S |G |I |X |x|B| |x|x|................|x|x|x|x| <- primary pte
> > '_'_'_'_'__'__'__'__'_'_'_'_'_'________________'_'_'_'_'
> > | | | | |  |  |  |  | | | | |..................| | | | | <- secondary pte
> > '_'_'_'_'__'__'__'__'_'_'_'_'__________________'_'_'_'_'
> > 
> > After the patch, the 64k HPTE backed 64k PTE format is
> > as follows
> > 
> >  0 1 2 3 4  5  6  7  8 9 10...........................63
> >  : : : : :  :  :  :  : : :                            :
> >  v v v v v  v  v  v  v v v                            v
> > 
> > ,-,-,-,-,--,--,--,--,-,-,-,-,-,------------------,-,-,-,
> > |x|x|x| |  |  |  |B |x| | |x|x|................|.|.|.|.| <- primary pte
> > '_'_'_'_'__'__'__'__'_'_'_'_'_'________________'_'_'_'_'
> > | | | | |  |  |  |  | | | | |..................|S|G|I|X| <- secondary pte
> > '_'_'_'_'__'__'__'__'_'_'_'_'__________________'_'_'_'_'
> > 
> > The above PTE changes is applicable to hugetlbpages aswell.
> > 
> > The patch does the following code changes:
> > 
> > a) moves  the  H_PAGE_F_SECOND and  H_PAGE_F_GIX to 4k PTE
> > 	header   since it is no more needed b the 64k PTEs.
> > b) abstracts  out __real_pte() and __rpte_to_hidx() so the
> > 	caller  need not know the bit location of the slot.
> > c) moves the slot bits to the secondary pte.
> > 
> > Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> > Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> > ---
> >  arch/powerpc/include/asm/book3s/64/hash-4k.h  |    3 ++
> >  arch/powerpc/include/asm/book3s/64/hash-64k.h |   29 +++++++++++-------------
> >  arch/powerpc/include/asm/book3s/64/hash.h     |    3 --
> >  arch/powerpc/mm/hash64_64k.c                  |   23 ++++++++-----------
> >  arch/powerpc/mm/hugetlbpage-hash64.c          |   18 ++++++---------
> >  5 files changed, 33 insertions(+), 43 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> > index e66bfeb..dc153c6 100644
> > --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
> > +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> > @@ -16,6 +16,9 @@
> >  #define H_PUD_TABLE_SIZE	(sizeof(pud_t) << H_PUD_INDEX_SIZE)
> >  #define H_PGD_TABLE_SIZE	(sizeof(pgd_t) << H_PGD_INDEX_SIZE)
> >  
> > +#define H_PAGE_F_GIX_SHIFT	56
> > +#define H_PAGE_F_SECOND	_RPAGE_RSV2	/* HPTE is in 2ndary HPTEG */
> > +#define H_PAGE_F_GIX	(_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
> >  #define H_PAGE_BUSY	_RPAGE_RSV1     /* software: PTE & hash are busy */
> >  
> >  /* PTE flags to conserve for HPTE identification */
> > diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> > index e038f1c..89ef5a9 100644
> > --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
> > +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> > @@ -12,7 +12,7 @@
> >   */
> >  #define H_PAGE_COMBO	_RPAGE_RPN0 /* this is a combo 4k page */
> >  #define H_PAGE_4K_PFN	_RPAGE_RPN1 /* PFN is for a single 4k page */
> > -#define H_PAGE_BUSY	_RPAGE_RPN42     /* software: PTE & hash are busy */
> > +#define H_PAGE_BUSY	_RPAGE_RPN44     /* software: PTE & hash are busy */
> >  
> >  /*
> >   * We need to differentiate between explicit huge page and THP huge
> > @@ -21,8 +21,7 @@
> >  #define H_PAGE_THP_HUGE  H_PAGE_4K_PFN
> >  
> >  /* PTE flags to conserve for HPTE identification */
> > -#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_F_SECOND | \
> > -			 H_PAGE_F_GIX | H_PAGE_HASHPTE | H_PAGE_COMBO)
> > +#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | H_PAGE_COMBO)
> >  /*
> >   * we support 16 fragments per PTE page of 64K size.
> >   */
> > @@ -50,24 +49,22 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
> >  	unsigned long *hidxp;
> >  
> >  	rpte.pte = pte;
> > -	rpte.hidx = 0;
> > -	if (pte_val(pte) & H_PAGE_COMBO) {
> > -		/*
> > -		 * Make sure we order the hidx load against the H_PAGE_COMBO
> > -		 * check. The store side ordering is done in __hash_page_4K
> > -		 */
> > -		smp_rmb();
> > -		hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
> > -		rpte.hidx = *hidxp;
> > -	}
> > +	/*
> > +	 * Ensure that we do not read the hidx before we read
> > +	 * the pte. Because the writer side is  expected
> > +	 * to finish writing the hidx first followed by the pte,
> > +	 * by using smp_wmb().
> > +	 * pte_set_hash_slot() ensures that.
> > +	 */
> > +	smp_rmb();
> > +	hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
> > +	rpte.hidx = *hidxp;
> >  	return rpte;
> >  }
> >  
> >  static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index)
> >  {
> > -	if ((pte_val(rpte.pte) & H_PAGE_COMBO))
> > -		return (rpte.hidx >> (index<<2)) & 0xf;
> > -	return (pte_val(rpte.pte) >> H_PAGE_F_GIX_SHIFT) & 0xf;
> > +	return ((rpte.hidx >> (index<<2)) & 0xfUL);
> >  }
> >  
> >  /*
> > diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
> > index 8ce4112..46f3a23 100644
> > --- a/arch/powerpc/include/asm/book3s/64/hash.h
> > +++ b/arch/powerpc/include/asm/book3s/64/hash.h
> > @@ -8,9 +8,6 @@
> >   *
> >   */
> >  #define H_PTE_NONE_MASK		_PAGE_HPTEFLAGS
> > -#define H_PAGE_F_GIX_SHIFT	56
> > -#define H_PAGE_F_SECOND		_RPAGE_RSV2	/* HPTE is in 2ndary HPTEG */
> > -#define H_PAGE_F_GIX		(_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
> >  #define H_PAGE_HASHPTE		_RPAGE_RPN43	/* PTE has associated HPTE */
> >  
> >  #ifdef CONFIG_PPC_64K_PAGES
> > diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
> > index c6c5559..9c63844 100644
> > --- a/arch/powerpc/mm/hash64_64k.c
> > +++ b/arch/powerpc/mm/hash64_64k.c
> > @@ -103,8 +103,8 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
> >  		 * On hash insert failure we use old pte value and we don't
> >  		 * want slot information there if we have a insert failure.
> >  		 */
> > -		old_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND);
> > -		new_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND);
> > +		old_pte &= ~H_PAGE_HASHPTE;
> > +		new_pte &= ~H_PAGE_HASHPTE;
> 
> Shouldn't we set old/new_pte.slot = invalid? via rpte.hidx

by resetting the H_PAGE_HASHPTE flag, we are invalidating
slot information.  Would that not be sufficient?

RP

> 
> >  		goto htab_insert_hpte;
> >  	}
> >  	/*
> > @@ -227,6 +227,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
> >  		    unsigned long vsid, pte_t *ptep, unsigned long trap,
> >  		    unsigned long flags, int ssize)
> >  {
> > +	real_pte_t rpte;
> >  	unsigned long hpte_group;
> >  	unsigned long rflags, pa;
> >  	unsigned long old_pte, new_pte;
> > @@ -263,6 +264,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
> >  	} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
> >  
> >  	rflags = htab_convert_pte_flags(new_pte);
> > +	rpte = __real_pte(__pte(old_pte), ptep);
> >  
> >  	if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
> >  	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
> > @@ -270,18 +272,13 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
> >  
> >  	vpn  = hpt_vpn(ea, vsid, ssize);
> >  	if (unlikely(old_pte & H_PAGE_HASHPTE)) {
> > +		unsigned long gslot;
> >  		/*
> >  		 * There MIGHT be an HPTE for this pte
> >  		 */
> > -		hash = hpt_hash(vpn, shift, ssize);
> > -		if (old_pte & H_PAGE_F_SECOND)
> > -			hash = ~hash;
> > -		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
> > -		slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
> > -
> > -		if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K,
> > -					       MMU_PAGE_64K, ssize,
> > -					       flags) == -1)
> > +		gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
> > +		if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K,
> > +				MMU_PAGE_64K, ssize, flags) == -1)
> >  			old_pte &= ~_PAGE_HPTEFLAGS;
> >  	}
> >  
> > @@ -328,9 +325,9 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
> >  					   MMU_PAGE_64K, MMU_PAGE_64K, old_pte);
> >  			return -1;
> >  		}
> > +
> >  		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
> > -		new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
> > -			(H_PAGE_F_SECOND | H_PAGE_F_GIX);
> > +		new_pte |= pte_set_hash_slot(ptep, rpte, 0, slot);
> >  	}
> >  	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
> >  	return 0;
> > diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
> > index a84bb44..d52d667 100644
> > --- a/arch/powerpc/mm/hugetlbpage-hash64.c
> > +++ b/arch/powerpc/mm/hugetlbpage-hash64.c
> > @@ -22,6 +22,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
> >  		     pte_t *ptep, unsigned long trap, unsigned long flags,
> >  		     int ssize, unsigned int shift, unsigned int mmu_psize)
> >  {
> > +	real_pte_t rpte;
> >  	unsigned long vpn;
> >  	unsigned long old_pte, new_pte;
> >  	unsigned long rflags, pa, sz;
> > @@ -61,6 +62,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
> >  	} while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
> >  
> >  	rflags = htab_convert_pte_flags(new_pte);
> > +	rpte = __real_pte(__pte(old_pte), ptep);
> >  
> >  	sz = ((1UL) << shift);
> >  	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
> > @@ -71,16 +73,11 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
> >  	/* Check if pte already has an hpte (case 2) */
> >  	if (unlikely(old_pte & H_PAGE_HASHPTE)) {
> >  		/* There MIGHT be an HPTE for this pte */
> > -		unsigned long hash, slot;
> > +		unsigned long gslot;
> >  
> > -		hash = hpt_hash(vpn, shift, ssize);
> > -		if (old_pte & H_PAGE_F_SECOND)
> > -			hash = ~hash;
> > -		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
> > -		slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
> > -
> > -		if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, mmu_psize,
> > -					       mmu_psize, ssize, flags) == -1)
> > +		gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
> > +		if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize,
> > +				mmu_psize, ssize, flags) == -1)
> >  			old_pte &= ~_PAGE_HPTEFLAGS;
> >  	}
> >  
> > @@ -106,8 +103,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
> >  			return -1;
> >  		}
> >  
> > -		new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
> > -			(H_PAGE_F_SECOND | H_PAGE_F_GIX);
> > +		new_pte |= pte_set_hash_slot(ptep, rpte, 0, slot);
> >  	}
> >  
> >  	/*
> 
> Balbir
Ram Pai Sept. 14, 2017, 6:25 p.m. UTC | #4
On Thu, Sep 14, 2017 at 10:54:08AM -0700, Ram Pai wrote:
> On Thu, Sep 14, 2017 at 11:44:49AM +1000, Balbir Singh wrote:
> > On Fri,  8 Sep 2017 15:44:44 -0700
> > Ram Pai <linuxram@us.ibm.com> wrote:
> > 
> > > Rearrange 64K PTE bits to  free  up  bits 3, 4, 5  and  6
> > > in the 64K backed HPTE pages. This along with the earlier
> > > patch will  entirely free  up the four bits from 64K PTE.
> > > The bit numbers are  big-endian as defined in the  ISA3.0
> > > 
> > > This patch  does  the  following change to 64K PTE backed
> > > by 64K HPTE.
> > > 
> > > H_PAGE_F_SECOND (S) which  occupied  bit  4  moves to the
> > > 	second part of the pte to bit 60.
> > > H_PAGE_F_GIX (G,I,X) which  occupied  bit 5, 6 and 7 also
> > > 	moves  to  the   second part of the pte to bit 61,
> > >        	62, 63, 64 respectively
> > > 
> > > since bit 7 is now freed up, we move H_PAGE_BUSY (B) from
> > > bit  9  to  bit  7.
> > > 
> > > The second part of the PTE will hold
> > > (H_PAGE_F_SECOND|H_PAGE_F_GIX) at bit 60,61,62,63.
> > > NOTE: None of the bits in the secondary PTE were not used
> > > by 64k-HPTE backed PTE.
> > > 
> > > Before the patch, the 64K HPTE backed 64k PTE format was
> > > as follows
> > > 
> > >  0 1 2 3 4  5  6  7  8 9 10...........................63
> > >  : : : : :  :  :  :  : : :                            :
> > >  v v v v v  v  v  v  v v v                            v
> > > 
> > > ,-,-,-,-,--,--,--,--,-,-,-,-,-,------------------,-,-,-,
> > > |x|x|x| |S |G |I |X |x|B| |x|x|................|x|x|x|x| <- primary pte
> > > '_'_'_'_'__'__'__'__'_'_'_'_'_'________________'_'_'_'_'
> > > | | | | |  |  |  |  | | | | |..................| | | | | <- secondary pte
> > > '_'_'_'_'__'__'__'__'_'_'_'_'__________________'_'_'_'_'
> > > 
> > > After the patch, the 64k HPTE backed 64k PTE format is
> > > as follows
> > > 
> > >  0 1 2 3 4  5  6  7  8 9 10...........................63
> > >  : : : : :  :  :  :  : : :                            :
> > >  v v v v v  v  v  v  v v v                            v
> > > 
> > > ,-,-,-,-,--,--,--,--,-,-,-,-,-,------------------,-,-,-,
> > > |x|x|x| |  |  |  |B |x| | |x|x|................|.|.|.|.| <- primary pte
> > > '_'_'_'_'__'__'__'__'_'_'_'_'_'________________'_'_'_'_'
> > > | | | | |  |  |  |  | | | | |..................|S|G|I|X| <- secondary pte
> > > '_'_'_'_'__'__'__'__'_'_'_'_'__________________'_'_'_'_'
> > > 
> > > The above PTE changes is applicable to hugetlbpages aswell.
> > > 
> > > The patch does the following code changes:
> > > 
> > > a) moves  the  H_PAGE_F_SECOND and  H_PAGE_F_GIX to 4k PTE
> > > 	header   since it is no more needed b the 64k PTEs.
> > > b) abstracts  out __real_pte() and __rpte_to_hidx() so the
> > > 	caller  need not know the bit location of the slot.
> > > c) moves the slot bits to the secondary pte.
> > > 
> > > Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> > > Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> > > ---
> > >  arch/powerpc/include/asm/book3s/64/hash-4k.h  |    3 ++
> > >  arch/powerpc/include/asm/book3s/64/hash-64k.h |   29 +++++++++++-------------
> > >  arch/powerpc/include/asm/book3s/64/hash.h     |    3 --
> > >  arch/powerpc/mm/hash64_64k.c                  |   23 ++++++++-----------
> > >  arch/powerpc/mm/hugetlbpage-hash64.c          |   18 ++++++---------
> > >  5 files changed, 33 insertions(+), 43 deletions(-)
> > > 
> > > diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> > > index e66bfeb..dc153c6 100644
> > > --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
> > > +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> > > @@ -16,6 +16,9 @@
> > >  #define H_PUD_TABLE_SIZE	(sizeof(pud_t) << H_PUD_INDEX_SIZE)
> > >  #define H_PGD_TABLE_SIZE	(sizeof(pgd_t) << H_PGD_INDEX_SIZE)
> > >  
> > > +#define H_PAGE_F_GIX_SHIFT	56
> > > +#define H_PAGE_F_SECOND	_RPAGE_RSV2	/* HPTE is in 2ndary HPTEG */
> > > +#define H_PAGE_F_GIX	(_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
> > >  #define H_PAGE_BUSY	_RPAGE_RSV1     /* software: PTE & hash are busy */
> > >  
> > >  /* PTE flags to conserve for HPTE identification */
> > > diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> > > index e038f1c..89ef5a9 100644
> > > --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
> > > +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> > > @@ -12,7 +12,7 @@
> > >   */
> > >  #define H_PAGE_COMBO	_RPAGE_RPN0 /* this is a combo 4k page */
> > >  #define H_PAGE_4K_PFN	_RPAGE_RPN1 /* PFN is for a single 4k page */
> > > -#define H_PAGE_BUSY	_RPAGE_RPN42     /* software: PTE & hash are busy */
> > > +#define H_PAGE_BUSY	_RPAGE_RPN44     /* software: PTE & hash are busy */
> > >  
> > >  /*
> > >   * We need to differentiate between explicit huge page and THP huge
> > > @@ -21,8 +21,7 @@
> > >  #define H_PAGE_THP_HUGE  H_PAGE_4K_PFN
> > >  
> > >  /* PTE flags to conserve for HPTE identification */
> > > -#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_F_SECOND | \
> > > -			 H_PAGE_F_GIX | H_PAGE_HASHPTE | H_PAGE_COMBO)
> > > +#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | H_PAGE_COMBO)
> > >  /*
> > >   * we support 16 fragments per PTE page of 64K size.
> > >   */
> > > @@ -50,24 +49,22 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
> > >  	unsigned long *hidxp;
> > >  
> > >  	rpte.pte = pte;
> > > -	rpte.hidx = 0;
> > > -	if (pte_val(pte) & H_PAGE_COMBO) {
> > > -		/*
> > > -		 * Make sure we order the hidx load against the H_PAGE_COMBO
> > > -		 * check. The store side ordering is done in __hash_page_4K
> > > -		 */
> > > -		smp_rmb();
> > > -		hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
> > > -		rpte.hidx = *hidxp;
> > > -	}
> > > +	/*
> > > +	 * Ensure that we do not read the hidx before we read
> > > +	 * the pte. Because the writer side is  expected
> > > +	 * to finish writing the hidx first followed by the pte,
> > > +	 * by using smp_wmb().
> > > +	 * pte_set_hash_slot() ensures that.
> > > +	 */
> > > +	smp_rmb();
> > > +	hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
> > > +	rpte.hidx = *hidxp;
> > >  	return rpte;
> > >  }
> > >  
> > >  static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index)
> > >  {
> > > -	if ((pte_val(rpte.pte) & H_PAGE_COMBO))
> > > -		return (rpte.hidx >> (index<<2)) & 0xf;
> > > -	return (pte_val(rpte.pte) >> H_PAGE_F_GIX_SHIFT) & 0xf;
> > > +	return ((rpte.hidx >> (index<<2)) & 0xfUL);
> > >  }
> > >  
> > >  /*
> > > diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
> > > index 8ce4112..46f3a23 100644
> > > --- a/arch/powerpc/include/asm/book3s/64/hash.h
> > > +++ b/arch/powerpc/include/asm/book3s/64/hash.h
> > > @@ -8,9 +8,6 @@
> > >   *
> > >   */
> > >  #define H_PTE_NONE_MASK		_PAGE_HPTEFLAGS
> > > -#define H_PAGE_F_GIX_SHIFT	56
> > > -#define H_PAGE_F_SECOND		_RPAGE_RSV2	/* HPTE is in 2ndary HPTEG */
> > > -#define H_PAGE_F_GIX		(_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
> > >  #define H_PAGE_HASHPTE		_RPAGE_RPN43	/* PTE has associated HPTE */
> > >  
> > >  #ifdef CONFIG_PPC_64K_PAGES
> > > diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
> > > index c6c5559..9c63844 100644
> > > --- a/arch/powerpc/mm/hash64_64k.c
> > > +++ b/arch/powerpc/mm/hash64_64k.c
> > > @@ -103,8 +103,8 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
> > >  		 * On hash insert failure we use old pte value and we don't
> > >  		 * want slot information there if we have a insert failure.
> > >  		 */
> > > -		old_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND);
> > > -		new_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND);
> > > +		old_pte &= ~H_PAGE_HASHPTE;
> > > +		new_pte &= ~H_PAGE_HASHPTE;
> > 
> > Shouldn't we set old/new_pte.slot = invalid? via rpte.hidx
> 
> by resetting the H_PAGE_HASHPTE flag, we are invalidating
> slot information.  Would that not be sufficient?

I think i misunderstood you question. Yes rpte.hidx will have
to be reset to invalid. The code does that further down in that
function.

	if (!(old_pte & H_PAGE_COMBO))
		rpte.hidx = ~0x0UL;


RP
Aneesh Kumar K.V Oct. 23, 2017, 8:52 a.m. UTC | #5
Benjamin Herrenschmidt <benh@kernel.crashing.org> writes:

> On Fri, 2017-09-08 at 15:44 -0700, Ram Pai wrote:
>> The second part of the PTE will hold
>> (H_PAGE_F_SECOND|H_PAGE_F_GIX) at bit 60,61,62,63.
>> NOTE: None of the bits in the secondary PTE were not used
>> by 64k-HPTE backed PTE.
>
> Have you measured the performance impact of this ? The second part of
> the PTE being in a different cache line there could be one...
>

I am also looking at a patch series removing the slot tracking
completely. With randomize address turned off and no swap in guest/host
and making sure we touched most of guest ram, I don't find much impact
in performance when we don't track the slot at all. I will post the
patch series with numbers in a day or two. But my test was

while (5000) {
      mmap(128M)
      touch every page of 2048 pages
      munmap()
}

I could also be the best case in my run because i might have always
found the hash pte slot in the primary. In one measurement with swap on
and address randmization enabled, i did find a 50% impact. But then i
was not able to recreate that again. So could be something i did wrong
in the test setup.

Ram,

Will you be able to get a test run with the above loop?

-aneesh
Ram Pai Oct. 23, 2017, 7:22 p.m. UTC | #6
On Thu, Sep 14, 2017 at 06:13:57PM +1000, Benjamin Herrenschmidt wrote:
> On Fri, 2017-09-08 at 15:44 -0700, Ram Pai wrote:
> > The second part of the PTE will hold
> > (H_PAGE_F_SECOND|H_PAGE_F_GIX) at bit 60,61,62,63.
> > NOTE: None of the bits in the secondary PTE were not used
> > by 64k-HPTE backed PTE.
> 
> Have you measured the performance impact of this ? The second part of
> the PTE being in a different cache line there could be one...

hmm..missed responding to this comment.

I did a preliminay measurement running mmap bench in the selftest.
Ran it multiple times. almost always the numbers were either equal-to
or better-than without the patch-series.

RP
Ram Pai Oct. 23, 2017, 11:42 p.m. UTC | #7
On Mon, Oct 23, 2017 at 02:22:44PM +0530, Aneesh Kumar K.V wrote:
> Benjamin Herrenschmidt <benh@kernel.crashing.org> writes:
> 
> > On Fri, 2017-09-08 at 15:44 -0700, Ram Pai wrote:
> >> The second part of the PTE will hold
> >> (H_PAGE_F_SECOND|H_PAGE_F_GIX) at bit 60,61,62,63.
> >> NOTE: None of the bits in the secondary PTE were not used
> >> by 64k-HPTE backed PTE.
> >
> > Have you measured the performance impact of this ? The second part of
> > the PTE being in a different cache line there could be one...
> >
> 
> I am also looking at a patch series removing the slot tracking
> completely. With randomize address turned off and no swap in guest/host
> and making sure we touched most of guest ram, I don't find much impact
> in performance when we don't track the slot at all. I will post the
> patch series with numbers in a day or two. But my test was
> 
> while (5000) {
>       mmap(128M)
>       touch every page of 2048 pages
>       munmap()
> }
> 
> I could also be the best case in my run because i might have always
> found the hash pte slot in the primary. In one measurement with swap on
> and address randmization enabled, i did find a 50% impact. But then i
> was not able to recreate that again. So could be something i did wrong
> in the test setup.
> 
> Ram,
> 
> Will you be able to get a test run with the above loop?

Yes. results with patch look good; better than w/o patch.


/-----------------------------------------------\
|Itteratn| secs w/ patch	|secs w/o patch |
-------------------------------------------------
|1	 | 45.572621     	| 49.046994	|
|2	 | 46.049545     	| 49.378756	|
|3	 | 46.103657     	| 49.223591	|
|4	 | 46.298903     	| 48.991245	|
|5	 | 46.353202     	| 48.988033	|
|6	 | 45.440878     	| 49.175846	|
|7	 | 46.860373     	| 49.008395	|
|8	 | 46.221390     	| 49.236964	|
|9	 | 45.794993     	| 49.171927	|
|10	 | 46.569491     	| 48.995628	|
|-----------------------------------------------|
|average  | 46.1265053		| 49.1217379    |
\-----------------------------------------------/


The code is as follows:


diff --git a/tools/testing/selftests/powerpc/benchmarks/mmap_bench.c b/tools/testing/selftests/powerpc/benchmarks/mmap_bench.c
index 8d084a2..ef2ad87 100644
--- a/tools/testing/selftests/powerpc/benchmarks/mmap_bench.c
+++ b/tools/testing/selftests/powerpc/benchmarks/mmap_bench.c
@@ -10,14 +10,14 @@
 
 #include "utils.h"
 
-#define ITERATIONS 5000000
+#define ITERATIONS 5000
 
 #define MEMSIZE (128 * 1024 * 1024)
 
 int test_mmap(void)
 {
 	struct timespec ts_start, ts_end;
-	unsigned long i = ITERATIONS;
+	unsigned long i = ITERATIONS, j;
 
 	clock_gettime(CLOCK_MONOTONIC, &ts_start);
 
@@ -25,6 +25,10 @@ int test_mmap(void)
 		char *c = mmap(NULL, MEMSIZE, PROT_READ|PROT_WRITE,
 			       MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
 		FAIL_IF(c == MAP_FAILED);
+
+		for (j=0; j < (MEMSIZE >> 16); j++)
+			c[j<<16] = 0xf;
+
 		munmap(c, MEMSIZE);
 	}
Aneesh Kumar K.V Oct. 24, 2017, 3:37 a.m. UTC | #8
On 10/24/2017 12:52 AM, Ram Pai wrote:
> On Thu, Sep 14, 2017 at 06:13:57PM +1000, Benjamin Herrenschmidt wrote:
>> On Fri, 2017-09-08 at 15:44 -0700, Ram Pai wrote:
>>> The second part of the PTE will hold
>>> (H_PAGE_F_SECOND|H_PAGE_F_GIX) at bit 60,61,62,63.
>>> NOTE: None of the bits in the secondary PTE were not used
>>> by 64k-HPTE backed PTE.
>>
>> Have you measured the performance impact of this ? The second part of
>> the PTE being in a different cache line there could be one...
> 
> hmm..missed responding to this comment.
> 
> I did a preliminay measurement running mmap bench in the selftest.
> Ran it multiple times. almost always the numbers were either equal-to
> or better-than without the patch-series.

mmap bench doesn't do any fault. It is just mmap/munmap in loop.

-aneesh
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index e66bfeb..dc153c6 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -16,6 +16,9 @@ 
 #define H_PUD_TABLE_SIZE	(sizeof(pud_t) << H_PUD_INDEX_SIZE)
 #define H_PGD_TABLE_SIZE	(sizeof(pgd_t) << H_PGD_INDEX_SIZE)
 
+#define H_PAGE_F_GIX_SHIFT	56
+#define H_PAGE_F_SECOND	_RPAGE_RSV2	/* HPTE is in 2ndary HPTEG */
+#define H_PAGE_F_GIX	(_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
 #define H_PAGE_BUSY	_RPAGE_RSV1     /* software: PTE & hash are busy */
 
 /* PTE flags to conserve for HPTE identification */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index e038f1c..89ef5a9 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -12,7 +12,7 @@ 
  */
 #define H_PAGE_COMBO	_RPAGE_RPN0 /* this is a combo 4k page */
 #define H_PAGE_4K_PFN	_RPAGE_RPN1 /* PFN is for a single 4k page */
-#define H_PAGE_BUSY	_RPAGE_RPN42     /* software: PTE & hash are busy */
+#define H_PAGE_BUSY	_RPAGE_RPN44     /* software: PTE & hash are busy */
 
 /*
  * We need to differentiate between explicit huge page and THP huge
@@ -21,8 +21,7 @@ 
 #define H_PAGE_THP_HUGE  H_PAGE_4K_PFN
 
 /* PTE flags to conserve for HPTE identification */
-#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_F_SECOND | \
-			 H_PAGE_F_GIX | H_PAGE_HASHPTE | H_PAGE_COMBO)
+#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | H_PAGE_COMBO)
 /*
  * we support 16 fragments per PTE page of 64K size.
  */
@@ -50,24 +49,22 @@  static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
 	unsigned long *hidxp;
 
 	rpte.pte = pte;
-	rpte.hidx = 0;
-	if (pte_val(pte) & H_PAGE_COMBO) {
-		/*
-		 * Make sure we order the hidx load against the H_PAGE_COMBO
-		 * check. The store side ordering is done in __hash_page_4K
-		 */
-		smp_rmb();
-		hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
-		rpte.hidx = *hidxp;
-	}
+	/*
+	 * Ensure that we do not read the hidx before we read
+	 * the pte. Because the writer side is  expected
+	 * to finish writing the hidx first followed by the pte,
+	 * by using smp_wmb().
+	 * pte_set_hash_slot() ensures that.
+	 */
+	smp_rmb();
+	hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
+	rpte.hidx = *hidxp;
 	return rpte;
 }
 
 static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index)
 {
-	if ((pte_val(rpte.pte) & H_PAGE_COMBO))
-		return (rpte.hidx >> (index<<2)) & 0xf;
-	return (pte_val(rpte.pte) >> H_PAGE_F_GIX_SHIFT) & 0xf;
+	return ((rpte.hidx >> (index<<2)) & 0xfUL);
 }
 
 /*
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 8ce4112..46f3a23 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -8,9 +8,6 @@ 
  *
  */
 #define H_PTE_NONE_MASK		_PAGE_HPTEFLAGS
-#define H_PAGE_F_GIX_SHIFT	56
-#define H_PAGE_F_SECOND		_RPAGE_RSV2	/* HPTE is in 2ndary HPTEG */
-#define H_PAGE_F_GIX		(_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
 #define H_PAGE_HASHPTE		_RPAGE_RPN43	/* PTE has associated HPTE */
 
 #ifdef CONFIG_PPC_64K_PAGES
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index c6c5559..9c63844 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -103,8 +103,8 @@  int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 		 * On hash insert failure we use old pte value and we don't
 		 * want slot information there if we have a insert failure.
 		 */
-		old_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND);
-		new_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND);
+		old_pte &= ~H_PAGE_HASHPTE;
+		new_pte &= ~H_PAGE_HASHPTE;
 		goto htab_insert_hpte;
 	}
 	/*
@@ -227,6 +227,7 @@  int __hash_page_64K(unsigned long ea, unsigned long access,
 		    unsigned long vsid, pte_t *ptep, unsigned long trap,
 		    unsigned long flags, int ssize)
 {
+	real_pte_t rpte;
 	unsigned long hpte_group;
 	unsigned long rflags, pa;
 	unsigned long old_pte, new_pte;
@@ -263,6 +264,7 @@  int __hash_page_64K(unsigned long ea, unsigned long access,
 	} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
 
 	rflags = htab_convert_pte_flags(new_pte);
+	rpte = __real_pte(__pte(old_pte), ptep);
 
 	if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
 	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
@@ -270,18 +272,13 @@  int __hash_page_64K(unsigned long ea, unsigned long access,
 
 	vpn  = hpt_vpn(ea, vsid, ssize);
 	if (unlikely(old_pte & H_PAGE_HASHPTE)) {
+		unsigned long gslot;
 		/*
 		 * There MIGHT be an HPTE for this pte
 		 */
-		hash = hpt_hash(vpn, shift, ssize);
-		if (old_pte & H_PAGE_F_SECOND)
-			hash = ~hash;
-		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
-
-		if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K,
-					       MMU_PAGE_64K, ssize,
-					       flags) == -1)
+		gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
+		if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K,
+				MMU_PAGE_64K, ssize, flags) == -1)
 			old_pte &= ~_PAGE_HPTEFLAGS;
 	}
 
@@ -328,9 +325,9 @@  int __hash_page_64K(unsigned long ea, unsigned long access,
 					   MMU_PAGE_64K, MMU_PAGE_64K, old_pte);
 			return -1;
 		}
+
 		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
-		new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
-			(H_PAGE_F_SECOND | H_PAGE_F_GIX);
+		new_pte |= pte_set_hash_slot(ptep, rpte, 0, slot);
 	}
 	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
 	return 0;
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index a84bb44..d52d667 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -22,6 +22,7 @@  int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 		     pte_t *ptep, unsigned long trap, unsigned long flags,
 		     int ssize, unsigned int shift, unsigned int mmu_psize)
 {
+	real_pte_t rpte;
 	unsigned long vpn;
 	unsigned long old_pte, new_pte;
 	unsigned long rflags, pa, sz;
@@ -61,6 +62,7 @@  int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 	} while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
 
 	rflags = htab_convert_pte_flags(new_pte);
+	rpte = __real_pte(__pte(old_pte), ptep);
 
 	sz = ((1UL) << shift);
 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
@@ -71,16 +73,11 @@  int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 	/* Check if pte already has an hpte (case 2) */
 	if (unlikely(old_pte & H_PAGE_HASHPTE)) {
 		/* There MIGHT be an HPTE for this pte */
-		unsigned long hash, slot;
+		unsigned long gslot;
 
-		hash = hpt_hash(vpn, shift, ssize);
-		if (old_pte & H_PAGE_F_SECOND)
-			hash = ~hash;
-		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
-
-		if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, mmu_psize,
-					       mmu_psize, ssize, flags) == -1)
+		gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
+		if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize,
+				mmu_psize, ssize, flags) == -1)
 			old_pte &= ~_PAGE_HPTEFLAGS;
 	}
 
@@ -106,8 +103,7 @@  int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 			return -1;
 		}
 
-		new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
-			(H_PAGE_F_SECOND | H_PAGE_F_GIX);
+		new_pte |= pte_set_hash_slot(ptep, rpte, 0, slot);
 	}
 
 	/*