[RFC] powerpc/mm/radix: Optimize tlbiel flush

Message ID 1492673305-29526-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com
State Not Applicable
Headers show

Commit Message

Aneesh Kumar K.V April 20, 2017, 7:28 a.m.
For a page walk cache flush, we don't need to loop with set number. The set
number is ignored with RIC=1 (pwc flush).

For RIC=2 (flush all), inorder to flush implementation dependent caches, we can
ignore the set number. Hence we do a RIC=2 flush with set no: 0, so we do both
the tlb flush for set 0 and the implementation dependent cache flushes. This is
then followed with tbl flush for set 1-127

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
Note: not yet tested.

 arch/powerpc/mm/tlb-radix.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

Comments

Anton Blanchard April 20, 2017, 8:23 a.m. | #1
Hi Aneesh,

> For a page walk cache flush, we don't need to loop with set number.
> The set number is ignored with RIC=1 (pwc flush).
> 
> For RIC=2 (flush all), inorder to flush implementation dependent
> caches, we can ignore the set number. Hence we do a RIC=2 flush with
> set no: 0, so we do both the tlb flush for set 0 and the
> implementation dependent cache flushes. This is then followed with
> tbl flush for set 1-127

I've applied your two previous radix tlbiel optimisations as my
baseline, and using the simple exec microbenchmark in a7a9dcd882a6 I
see:

HPT:            100%
Radix baseline: 248%
Radix patched:   95%

So this patch fixes the large regression we see with radix, and is even
faster than our HPT number now. Nice work!

Acked-by: Anton Blanchard <anton@samba.org>

Anton

> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
> Note: not yet tested.
> 
>  arch/powerpc/mm/tlb-radix.c | 28 +++++++++++++++++++++++-----
>  1 file changed, 23 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
> index b68b5219cf45..b827aef38b90 100644
> --- a/arch/powerpc/mm/tlb-radix.c
> +++ b/arch/powerpc/mm/tlb-radix.c
> @@ -43,12 +43,30 @@ static inline void __tlbiel_pid(unsigned long
> pid, int set, */
>  static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
>  {
> -	int set;
> +	int set = 0;
>  
>  	asm volatile("ptesync": : :"memory");
> -	for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) {
> -		__tlbiel_pid(pid, set, ric);
> +	if (ric == RIC_FLUSH_ALL) {
> +		ric = RIC_FLUSH_TLB;
> +		set = 1;
> +		/* Use set 0 to flush all */
> +		__tlbiel_pid(pid, 0, RIC_FLUSH_ALL);
>  	}
> +
> +	for (; set < POWER9_TLB_SETS_RADIX ; set++)
> +		__tlbiel_pid(pid, set, ric);
> +
> +	asm volatile("ptesync": : :"memory");
> +	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
> +}
> +
> +static inline void _tlbiel_pwc(unsigned long pid)
> +{
> +	asm volatile("ptesync": : :"memory");
> +	/*
> +	 * for PWC flush, we don't look at set number
> +	 */
> +	__tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
>  	asm volatile("ptesync": : :"memory");
>  	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
>  }
> @@ -140,7 +158,7 @@ void radix__local_flush_tlb_pwc(struct mmu_gather
> *tlb, unsigned long addr) 
>  	pid = mm->context.id;
>  	if (pid != MMU_NO_CONTEXT)
> -		_tlbiel_pid(pid, RIC_FLUSH_PWC);
> +		_tlbiel_pwc(pid);
>  
>  	preempt_enable();
>  }
> @@ -222,7 +240,7 @@ void radix__flush_tlb_pwc(struct mmu_gather *tlb,
> unsigned long addr) if (lock_tlbie)
>  			raw_spin_unlock(&native_tlbie_lock);
>  	} else
> -		_tlbiel_pid(pid, RIC_FLUSH_PWC);
> +		_tlbiel_pwc(pid);
>  no_context:
>  	preempt_enable();
>  }

Patch

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index b68b5219cf45..b827aef38b90 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -43,12 +43,30 @@  static inline void __tlbiel_pid(unsigned long pid, int set,
  */
 static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
 {
-	int set;
+	int set = 0;
 
 	asm volatile("ptesync": : :"memory");
-	for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) {
-		__tlbiel_pid(pid, set, ric);
+	if (ric == RIC_FLUSH_ALL) {
+		ric = RIC_FLUSH_TLB;
+		set = 1;
+		/* Use set 0 to flush all */
+		__tlbiel_pid(pid, 0, RIC_FLUSH_ALL);
 	}
+
+	for (; set < POWER9_TLB_SETS_RADIX ; set++)
+		__tlbiel_pid(pid, set, ric);
+
+	asm volatile("ptesync": : :"memory");
+	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+static inline void _tlbiel_pwc(unsigned long pid)
+{
+	asm volatile("ptesync": : :"memory");
+	/*
+	 * for PWC flush, we don't look at set number
+	 */
+	__tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
 	asm volatile("ptesync": : :"memory");
 	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
 }
@@ -140,7 +158,7 @@  void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
 
 	pid = mm->context.id;
 	if (pid != MMU_NO_CONTEXT)
-		_tlbiel_pid(pid, RIC_FLUSH_PWC);
+		_tlbiel_pwc(pid);
 
 	preempt_enable();
 }
@@ -222,7 +240,7 @@  void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
 		if (lock_tlbie)
 			raw_spin_unlock(&native_tlbie_lock);
 	} else
-		_tlbiel_pid(pid, RIC_FLUSH_PWC);
+		_tlbiel_pwc(pid);
 no_context:
 	preempt_enable();
 }