Message ID | 1492673305-29526-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
Hi Aneesh, > For a page walk cache flush, we don't need to loop with set number. > The set number is ignored with RIC=1 (pwc flush). > > For RIC=2 (flush all), inorder to flush implementation dependent > caches, we can ignore the set number. Hence we do a RIC=2 flush with > set no: 0, so we do both the tlb flush for set 0 and the > implementation dependent cache flushes. This is then followed with > tbl flush for set 1-127 I've applied your two previous radix tlbiel optimisations as my baseline, and using the simple exec microbenchmark in a7a9dcd882a6 I see: HPT: 100% Radix baseline: 248% Radix patched: 95% So this patch fixes the large regression we see with radix, and is even faster than our HPT number now. Nice work! Acked-by: Anton Blanchard <anton@samba.org> Anton > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> > --- > Note: not yet tested. > > arch/powerpc/mm/tlb-radix.c | 28 +++++++++++++++++++++++----- > 1 file changed, 23 insertions(+), 5 deletions(-) > > diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c > index b68b5219cf45..b827aef38b90 100644 > --- a/arch/powerpc/mm/tlb-radix.c > +++ b/arch/powerpc/mm/tlb-radix.c > @@ -43,12 +43,30 @@ static inline void __tlbiel_pid(unsigned long > pid, int set, */ > static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) > { > - int set; > + int set = 0; > > asm volatile("ptesync": : :"memory"); > - for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) { > - __tlbiel_pid(pid, set, ric); > + if (ric == RIC_FLUSH_ALL) { > + ric = RIC_FLUSH_TLB; > + set = 1; > + /* Use set 0 to flush all */ > + __tlbiel_pid(pid, 0, RIC_FLUSH_ALL); > } > + > + for (; set < POWER9_TLB_SETS_RADIX ; set++) > + __tlbiel_pid(pid, set, ric); > + > + asm volatile("ptesync": : :"memory"); > + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); > +} > + > +static inline void _tlbiel_pwc(unsigned long pid) > +{ > + asm volatile("ptesync": : :"memory"); > + /* > + * for PWC flush, we don't look at set number > + */ > + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); > asm volatile("ptesync": : :"memory"); > asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); > } > @@ -140,7 +158,7 @@ void radix__local_flush_tlb_pwc(struct mmu_gather > *tlb, unsigned long addr) > pid = mm->context.id; > if (pid != MMU_NO_CONTEXT) > - _tlbiel_pid(pid, RIC_FLUSH_PWC); > + _tlbiel_pwc(pid); > > preempt_enable(); > } > @@ -222,7 +240,7 @@ void radix__flush_tlb_pwc(struct mmu_gather *tlb, > unsigned long addr) if (lock_tlbie) > raw_spin_unlock(&native_tlbie_lock); > } else > - _tlbiel_pid(pid, RIC_FLUSH_PWC); > + _tlbiel_pwc(pid); > no_context: > preempt_enable(); > }
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index b68b5219cf45..b827aef38b90 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -43,12 +43,30 @@ static inline void __tlbiel_pid(unsigned long pid, int set, */ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) { - int set; + int set = 0; asm volatile("ptesync": : :"memory"); - for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) { - __tlbiel_pid(pid, set, ric); + if (ric == RIC_FLUSH_ALL) { + ric = RIC_FLUSH_TLB; + set = 1; + /* Use set 0 to flush all */ + __tlbiel_pid(pid, 0, RIC_FLUSH_ALL); } + + for (; set < POWER9_TLB_SETS_RADIX ; set++) + __tlbiel_pid(pid, set, ric); + + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +static inline void _tlbiel_pwc(unsigned long pid) +{ + asm volatile("ptesync": : :"memory"); + /* + * for PWC flush, we don't look at set number + */ + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); asm volatile("ptesync": : :"memory"); asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); } @@ -140,7 +158,7 @@ void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) pid = mm->context.id; if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_PWC); + _tlbiel_pwc(pid); preempt_enable(); } @@ -222,7 +240,7 @@ void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) if (lock_tlbie) raw_spin_unlock(&native_tlbie_lock); } else - _tlbiel_pid(pid, RIC_FLUSH_PWC); + _tlbiel_pwc(pid); no_context: preempt_enable(); }
For a page walk cache flush, we don't need to loop with set number. The set number is ignored with RIC=1 (pwc flush). For RIC=2 (flush all), inorder to flush implementation dependent caches, we can ignore the set number. Hence we do a RIC=2 flush with set no: 0, so we do both the tlb flush for set 0 and the implementation dependent cache flushes. This is then followed with tbl flush for set 1-127 Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> --- Note: not yet tested. arch/powerpc/mm/tlb-radix.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-)