@@ -439,6 +439,9 @@ static inline int try_lock_tlbie(unsigned int *lock)
unsigned int tmp, old;
unsigned int token = LOCK_TOKEN;
+ if (mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE))
+ return 1;
+
asm volatile("1:lwarx %1,0,%2\n"
" cmpwi cr0,%1,0\n"
" bne 2f\n"
@@ -452,6 +455,13 @@ static inline int try_lock_tlbie(unsigned int *lock)
return old == 0;
}
+static inline void unlock_tlbie_after_sync(unsigned int *lock)
+{
+ if (mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE))
+ return;
+ WRITE_ONCE(*lock, 0);
+}
+
static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
long npages, int global, bool need_sync)
{
@@ -485,7 +495,7 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
}
asm volatile("eieio; tlbsync; ptesync" : : : "memory");
- kvm->arch.tlbie_lock = 0;
+ unlock_tlbie_after_sync(&kvm->arch.tlbie_lock);
} else {
if (need_sync)
asm volatile("ptesync" : : : "memory");
tlbies to an LPAR do not have to be serialised since POWER4, MMU_FTR_LOCKLESS_TLBIE can be used to avoid the spin lock in do_tlbies. Testing was done on a POWER9 system in HPT mode, with a -smp 32 guest in HPT mode. 32 instances of the powerpc fork benchmark from selftests were run with --fork, and the results measured. Without this patch, total throughput was about 13.5K/sec, and this is the top of the host profile: 74.52% [k] do_tlbies 2.95% [k] kvmppc_book3s_hv_page_fault 1.80% [k] calc_checksum 1.80% [k] kvmppc_vcpu_run_hv 1.49% [k] kvmppc_run_core After this patch, throughput was about 51K/sec, with this profile: 21.28% [k] do_tlbies 5.26% [k] kvmppc_run_core 4.88% [k] kvmppc_book3s_hv_page_fault 3.30% [k] _raw_spin_lock_irqsave 3.25% [k] gup_pgd_range Signed-off-by: Nicholas Piggin <npiggin@gmail.com> --- Since v1: - Bug fix for unlock !MMU_FTR_LOCKLESS_TLBIE path arch/powerpc/kvm/book3s_hv_rm_mmu.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-)