diff mbox

[21/23] KVM: PPC: Book3S HV: Streamline guest entry and exit

Message ID 1426844400-12017-22-git-send-email-paulus@samba.org
State New, archived
Headers show

Commit Message

Paul Mackerras March 20, 2015, 9:39 a.m. UTC
On entry to the guest, secondary threads now wait for the primary to
switch the MMU after loading up most of their state, rather than before.
This means that the secondary threads get into the guest sooner, in the
common case where the secondary threads get to kvmppc_hv_entry before
the primary thread.

On exit, the first thread out increments the exit count and interrupts
the other threads (to get them out of the guest) before saving most
of its state, rather than after.  That means that the other threads
exit sooner and means that the first thread doesn't spend so much
time waiting for the other threads at the point where the MMU gets
switched back to the host.

This pulls out the code that increments the exit count and interrupts
other threads into a separate function, kvmhv_commence_exit().
This also makes sure that r12 and vcpu->arch.trap are set correctly
in some corner cases.

Statistics from /sys/kernel/debug/kvm/vm*/vcpu*/timings show the
improvement.  Aggregating across vcpus for a guest with 32 vcpus,
8 threads/vcore, running on a POWER8, gives this before the change:

 rm_entry:     avg 3919.3ns (244 - 56492, 742665 samples)
  rm_exit:     avg 4102.5ns (130 - 36272, 704056 samples)
  rm_intr:     avg 1006.0ns (12 - 75040, 2819905 samples)

and this after the change:

 rm_entry:     avg 2979.8ns (258 - 83740, 836403 samples)
  rm_exit:     avg 3992.9ns (12 - 45572, 838034 samples)
  rm_intr:     avg  922.2ns (12 - 66694, 3127066 samples)

showing a substantial reduction in the time spent in the real-mode
guest entry code, and smaller reductions in the real mode guest exit
and interrupt handling times.  (The test was to start the guest and
boot Fedora 20 big-endian to the login prompt.)

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 240 +++++++++++++++++++-------------
 1 file changed, 141 insertions(+), 99 deletions(-)
diff mbox

Patch

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 04728ce..ff1461d 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -175,6 +175,19 @@  kvmppc_primary_no_guest:
 	/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
 	mfspr	r3, SPRN_HDEC
 	mtspr	SPRN_DEC, r3
+	/*
+	 * Make sure the primary has finished the MMU switch.
+	 * We should never get here on a secondary thread, but
+	 * check it for robustness' sake.
+	 */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+65:	lbz	r0, VCORE_IN_GUEST(r5)
+	cmpwi	r0, 0
+	beq	65b
+	/* Set LPCR. */
+	ld	r8,VCORE_LPCR(r5)
+	mtspr	SPRN_LPCR,r8
+	isync
 	/* set our bit in napping_threads */
 	ld	r5, HSTATE_KVM_VCORE(r13)
 	lbz	r7, HSTATE_PTID(r13)
@@ -206,7 +219,7 @@  kvm_novcpu_wakeup:
 
 	/* check the wake reason */
 	bl	kvmppc_check_wake_reason
-	
+
 	/* see if any other thread is already exiting */
 	lwz	r0, VCORE_ENTRY_EXIT(r5)
 	cmpwi	r0, 0x100
@@ -243,7 +256,12 @@  kvm_novcpu_wakeup:
 
 kvm_novcpu_exit:
 	ld	r4, HSTATE_KVM_VCPU(r13)
-	b	hdec_soon
+	cmpdi	r4, 0
+	beq	13f
+	addi	r3, r4, VCPU_TB_RMEXIT
+	bl	kvmhv_accumulate_time
+13:	bl	kvmhv_commence_exit
+	b	kvmhv_switch_to_host
 
 /*
  * We come in here when wakened from nap mode.
@@ -417,7 +435,7 @@  kvmppc_hv_entry:
 	ld	r9,VCORE_KVM(r5)	/* pointer to struct kvm */
 	lbz	r6,HSTATE_PTID(r13)
 	cmpwi	r6,0
-	bne	20f
+	bne	10f
 	ld	r6,KVM_SDR1(r9)
 	lwz	r7,KVM_LPID(r9)
 	li	r0,LPID_RSVD		/* switch to reserved LPID */
@@ -488,26 +506,9 @@  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 	li	r0,1
 	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to continue */
-	b	10f
-
-	/* Secondary threads wait for primary to have done partition switch */
-20:	lbz	r0,VCORE_IN_GUEST(r5)
-	cmpwi	r0,0
-	beq	20b
-
-	/* Set LPCR. */
-10:	ld	r8,VCORE_LPCR(r5)
-	mtspr	SPRN_LPCR,r8
-	isync
-
-	/* Check if HDEC expires soon */
-	mfspr	r3,SPRN_HDEC
-	cmpwi	r3,512		/* 1 microsecond */
-	li	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
-	blt	hdec_soon
 
 	/* Do we have a guest vcpu to run? */
-	cmpdi	r4, 0
+10:	cmpdi	r4, 0
 	beq	kvmppc_primary_no_guest
 kvmppc_got_guest:
 
@@ -832,6 +833,30 @@  END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	clrrdi	r6,r6,1
 	mtspr	SPRN_CTRLT,r6
 4:
+	/* Secondary threads wait for primary to have done partition switch */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	lbz	r6, HSTATE_PTID(r13)
+	cmpwi	r6, 0
+	beq	21f
+	lbz	r0, VCORE_IN_GUEST(r5)
+	cmpwi	r0, 0
+	bne	21f
+	HMT_LOW
+20:	lbz	r0, VCORE_IN_GUEST(r5)
+	cmpwi	r0, 0
+	beq	20b
+	HMT_MEDIUM
+21:
+	/* Set LPCR. */
+	ld	r8,VCORE_LPCR(r5)
+	mtspr	SPRN_LPCR,r8
+	isync
+
+	/* Check if HDEC expires soon */
+	mfspr	r3, SPRN_HDEC
+	cmpwi	r3, 512		/* 1 microsecond */
+	blt	hdec_soon
+
 	ld	r6, VCPU_CTR(r4)
 	lwz	r7, VCPU_XER(r4)
 
@@ -936,18 +961,21 @@  END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	b	.
 
 secondary_too_late:
+	li	r12, 0
 	cmpdi	r4, 0
 	beq	11f
+	stw	r12, VCPU_TRAP(r4)
 	addi	r3, r4, VCPU_TB_RMEXIT
 	bl	kvmhv_accumulate_time
 11:	b	kvmhv_switch_to_host
 
 hdec_soon:
-	cmpdi	r4, 0
-	beq	12f
+	li	r12, BOOK3S_INTERRUPT_HV_DECREMENTER
+	stw	r12, VCPU_TRAP(r4)
+	mr	r9, r4
 	addi	r3, r4, VCPU_TB_RMEXIT
 	bl	kvmhv_accumulate_time
-12:	b	kvmhv_do_exit
+	b	guest_exit_cont
 
 /******************************************************************************
  *                                                                            *
@@ -1108,7 +1136,7 @@  guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	stw	r7, VCPU_DSISR(r9)
 	/* don't overwrite fault_dar/fault_dsisr if HDSI */
 	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
-	beq	6f
+	beq	mc_cont
 	std	r6, VCPU_FAULT_DAR(r9)
 	stw	r7, VCPU_FAULT_DSISR(r9)
 
@@ -1120,8 +1148,11 @@  mc_cont:
 	mr	r4, r9
 	bl	kvmhv_accumulate_time
 
+	/* Increment exit count, poke other threads to exit */
+	bl	kvmhv_commence_exit
+
 	/* Save guest CTRL register, set runlatch to 1 */
-6:	mfspr	r6,SPRN_CTRLF
+	mfspr	r6,SPRN_CTRLF
 	stw	r6,VCPU_CTRL(r9)
 	andi.	r0,r6,1
 	bne	4f
@@ -1463,83 +1494,14 @@  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	slbia
 	ptesync
 
-kvmhv_do_exit:			/* r12 = trap, r13 = paca */
 	/*
 	 * POWER7/POWER8 guest -> host partition switch code.
 	 * We don't have to lock against tlbies but we do
 	 * have to coordinate the hardware threads.
 	 */
-	/* Increment the threads-exiting-guest count in the 0xff00
-	   bits of vcore->entry_exit_count */
-	ld	r5,HSTATE_KVM_VCORE(r13)
-	addi	r6,r5,VCORE_ENTRY_EXIT
-41:	lwarx	r3,0,r6
-	addi	r0,r3,0x100
-	stwcx.	r0,0,r6
-	bne	41b
-	isync		/* order stwcx. vs. reading napping_threads */
-
-	/*
-	 * At this point we have an interrupt that we have to pass
-	 * up to the kernel or qemu; we can't handle it in real mode.
-	 * Thus we have to do a partition switch, so we have to
-	 * collect the other threads, if we are the first thread
-	 * to take an interrupt.  To do this, we set the HDEC to 0,
-	 * which causes an HDEC interrupt in all threads within 2ns
-	 * because the HDEC register is shared between all 4 threads.
-	 * However, we don't need to bother if this is an HDEC
-	 * interrupt, since the other threads will already be on their
-	 * way here in that case.
-	 */
-	cmpwi	r3,0x100	/* Are we the first here? */
-	bge	43f
-	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
-	beq	43f
-	li	r0,0
-	mtspr	SPRN_HDEC,r0
-
-	/*
-	 * Send a message or IPI to any napping threads, since an HDEC interrupt
-	 * doesn't wake CPUs up from nap.
-	 */
-	lwz	r3,VCORE_NAPPING_THREADS(r5)
-	lbz	r4,HSTATE_PTID(r13)
-	li	r0,1
-	sld	r0,r0,r4
-	andc.	r3,r3,r0		/* no sense IPI'ing ourselves */
-	beq	43f
-	/* Order entry/exit update vs. IPIs */
-	sync
-BEGIN_FTR_SECTION
-	b	45f
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
-	/* Use msgsnd on POWER8 */
-	lhz	r6, VCORE_PCPU(r5)
-	clrldi	r6, r6, 64-3
-	oris	r6, r6, (PPC_DBELL_SERVER << (63-36))@h
-42:	andi.	r0, r3, 1
-	beq	44f
-	PPC_MSGSND(6)
-44:	srdi.	r3, r3, 1
-	addi	r6, r6, 1
-	bne	42b
-	b	kvmhv_switch_to_host
-	/* Use IPIs on POWER7 */
-45:	mulli	r4,r4,PACA_SIZE		/* get paca for thread 0 */
-	subf	r6,r4,r13
-42:	andi.	r0,r3,1
-	beq	44f
-	ld	r8,HSTATE_XICS_PHYS(r6)	/* get thread's XICS reg addr */
-	li	r0,IPI_PRIORITY
-	li	r7,XICS_MFRR
-	stbcix	r0,r7,r8		/* trigger the IPI */
-44:	srdi.	r3,r3,1
-	addi	r6,r6,PACA_SIZE
-	bne	42b
-
 kvmhv_switch_to_host:
 	/* Secondary threads wait for primary to do partition switch */
-43:	ld	r5,HSTATE_KVM_VCORE(r13)
+	ld	r5,HSTATE_KVM_VCORE(r13)
 	ld	r4,VCORE_KVM(r5)	/* pointer to struct kvm */
 	lbz	r3,HSTATE_PTID(r13)
 	cmpwi	r3,0
@@ -1639,6 +1601,84 @@  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	mtlr	r0
 	blr
 
+kvmhv_commence_exit:		/* r12 = trap, r13 = paca, doesn't trash r9 */
+	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+	stdu	r1, -PPC_MIN_STKFRM(r1)
+
+	/* Increment the threads-exiting-guest count in the 0xff00
+	   bits of vcore->entry_exit_count */
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	addi	r6,r5,VCORE_ENTRY_EXIT
+41:	lwarx	r3,0,r6
+	addi	r0,r3,0x100
+	stwcx.	r0,0,r6
+	bne	41b
+	isync		/* order stwcx. vs. reading napping_threads */
+
+	/*
+	 * At this point we have an interrupt that we have to pass
+	 * up to the kernel or qemu; we can't handle it in real mode.
+	 * Thus we have to do a partition switch, so we have to
+	 * collect the other threads, if we are the first thread
+	 * to take an interrupt.  To do this, we set the HDEC to 0,
+	 * which causes an HDEC interrupt in all threads within 2ns
+	 * because the HDEC register is shared between all 4 threads.
+	 * However, we don't need to bother if this is an HDEC
+	 * interrupt, since the other threads will already be on their
+	 * way here in that case.
+	 */
+	cmpwi	r3,0x100	/* Are we the first here? */
+	bge	43f
+	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+	beq	43f
+	li	r0,0
+	mtspr	SPRN_HDEC,r0
+
+	/*
+	 * Send a message or IPI to any napping threads, since an HDEC interrupt
+	 * doesn't wake CPUs up from nap.
+	 */
+	lwz	r3,VCORE_NAPPING_THREADS(r5)
+	lbz	r4,HSTATE_PTID(r13)
+	li	r0,1
+	sld	r0,r0,r4
+	andc.	r3,r3,r0		/* no sense IPI'ing ourselves */
+	beq	43f
+	/* Order entry/exit update vs. IPIs */
+	sync
+BEGIN_FTR_SECTION
+	b	45f
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+	/* Use msgsnd on POWER8 */
+	lhz	r6, VCORE_PCPU(r5)
+	clrldi	r6, r6, 64-3
+	oris	r6, r6, (PPC_DBELL_SERVER << (63-36))@h
+42:	andi.	r0, r3, 1
+	beq	44f
+	PPC_MSGSND(6)
+44:	srdi.	r3, r3, 1
+	addi	r6, r6, 1
+	bne	42b
+	b	43f
+	/* Use IPIs on POWER7 */
+45:	mulli	r4,r4,PACA_SIZE		/* get paca for thread 0 */
+	subf	r6,r4,r13
+42:	andi.	r0,r3,1
+	beq	44f
+	ld	r8,HSTATE_XICS_PHYS(r6)	/* get thread's XICS reg addr */
+	li	r0,IPI_PRIORITY
+	li	r7,XICS_MFRR
+	stbcix	r0,r7,r8		/* trigger the IPI */
+44:	srdi.	r3,r3,1
+	addi	r6,r6,PACA_SIZE
+	bne	42b
+
+43:	ld	r0, PPC_MIN_STKFRM+PPC_LR_STKOFF(r1)
+	addi	r1, r1, PPC_MIN_STKFRM
+	mtlr	r0
+	blr
+
 /*
  * Check whether an HDSI is an HPTE not found fault or something else.
  * If it is an HPTE not found fault that is due to the guest accessing
@@ -2074,8 +2114,8 @@  _GLOBAL(kvmppc_h_cede)		/* r3 = vcpu pointer, r11 = msr, r13 = paca */
 	lbz	r5,VCPU_PRODDED(r3)
 	cmpwi	r5,0
 	bne	kvm_cede_prodded
-	li	r0,0		/* set trap to 0 to say hcall is handled */
-	stw	r0,VCPU_TRAP(r3)
+	li	r12,0		/* set trap to 0 to say hcall is handled */
+	stw	r12,VCPU_TRAP(r3)
 	li	r0,H_SUCCESS
 	std	r0,VCPU_GPR(R3)(r3)
 
@@ -2279,7 +2319,8 @@  kvm_cede_prodded:
 
 	/* we've ceded but we want to give control to the host */
 kvm_cede_exit:
-	b	hcall_real_fallback
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	b	guest_exit_cont
 
 	/* Try to handle a machine check in real mode */
 machine_check_realmode:
@@ -2417,6 +2458,7 @@  kvmppc_read_intr:
 	bne-	43f
 
 	/* OK, it's an IPI for us */
+	li	r12, 0
 	li	r3, -1
 1:	blr