diff mbox series

[v1,26/55] KVM: PPC: Book3S HV: Change dec_expires to be relative to guest timebase

Message ID 20210726035036.739609-27-npiggin@gmail.com
State New
Headers show
Series [v1,01/55] KVM: PPC: Book3S HV: Remove TM emulation from POWER7/8 path | expand

Commit Message

Nicholas Piggin July 26, 2021, 3:50 a.m. UTC
Change dec_expires to be relative to the guest timebase, and allow
it to be moved into low level P9 guest entry functions, to improve
SPR access scheduling.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/kvm_book3s.h   |  6 +++
 arch/powerpc/include/asm/kvm_host.h     |  2 +-
 arch/powerpc/kvm/book3s_hv.c            | 58 +++++++++++++------------
 arch/powerpc/kvm/book3s_hv_nested.c     |  3 ++
 arch/powerpc/kvm/book3s_hv_p9_entry.c   | 10 ++++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 14 ------
 6 files changed, 49 insertions(+), 44 deletions(-)

Comments

Michael Ellerman Aug. 7, 2021, 11:17 p.m. UTC | #1
Nicholas Piggin <npiggin@gmail.com> writes:
> Change dec_expires to be relative to the guest timebase, and allow
> it to be moved into low level P9 guest entry functions, to improve
> SPR access scheduling.
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/powerpc/include/asm/kvm_book3s.h   |  6 +++
>  arch/powerpc/include/asm/kvm_host.h     |  2 +-
>  arch/powerpc/kvm/book3s_hv.c            | 58 +++++++++++++------------
>  arch/powerpc/kvm/book3s_hv_nested.c     |  3 ++
>  arch/powerpc/kvm/book3s_hv_p9_entry.c   | 10 ++++-
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 14 ------
>  6 files changed, 49 insertions(+), 44 deletions(-)

My p8 is hitting an oops running guests, and bisect points to this. Not
obvious how the change relates to the oops, but maybe you can see it.

cheers


[  716.042962][T16989] Kernel attempted to read user page (0) - exploit attempt? (uid: 0)
[  716.043020][T16989] BUG: Kernel NULL pointer dereference on read at 0x00000000
[  716.043028][T16989] Faulting instruction address: 0xc00000000001e1a8
[  716.043037][T16989] Oops: Kernel access of bad area, sig: 11 [#1]
[  716.043043][T16989] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
[  716.043052][T16989] Modules linked in: xt_MASQUERADE xt_conntrack ipt_REJECT nf_reject_ipv4 xt_tcpudp iptable_mangle iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nfnetlink ip6table_filter ip6_tables iptable_filter tun bridge stp llc fuse kvm_hv kvm binfmt_misc squashfs mlx4_ib ib_uverbs dm_multipath scsi_dh_rdac ib_core scsi_dh_alua mlx4_en sr_mod cdrom lpfc sg mlx4_core bnx2x crc_t10dif crct10dif_generic scsi_transport_fc mdio vmx_crypto gf128mul crct10dif_vpmsum crct10dif_common leds_powernv powernv_rng led_class crc32c_vpmsum rng_core powernv_op_panel sunrpc ip_tables x_tables autofs4
[  716.043128][T16989] CPU: 56 PID: 16989 Comm: qemu-system-ppc Not tainted 5.14.0-rc4-02329-g9bdd37071243 #1
[  716.043137][T16989] NIP:  c00000000001e1a8 LR: c00000000001e154 CTR: c00000000016ebb0
[  716.043144][T16989] REGS: c0000009f1a93480 TRAP: 0300   Not tainted  (5.14.0-rc4-02329-g9bdd37071243)
[  716.043150][T16989] MSR:  9000000002803033 <SF,HV,VEC,VSX,FP,ME,IR,DR,RI,LE>  CR: 42442444  XER: 20000000
[  716.043167][T16989] CFAR: c00000000000cd0c DAR: 0000000000000000 DSISR: 40000000 IRQMASK: 3
[  716.043167][T16989] GPR00: c00000000001eab8 c0000009f1a93720 c000000002459f00 c0000009c0730270
[  716.043167][T16989] GPR04: 00000000000001f0 0000000000000000 0000000022442448 c0000009c072ec80
[  716.043167][T16989] GPR08: 00000000000000c2 0000000044000000 9000000002803033 0000000000000001
[  716.043167][T16989] GPR12: 0000000000002200 c000000ffffec600 00007fff955f4410 0000000000000000
[  716.043167][T16989] GPR16: 00007fff96280000 00007fff955f0320 00007fff8ee8ebe0 00007fff8e660028
[  716.043167][T16989] GPR20: c000000803807400 c000000858b243bc 000000000000000a c000000002496eb8
[  716.043167][T16989] GPR24: c000000801123650 c0000009c0730250 c0000009c072ec80 0000000002802000
[  716.043167][T16989] GPR28: 0000000000800000 0000000002802000 0000000000800000 c0000009f1a93e80
[  716.043236][T16989] NIP [c00000000001e1a8] restore_math+0x208/0x310
[  716.043247][T16989] LR [c00000000001e154] restore_math+0x1b4/0x310
[  716.043254][T16989] Call Trace:
[  716.043257][T16989] [c0000009f1a93720] [0000000022442448] 0x22442448 (unreliable)
[  716.043267][T16989] [c0000009f1a93780] [c00000000001eab8] __switch_to+0x228/0x2f0
[  716.043274][T16989] [c0000009f1a937e0] [c000000000f7949c] __schedule+0x40c/0xf10
[  716.043283][T16989] [c0000009f1a938b0] [c000000000f7a034] schedule+0x94/0x170
[  716.043291][T16989] [c0000009f1a938e0] [c00800000b8e4474] kvmppc_wait_for_exec+0xdc/0xf8 [kvm_hv]
[  716.043307][T16989] [c0000009f1a93960] [c00800000b8eeb18] kvmppc_vcpu_run_hv+0x900/0x10f0 [kvm_hv]
[  716.043319][T16989] [c0000009f1a93a10] [c00800000b76355c] kvmppc_vcpu_run+0x34/0x48 [kvm]
[  716.043340][T16989] [c0000009f1a93a30] [c00800000b75f188] kvm_arch_vcpu_ioctl_run+0x340/0x450 [kvm]
[  716.043359][T16989] [c0000009f1a93ac0] [c00800000b74d470] kvm_vcpu_ioctl+0x328/0x8f8 [kvm]
[  716.043378][T16989] [c0000009f1a93ca0] [c0000000004fe9d4] sys_ioctl+0x6b4/0x13b0
[  716.043386][T16989] [c0000009f1a93db0] [c00000000002f918] system_call_exception+0x168/0x290
[  716.043394][T16989] [c0000009f1a93e10] [c00000000000c864] system_call_common+0xf4/0x258
[  716.043402][T16989] --- interrupt: c00 at 0x7fff954af010
[  716.043407][T16989] NIP:  00007fff954af010 LR: 0000000116243430 CTR: 0000000000000000
[  716.043413][T16989] REGS: c0000009f1a93e80 TRAP: 0c00   Not tainted  (5.14.0-rc4-02329-g9bdd37071243)
[  716.043419][T16989] MSR:  900000000000d033 <SF,HV,EE,PR,ME,IR,DR,RI,LE>  CR: 22444442  XER: 00000000
[  716.043434][T16989] IRQMASK: 0
[  716.043434][T16989] GPR00: 0000000000000036 00007fff8ee8dc30 00007fff955a7100 000000000000000f
[  716.043434][T16989] GPR04: 000000002000ae80 0000000000000000 00000000000004fb 0000000000000000
[  716.043434][T16989] GPR08: 000000000000000f 0000000000000000 0000000000000000 0000000000000000
[  716.043434][T16989] GPR12: 0000000000000000 00007fff8ee96290 00007fff955f4410 0000000000000000
[  716.043434][T16989] GPR16: 00007fff96280000 00007fff955f0320 00007fff8ee8ebe0 00007fff8e660028
[  716.043434][T16989] GPR20: 0000000000000000 0000000000000000 000000011689b0d0 000000002000ae80
[  716.043434][T16989] GPR24: 00007fff8ffa00ae 0000000000000000 00007fff8ee8f290 00007fff8ffb0010
[  716.043434][T16989] GPR28: 0000000116e010e0 00007fff8ffb0010 0000000000000000 000000002000ae80
[  716.043498][T16989] NIP [00007fff954af010] 0x7fff954af010
[  716.043503][T16989] LR [0000000116243430] 0x116243430
[  716.043507][T16989] --- interrupt: c00
[  716.043511][T16989] Instruction dump:
[  716.043517][T16989] fb610038 67db0200 9907185a 4182005c 7c0802a6 7f63db78 f8010070 4bffeeed
[  716.043529][T16989] 2c3e0000 408200d4 547ddb78 0082812b <eb000000> 387a1860 7fdcf378 7f7edb78
[  716.043543][T16989] ---[ end trace b02ece1d913ff866 ]---
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index caaa0f592d8e..15b573671f99 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -406,6 +406,12 @@  static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 	return vcpu->arch.fault_dar;
 }
 
+/* Expiry time of vcpu DEC relative to host TB */
+static inline u64 kvmppc_dec_expires_host_tb(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.dec_expires - vcpu->arch.vcore->tb_offset;
+}
+
 static inline bool is_kvmppc_resume_guest(int r)
 {
 	return (r == RESUME_GUEST || r == RESUME_GUEST_NV);
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index aee41edcfe6b..f105eaeb4521 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -742,7 +742,7 @@  struct kvm_vcpu_arch {
 
 	struct hrtimer dec_timer;
 	u64 dec_jiffies;
-	u64 dec_expires;
+	u64 dec_expires;	/* Relative to guest timebase. */
 	unsigned long pending_exceptions;
 	u8 ceded;
 	u8 prodded;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 4d757e4904c4..027ae0b60e70 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2237,8 +2237,7 @@  static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 		*val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
 		break;
 	case KVM_REG_PPC_DEC_EXPIRY:
-		*val = get_reg_val(id, vcpu->arch.dec_expires +
-				   vcpu->arch.vcore->tb_offset);
+		*val = get_reg_val(id, vcpu->arch.dec_expires);
 		break;
 	case KVM_REG_PPC_ONLINE:
 		*val = get_reg_val(id, vcpu->arch.online);
@@ -2490,8 +2489,7 @@  static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 		r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
 		break;
 	case KVM_REG_PPC_DEC_EXPIRY:
-		vcpu->arch.dec_expires = set_reg_val(id, *val) -
-			vcpu->arch.vcore->tb_offset;
+		vcpu->arch.dec_expires = set_reg_val(id, *val);
 		break;
 	case KVM_REG_PPC_ONLINE:
 		i = set_reg_val(id, *val);
@@ -2877,13 +2875,13 @@  static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
 	unsigned long dec_nsec, now;
 
 	now = get_tb();
-	if (now > vcpu->arch.dec_expires) {
+	if (now > kvmppc_dec_expires_host_tb(vcpu)) {
 		/* decrementer has already gone negative */
 		kvmppc_core_queue_dec(vcpu);
 		kvmppc_core_prepare_to_enter(vcpu);
 		return;
 	}
-	dec_nsec = tb_to_ns(vcpu->arch.dec_expires - now);
+	dec_nsec = tb_to_ns(kvmppc_dec_expires_host_tb(vcpu) - now);
 	hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL);
 	vcpu->arch.timer_running = 1;
 }
@@ -3355,7 +3353,7 @@  static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 		 */
 		spin_unlock(&vc->lock);
 		/* cancel pending dec exception if dec is positive */
-		if (now < vcpu->arch.dec_expires &&
+		if (now < kvmppc_dec_expires_host_tb(vcpu) &&
 		    kvmppc_core_pending_dec(vcpu))
 			kvmppc_core_dequeue_dec(vcpu);
 
@@ -4174,20 +4172,6 @@  static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 
 	load_spr_state(vcpu);
 
-	/*
-	 * When setting DEC, we must always deal with irq_work_raise via NMI vs
-	 * setting DEC. The problem occurs right as we switch into guest mode
-	 * if a NMI hits and sets pending work and sets DEC, then that will
-	 * apply to the guest and not bring us back to the host.
-	 *
-	 * irq_work_raise could check a flag (or possibly LPCR[HDICE] for
-	 * example) and set HDEC to 1? That wouldn't solve the nested hv
-	 * case which needs to abort the hcall or zero the time limit.
-	 *
-	 * XXX: Another day's problem.
-	 */
-	mtspr(SPRN_DEC, vcpu->arch.dec_expires - tb);
-
 	if (kvmhv_on_pseries()) {
 		/*
 		 * We need to save and restore the guest visible part of the
@@ -4213,6 +4197,23 @@  static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 			hvregs.vcpu_token = vcpu->vcpu_id;
 		}
 		hvregs.hdec_expiry = time_limit;
+
+		/*
+		 * When setting DEC, we must always deal with irq_work_raise
+		 * via NMI vs setting DEC. The problem occurs right as we
+		 * switch into guest mode if a NMI hits and sets pending work
+		 * and sets DEC, then that will apply to the guest and not
+		 * bring us back to the host.
+		 *
+		 * irq_work_raise could check a flag (or possibly LPCR[HDICE]
+		 * for example) and set HDEC to 1? That wouldn't solve the
+		 * nested hv case which needs to abort the hcall or zero the
+		 * time limit.
+		 *
+		 * XXX: Another day's problem.
+		 */
+		mtspr(SPRN_DEC, kvmppc_dec_expires_host_tb(vcpu) - tb);
+
 		mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
 		mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
 		trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
@@ -4224,6 +4225,12 @@  static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 		vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);
 		mtspr(SPRN_PSSCR_PR, host_psscr);
 
+		dec = mfspr(SPRN_DEC);
+		if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
+			dec = (s32) dec;
+		tb = mftb();
+		vcpu->arch.dec_expires = dec + (tb + vc->tb_offset);
+
 		/* H_CEDE has to be handled now, not later */
 		if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
 		    kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
@@ -4231,6 +4238,7 @@  static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 			kvmppc_set_gpr(vcpu, 3, 0);
 			trap = 0;
 		}
+
 	} else {
 		kvmppc_xive_push_vcpu(vcpu);
 		trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr);
@@ -4262,12 +4270,6 @@  static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 			vcpu->arch.slb_max = 0;
 	}
 
-	dec = mfspr(SPRN_DEC);
-	if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
-		dec = (s32) dec;
-	tb = mftb();
-	vcpu->arch.dec_expires = dec + tb;
-
 	store_spr_state(vcpu);
 
 	restore_p9_host_os_sprs(vcpu, &host_os_sprs);
@@ -4752,7 +4754,7 @@  int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
 	 * by L2 and the L1 decrementer is provided in hdec_expires
 	 */
 	if (kvmppc_core_pending_dec(vcpu) &&
-			((get_tb() < vcpu->arch.dec_expires) ||
+			((get_tb() < kvmppc_dec_expires_host_tb(vcpu)) ||
 			 (trap == BOOK3S_INTERRUPT_SYSCALL &&
 			  kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED)))
 		kvmppc_core_dequeue_dec(vcpu);
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 3ffc63ffebc5..fad7bc8736ea 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -380,6 +380,7 @@  long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
 	/* convert TB values/offsets to host (L0) values */
 	hdec_exp = l2_hv.hdec_expiry - vc->tb_offset;
 	vc->tb_offset += l2_hv.tb_offset;
+	vcpu->arch.dec_expires += l2_hv.tb_offset;
 
 	/* set L1 state to L2 state */
 	vcpu->arch.nested = l2;
@@ -421,6 +422,8 @@  long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
 	if (l2_regs.msr & MSR_TS_MASK)
 		vcpu->arch.shregs.msr |= MSR_TS_S;
 	vc->tb_offset = saved_l1_hv.tb_offset;
+	/* XXX: is this always the same delta as saved_l1_hv.tb_offset? */
+	vcpu->arch.dec_expires -= l2_hv.tb_offset;
 	restore_hv_regs(vcpu, &saved_l1_hv);
 	vcpu->arch.purr += delta_purr;
 	vcpu->arch.spurr += delta_spurr;
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index fb9cb34445ea..814b0dfd590f 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -188,7 +188,7 @@  int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_nested_guest *nested = vcpu->arch.nested;
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
-	s64 hdec;
+	s64 hdec, dec;
 	u64 tb, purr, spurr;
 	u64 *exsave;
 	bool ri_set;
@@ -317,6 +317,8 @@  int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
 	 */
 	mtspr(SPRN_HDEC, hdec);
 
+	mtspr(SPRN_DEC, vcpu->arch.dec_expires - tb);
+
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 tm_return_to_guest:
 #endif
@@ -461,6 +463,12 @@  int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
 	vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
 	vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
 
+	dec = mfspr(SPRN_DEC);
+	if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
+		dec = (s32) dec;
+	tb = mftb();
+	vcpu->arch.dec_expires = dec + tb;
+
 	/* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
 	mtspr(SPRN_PSSCR, host_psscr |
 	      (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 05be8648937d..16cb3240df52 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -808,10 +808,6 @@  END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	 * Set the decrementer to the guest decrementer.
 	 */
 	ld	r8,VCPU_DEC_EXPIRES(r4)
-	/* r8 is a host timebase value here, convert to guest TB */
-	ld	r5,HSTATE_KVM_VCORE(r13)
-	ld	r6,VCORE_TB_OFFSET_APPL(r5)
-	add	r8,r8,r6
 	mftb	r7
 	subf	r3,r7,r8
 	mtspr	SPRN_DEC,r3
@@ -1186,9 +1182,6 @@  guest_bypass:
 	mftb	r6
 	extsw	r5,r5
 16:	add	r5,r5,r6
-	/* r5 is a guest timebase value here, convert to host TB */
-	ld	r4,VCORE_TB_OFFSET_APPL(r3)
-	subf	r5,r4,r5
 	std	r5,VCPU_DEC_EXPIRES(r9)
 
 	/* Increment exit count, poke other threads to exit */
@@ -2153,10 +2146,6 @@  END_FTR_SECTION_IFCLR(CPU_FTR_TM)
 67:
 	/* save expiry time of guest decrementer */
 	add	r3, r3, r5
-	ld	r4, HSTATE_KVM_VCPU(r13)
-	ld	r5, HSTATE_KVM_VCORE(r13)
-	ld	r6, VCORE_TB_OFFSET_APPL(r5)
-	subf	r3, r6, r3	/* convert to host TB value */
 	std	r3, VCPU_DEC_EXPIRES(r4)
 
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
@@ -2253,9 +2242,6 @@  END_FTR_SECTION_IFCLR(CPU_FTR_TM)
 
 	/* Restore guest decrementer */
 	ld	r3, VCPU_DEC_EXPIRES(r4)
-	ld	r5, HSTATE_KVM_VCORE(r13)
-	ld	r6, VCORE_TB_OFFSET_APPL(r5)
-	add	r3, r3, r6	/* convert host TB to guest TB value */
 	mftb	r7
 	subf	r3, r7, r3
 	mtspr	SPRN_DEC, r3