diff mbox

[RFC,01/10] KVM: PPC: Book3S HV: Align physical CPU thread numbers with virtual

Message ID 20130906035026.GN29710@iris.ozlabs.ibm.com
State New, archived
Headers show

Commit Message

Paul Mackerras Sept. 6, 2013, 3:50 a.m. UTC
On a threaded processor such as POWER7, we group VCPUs into virtual
cores and arrange that the VCPUs in a virtual core run on the same
physical core.  Currently we don't enforce any correspondence between
virtual thread numbers within a virtual core and physical thread
numbers.  Physical threads are allocated starting at 0 on a first-come
first-served basis to runnable virtual threads (VCPUs).

POWER8 implements a new "msgsndp" instruction which guest kernels can
use to interrupt other threads in the same core or sub-core.  Since
the instruction takes the destination physical thread ID as a parameter,
it becomes necessary to align the physical thread IDs with the virtual
thread IDs, that is, to make sure virtual thread N within a virtual
core always runs on physical thread N.

This means that it's possible that thread 0, which is where we call
__kvmppc_vcore_entry, may end up running some other vcpu than the
one whose task called kvmppc_run_core(), or it may end up running
no vcpu at all, if for example thread 0 of the virtual core is
currently executing in userspace.  Thus we can't rely on thread 0
to be the master responsible for switching the MMU.  Instead we now
have an explicit 'is_master' flag which is set for the vcpu whose
task called kvmppc_run_core().  The master then has to wait for
thread 0 to enter real mode before switching the MMU.  Also, we
no longer pass the vcpu pointer to __kvmppc_vcore_entry, but
instead let the assembly code load it from the PACA.

Since the assembly code will need to know the kvm pointer and the
thread ID for threads which don't have a vcpu, we move the thread
ID into the PACA and we add a kvm pointer to the virtual core
structure.

In the case where thread 0 has no vcpu to run, we arrange for it to
go to nap mode, using a new flag value in the PACA 'napping' field
so we can differentiate it when it wakes from the other nap cases.
We set the bit for the thread in the vcore 'napping_threads' field
so that when other threads come out of the guest they will send an
IPI to thread 0 to wake it up.  When it does wake up, we clear that
bit, see what caused the wakeup, and either exit back to the kernel,
or start running virtual thread 0 in the case where it now wants to
enter the guest and the other threads are still in the guest.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_book3s_asm.h |   1 +
 arch/powerpc/include/asm/kvm_host.h       |   4 +
 arch/powerpc/kernel/asm-offsets.c         |   5 +-
 arch/powerpc/kvm/book3s_hv.c              |  49 ++++----
 arch/powerpc/kvm/book3s_hv_interrupts.S   |   6 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 188 +++++++++++++++++++++++++-----
 6 files changed, 192 insertions(+), 61 deletions(-)
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 22f4606..8669ac0 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -87,6 +87,7 @@  struct kvmppc_host_state {
 	u8 hwthread_req;
 	u8 hwthread_state;
 	u8 host_ipi;
+	u8 ptid;
 	struct kvm_vcpu *kvm_vcpu;
 	struct kvmppc_vcore *kvm_vcore;
 	unsigned long xics_phys;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 5a40270..6a43455 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -284,6 +284,8 @@  struct kvmppc_vcore {
 	int n_woken;
 	int nap_count;
 	int napping_threads;
+	int real_mode_threads;
+	int first_vcpuid;
 	u16 pcpu;
 	u16 last_cpu;
 	u8 vcore_state;
@@ -294,6 +296,7 @@  struct kvmppc_vcore {
 	u64 stolen_tb;
 	u64 preempt_tb;
 	struct kvm_vcpu *runner;
+	struct kvm *kvm;
 	u64 tb_offset;		/* guest timebase - host timebase */
 	u32 arch_compat;
 	ulong pcr;
@@ -575,6 +578,7 @@  struct kvm_vcpu_arch {
 	int state;
 	int ptid;
 	bool timer_running;
+	u8 is_master;
 	wait_queue_head_t cpu_run;
 
 	struct kvm_vcpu_arch_shared *shared;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 115dd64..51d7eed 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -514,13 +514,15 @@  int main(void)
 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
-	DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
+	DEFINE(VCPU_IS_MASTER, offsetof(struct kvm_vcpu, arch.is_master));
 	DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar));
 	DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr));
 	DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
 	DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
 	DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
 	DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
+	DEFINE(VCORE_RM_THREADS, offsetof(struct kvmppc_vcore, real_mode_threads));
+	DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm));
 	DEFINE(VCORE_TB_OFFSET, offsetof(struct kvmppc_vcore, tb_offset));
 	DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr));
 	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
@@ -590,6 +592,7 @@  int main(void)
 	HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
 	HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
 	HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
+	HSTATE_FIELD(HSTATE_PTID, ptid);
 	HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
 	HSTATE_FIELD(HSTATE_PMC, host_pmc);
 	HSTATE_FIELD(HSTATE_PURR, host_purr);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 731e46e..e0791ed58 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1030,6 +1030,8 @@  struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 			spin_lock_init(&vcore->lock);
 			init_waitqueue_head(&vcore->wq);
 			vcore->preempt_tb = TB_NIL;
+			vcore->first_vcpuid = core * threads_per_core;
+			vcore->kvm = kvm;
 		}
 		kvm->arch.vcores[core] = vcore;
 		kvm->arch.online_vcores++;
@@ -1043,6 +1045,7 @@  struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	++vcore->num_threads;
 	spin_unlock(&vcore->lock);
 	vcpu->arch.vcore = vcore;
+	vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
 
 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	kvmppc_sanity_check(vcpu);
@@ -1100,7 +1103,7 @@  static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
 	}
 }
 
-extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+extern void __kvmppc_vcore_entry(void);
 
 static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
 				   struct kvm_vcpu *vcpu)
@@ -1174,13 +1177,14 @@  static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
 	tpaca = &paca[cpu];
 	tpaca->kvm_hstate.kvm_vcpu = vcpu;
 	tpaca->kvm_hstate.kvm_vcore = vc;
-	tpaca->kvm_hstate.napping = 0;
+	tpaca->kvm_hstate.ptid = vcpu->arch.ptid;
 	vcpu->cpu = vc->pcpu;
 	smp_wmb();
 #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
-	if (vcpu->arch.ptid) {
+	if (cpu != smp_processor_id()) {
 		xics_wake_cpu(cpu);
-		++vc->n_woken;
+		if (vcpu->arch.ptid)
+			++vc->n_woken;
 	}
 #endif
 }
@@ -1237,10 +1241,10 @@  static int on_primary_thread(void)
  */
 static void kvmppc_run_core(struct kvmppc_vcore *vc)
 {
-	struct kvm_vcpu *vcpu, *vcpu0, *vnext;
+	struct kvm_vcpu *vcpu, *vnext;
 	long ret;
 	u64 now;
-	int ptid, i, need_vpa_update;
+	int i, need_vpa_update;
 	int srcu_idx;
 	struct kvm_vcpu *vcpus_to_update[threads_per_core];
 
@@ -1265,6 +1269,7 @@  static void kvmppc_run_core(struct kvmppc_vcore *vc)
 	vc->vcore_state = VCORE_STARTING;
 	vc->in_guest = 0;
 	vc->napping_threads = 0;
+	vc->real_mode_threads = 0;
 
 	/*
 	 * Updating any of the vpas requires calling kvmppc_pin_guest_page,
@@ -1278,25 +1283,6 @@  static void kvmppc_run_core(struct kvmppc_vcore *vc)
 	}
 
 	/*
-	 * Assign physical thread IDs, first to non-ceded vcpus
-	 * and then to ceded ones.
-	 */
-	ptid = 0;
-	vcpu0 = NULL;
-	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-		if (!vcpu->arch.ceded) {
-			if (!ptid)
-				vcpu0 = vcpu;
-			vcpu->arch.ptid = ptid++;
-		}
-	}
-	if (!vcpu0)
-		goto out;	/* nothing to run; should never happen */
-	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
-		if (vcpu->arch.ceded)
-			vcpu->arch.ptid = ptid++;
-
-	/*
 	 * Make sure we are running on thread 0, and that
 	 * secondary threads are offline.
 	 */
@@ -1312,15 +1298,19 @@  static void kvmppc_run_core(struct kvmppc_vcore *vc)
 		kvmppc_create_dtl_entry(vcpu, vc);
 	}
 
+	/* Set this explicitly in case thread 0 doesn't have a vcpu */
+	get_paca()->kvm_hstate.kvm_vcore = vc;
+	get_paca()->kvm_hstate.ptid = 0;
+
 	vc->vcore_state = VCORE_RUNNING;
 	preempt_disable();
 	spin_unlock(&vc->lock);
 
 	kvm_guest_enter();
 
-	srcu_idx = srcu_read_lock(&vcpu0->kvm->srcu);
+	srcu_idx = srcu_read_lock(&vc->kvm->srcu);
 
-	__kvmppc_vcore_entry(NULL, vcpu0);
+	__kvmppc_vcore_entry();
 
 	spin_lock(&vc->lock);
 	/* disable sending of IPIs on virtual external irqs */
@@ -1335,7 +1325,7 @@  static void kvmppc_run_core(struct kvmppc_vcore *vc)
 	vc->vcore_state = VCORE_EXITING;
 	spin_unlock(&vc->lock);
 
-	srcu_read_unlock(&vcpu0->kvm->srcu, srcu_idx);
+	srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
 
 	/* make sure updates to secondary vcpu structs are visible now */
 	smp_mb();
@@ -1443,7 +1433,6 @@  static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	if (!signal_pending(current)) {
 		if (vc->vcore_state == VCORE_RUNNING &&
 		    VCORE_EXIT_COUNT(vc) == 0) {
-			vcpu->arch.ptid = vc->n_runnable - 1;
 			kvmppc_create_dtl_entry(vcpu, vc);
 			kvmppc_start_thread(vcpu);
 		} else if (vc->vcore_state == VCORE_SLEEPING) {
@@ -1474,6 +1463,7 @@  static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
 			break;
 		vc->runner = vcpu;
+		vcpu->arch.is_master = 1;
 		n_ceded = 0;
 		list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
 			if (!v->arch.pending_exceptions)
@@ -1486,6 +1476,7 @@  static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		else
 			kvmppc_run_core(vc);
 		vc->runner = NULL;
+		vcpu->arch.is_master = 0;
 	}
 
 	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
index 37f1cc4..2a85bf6 100644
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -35,7 +35,7 @@ 
  ****************************************************************************/
 
 /* Registers:
- *  r4: vcpu pointer
+ *  none
  */
 _GLOBAL(__kvmppc_vcore_entry)
 
@@ -69,7 +69,6 @@  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	mtmsrd  r10,1
 
 	/* Save host PMU registers */
-	/* R4 is live here (vcpu pointer) but not r3 or r5 */
 	li	r3, 1
 	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
 	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
@@ -134,16 +133,15 @@  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	 * enters the guest with interrupts enabled.
 	 */
 BEGIN_FTR_SECTION
+	ld	r4, HSTATE_KVM_VCPU(r13)
 	ld	r0, VCPU_PENDING_EXC(r4)
 	li	r7, (1 << BOOK3S_IRQPRIO_EXTERNAL)
 	oris	r7, r7, (1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
 	and.	r0, r0, r7
 	beq	32f
-	mr	r31, r4
 	lhz	r3, PACAPACAINDEX(r13)
 	bl	smp_send_reschedule
 	nop
-	mr	r4, r31
 32:
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 #endif /* CONFIG_SMP */
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 31030f3..0721d2a 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -53,6 +53,10 @@  kvmppc_skip_Hinterrupt:
 	hrfid
 	b	.
 
+/* Values in HSTATE_NAPPING(r13) */
+#define NAPPING_CEDE	1
+#define NAPPING_NOVCPU	2
+
 /*
  * Call kvmppc_hv_entry in real mode.
  * Must be called with interrupts hard-disabled.
@@ -77,6 +81,16 @@  _GLOBAL(kvmppc_hv_entry_trampoline)
 	RFI
 
 kvmppc_call_hv_entry:
+	/* Indicate that we are now in real mode */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	li	r0, 1
+	stw	r0, VCORE_RM_THREADS(r5)
+
+	/* any guest vcpu to run? */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	cmpdi	r4, 0
+	beq	kvmppc_call_no_guest
+kvmppc_got_guest:
 	bl	kvmppc_hv_entry
 
 	/* Back from guest - restore host state and return to caller */
@@ -91,15 +105,6 @@  kvmppc_call_hv_entry:
 	ld	r3,PACA_SPRG3(r13)
 	mtspr	SPRN_SPRG3,r3
 
-	/*
-	 * Reload DEC.  HDEC interrupts were disabled when
-	 * we reloaded the host's LPCR value.
-	 */
-	ld	r3, HSTATE_DECEXP(r13)
-	mftb	r4
-	subf	r4, r4, r3
-	mtspr	SPRN_DEC, r4
-
 	/* Reload the host's PMU registers */
 	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
 	lbz	r4, LPPACA_PMCINUSE(r3)
@@ -134,6 +139,16 @@  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	isync
 23:
 
+kvmppc_vcore_exit:
+	/*
+	 * Reload DEC.  HDEC interrupts were disabled when
+	 * we reloaded the host's LPCR value.
+	 */
+	ld	r3, HSTATE_DECEXP(r13)
+	mftb	r4
+	subf	r4, r4, r3
+	mtspr	SPRN_DEC, r4
+
 	/*
 	 * For external and machine check interrupts, we need
 	 * to call the Linux handler to process the interrupt.
@@ -173,16 +188,114 @@  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 13:	b	machine_check_fwnmi
 
+kvmppc_call_no_guest:
+	/* We handle this much like a ceded vcpu */
+	/* First wait for the partition switch to happen */
+	HMT_LOW
+3:	lbz	r3,VCORE_IN_GUEST(r5)
+	cmpwi	r3,0
+	beq	3b
+	HMT_MEDIUM
+	/* now load up LPCR */
+	ld	r9, VCORE_KVM(r5)
+	ld	r8, KVM_LPCR(r9)
+	mtspr	SPRN_LPCR, r8
+	isync
+
+	/* set our bit in napping_threads */
+	lbz	r7, HSTATE_PTID(r13)
+	li	r0, 1
+	sld	r0, r0, r7
+	addi	r6, r5, VCORE_NAPPING_THREADS
+1:	lwarx	r3, 0, r6
+	or	r3, r3, r0
+	stwcx.	r3, 0, r6
+	bne	1b
+	/* order napping_threads update vs testing entry_exit_count */
+	lwsync
+	li	r12, 0
+	lwz	r7, VCORE_ENTRY_EXIT(r5)
+	cmpwi	r7, 0x100
+	bge	kvm_novcpu_exit	/* another thread already exiting */
+	li	r3, NAPPING_NOVCPU
+	stb	r3, HSTATE_NAPPING(r13)
+	li	r3, 1
+	stb	r3, HSTATE_HWTHREAD_REQ(r13)
+	std	r1, HSTATE_HOST_R1(r13)
+
+	b	kvm_do_nap
+
+kvm_novcpu_wakeup:
+	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	li	r0, 0
+	stb	r0, HSTATE_NAPPING(r13)
+	stb	r0, HSTATE_HWTHREAD_REQ(r13)
+
+	/* see if any other thread is already exiting */
+	li	r12, 0
+	lwz	r0, VCORE_ENTRY_EXIT(r5)
+	cmpwi	r0, 0x100
+	bge	kvm_novcpu_exit
+
+	/* clear our bit in napping_threads */
+	lbz	r7, HSTATE_PTID(r13)
+	li	r0, 1
+	sld	r0, r0, r7
+	addi	r6, r5, VCORE_NAPPING_THREADS
+4:	lwarx	r3, 0, r6
+	andc	r3, r3, r0
+	stwcx.	r3, 0, r6
+	bne	4b
+
+	/* Check the wake reason in SRR1 to see why we got here */
+	mfspr	r3, SPRN_SRR1
+	rlwinm	r3, r3, 44-31, 0x7	/* extract wake reason field */
+	cmpwi	r3, 4			/* was it an external interrupt? */
+	bne	kvm_novcpu_exit		/* if not, exit the guest */
+
+	/* extern interrupt - read and handle it */
+	li	r12, BOOK3S_INTERRUPT_EXTERNAL
+	bl	kvmppc_read_intr
+	cmpdi	r3, 0
+	bge	kvm_novcpu_exit
+	li	r12, 0
+
+	/* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	cmpdi	r4, 0
+	bne	kvmppc_got_guest
+
+kvm_novcpu_exit:
+	ld	r5, HSTATE_KVM_VCORE(r13)
+
+	/* if other threads aren't yet exiting, poke them to do so */
+	lwz	r3, VCORE_ENTRY_EXIT(r5)
+	cmpwi	r3, 0x100
+	bge	2f
+	li	r3, 0			/* set HDEC to 0 */
+	mtspr	SPRN_HDEC, r3
+2:
+	/* wait for the MMU switch */
+	HMT_LOW
+3:	lbz	r3, VCORE_IN_GUEST(r5)
+	cmpwi	r3, 0
+	bne	3b
+	HMT_MEDIUM
+	/* reload host LPCR */
+	ld	r9, VCORE_KVM(r5)
+	ld	r8, KVM_HOST_LPCR(r9)
+	mtspr	SPRN_LPCR, r8
+	isync
+	b	kvmppc_vcore_exit
 
 /*
- * We come in here when wakened from nap mode on a secondary hw thread.
+ * We come in here when wakened from nap mode.
  * Relocation is off and most register values are lost.
  * r13 points to the PACA.
  */
 	.globl	kvm_start_guest
 kvm_start_guest:
-	ld	r1,PACAEMERGSP(r13)
-	subi	r1,r1,STACK_FRAME_OVERHEAD
 	ld	r2,PACATOC(r13)
 
 	li	r0,KVM_HWTHREAD_IN_KVM
@@ -194,8 +307,13 @@  kvm_start_guest:
 
 	/* were we napping due to cede? */
 	lbz	r0,HSTATE_NAPPING(r13)
-	cmpwi	r0,0
-	bne	kvm_end_cede
+	cmpwi	r0,NAPPING_CEDE
+	beq	kvm_end_cede
+	cmpwi	r0,NAPPING_NOVCPU
+	beq	kvm_novcpu_wakeup
+
+	ld	r1,PACAEMERGSP(r13)
+	subi	r1,r1,STACK_FRAME_OVERHEAD
 
 	/*
 	 * We weren't napping due to cede, so this must be a secondary
@@ -270,6 +388,7 @@  kvm_start_guest:
 kvm_no_guest:
 	li	r0, KVM_HWTHREAD_IN_NAP
 	stb	r0, HSTATE_HWTHREAD_STATE(r13)
+kvm_do_nap:
 	li	r3, LPCR_PECE0
 	mfspr	r4, SPRN_LPCR
 	rlwimi	r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
@@ -443,9 +562,15 @@  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 
 	/* Primary thread switches to guest partition. */
 	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
-	lwz	r6,VCPU_PTID(r4)
+	lbz	r6,VCPU_IS_MASTER(r4)
+	cmpwi	r6,0
+	beq	20f
+	/* wait for thread 0 to get into real mode */
+	HMT_LOW
+50:	lwz	r6,VCORE_RM_THREADS(r5)
 	cmpwi	r6,0
-	bne	20f
+	beq	50b
+	HMT_MEDIUM
 	ld	r6,KVM_SDR1(r9)
 	lwz	r7,KVM_LPID(r9)
 	li	r0,LPID_RSVD		/* switch to reserved LPID */
@@ -1014,8 +1139,6 @@  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	 */
 	cmpwi	r3,0x100	/* Are we the first here? */
 	bge	43f
-	cmpwi	r3,1		/* Are any other threads in the guest? */
-	ble	43f
 	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
 	beq	40f
 	li	r0,0
@@ -1026,7 +1149,7 @@  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	 * doesn't wake CPUs up from nap.
 	 */
 	lwz	r3,VCORE_NAPPING_THREADS(r5)
-	lwz	r4,VCPU_PTID(r9)
+	lbz	r4,HSTATE_PTID(r13)
 	li	r0,1
 	sld	r0,r0,r4
 	andc.	r3,r3,r0		/* no sense IPI'ing ourselves */
@@ -1046,9 +1169,9 @@  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	/* Secondary threads wait for primary to do partition switch */
 43:	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
 	ld	r5,HSTATE_KVM_VCORE(r13)
-	lwz	r3,VCPU_PTID(r9)
+	lwz	r3,VCPU_IS_MASTER(r9)
 	cmpwi	r3,0
-	beq	15f
+	bne	15f
 	HMT_LOW
 13:	lbz	r3,VCORE_IN_GUEST(r5)
 	cmpwi	r3,0
@@ -1315,10 +1438,13 @@  secondary_too_late:
 	cmpwi	r3,0
 	bne	13b
 	HMT_MEDIUM
-	li	r0, KVM_GUEST_MODE_NONE
-	stb	r0, HSTATE_IN_GUEST(r13)
-	ld	r11,PACA_SLBSHADOWPTR(r13)
 
+	ld	r6, VCORE_KVM(r5)
+	ld	r8, KVM_HOST_LPCR(r6)
+	mtspr	SPRN_LPCR, r8
+	isync
+
+	ld	r11,PACA_SLBSHADOWPTR(r13)
 	.rept	SLB_NUM_BOLTED
 	ld	r5,SLBSHADOW_SAVEAREA(r11)
 	ld	r6,SLBSHADOW_SAVEAREA+8(r11)
@@ -1327,6 +1453,14 @@  secondary_too_late:
 	slbmte	r6,r5
 1:	addi	r11,r11,16
 	.endr
+
+	li	r6, 0
+	mtspr	SPRN_AMR, r6
+	li	r0, KVM_GUEST_MODE_NONE
+	stb	r0, HSTATE_IN_GUEST(r13)
+	ld	r7, HSTATE_DSCR(r13)
+	mtspr	SPRN_DSCR, r7
+
 	b	22b
 
 /*
@@ -1615,7 +1749,7 @@  END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
 	 * up to the host.
 	 */
 	ld	r5,HSTATE_KVM_VCORE(r13)
-	lwz	r6,VCPU_PTID(r3)
+	lbz	r6,HSTATE_PTID(r13)
 	lwz	r8,VCORE_ENTRY_EXIT(r5)
 	clrldi	r8,r8,56
 	li	r0,1
@@ -1628,7 +1762,7 @@  END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
 	bge	kvm_cede_exit
 	stwcx.	r4,0,r6
 	bne	31b
-	li	r0,1
+	li	r0,NAPPING_CEDE
 	stb	r0,HSTATE_NAPPING(r13)
 	/* order napping_threads update vs testing entry_exit_count */
 	lwsync
@@ -1717,7 +1851,7 @@  kvm_end_cede:
 
 	/* clear our bit in vcore->napping_threads */
 33:	ld	r5,HSTATE_KVM_VCORE(r13)
-	lwz	r3,VCPU_PTID(r4)
+	lbz	r3,HSTATE_PTID(r13)
 	li	r0,1
 	sld	r0,r0,r3
 	addi	r6,r5,VCORE_NAPPING_THREADS