diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 58c7264..62fbd38 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -246,11 +246,12 @@ struct kvm_arch {
 	int using_mmu_notifiers;
 	u32 hpt_order;
 	atomic_t vcpus_running;
+	u32 online_vcores;
 	unsigned long hpt_npte;
 	unsigned long hpt_mask;
 	atomic_t hpte_mod_interest;
 	spinlock_t slot_phys_lock;
-	unsigned short last_vcpu[NR_CPUS];
+	cpumask_t need_tlb_flush;
 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
 	struct kvmppc_linear_info *hpt_li;
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
@@ -275,6 +276,7 @@ struct kvmppc_vcore {
 	int nap_count;
 	int napping_threads;
 	u16 pcpu;
+	u16 last_cpu;
 	u8 vcore_state;
 	u8 in_guest;
 	struct list_head runnable_threads;
@@ -523,7 +525,6 @@ struct kvm_vcpu_arch {
 	u64 dec_jiffies;
 	u64 dec_expires;
 	unsigned long pending_exceptions;
-	u16 last_cpu;
 	u8 ceded;
 	u8 prodded;
 	u32 last_inst;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 7523539..4e23ba2 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -441,8 +441,7 @@ int main(void)
 	DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
 	DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
 	DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
-	DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
-	DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
+	DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
 	DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
 	DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
 	DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
@@ -470,7 +469,6 @@ int main(void)
 	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
 	DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
 	DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
-	DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
 	DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 1029e22..2d61e01 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -148,11 +148,8 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
 		 * Reset all the reverse-mapping chains for all memslots
 		 */
 		kvmppc_rmap_reset(kvm);
-		/*
-		 * Set the whole last_vcpu array to an invalid vcpu number.
-		 * This ensures that each vcpu will flush its TLB on next entry.
-		 */
-		memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu));
+		/* Ensure that each vcpu will flush its TLB on next entry. */
+		cpumask_setall(&kvm->arch.need_tlb_flush);
 		*htab_orderp = order;
 		err = 0;
 	} else {
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index a4f59db..ddbec60 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -853,7 +853,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 		goto free_vcpu;
 
 	vcpu->arch.shared = &vcpu->arch.shregs;
-	vcpu->arch.last_cpu = -1;
 	vcpu->arch.mmcr[0] = MMCR0_FC;
 	vcpu->arch.ctrl = CTRL_RUNLATCH;
 	/* default to host PVR, since we can't spoof it */
@@ -880,6 +879,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 			vcore->preempt_tb = TB_NIL;
 		}
 		kvm->arch.vcores[core] = vcore;
+		kvm->arch.online_vcores++;
 	}
 	mutex_unlock(&kvm->lock);
 
@@ -1802,6 +1802,13 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 		return -ENOMEM;
 	kvm->arch.lpid = lpid;
 
+	/*
+	 * Since we don't flush the TLB when tearing down a VM,
+	 * and this lpid might have previously been used,
+	 * make sure we flush on each core before running the new VM.
+	 */
+	cpumask_setall(&kvm->arch.need_tlb_flush);
+
 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
 
 	kvm->arch.rma = NULL;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index fc3da32..7e1f7e2 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -35,6 +35,37 @@ static void *real_vmalloc_addr(void *x)
 	return __va(addr);
 }
 
+/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
+static int global_invalidates(struct kvm *kvm, unsigned long flags)
+{
+	int global;
+
+	/*
+	 * If there is only one vcore, and it's currently running,
+	 * we can use tlbiel as long as we mark all other physical
+	 * cores as potentially having stale TLB entries for this lpid.
+	 * If we're not using MMU notifiers, we never take pages away
+	 * from the guest, so we can use tlbiel if requested.
+	 * Otherwise, don't use tlbiel.
+	 */
+	if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore)
+		global = 0;
+	else if (kvm->arch.using_mmu_notifiers)
+		global = 1;
+	else
+		global = !(flags & H_LOCAL);
+
+	if (!global) {
+		/* any other core might now have stale TLB entries... */
+		smp_wmb();
+		cpumask_setall(&kvm->arch.need_tlb_flush);
+		cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
+				  &kvm->arch.need_tlb_flush);
+	}
+
+	return global;
+}
+
 /*
  * Add this HPTE into the chain for the real page.
  * Must be called with the chain locked; it unlocks the chain.
@@ -390,7 +421,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 	if (v & HPTE_V_VALID) {
 		hpte[0] &= ~HPTE_V_VALID;
 		rb = compute_tlbie_rb(v, hpte[1], pte_index);
-		if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) {
+		if (global_invalidates(kvm, flags)) {
 			while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
 				cpu_relax();
 			asm volatile("ptesync" : : : "memory");
@@ -565,8 +596,6 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 		return H_NOT_FOUND;
 	}
 
-	if (atomic_read(&kvm->online_vcpus) == 1)
-		flags |= H_LOCAL;
 	v = hpte[0];
 	bits = (flags << 55) & HPTE_R_PP0;
 	bits |= (flags << 48) & HPTE_R_KEY_HI;
@@ -587,7 +616,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	if (v & HPTE_V_VALID) {
 		rb = compute_tlbie_rb(v, r, pte_index);
 		hpte[0] = v & ~HPTE_V_VALID;
-		if (!(flags & H_LOCAL)) {
+		if (global_invalidates(kvm, flags)) {
 			while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
 				cpu_relax();
 			asm volatile("ptesync" : : : "memory");
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index b31fb4f..dbb5ced 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -313,7 +313,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
 	mtspr	SPRN_LPID,r7
 	isync
+
+	/* See if we need to flush the TLB */
+	lhz	r6,PACAPACAINDEX(r13)	/* test_bit(cpu, need_tlb_flush) */
+	clrldi	r7,r6,64-6		/* extract bit number (6 bits) */
+	srdi	r6,r6,6			/* doubleword number */
+	sldi	r6,r6,3			/* address offset */
+	add	r6,r6,r9
+	addi	r6,r6,KVM_NEED_FLUSH	/* dword in kvm->arch.need_tlb_flush */
 	li	r0,1
+	sld	r0,r0,r7
+	ld	r7,0(r6)
+	and.	r7,r7,r0
+	beq	22f
+23:	ldarx	r7,0,r6			/* if set, clear the bit */
+	andc	r7,r7,r0
+	stdcx.	r7,0,r6
+	bne	23b
+	li	r6,128			/* and flush the TLB */
+	mtctr	r6
+	li	r7,0x800		/* IS field = 0b10 */
+	ptesync
+28:	tlbiel	r7
+	addi	r7,r7,0x1000
+	bdnz	28b
+	ptesync
+
+22:	li	r0,1
 	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to continue */
 	b	10f
 
@@ -336,36 +362,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	mr	r9,r4
 	blt	hdec_soon
 
-	/*
-	 * Invalidate the TLB if we could possibly have stale TLB
-	 * entries for this partition on this core due to the use
-	 * of tlbiel.
-	 * XXX maybe only need this on primary thread?
-	 */
-	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
-	lwz	r5,VCPU_VCPUID(r4)
-	lhz	r6,PACAPACAINDEX(r13)
-	rldimi	r6,r5,0,62		/* XXX map as if threads 1:1 p:v */
-	lhz	r8,VCPU_LAST_CPU(r4)
-	sldi	r7,r6,1			/* see if this is the same vcpu */
-	add	r7,r7,r9		/* as last ran on this pcpu */
-	lhz	r0,KVM_LAST_VCPU(r7)
-	cmpw	r6,r8			/* on the same cpu core as last time? */
-	bne	3f
-	cmpw	r0,r5			/* same vcpu as this core last ran? */
-	beq	1f
-3:	sth	r6,VCPU_LAST_CPU(r4)	/* if not, invalidate partition TLB */
-	sth	r5,KVM_LAST_VCPU(r7)
-	li	r6,128
-	mtctr	r6
-	li	r7,0x800		/* IS field = 0b10 */
-	ptesync
-2:	tlbiel	r7
-	addi	r7,r7,0x1000
-	bdnz	2b
-	ptesync
-1:
-
 	/* Save purr/spurr */
 	mfspr	r5,SPRN_PURR
 	mfspr	r6,SPRN_SPURR
