Patchwork [5/7] powerpc/kvm/xics: Switch ICP to use atomic updates

login
register
mail settings
Submitter Benjamin Herrenschmidt
Date Aug. 17, 2012, 8:38 a.m.
Message ID <1345192735.11751.73.camel@pasglop>
Download mbox | patch
Permalink /patch/178172/
State New
Headers show

Comments

Benjamin Herrenschmidt - Aug. 17, 2012, 8:38 a.m.
This changes the implementation of the ICP (presentation
controller which is attached to each CPU) to operate
based on an atomic update mechanism rather than mutexes.

This is done to make it possible to implement large
portions of it to work in "real mode" (ie. MMU off)
in order to speed things up significantly on HV KVM.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/kvm_book3s.h |    1 +
 arch/powerpc/kvm/book3s.c             |    2 +-
 arch/powerpc/kvm/book3s_xics.c        |  781 +++++++++++++++++++++------------
 3 files changed, 501 insertions(+), 283 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index f0e0c6a..0221996 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -140,6 +140,7 @@  extern int kvmppc_mmu_hv_init(void);
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+extern void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
 extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
 			   bool upper, u32 val);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 5c631e4..b57fb3ae 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -104,7 +104,7 @@  static int kvmppc_book3s_vec2irqprio(unsigned int vec)
 	return prio;
 }
 
-static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
 					  unsigned int vec)
 {
 	unsigned long old_pending = vcpu->arch.pending_exceptions;
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 5638e21..254d0fe 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -25,6 +25,7 @@ 
 #define MASKED	0xff
 
 #define XICS_DBG(fmt...) do { } while(0)
+//#define XICS_DBG(fmt...) do { trace_printk(fmt); } while(0)
 
 #undef DEBUG_REALMODE
 
@@ -32,11 +33,11 @@ 
  * LOCKING
  * =======
  *
- * Each ICP has its own lock, and there is one lock for the ICS (ie. all
- * information about irq sources).
+ * Each ICS has a mutex protecting the information about the IRQ
+ * sources and avoiding simultaneous deliveries if the same interrupt.
  *
- * The ICS lock nests inside any of the ICP locks. ie. you are allowed
- * to take the ICS lock while holding an ICP lock, but not vice versa.
+ * ICP operations are done via a single compare & swap transaction
+ * (most ICP state fits in the union kvmppc_icp_state)
  */
 
 /*
@@ -48,6 +49,21 @@ 
  * 10 bit
  */
 
+/*
+ * TODO
+ * ====
+ *
+ * - To speed up resends, keep a bitmap of "resend" set bits in the
+ *   ICS
+ *
+ * - Speed up server# -> ICP lookup (array ? hash table ?)
+ *
+ * - Make ICS lockless as well, or at least a per-interrupt lock or hashed
+ *   locks array to improve scalability
+ *
+ * - ioctl's to save/restore the entire state for snapshot & migration
+ */
+
 #define KVMPPC_XICS_MAX_BUID	0xfff
 #define KVMPPC_XICS_IRQ_COUNT	0x1000
 #define KVMPPC_XICS_BUID_SHIFT	12
@@ -67,18 +83,25 @@  struct ics_irq_state {
 #define ICP_RESEND_MAP_SIZE	\
 	((KVMPPC_XICS_MAX_BUID + BITS_PER_LONG - 1) / BITS_PER_LONG)
 
+/* Atomic ICP state, updated with a single compare & swap */
+union kvmppc_icp_state {
+	unsigned long raw;
+	struct {
+		u8 out_ee : 1;
+		u8 need_resend : 1;
+		u8 cppr;
+		u8 mfrr;
+		u8 pending_pri;
+		u32 xisr;
+	};
+};
+
 struct kvmppc_icp {
-	struct mutex lock;
 	struct kvm_vcpu *vcpu;
-	u32 pending_irq;	/* XISR */
-	u8  pending_priority;
-	u8  current_priority;	/* CPPR */
-	u8  mfrr;		/* MFRR */
-	bool need_resend;
+	union kvmppc_icp_state state;
 	unsigned long resend_map[ICP_RESEND_MAP_SIZE];
 };
 
-
 struct kvmppc_ics {
 	struct mutex lock;
 	u16 buid;
@@ -112,6 +135,11 @@  static struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
 	u16 src = irq & KVMPPC_XICS_SRC_MASK;
 	struct kvmppc_ics *ics;
 
+	if (WARN_ON_ONCE(!buid || buid > KVMPPC_XICS_MAX_BUID)) {
+		XICS_DBG("kvmppc_xics_find_ics: irq %#x BUID out of range !\n",
+			 irq);
+		return NULL;
+	}
 	ics = xics->ics[buid - 1];
 	if (!ics)
 		return NULL;
@@ -125,123 +153,43 @@  static struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
 
 /* -- ICS routines -- */
 
-static void icp_deliver_irq(struct kvmppc_xics *xics,
-			    struct kvmppc_icp *icp,
-			    struct kvmppc_ics *ics, u16 src);
-
-static void __ics_reject_irq(struct kvmppc_icp *icp,
-			     struct kvmppc_ics *ics, u16 src)
-{
-	struct ics_irq_state *state = &ics->irq_state[src];
-
-	XICS_DBG("server %d reject src %#x\n", icp->vcpu->vcpu_id, src);
-
-	/* XXX check if it still level & asserted ? */
-	state->resend = 1;
-	set_bit(ics->buid, icp->resend_map);
-	icp->need_resend = true;
-}
-
-static void ics_reject_irq(struct kvmppc_xics *xics,
-			   struct kvmppc_icp *icp, u32 irq)
-{
-	struct kvmppc_ics *ics;
-	u16 src;
-
-	lockdep_assert_held(&icp->lock);
-
-	ics = kvmppc_xics_find_ics(xics, irq, &src);
-	if (!ics) {
-		pr_warning("ics_reject_irq: IRQ 0x%06x not found !\n", irq);
-		return;
-	}
-
-	mutex_lock(&ics->lock);
-	__ics_reject_irq(icp, ics, src);
-	mutex_unlock(&ics->lock);
-}
-
-static void ics_eoi(struct kvmppc_xics *xics, struct kvmppc_icp *icp, 
-		    u32 irq)
-{
-	struct ics_irq_state *state;
-	struct kvmppc_ics *ics;
-	u16 src;
-
-	XICS_DBG("ics_eoi 0x%06x\n", irq);
-
-	lockdep_assert_held(&icp->lock);
-
-	ics = kvmppc_xics_find_ics(xics, irq, &src);
-	if (!ics) {
-		pr_warning("ics_eoi: IRQ 0x%06x not found !\n", irq);
-		return;
-	}
-	state = &ics->irq_state[src];
-
-	mutex_lock(&ics->lock);
-
-	/* If it's an LSI and still asserted we resend */
-	if (state->asserted) {
-		state->resend = 1;
-		set_bit(ics->buid, icp->resend_map);
-		icp->need_resend = true;
-	}
-
-	mutex_unlock(&ics->lock);
-}
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq);
 
-static void ics_deliver_irq(struct kvmppc_xics *xics,
-			    u32 irq, u32 level)
+static void ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
 {
-	struct kvmppc_icp *icp;
 	struct ics_irq_state *state;
 	struct kvmppc_ics *ics;	
-	bool deliver = false;
-	u32 server;
 	u16 src;
 
-	XICS_DBG("ics deliver 0x%06x (level: %d)\n", irq, level);
+	XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
 
 	ics = kvmppc_xics_find_ics(xics, irq, &src);
 	if (!ics) {
-		pr_warning("ics_deliver_irq: IRQ 0x%06x not found !\n", irq);
+		XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq);
 		return;
 	}
 	state = &ics->irq_state[src];
 
-	mutex_lock(&ics->lock);
-
+	/*
+	 * We set state->asserted locklessly. This should be fine as
+	 * we are the only setter, thus concurrent access is undefined
+	 * to begin with.
+	 */
 	if (level == KVM_INTERRUPT_SET_LEVEL)
 		state->asserted = 1;
 	else if (level == KVM_INTERRUPT_UNSET) {
 		state->asserted = 0;
-		goto unlock;
+		return;
 	}
 
-	if (state->priority != MASKED) {
-		deliver = true;
-		server = state->server;
-	} else {
-		XICS_DBG("masked pending\n");
-		state->masked_pending = 1;
-	}	
-
-unlock:
-	mutex_unlock(&ics->lock);
-
-	if (deliver) {
-		 icp = kvmppc_xics_find_server(xics->kvm, server);
-		 /* Configured server not found... XXX FALLBACK */
-		 if (icp) 
-			 icp_deliver_irq(xics, icp, ics, src);
-	}
+	/* Attempt delivery */
+	icp_deliver_irq(xics, NULL, irq);
 }
 
 static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
 			     struct kvmppc_icp *icp)
 {
-	u32 server = icp->vcpu->vcpu_id;
 	int i;
 
 	mutex_lock(&ics->lock);
@@ -249,40 +197,20 @@  static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
 	for (i = 0; i < ics->nr_irqs; i++) {
 		struct ics_irq_state *state = &ics->irq_state[i];
 
-		if (!state->resend || state->server != server)
+		if (!state->resend)
 			continue;
 
-		XICS_DBG("resend 0x%06x prio %d\n", state->number,
+		XICS_DBG("resend %#x prio %#x\n", state->number,
 			      state->priority);
 
-		state->resend = 0;
-		if (state->priority == MASKED)
-			continue;
-
 		mutex_unlock(&ics->lock);
-		icp_deliver_irq(xics, icp, ics, i);
+		icp_deliver_irq(xics, icp, state->number);
 		mutex_lock(&ics->lock);
 	}
 
 	mutex_unlock(&ics->lock);
 }
 
-static void icp_check_resend(struct kvmppc_xics *xics,
-			     struct kvmppc_icp *icp)
-{
-	u32 buid;
-	
-	for_each_set_bit(buid, icp->resend_map, xics->max_buid + 1) {
-		struct kvmppc_ics *ics = xics->ics[buid - 1];
-
-		if (!test_and_clear_bit(buid, icp->resend_map))
-			continue;
-		if (!ics)
-			continue;
-		ics_check_resend(xics, ics, icp);
-	}
-}
-
 int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
 {
 	struct kvmppc_xics *xics = kvm->arch.xics;
@@ -306,20 +234,22 @@  int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
 
 	mutex_lock(&ics->lock);
 
+	XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n",
+		 irq, server, priority,
+		 state->masked_pending, state->resend);
+
 	state->server = server;
 	state->priority = priority;
 	deliver = false;
-	if (state->masked_pending && state->priority != MASKED) {
+	if ((state->masked_pending || state->resend) && priority != MASKED) {
 		state->masked_pending = 0;
 		deliver = true;
 	}
 
 	mutex_unlock(&ics->lock);
 
-	XICS_DBG("irq 0x%06x server %d prio %#x\n", irq, server, priority);
-
 	if (deliver)
-		icp_deliver_irq(xics, icp, ics, src);
+		icp_deliver_irq(xics, icp, irq);
 
 	return 0;
 }
@@ -344,218 +274,501 @@  int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
 	*priority = state->priority;
 	mutex_unlock(&ics->lock);
 
-	XICS_DBG("irq 0x%06x server %d prio %#x\n",
-		      irq, state->server, state->priority);
-
 	return 0;
 }
 
 /* -- ICP routines, including hcalls -- */
 
-static void icp_external_interrupt(struct kvmppc_icp *icp)
+static inline bool icp_try_update(struct kvmppc_icp *icp,
+				  union kvmppc_icp_state old,
+				  union kvmppc_icp_state new,
+				  bool change_self)
 {
-	unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL;
-
-	lockdep_assert_held(&icp->lock);
-
-	kvmppc_book3s_queue_irqprio(icp->vcpu, vec);
-	kvm_vcpu_kick(icp->vcpu);
+	bool success;
+
+	/* Calculate new output value */
+	new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+	/* Attempt atomic update */
+	success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+	if (!success)
+		goto bail;
+
+	XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+		 icp->vcpu->vcpu_id,
+		 old.cppr, old.mfrr, old.pending_pri, old.xisr,
+		 old.need_resend, old.out_ee);
+	XICS_DBG("UPD        - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+		 new.cppr, new.mfrr, new.pending_pri, new.xisr,
+		 new.need_resend, new.out_ee);
+	/*
+	 * Check for output state update
+	 *
+	 * Note that this is racy since another processor could be updating
+	 * the state already. This is why we never clear the interrupt output
+	 * here, we only ever set it. The clear only happens prior to doing
+	 * an update and only by the processor itself. Currently we do it
+	 * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+	 *
+	 * We also do not try to figure out whether the EE state has changed,
+	 * we unconditionally set it if the new state calls for it for the
+	 * same reason.
+	 */
+	if (new.out_ee) {
+		kvmppc_book3s_queue_irqprio(icp->vcpu,
+					    BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+		if (!change_self)
+			kvm_vcpu_kick(icp->vcpu);
+	}
+ bail:
+	return success;
 }
 
-static void icp_deliver_irq(struct kvmppc_xics *xics,
-			    struct kvmppc_icp *icp,
-			    struct kvmppc_ics *ics, u16 src)
+static void icp_check_resend(struct kvmppc_xics *xics,
+			     struct kvmppc_icp *icp)
 {
-	struct ics_irq_state state_copy;
-
-	mutex_lock(&icp->lock);
-
-	/* Snapshot irq state */
-	mutex_lock(&ics->lock);
-	state_copy = ics->irq_state[src];
+	u32 buid;
+	
+	/* Order this load with the test for need_resend in the caller */
+	smp_rmb();
+	for_each_set_bit(buid, icp->resend_map, xics->max_buid + 1) {
+		struct kvmppc_ics *ics = xics->ics[buid - 1];
 
-	if (state_copy.priority > icp->current_priority) {
-		/* CPU is not interested in us */
-		__ics_reject_irq(icp, ics, src);
-		mutex_unlock(&ics->lock);
-		goto out;
+		if (!test_and_clear_bit(buid, icp->resend_map))
+			continue;
+		if (!ics)
+			continue;
+		ics_check_resend(xics, ics, icp);
 	}
+}
 
-	if (icp->pending_irq) {
-		/* An interrupt is pending */
-		if (icp->pending_priority <= state_copy.priority) {
-			/* pending irq is equally or more favoured */
-			__ics_reject_irq(icp, ics, src);
-			mutex_unlock(&ics->lock);
-			goto out;
+static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+			       u32 *reject)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool success;
+
+	XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority,
+		 icp->vcpu->vcpu_id);
+
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		*reject = 0;
+
+		/* See if we can deliver */
+		success = new_state.cppr > priority &&
+			new_state.mfrr > priority &&
+			new_state.pending_pri > priority;
+
+		/*
+		 * If we can, check for a rejection and perform the
+		 * delivery
+		 */
+		if (success) {
+			*reject = new_state.xisr;
+			new_state.xisr = irq;
+			new_state.pending_pri = priority;
+		} else {
+			/*
+			 * If we failed to deliver we set need_resend
+			 * so a subsequent CPPR state change causes us
+			 * to try a new delivery.
+			 */
+			new_state.need_resend = true;
 		}
-	}
-	mutex_unlock(&ics->lock);
-
-	/* We are more favoured, reject pending irq */
-	if (icp->pending_irq)
-		ics_reject_irq(xics, icp, icp->pending_irq);
-
-	icp->pending_irq = state_copy.number;
-	icp->pending_priority = state_copy.priority;
 
-	XICS_DBG("irq 0x%06x pending on %d prio %#x\n",
-		     state_copy.number, state_copy.server, state_copy.priority);
+	} while(!icp_try_update(icp, old_state, new_state, false));
 
-	icp_external_interrupt(icp);
-
-out:
-	mutex_unlock(&icp->lock);
+	return success;
 }
 
-static void icp_check_ipi(struct kvmppc_xics *xics, struct kvmppc_icp *icp)
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq)
 {
-	lockdep_assert_held(&icp->lock);
-
-	if (icp->mfrr >= icp->current_priority)
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;
+	u32 reject;
+	u16 src;	
+
+	/*
+	 * This is used both for initial delivery of an interrupt and
+	 * for subsequent rejection.
+	 *
+	 * Rejection can be racy vs. resends. We have evaluated the
+	 * rejection in an atomic ICP transaction which is now complete,
+	 * so potentially the ICP can already accept the interrupt again.
+	 *
+	 * So we need to retry the delivery. Essentially the reject path
+	 * boils down to a failed delivery. Always.
+	 *
+	 * Now the interrupt could also have moved to a different target,
+	 * thus we may need to re-do the ICP lookup as well
+	 */
+	 
+ again:
+	/* Get the ICS state and lock it */
+	ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+	if (!ics) {
+		XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq);
 		return;
+	}
+	state = &ics->irq_state[src];
 
-	XICS_DBG("cpu %d can take IPI mfrr=%#x\n",
-		     icp->vcpu->vcpu_id, icp->mfrr);
+	/* Get a lock on the ICS */
+	mutex_lock(&ics->lock);
 
-	if (icp->pending_irq) {
-		/* IPI is less favoured */
-		if (icp->pending_priority <= icp->mfrr) {
-			XICS_DBG("ODD: pending_prio=%#x pending_irq=%#x\n",
-				     icp->pending_priority, icp->pending_irq);
-			return;
+	/* Get our server */
+	if (!icp || state->server != icp->vcpu->vcpu_id) {
+		icp = kvmppc_xics_find_server(xics->kvm, state->server);
+		if (!icp) {
+			pr_warning("icp_deliver_irq: IRQ 0x%06x server 0x%x"
+				   " not found !\n", new_irq, state->server);
+			goto out;
 		}
+	}
 
-		/* IPI is more favoured, reject the other interrupt */
-		ics_reject_irq(xics, icp, icp->pending_irq);
+	/* Clear the resend bit of that interrupt */
+	state->resend = 0;
+
+	/*
+	 * If masked, bail out
+	 *
+	 * Note: PAPR doesn't mention anything about masked pending
+	 * when doing a resend, only when doing a delivery.
+	 *
+	 * However that would have the effect of losing a masked
+	 * interrupt that was rejected and isn't consistent with
+	 * the whole masked_pending business which is about not
+	 * losing interrupts that occur while masked.
+	 *
+	 * I don't differenciate normal deliveries and resends, this
+	 * implementation will differ from PAPR and not lose such
+	 * interrupts.
+	 */
+	if (state->priority == MASKED) {
+		XICS_DBG("irq %#x masked pending\n", new_irq);
+		state->masked_pending = 1;
+		goto out;
 	}
 
-	icp->pending_irq = XICS_IPI;
-	icp->pending_priority = icp->mfrr;
-	icp_external_interrupt(icp);
+	/*
+	 * Try the delivery, this will set the need_resend flag
+	 * in the ICP as part of the atomic transaction if the
+	 * delivery is not possible.
+	 *
+	 * Note that if successful, the new delivery might have itself
+	 * rejected an interrupt that was "delivered" before we took the
+	 * icp mutex.
+	 *
+	 * In this case we do the whole sequence all over again for the
+	 * new guy. We cannot assume that the rejected interrupt is less
+	 * favored than the new one, and thus doesn't need to be delivered,
+	 * because by the time we exit icp_try_to_deliver() the target
+	 * processor may well have alrady consumed & completed it, and thus
+	 * the rejected interrupt might actually be already acceptable.
+	 */
+	if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+		/*
+		 * Delivery was successful, did we reject somebody else ?
+		 */
+		if (reject && reject != XICS_IPI) {
+			mutex_unlock(&ics->lock);
+			new_irq = reject;
+			goto again;
+		}
+	} else {
+		/*
+		 * We failed to deliver the interrupt we need to set the
+		 * resend map bit and mark the ICS state as needing a resend
+		 */
+		set_bit(ics->buid, icp->resend_map);
+		state->resend = 1;
+
+		/*
+		 * If the need_resend flag got cleared in the ICP some time
+		 * between icp_try_to_deliver() atomic update and now, then
+		 * we know it might have missed the resend_map bit. So we
+		 * retry
+		 */
+		smp_mb();
+		if (!icp->state.need_resend) {
+			mutex_unlock(&ics->lock);
+			goto again;
+		}
+	}
+ out:
+	mutex_unlock(&ics->lock);
 }
 
-static u32 icp_accept(struct kvm_vcpu *vcpu, struct kvmppc_icp *icp)
+static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			  u8 new_cppr)
 {
-	u32 xirr;
-
-	mutex_lock(&icp->lock);
-
-	kvmppc_core_dequeue_external(vcpu);
-
-	/* The XIRR is the pending interrupt & current priority */
-	xirr = icp->pending_irq | (icp->current_priority << 24);
-
-	/* The pending priority becomes current */
-	icp->current_priority = icp->pending_priority;
+	union kvmppc_icp_state old_state, new_state;
+	bool resend;
+
+	/*
+	 * This handles several related states in one operation:
+	 *
+	 * ICP State: Down_CPPR
+	 *
+	 * Load CPPR with new value and if the XISR is 0
+	 * then check for resends:
+	 *
+	 * ICP State: Resend
+	 *
+	 * If MFRR is more favored than CPPR, check for IPIs
+	 * and notify ICS of a potential resend. This is done
+	 * asynchronously (when used in real mode, we will have
+	 * to exit here).
+	 *
+	 * We do not handle the complete Check_IPI as documented
+	 * here. In the PAPR, this state will be used for both
+	 * Set_MFRR and Down_CPPR. However, we know that we aren't
+	 * changing the MFRR state here so we don't need to handle
+	 * the case of an MFRR causing a reject of a pending irq,
+	 * this will have been handled when the MFRR was set in the
+	 * first place.
+	 *
+	 * Thus we don't have to handle rejects, only resends.
+	 *
+	 * When implementing real mode for HV KVM, resend will lead to
+	 * a H_TOO_HARD return and the whole transaction will be handled
+	 * in virtual mode.
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		/* Down_CPPR */
+		new_state.cppr = new_cppr;
+
+		/*
+		 * Cut down Resend / Check_IPI / IPI
+		 *
+		 * The logic is that we cannot have a pending interrupt
+		 * trumped by an IPI at this point (see above), so we
+		 * know that either the pending interrupt is already an
+		 * IPI (in which case we don't care to override it) or
+		 * it's either more favored than us or non existent
+		 */
+		if (new_state.mfrr < new_cppr &&
+		    new_state.mfrr <= new_state.pending_pri) {
+			WARN_ON(new_state.xisr != XICS_IPI &&
+				new_state.xisr != 0);
+			new_state.pending_pri = new_state.mfrr;
+			new_state.xisr = XICS_IPI;
+		}
 
-	/* Clear the pending interrupt */
-	icp->pending_irq = 0;
+		/* Latch/clear resend bit */
+		resend = new_state.need_resend;
+		new_state.need_resend = 0;
 
-	mutex_unlock(&icp->lock);
+	} while(!icp_try_update(icp, old_state, new_state, true));
 
-	return xirr;
+	/*
+	 * Now handle resend checks. Those are asynchronous to the ICP
+	 * state update in HW (ie bus transactions) so we can handle them
+	 * separately here too
+	 */
+	if (resend)
+		icp_check_resend(xics, icp);
 }
 
-static unsigned long h_xirr(struct kvm_vcpu *vcpu)
+static noinline unsigned long h_xirr(struct kvm_vcpu *vcpu)
 {
+	union kvmppc_icp_state old_state, new_state;
 	struct kvmppc_icp *icp = vcpu->arch.icp;
 	u32 xirr;
 
-	xirr = icp_accept(vcpu, icp);
+	/* First, remove EE from the processor */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+	/*
+	 * ICP State: Accept_Interrupt
+	 *
+	 * Return the pending interrupt (if any) along with the
+	 * current CPPR, then clear the XISR & set CPPR to the
+	 * pending priority
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+		new_state.cppr = new_state.pending_pri;
+		new_state.pending_pri = 0xff;
+		new_state.xisr = 0;
+
+	} while(!icp_try_update(icp, old_state, new_state, true));
 
 	XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr);
 
 	return xirr;
 }
 
-static int h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
-		 unsigned long mfrr)
+static noinline int h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+			  unsigned long mfrr)
 {
+        union kvmppc_icp_state old_state, new_state;
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
 	struct kvmppc_icp *icp;
+	u32 reject;
+	bool local;
 
 	XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n",
 			vcpu->vcpu_id, server, mfrr);
 
-	icp = kvmppc_xics_find_server(vcpu->kvm, server);
+	local = vcpu->vcpu_id == server;
+	if (local)
+		icp = vcpu->arch.icp;
+	else
+		icp = kvmppc_xics_find_server(vcpu->kvm, server);
 	if (!icp)
 		return H_PARAMETER;
 
-	mutex_lock(&icp->lock);
+	/*
+	 * ICP state: Set_MFRR
+	 *
+	 * If the CPPR is more favored than the new MFRR, then
+	 * nothing needs to be done as there can be no XISR to
+	 * reject.
+	 *
+	 * If the CPPR is less favored, then we might be replacing
+	 * an interrupt, and thus need to possibly reject it as in
+	 *
+	 * ICP state: Check_IPI
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		/* Set_MFRR */
+		new_state.mfrr = mfrr;
+
+		/* Check_IPI */
+		reject = 0;
+		if (mfrr < new_state.cppr) {
+			/* Reject a pending interrupt if not an IPI */
+			if (mfrr <= new_state.pending_pri)
+				reject = new_state.xisr;
+			new_state.pending_pri = mfrr;
+			new_state.xisr = XICS_IPI;
+		}
 
-	icp->mfrr = mfrr;
-	icp_check_ipi(xics, icp);
+	} while(!icp_try_update(icp, old_state, new_state, local));
 
-	mutex_unlock(&icp->lock);
+	/* Handle reject */
+	if (reject && reject != XICS_IPI)
+		icp_deliver_irq(xics, icp, reject);
+		
 
 	return H_SUCCESS;
 }
 
-static void h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+static noinline void h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 {
+	union kvmppc_icp_state old_state, new_state;
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
 	struct kvmppc_icp *icp = vcpu->arch.icp;
-	u8 old_priority;
-	bool check_resend = false;
+	u32 reject;
 
 	XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr);
 
-	mutex_lock(&icp->lock);
-
-	old_priority = icp->current_priority;
-	icp->current_priority = cppr;
+	/*
+	 * ICP State: Set_CPPR
+	 *
+	 * We can safely compare the new value with the current
+	 * value outside of the transaction as the CPPR is only
+	 * ever changed by the processor on itself
+	 */
+	if (cppr > icp->state.cppr)
+		icp_down_cppr(xics, icp, cppr);
+	else if (cppr == icp->state.cppr)
+		return;
 
-	if (icp->pending_irq &&
-	    icp->current_priority < icp->pending_priority) {
-		u32 pending = icp->pending_irq;
-		/* Pending irq is less favoured than our new priority */
-		icp->pending_irq = 0;
-		kvmppc_core_dequeue_external(vcpu);
-		ics_reject_irq(xics, icp, pending);
-	}
+	/*
+	 * ICP State: Up_CPPR
+	 *
+	 * The processor is raising its priority, this can result
+	 * in a rejection of a pending interrupt:
+	 *
+	 * ICP State: Reject_Current
+	 *
+	 * We can remove EE from the current processor, the update
+	 * transaction will set it again if needed
+	 */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		reject = 0;
+		new_state.cppr = cppr;
+
+		if (cppr <= new_state.pending_pri) {
+			reject = new_state.xisr;
+			new_state.xisr = 0;
+			new_state.pending_pri = 0xff;
+		}
 
-	/* Check if there is anything we can accept now */
-	if (!icp->pending_irq)
-		icp_check_ipi(xics, icp);
-	if (!icp->pending_irq && icp->need_resend) {
-		check_resend = true;
-		icp->need_resend = false;
-	}
-	
-	mutex_unlock(&icp->lock);
+	} while(!icp_try_update(icp, old_state, new_state, true));
 
-	if (check_resend)
-		icp_check_resend(xics, icp);
+	/*
+	 * Check for rejects. They are handled by doing a new delivery
+	 * attempt (see comments in icp_deliver_irq).
+	 */
+	if (reject && reject != XICS_IPI)
+		icp_deliver_irq(xics, icp, reject);
 }
 
-static void h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+static noinline int h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 {
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
 	struct kvmppc_icp *icp = vcpu->arch.icp;
-	bool check_resend = false;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u32 irq = xirr & 0x00ffffff;
+	u16 src;
 
 	XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
 
-	mutex_lock(&icp->lock);
-
-	icp->current_priority = xirr >> 24;
-
-	/* If nothing is pending since accept, check for an IPI */
-	if (!icp->pending_irq)
-		icp_check_ipi(xics, icp);
-
-	if (!icp->pending_irq && icp->need_resend) {
-		check_resend = true;
-		icp->need_resend = false;
+	/*
+	 * ICP State: EOI
+	 *
+	 * Note: If EOI is incorrectly used by SW to lower the CPPR
+	 * value (ie more favored), we do not check for rejection of
+	 * a pending interrupt, this is a SW error and PAPR sepcifies
+	 * that we don't have to deal with it.
+	 *
+	 * The sending of an EOI to the ICS is handled after the
+	 * CPPR update
+	 *
+	 * ICP State: Down_CPPR which we handle
+	 * in a separate function as it's shared with H_CPPR.
+	 */
+	icp_down_cppr(xics, icp, xirr >> 24);
+
+	/* IPIs have no EOI */
+	if (irq == XICS_IPI)
+		return H_SUCCESS;
+	/*
+	 * EOI handling: If the interrupt is still asserted, we need to
+	 * resend it. We can take a lockless "peek" at the ICS state here.
+	 *
+	 * "Message" interrupts will never have "asserted" set
+	 */
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics) {
+		XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
+		return H_PARAMETER;
 	}
+	state = &ics->irq_state[src];
 
-	ics_eoi(xics, icp, xirr & 0xFFFFFF);
-
-	mutex_unlock(&icp->lock);
+	/* Still asserted, resend it */
+	if (state->asserted)
+		icp_deliver_irq(xics, icp, irq);
 
-	if (check_resend)
-		icp_check_resend(xics, icp);
+	return H_SUCCESS;
 }
 
 int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
@@ -576,7 +789,7 @@  int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
 		h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
 		break;
 	case H_EOI:
-		h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
+		rc = h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
 		break;
 	case H_IPI:
 		rc = h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
@@ -604,18 +817,16 @@  static int xics_debug_show(struct seq_file *m, void *private)
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		struct kvmppc_icp *icp = vcpu->arch.icp;
+		union kvmppc_icp_state state;
 
 		if (!icp)
 			continue;
 
-		mutex_lock(&icp->lock);
-
-		seq_printf(m, "cpu server %#x pending %#x pending prio %#x cppr %#x "
-			   "mfrr %#x\n", vcpu->vcpu_id, icp->pending_irq,
-			   icp->pending_priority, icp->current_priority,
-			   icp->mfrr);
-
-		mutex_unlock(&icp->lock);
+		state.raw = ACCESS_ONCE(icp->state.raw);
+		seq_printf(m, "cpu server %#x XIRR:%#x PPRI:%#x CPPR:%#x "
+			   "MFRR:%#x OUT:%d NR:%d\n", vcpu->vcpu_id, state.xisr,
+			   state.pending_pri, state.cppr, state.mfrr,
+			   state.out_ee, state.need_resend);
 	}
 
 	for (buid = 1; buid <= KVMPPC_XICS_MAX_BUID; buid++) {
@@ -624,18 +835,20 @@  static int xics_debug_show(struct seq_file *m, void *private)
 		if (!ics)
 			continue;
 
-		seq_printf(m, "=========\nICS state for BUID 0x%x\n=========\n", buid);
+		seq_printf(m, "=========\nICS state for BUID 0x%x\n=========\n",
+			   buid);
 
 		mutex_lock(&ics->lock);
 
 		for (i = 0; i < ics->nr_irqs; i++) {
 			struct ics_irq_state *irq = &ics->irq_state[i];
 
-			seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x "
-				   "asserted %d resend %d masked pending %d\n",
+			seq_printf(m, "irq 0x%06x: server %#x prio %#x save"
+				   " prio %#x asserted %d resend %d masked"
+				   " pending %d\n",
 				   irq->number, irq->server, irq->priority,
-				   irq->saved_priority, irq->asserted, irq->resend,
-				   irq->masked_pending);
+				   irq->saved_priority, irq->asserted,
+				   irq->resend, irq->masked_pending);
 
 		}
 		mutex_unlock(&ics->lock);
@@ -672,14 +885,16 @@  static void xics_debugfs_init(struct kvmppc_xics *xics)
 	kfree(name);
 }
 
-static int kvmppc_xics_create_ics(struct kvmppc_xics *xics, u16 buid, u16 nr_irq)
+static int kvmppc_xics_create_ics(struct kvmppc_xics *xics, u16 buid,
+				  u16 nr_irq)
 {
 	struct kvmppc_ics *ics;
 	int i, size;
 
 
 	/* Create the ICS */
-	size = sizeof(struct kvmppc_ics) + sizeof(struct ics_irq_state) * nr_irqs;
+	size = sizeof(struct kvmppc_ics) +
+		sizeof(struct ics_irq_state) * nr_irqs;
 	ics = kzalloc(size, GFP_KERNEL);
 	if (!ics)
 		return -ENOMEM;
@@ -710,9 +925,9 @@  int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu)
 	if (!icp)
 		return -ENOMEM;
 
-	mutex_init(&icp->lock);
 	icp->vcpu = vcpu;
-	icp->mfrr = MASKED;
+	icp->state.mfrr = MASKED;
+	icp->state.pending_pri = MASKED;
 	vcpu->arch.icp = icp;
 
 	XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id);
@@ -848,6 +1063,8 @@  int kvmppc_xics_ioctl(struct kvm *kvm, unsigned ioctl, unsigned long arg)
 	void __user *argp = (void __user *)arg;
 	int rc;
 
+	BUILD_BUG_ON(sizeof(union kvmppc_icp_state) != sizeof(unsigned long));
+
 	switch (ioctl) {
 	case KVM_CREATE_IRQCHIP: {
 		struct kvm_irqchip_args args;