diff mbox

[RFC,5/9] KVM: PPC: Book3S HV: Add support for real mode ICP in XICS emulation

Message ID 20121105032233.GF22409@drongo
State New, archived
Headers show

Commit Message

Paul Mackerras Nov. 5, 2012, 3:22 a.m. UTC
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>

This adds an implementation of the XICS hypercalls in real mode for HV
KVM, which allows us to avoid exiting the guest MMU context on all
threads for a variety of operations such as fetching a pending
interrupt, EOI of messages, IPIs, etc.

For debugging purposes, the use of the real mode implementation can be
disabled by setting the KVM_ICP_FLAG_NOREALMODE bit in the icp.flags
field of struct kvm_irqchip_args.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/uapi/asm/kvm.h  |    1 +
 arch/powerpc/kvm/Makefile            |    1 +
 arch/powerpc/kvm/book3s_hv_rm_xics.c |  402 ++++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_xics.c       |  153 +++++--------
 arch/powerpc/kvm/book3s_xics.h       |  116 ++++++++++
 5 files changed, 572 insertions(+), 101 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_hv_rm_xics.c
 create mode 100644 arch/powerpc/kvm/book3s_xics.h
diff mbox

Patch

diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 145c645..55c1907 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -314,6 +314,7 @@  struct kvm_irqchip_args {
 		 * structures.
 		 */
 		struct {
+#define KVM_ICP_FLAG_NOREALMODE		0x00000001 /* Disable real mode ICP */
 			__u32 flags;
 		} icp;
 
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index ec2f8da..c3d958d 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -72,6 +72,7 @@  kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_hv_rmhandlers.o \
 	book3s_hv_rm_mmu.o \
 	book3s_64_vio_hv.o \
+	book3s_hv_rm_xics.o \
 	book3s_hv_builtin.o
 
 kvm-book3s_64-module-objs := \
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
new file mode 100644
index 0000000..49bb25b
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -0,0 +1,402 @@ 
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+
+#include "book3s_xics.h"
+
+#define DEBUG_PASSUP
+
+static inline void rm_writeb(unsigned long paddr, u8 val)
+{
+	__asm__ __volatile__("sync; stbcix %0,0,%1"
+		: : "r" (val), "r" (paddr) : "memory");
+}
+
+static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, struct kvm_vcpu *this_vcpu)
+{
+	struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
+	unsigned long xics_phys;
+	int cpu;
+
+	/* Mark the target VCPU as having an interrupt pending */
+	vcpu->stat.queue_intr++;
+	set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
+
+	/* Kick self ? Just set MER and return */
+	if (vcpu == this_vcpu) {
+		mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_MER);
+		return;
+	}
+
+	/* Check if the core is loaded, if not, too hard */
+	cpu = vcpu->cpu;
+	if (cpu < 0 || cpu >= nr_cpu_ids) {
+		this_icp->rm_action |= XICS_RM_KICK_VCPU;
+		this_icp->rm_kick_target = vcpu;
+		return;
+	}
+	/* In SMT cpu will always point to thread 0, we adjust it */
+	cpu += vcpu->arch.ptid;
+
+	/* Not too hard, then poke the target */
+	xics_phys = paca[cpu].kvm_hstate.xics_phys;
+	rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+}
+
+static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
+{
+	/* Note: Only called on self ! */
+	clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
+	mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER);
+}
+
+static inline bool icp_rm_try_update(struct kvmppc_icp *icp,
+				     union kvmppc_icp_state old,
+				     union kvmppc_icp_state new)
+{
+	struct kvm_vcpu *this_vcpu = local_paca->kvm_hstate.kvm_vcpu;
+	bool success;
+
+	/* Calculate new output value */
+	new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+	/* Attempt atomic update */
+	success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+	if (!success)
+		goto bail;
+
+	/*
+	 * Check for output state update
+	 *
+	 * Note that this is racy since another processor could be updating
+	 * the state already. This is why we never clear the interrupt output
+	 * here, we only ever set it. The clear only happens prior to doing
+	 * an update and only by the processor itself. Currently we do it
+	 * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+	 *
+	 * We also do not try to figure out whether the EE state has changed,
+	 * we unconditionally set it if the new state calls for it. The reason
+	 * for that is that we opportunistically remove the pending interrupt
+	 * flag when raising CPPR, so we need to set it back here if an
+	 * interrupt is still pending.
+	 */
+	if (new.out_ee)
+		icp_rm_set_vcpu_irq(icp->vcpu, this_vcpu);
+
+	/* Expose the state change for debug purposes */
+	this_vcpu->arch.icp->rm_dbgstate = new;
+	this_vcpu->arch.icp->rm_dbgtgt = icp->vcpu;
+
+ bail:
+	return success;
+}
+
+static inline int check_too_hard(struct kvmppc_xics *xics, struct kvmppc_icp *icp)
+{
+	return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS;
+}
+
+static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			     u8 new_cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool resend;
+
+	/*
+	 * This handles several related states in one operation:
+	 *
+	 * ICP State: Down_CPPR
+	 *
+	 * Load CPPR with new value and if the XISR is 0
+	 * then check for resends:
+	 *
+	 * ICP State: Resend
+	 *
+	 * If MFRR is more favored than CPPR, check for IPIs
+	 * and notify ICS of a potential resend. This is done
+	 * asynchronously (when used in real mode, we will have
+	 * to exit here).
+	 *
+	 * We do not handle the complete Check_IPI as documented
+	 * here. In the PAPR, this state will be used for both
+	 * Set_MFRR and Down_CPPR. However, we know that we aren't
+	 * changing the MFRR state here so we don't need to handle
+	 * the case of an MFRR causing a reject of a pending irq,
+	 * this will have been handled when the MFRR was set in the
+	 * first place.
+	 *
+	 * Thus we don't have to handle rejects, only resends.
+	 *
+	 * When implementing real mode for HV KVM, resend will lead to
+	 * a H_TOO_HARD return and the whole transaction will be handled
+	 * in virtual mode.
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		/* Down_CPPR */
+		new_state.cppr = new_cppr;
+
+		/*
+		 * Cut down Resend / Check_IPI / IPI
+		 *
+		 * The logic is that we cannot have a pending interrupt
+		 * trumped by an IPI at this point (see above), so we
+		 * know that either the pending interrupt is already an
+		 * IPI (in which case we don't care to override it) or
+		 * it's either more favored than us or non existent
+		 */
+		if (new_state.mfrr < new_cppr &&
+		    new_state.mfrr <= new_state.pending_pri) {
+			new_state.pending_pri = new_state.mfrr;
+			new_state.xisr = XICS_IPI;
+		}
+
+		/* Latch/clear resend bit */
+		resend = new_state.need_resend;
+		new_state.need_resend = 0;
+
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	/*
+	 * Now handle resend checks. Those are asynchronous to the ICP
+	 * state update in HW (ie bus transactions) so we can handle them
+	 * separately here as well.
+	 */
+	if (resend)
+		icp->rm_action |= XICS_RM_CHECK_RESEND;
+}
+
+
+unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 xirr;
+
+	if (!xics->real_mode)
+		return H_TOO_HARD;
+
+	/* First clear the interrupt */
+	icp_rm_clr_vcpu_irq(icp->vcpu);
+
+	/*
+	 * ICP State: Accept_Interrupt
+	 *
+	 * Return the pending interrupt (if any) along with the
+	 * current CPPR, then clear the XISR & set CPPR to the
+	 * pending priority
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+		if (!old_state.xisr)
+			break;
+		new_state.cppr = new_state.pending_pri;
+		new_state.pending_pri = 0xff;
+		new_state.xisr = 0;
+
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	/* Return the result in GPR4 */
+	vcpu->arch.gpr[4] = xirr;
+
+	return check_too_hard(xics, icp);
+}
+
+int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, unsigned long mfrr)
+{
+        union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp, *this_icp = vcpu->arch.icp;
+	u32 reject;
+	bool resend;
+	bool local;
+
+	if (!xics->real_mode)
+		return H_TOO_HARD;
+
+	local = vcpu->vcpu_id == server;
+	if (local)
+		icp = this_icp;
+	else
+		icp = kvmppc_xics_find_server(vcpu->kvm, server);
+	if (!icp)
+		return H_PARAMETER;
+
+	/*
+	 * ICP state: Set_MFRR
+	 *
+	 * If the CPPR is more favored than the new MFRR, then
+	 * nothing needs to be done as there can be no XISR to
+	 * reject.
+	 *
+	 * If the CPPR is less favored, then we might be replacing
+	 * an interrupt, and thus need to possibly reject it as in
+	 *
+	 * ICP state: Check_IPI
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		/* Set_MFRR */
+		new_state.mfrr = mfrr;
+
+		/* Check_IPI */
+		reject = 0;
+		resend = false;
+		if (mfrr < new_state.cppr) {
+			/* Reject a pending interrupt if not an IPI */
+			if (mfrr <= new_state.pending_pri)
+				reject = new_state.xisr;
+			new_state.pending_pri = mfrr;
+			new_state.xisr = XICS_IPI;
+		}
+
+		if (mfrr > old_state.mfrr && mfrr > new_state.cppr) {
+			resend = new_state.need_resend;
+			new_state.need_resend = 0;
+		}
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	/* Pass rejects to virtual mode */
+	if (reject && reject != XICS_IPI) {
+		this_icp->rm_action |= XICS_RM_REJECT;
+		this_icp->rm_reject = reject;
+	}
+
+	/* Pass resends to virtual mode */
+	if (resend)
+		this_icp->rm_action |= XICS_RM_CHECK_RESEND;
+
+	return check_too_hard(xics, this_icp);
+}
+
+int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 reject;
+
+	if (!xics->real_mode)
+		return H_TOO_HARD;
+
+	/*
+	 * ICP State: Set_CPPR
+	 *
+	 * We can safely compare the new value with the current
+	 * value outside of the transaction as the CPPR is only
+	 * ever changed by the processor on itself
+	 */
+	if (cppr > icp->state.cppr) {
+		icp_rm_down_cppr(xics, icp, cppr);
+		goto bail;
+	} else if (cppr == icp->state.cppr)
+		return H_SUCCESS;
+
+	/*
+	 * ICP State: Up_CPPR
+	 *
+	 * The processor is raising its priority, this can result
+	 * in a rejection of a pending interrupt:
+	 *
+	 * ICP State: Reject_Current
+	 *
+	 * We can remove EE from the current processor, the update
+	 * transaction will set it again if needed
+	 */
+	icp_rm_clr_vcpu_irq(icp->vcpu);
+
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		reject = 0;
+		new_state.cppr = cppr;
+
+		if (cppr <= new_state.pending_pri) {
+			reject = new_state.xisr;
+			new_state.xisr = 0;
+			new_state.pending_pri = 0xff;
+		}
+
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	/* Pass rejects to virtual mode */
+	if (reject && reject != XICS_IPI) {
+		icp->rm_action |= XICS_RM_REJECT;
+		icp->rm_reject = reject;
+	}
+ bail:
+	return check_too_hard(xics, icp);
+}
+
+int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u32 irq = xirr & 0x00ffffff;
+	u16 src;
+
+	if (!xics->real_mode)
+		return H_TOO_HARD;
+
+	/*
+	 * ICP State: EOI
+	 *
+	 * Note: If EOI is incorrectly used by SW to lower the CPPR
+	 * value (ie more favored), we do not check for rejection of
+	 * a pending interrupt, this is a SW error and PAPR sepcifies
+	 * that we don't have to deal with it.
+	 *
+	 * The sending of an EOI to the ICS is handled after the
+	 * CPPR update
+	 *
+	 * ICP State: Down_CPPR which we handle
+	 * in a separate function as it's shared with H_CPPR.
+	 */
+	icp_rm_down_cppr(xics, icp, xirr >> 24);
+
+	/* IPIs have no EOI */
+	if (irq == XICS_IPI)
+		goto bail;
+	/*
+	 * EOI handling: If the interrupt is still asserted, we need to
+	 * resend it. We can take a lockless "peek" at the ICS state here.
+	 *
+	 * "Message" interrupts will never have "asserted" set
+	 */
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		goto bail;
+	state = &ics->irq_state[src];
+
+	/* Still asserted, resend it, we make it look like a reject */
+	if (state->asserted) {
+		icp->rm_action |= XICS_RM_REJECT;
+		icp->rm_reject = irq;
+	}
+ bail:
+	return check_too_hard(xics, icp);
+}
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index ffcdb7e..3858c14 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -22,7 +22,7 @@ 
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
-#define MASKED	0xff
+#include "book3s_xics.h"
 
 #define XICS_DBG(fmt...) do { } while (0)
 //#define XICS_DBG(fmt...) do { trace_printk(fmt); } while (0)
@@ -64,93 +64,6 @@ 
  * - ioctl's to save/restore the entire state for snapshot & migration
  */
 
-#define KVMPPC_XICS_MAX_BUID	0xfff
-#define KVMPPC_XICS_IRQ_COUNT	0x1000
-#define KVMPPC_XICS_BUID_SHIFT	12
-#define KVMPPC_XICS_SRC_MASK	0xfff
-
-/* State for one irq in an ics */
-struct ics_irq_state {
-	u32 number;
-	u32 server;
-	u8  priority;
-	u8  saved_priority; /* currently unused */
-	u8  resend;
-	u8  masked_pending;
-	u8  asserted; /* Only for LSI */
-};
-
-#define ICP_RESEND_MAP_SIZE	\
-	((KVMPPC_XICS_MAX_BUID + BITS_PER_LONG - 1) / BITS_PER_LONG)
-
-/* Atomic ICP state, updated with a single compare & swap */
-union kvmppc_icp_state {
-	unsigned long raw;
-	struct {
-		u8 out_ee : 1;
-		u8 need_resend : 1;
-		u8 cppr;
-		u8 mfrr;
-		u8 pending_pri;
-		u32 xisr;
-	};
-};
-
-struct kvmppc_icp {
-	struct kvm_vcpu *vcpu;
-	union kvmppc_icp_state state;
-	unsigned long resend_map[ICP_RESEND_MAP_SIZE];
-};
-
-struct kvmppc_ics {
-	struct mutex lock;
-	u16 buid;
-	u16 nr_irqs;
-	struct ics_irq_state irq_state[];
-};
-
-struct kvmppc_xics {
-	struct kvm *kvm;
-	struct dentry *dentry;
-	u32 max_buid;
-	struct kvmppc_ics *ics[KVMPPC_XICS_MAX_BUID]; /* [1...MAX_BUID] */
-};
-
-static struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm, u32 nr)
-{
-	struct kvm_vcpu *vcpu = NULL;
-	int i;
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (nr == vcpu->vcpu_id)
-			return vcpu->arch.icp;
-	}
-	return NULL;
-}
-
-static struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
-					       u32 irq, u16 *source)
-{
-	u16 buid = irq >> KVMPPC_XICS_BUID_SHIFT;
-	u16 src = irq & KVMPPC_XICS_SRC_MASK;
-	struct kvmppc_ics *ics;
-
-	if (WARN_ON_ONCE(!buid || buid > KVMPPC_XICS_MAX_BUID)) {
-		XICS_DBG("kvmppc_xics_find_ics: irq %#x BUID out of range !\n",
-			 irq);
-		return NULL;
-	}
-	ics = xics->ics[buid - 1];
-	if (!ics)
-		return NULL;
-	if (src >= ics->nr_irqs)
-		return NULL;
-	if (source)
-		*source = src;
-	return ics;
-}
-
-
 /* -- ICS routines -- */
 
 static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
@@ -311,8 +224,10 @@  static inline bool icp_try_update(struct kvmppc_icp *icp,
 	 * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
 	 *
 	 * We also do not try to figure out whether the EE state has changed,
-	 * we unconditionally set it if the new state calls for it for the
-	 * same reason.
+	 * we unconditionally set it if the new state calls for it. The reason
+	 * for that is that we opportunistically remove the pending interrupt
+	 * flag when raising CPPR, so we need to set it back here if an
+	 * interrupt is still pending.
 	 */
 	if (new.out_ee) {
 		kvmppc_book3s_queue_irqprio(icp->vcpu,
@@ -574,7 +489,7 @@  static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		icp_check_resend(xics, icp);
 }
 
-static noinline unsigned long h_xirr(struct kvm_vcpu *vcpu)
+static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu)
 {
 	union kvmppc_icp_state old_state, new_state;
 	struct kvmppc_icp *icp = vcpu->arch.icp;
@@ -608,8 +523,8 @@  static noinline unsigned long h_xirr(struct kvm_vcpu *vcpu)
 	return xirr;
 }
 
-static noinline int h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
-			  unsigned long mfrr)
+static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+				 unsigned long mfrr)
 {
         union kvmppc_icp_state old_state, new_state;
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
@@ -677,7 +592,7 @@  static noinline int h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
 	return H_SUCCESS;
 }
 
-static noinline void h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 {
 	union kvmppc_icp_state old_state, new_state;
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
@@ -734,7 +649,7 @@  static noinline void h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 		icp_deliver_irq(xics, icp, reject);
 }
 
-static noinline int h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 {
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
 	struct kvmppc_icp *icp = vcpu->arch.icp;
@@ -784,29 +699,54 @@  static noinline int h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 	return H_SUCCESS;
 }
 
+static int noinline kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+
+	XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n",
+		 hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt);
+
+	if (icp->rm_action & XICS_RM_KICK_VCPU)
+		kvmppc_fast_vcpu_kick(icp->rm_kick_target);
+	if (icp->rm_action & XICS_RM_CHECK_RESEND)
+		icp_check_resend(xics, icp);
+	if (icp->rm_action & XICS_RM_REJECT)
+		icp_deliver_irq(xics, icp, icp->rm_reject);
+
+	icp->rm_action = 0;
+
+	return H_SUCCESS;
+}
+
 int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
 {
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
 	unsigned long res;
 	int rc = H_SUCCESS;
 
 	/* Check if we have an ICP */
-	if (!vcpu->arch.icp || !vcpu->kvm->arch.xics)
+	if (!xics || !vcpu->arch.icp)
 		return H_HARDWARE;
 
+	/* Check for real mode returning too hard */
+	if (xics->real_mode)
+		return kvmppc_xics_rm_complete(vcpu, req);
+
 	switch (req) {
 	case H_XIRR:
-		res = h_xirr(vcpu);
+		res = kvmppc_h_xirr(vcpu);
 		kvmppc_set_gpr(vcpu, 4, res);
 		break;
 	case H_CPPR:
-		h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
+		kvmppc_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
 		break;
 	case H_EOI:
-		rc = h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
+		rc = kvmppc_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
 		break;
 	case H_IPI:
-		rc = h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
-			   kvmppc_get_gpr(vcpu, 5));
+		rc = kvmppc_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
+				  kvmppc_get_gpr(vcpu, 5));
 		break;
 	}
 
@@ -1004,6 +944,17 @@  static int kvm_vm_ioctl_create_icp(struct kvm *kvm,
 	kvm->arch.xics = xics;
 	xics_debugfs_init(xics);
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+	/* Enable real mode support */
+		if (!args->icp.flags & KVM_ICP_FLAG_NOREALMODE)
+			xics->real_mode = true;
+#ifdef DEBUG_REALMODE
+		xics->real_mode_dbg = true;
+#endif
+	}
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
 out:
 	mutex_unlock(&kvm->lock);
 	return rc;
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
new file mode 100644
index 0000000..951eacb
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -0,0 +1,116 @@ 
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _KVM_PPC_BOOK3S_XICS_H
+#define _KVM_PPC_BOOK3S_XICS_H
+
+#define KVMPPC_XICS_MAX_BUID	0xfff
+#define KVMPPC_XICS_IRQ_COUNT	0x1000
+#define KVMPPC_XICS_BUID_SHIFT	12
+#define KVMPPC_XICS_SRC_MASK	0xfff
+
+#define MASKED	0xff
+
+/* State for one irq in an ics */
+struct ics_irq_state {
+	u32 number;
+	u32 server;
+	u8  priority;
+	u8  saved_priority; /* currently unused */
+	u8  resend;
+	u8  masked_pending;
+	u8  asserted; /* Only for LSI */
+};
+
+#define ICP_RESEND_MAP_SIZE	\
+	((KVMPPC_XICS_MAX_BUID + BITS_PER_LONG - 1) / BITS_PER_LONG)
+
+/* Atomic ICP state, updated with a single compare & swap */
+union kvmppc_icp_state {
+	unsigned long raw;
+	struct {
+		u8 out_ee : 1;
+		u8 need_resend : 1;
+		u8 cppr;
+		u8 mfrr;
+		u8 pending_pri;
+		u32 xisr;
+	};
+};
+
+struct kvmppc_icp {
+	struct kvm_vcpu *vcpu;
+	union kvmppc_icp_state state;
+	unsigned long resend_map[ICP_RESEND_MAP_SIZE];
+
+	/* Real mode might find something too hard, here's the action
+	 * it might request from virtual mode
+	 */
+#define XICS_RM_KICK_VCPU	0x1
+#define XICS_RM_CHECK_RESEND	0x2
+#define XICS_RM_REJECT		0x4
+	u32 rm_action;
+	struct kvm_vcpu *rm_kick_target;
+	u32  rm_reject;
+
+	/* Debug stuff for real mode */
+	union kvmppc_icp_state rm_dbgstate;
+	struct kvm_vcpu *rm_dbgtgt;
+};
+
+struct kvmppc_ics {
+	struct mutex lock;
+	u16 buid;
+	u16 nr_irqs;
+	struct ics_irq_state irq_state[];
+};
+
+struct kvmppc_xics {
+	struct kvm *kvm;
+	struct dentry *dentry;
+	u32 max_buid;
+	bool real_mode;
+	bool real_mode_dbg;
+	struct kvmppc_ics *ics[KVMPPC_XICS_MAX_BUID]; /* [1...MAX_BUID] */
+};
+
+static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm,
+							 u32 nr)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (nr == vcpu->vcpu_id)
+			return vcpu->arch.icp;
+	}
+	return NULL;
+}
+
+static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
+						      u32 irq, u16 *source)
+{
+	u16 buid = irq >> KVMPPC_XICS_BUID_SHIFT;
+	u16 src = irq & KVMPPC_XICS_SRC_MASK;
+	struct kvmppc_ics *ics;
+
+	if (WARN_ON_ONCE(!buid || buid > KVMPPC_XICS_MAX_BUID))
+		return NULL;
+	ics = xics->ics[buid - 1];
+	if (!ics)
+		return NULL;
+	if (src >= ics->nr_irqs)
+		return NULL;
+	if (source)
+		*source = src;
+	return ics;
+}
+
+
+#endif /* _KVM_PPC_BOOK3S_XICS_H */