Patchwork [2/8] KVM: PPC: Book3S: Add kernel emulation for the XICS interrupt controller

login
register
mail settings
Submitter Paul Mackerras
Date April 11, 2013, 5:54 a.m.
Message ID <20130411055400.GD9042@drongo>
Download mbox | patch
Permalink /patch/235565/
State New
Headers show

Comments

Paul Mackerras - April 11, 2013, 5:54 a.m.
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>

This adds in-kernel emulation of the XICS (eXternal Interrupt
Controller Specification) interrupt controller specified by PAPR, for
both HV and PR KVM guests.

The XICS emulation supports up to 1048560 interrupt sources.
Interrupt source numbers below 16 are reserved; 0 is used to mean no
interrupt and 2 is used for IPIs.  Internally these are represented in
blocks of 1024, called ICS (interrupt controller source) entities, but
that is not visible to userspace.

Each vcpu gets one ICP (interrupt controller presentation) entity,
used to store the per-vcpu state such as vcpu priority, pending
interrupt state, IPI request, etc.

This does not include any API or any way to connect vcpus to their
ICP state; that will be added in later patches.

This is based on an initial implementation by Michael Ellerman
<michael@ellerman.id.au> reworked by Benjamin Herrenschmidt and
Paul Mackerras.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_book3s.h |    1 +
 arch/powerpc/include/asm/kvm_host.h   |    9 +
 arch/powerpc/include/asm/kvm_ppc.h    |   27 +
 arch/powerpc/include/uapi/asm/kvm.h   |    1 +
 arch/powerpc/kvm/Kconfig              |    8 +
 arch/powerpc/kvm/Makefile             |    3 +
 arch/powerpc/kvm/book3s.c             |    2 +-
 arch/powerpc/kvm/book3s_hv.c          |    9 +
 arch/powerpc/kvm/book3s_pr_papr.c     |   16 +
 arch/powerpc/kvm/book3s_rtas.c        |   51 +-
 arch/powerpc/kvm/book3s_xics.c        |  942 +++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_xics.h        |  113 ++++
 arch/powerpc/kvm/powerpc.c            |    5 +
 13 files changed, 1185 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_xics.c
 create mode 100644 arch/powerpc/kvm/book3s_xics.h

Patch

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 5a56e1c..17c9a15 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -142,6 +142,7 @@  extern int kvmppc_mmu_hv_init(void);
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+extern void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
 extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
 			   bool upper, u32 val);
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 8fe8ef5..c7497dd 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -188,6 +188,10 @@  struct kvmppc_linear_info {
 	int		 type;
 };
 
+/* XICS components, defined in boo3s_xics.c */
+struct kvmppc_xics;
+struct kvmppc_icp;
+
 /*
  * The reverse mapping array has one entry for each HPTE,
  * which stores the guest's view of the second word of the HPTE
@@ -256,6 +260,7 @@  struct kvm_arch {
 #ifdef CONFIG_PPC_BOOK3S_64
 	struct list_head spapr_tce_tables;
 	struct list_head rtas_tokens;
+	struct kvmppc_xics *xics;
 #endif
 };
 
@@ -378,6 +383,7 @@  struct kvmppc_booke_debug_reg {
 
 #define KVMPPC_IRQ_DEFAULT	0
 #define KVMPPC_IRQ_MPIC		1
+#define KVMPPC_IRQ_XICS		2
 
 struct openpic;
 
@@ -562,6 +568,9 @@  struct kvm_vcpu_arch {
 
 	int irq_type;		/* one of KVM_IRQ_* */
 	struct openpic *mpic;	/* KVM_IRQ_MPIC */
+#ifdef CONFIG_KVM_XICS
+	struct kvmppc_icp *icp; /* XICS presentation controller */
+#endif
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	struct kvm_vcpu_arch_shared shregs;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index f4e66c4..5f5821b 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -130,6 +130,8 @@  extern long kvmppc_prepare_vrma(struct kvm *kvm,
 extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
 			struct kvm_memory_slot *memslot, unsigned long porder);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
+
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce *args);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
@@ -169,6 +171,8 @@  int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
 extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp);
 extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu);
 extern void kvmppc_rtas_tokens_free(struct kvm *kvm);
+extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority);
+extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority);
 
 /*
  * Cuts out inst bits with ordering according to spec.
@@ -268,6 +272,27 @@  static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 
 static inline void kvm_linear_init(void)
 {}
+
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
+}
+extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
+extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
+extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
+#else
+static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
+	{ return 0; }
+static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
+static inline int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu,
+					 unsigned long server)
+	{ return -EINVAL; }
+static inline int kvm_vm_ioctl_xics_irq(struct kvm *kvm,
+					struct kvm_irq_level *args)
+	{ return -ENOTTY; }
 #endif
 
 static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
@@ -282,6 +307,8 @@  static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
 void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu);
 int kvmppc_mpic_connect_vcpu(struct file *mpic_filp, struct kvm_vcpu *vcpu,
 			     u32 cpu);
+int kvmppc_xics_connect_vcpu(struct file *xics_filp, struct kvm_vcpu *vcpu,
+			     u32 cpu);
 
 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 			      struct kvm_config_tlb *cfg);
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index a599ea5..6beb876 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -428,4 +428,5 @@  struct kvm_get_htab_header {
 #define KVM_REG_PPC_CLEAR_TSR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x88)
 #define KVM_REG_PPC_TCR		(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x89)
 #define KVM_REG_PPC_TSR		(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8a)
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index a87139b..c036c26 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -155,6 +155,14 @@  config KVM_MPIC
 	bool "KVM in-kernel MPIC emulation"
 	depends on KVM
 
+config KVM_XICS
+	bool "KVM in-kernel XICS emulation"
+	depends on KVM_BOOK3S_64
+	---help---
+	  Include support for the XICS (eXternal Interrupt Controller
+	  Specification) interrupt controller architecture used on
+	  IBM POWER (pSeries) servers.
+
 
 source drivers/vhost/Kconfig
 
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index d2c8a88..cccd85f 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -79,6 +79,9 @@  kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_hv_ras.o \
 	book3s_hv_builtin.o
 
+kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
+	book3s_xics.o
+
 kvm-book3s_64-module-objs := \
 	../../../virt/kvm/kvm_main.o \
 	../../../virt/kvm/eventfd.o \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 6548445..c5a4478 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -104,7 +104,7 @@  static int kvmppc_book3s_vec2irqprio(unsigned int vec)
 	return prio;
 }
 
-static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
 					  unsigned int vec)
 {
 	unsigned long old_pending = vcpu->arch.pending_exceptions;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index c445a3d..506b5ea 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -528,6 +528,15 @@  int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 
 		/* Send the error out to userspace via KVM_RUN */
 		return rc;
+
+	case H_XIRR:
+	case H_CPPR:
+	case H_EOI:
+	case H_IPI:
+		if (kvmppc_xics_enabled(vcpu)) {
+			ret = kvmppc_xics_hcall(vcpu, req);
+			break;
+		} /* fallthrough */
 	default:
 		return RESUME_HOST;
 	}
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index 4efa4a4..9802245 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -227,6 +227,15 @@  static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 }
 
+static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
+{
+	long rc = kvmppc_xics_hcall(vcpu, cmd);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
 int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 {
 	switch (cmd) {
@@ -246,6 +255,13 @@  int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 		vcpu->stat.halt_wakeup++;
 		return EMULATE_DONE;
+	case H_XIRR:
+	case H_CPPR:
+	case H_EOI:
+	case H_IPI:
+		if (kvmppc_xics_enabled(vcpu))
+			return kvmppc_h_pr_xics_hcall(vcpu, cmd);
+		break;
 	case H_RTAS:
 		if (list_empty(&vcpu->kvm->arch.rtas_tokens))
 			return RESUME_HOST;
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
index 8a324e8..6a6c1fe 100644
--- a/arch/powerpc/kvm/book3s_rtas.c
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -18,12 +18,61 @@ 
 #include <asm/rtas.h>
 
 
+static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq, server, priority;
+	int rc;
+
+	if (args->nargs != 3 || args->nret != 1) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = args->args[0];
+	server = args->args[1];
+	priority = args->args[2];
+
+	rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
+	if (rc)
+		rc = -3;
+out:
+	args->rets[0] = rc;
+}
+
+static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq, server, priority;
+	int rc;
+
+	if (args->nargs != 1 || args->nret != 3) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = args->args[0];
+
+	server = priority = 0;
+	rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
+	if (rc) {
+		rc = -3;
+		goto out;
+	}
+
+	args->rets[1] = server;
+	args->rets[2] = priority;
+out:
+	args->rets[0] = rc;
+}
+
 struct rtas_handler {
 	void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args);
 	char *name;
 };
 
-static struct rtas_handler rtas_handlers[] = { };
+static struct rtas_handler rtas_handlers[] = {
+	{ .name = "ibm,set-xive", .handler = kvm_rtas_set_xive },
+	{ .name = "ibm,get-xive", .handler = kvm_rtas_get_xive },
+};
 
 struct rtas_token_definition {
 	struct list_head list;
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
new file mode 100644
index 0000000..175a3c2
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -0,0 +1,942 @@ 
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "book3s_xics.h"
+
+#define XICS_DBG(fmt...) do { } while (0)
+//#define XICS_DBG(fmt...) do { trace_printk(fmt); } while (0)
+
+/*
+ * LOCKING
+ * =======
+ *
+ * Each ICS has a mutex protecting the information about the IRQ
+ * sources and avoiding simultaneous deliveries if the same interrupt.
+ *
+ * ICP operations are done via a single compare & swap transaction
+ * (most ICP state fits in the union kvmppc_icp_state)
+ */
+
+/*
+ * TODO
+ * ====
+ *
+ * - To speed up resends, keep a bitmap of "resend" set bits in the
+ *   ICS
+ *
+ * - Speed up server# -> ICP lookup (array ? hash table ?)
+ *
+ * - Make ICS lockless as well, or at least a per-interrupt lock or hashed
+ *   locks array to improve scalability
+ *
+ * - ioctl's to save/restore the entire state for snapshot & migration
+ */
+
+/* -- ICS routines -- */
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq);
+
+static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
+{
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;	
+	u16 src;
+
+	XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics) {
+		XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq);
+		return -EINVAL;
+	}
+	state = &ics->irq_state[src];
+	if (!state->exists)
+		return -EINVAL;
+
+	/*
+	 * We set state->asserted locklessly. This should be fine as
+	 * we are the only setter, thus concurrent access is undefined
+	 * to begin with.
+	 */
+	if (level == KVM_INTERRUPT_SET_LEVEL)
+		state->asserted = 1;
+	else if (level == KVM_INTERRUPT_UNSET) {
+		state->asserted = 0;
+		return 0;
+	}
+
+	/* Attempt delivery */
+	icp_deliver_irq(xics, NULL, irq);
+
+	return 0;
+}
+
+static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
+			     struct kvmppc_icp *icp)
+{
+	int i;
+
+	mutex_lock(&ics->lock);
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		struct ics_irq_state *state = &ics->irq_state[i];
+
+		if (!state->resend)
+			continue;
+
+		XICS_DBG("resend %#x prio %#x\n", state->number,
+			      state->priority);
+
+		mutex_unlock(&ics->lock);
+		icp_deliver_irq(xics, icp, state->number);
+		mutex_lock(&ics->lock);
+	}
+
+	mutex_unlock(&ics->lock);
+}
+
+int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_icp *icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+	bool deliver;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	icp = kvmppc_xics_find_server(kvm, server);
+	if (!icp)
+		return -EINVAL;
+
+	mutex_lock(&ics->lock);
+
+	XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n",
+		 irq, server, priority,
+		 state->masked_pending, state->resend);
+
+	state->server = server;
+	state->priority = priority;
+	deliver = false;
+	if ((state->masked_pending || state->resend) && priority != MASKED) {
+		state->masked_pending = 0;
+		deliver = true;
+	}
+
+	mutex_unlock(&ics->lock);
+
+	if (deliver)
+		icp_deliver_irq(xics, icp, irq);
+
+	return 0;
+}
+
+int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	mutex_lock(&ics->lock);
+	*server = state->server;
+	*priority = state->priority;
+	mutex_unlock(&ics->lock);
+
+	return 0;
+}
+
+/* -- ICP routines, including hcalls -- */
+
+static inline bool icp_try_update(struct kvmppc_icp *icp,
+				  union kvmppc_icp_state old,
+				  union kvmppc_icp_state new,
+				  bool change_self)
+{
+	bool success;
+
+	/* Calculate new output value */
+	new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+	/* Attempt atomic update */
+	success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+	if (!success)
+		goto bail;
+
+	XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+		 icp->vcpu->vcpu_id,
+		 old.cppr, old.mfrr, old.pending_pri, old.xisr,
+		 old.need_resend, old.out_ee);
+	XICS_DBG("UPD        - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+		 new.cppr, new.mfrr, new.pending_pri, new.xisr,
+		 new.need_resend, new.out_ee);
+	/*
+	 * Check for output state update
+	 *
+	 * Note that this is racy since another processor could be updating
+	 * the state already. This is why we never clear the interrupt output
+	 * here, we only ever set it. The clear only happens prior to doing
+	 * an update and only by the processor itself. Currently we do it
+	 * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+	 *
+	 * We also do not try to figure out whether the EE state has changed,
+	 * we unconditionally set it if the new state calls for it for the
+	 * same reason.
+	 */
+	if (new.out_ee) {
+		kvmppc_book3s_queue_irqprio(icp->vcpu,
+					    BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+		if (!change_self)
+			kvm_vcpu_kick(icp->vcpu);
+	}
+ bail:
+	return success;
+}
+
+static void icp_check_resend(struct kvmppc_xics *xics,
+			     struct kvmppc_icp *icp)
+{
+	u32 icsid;
+	
+	/* Order this load with the test for need_resend in the caller */
+	smp_rmb();
+	for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
+		struct kvmppc_ics *ics = xics->ics[icsid];
+
+		if (!test_and_clear_bit(icsid, icp->resend_map))
+			continue;
+		if (!ics)
+			continue;
+		ics_check_resend(xics, ics, icp);
+	}
+}
+
+static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+			       u32 *reject)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool success;
+
+	XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority,
+		 icp->vcpu->vcpu_id);
+
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		*reject = 0;
+
+		/* See if we can deliver */
+		success = new_state.cppr > priority &&
+			new_state.mfrr > priority &&
+			new_state.pending_pri > priority;
+
+		/*
+		 * If we can, check for a rejection and perform the
+		 * delivery
+		 */
+		if (success) {
+			*reject = new_state.xisr;
+			new_state.xisr = irq;
+			new_state.pending_pri = priority;
+		} else {
+			/*
+			 * If we failed to deliver we set need_resend
+			 * so a subsequent CPPR state change causes us
+			 * to try a new delivery.
+			 */
+			new_state.need_resend = true;
+		}
+
+	} while (!icp_try_update(icp, old_state, new_state, false));
+
+	return success;
+}
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq)
+{
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;
+	u32 reject;
+	u16 src;	
+
+	/*
+	 * This is used both for initial delivery of an interrupt and
+	 * for subsequent rejection.
+	 *
+	 * Rejection can be racy vs. resends. We have evaluated the
+	 * rejection in an atomic ICP transaction which is now complete,
+	 * so potentially the ICP can already accept the interrupt again.
+	 *
+	 * So we need to retry the delivery. Essentially the reject path
+	 * boils down to a failed delivery. Always.
+	 *
+	 * Now the interrupt could also have moved to a different target,
+	 * thus we may need to re-do the ICP lookup as well
+	 */
+	 
+ again:
+	/* Get the ICS state and lock it */
+	ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+	if (!ics) {
+		XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq);
+		return;
+	}
+	state = &ics->irq_state[src];
+
+	/* Get a lock on the ICS */
+	mutex_lock(&ics->lock);
+
+	/* Get our server */
+	if (!icp || state->server != icp->vcpu->vcpu_id) {
+		icp = kvmppc_xics_find_server(xics->kvm, state->server);
+		if (!icp) {
+			pr_warning("icp_deliver_irq: IRQ 0x%06x server 0x%x"
+				   " not found !\n", new_irq, state->server);
+			goto out;
+		}
+	}
+
+	/* Clear the resend bit of that interrupt */
+	state->resend = 0;
+
+	/*
+	 * If masked, bail out
+	 *
+	 * Note: PAPR doesn't mention anything about masked pending
+	 * when doing a resend, only when doing a delivery.
+	 *
+	 * However that would have the effect of losing a masked
+	 * interrupt that was rejected and isn't consistent with
+	 * the whole masked_pending business which is about not
+	 * losing interrupts that occur while masked.
+	 *
+	 * I don't differenciate normal deliveries and resends, this
+	 * implementation will differ from PAPR and not lose such
+	 * interrupts.
+	 */
+	if (state->priority == MASKED) {
+		XICS_DBG("irq %#x masked pending\n", new_irq);
+		state->masked_pending = 1;
+		goto out;
+	}
+
+	/*
+	 * Try the delivery, this will set the need_resend flag
+	 * in the ICP as part of the atomic transaction if the
+	 * delivery is not possible.
+	 *
+	 * Note that if successful, the new delivery might have itself
+	 * rejected an interrupt that was "delivered" before we took the
+	 * icp mutex.
+	 *
+	 * In this case we do the whole sequence all over again for the
+	 * new guy. We cannot assume that the rejected interrupt is less
+	 * favored than the new one, and thus doesn't need to be delivered,
+	 * because by the time we exit icp_try_to_deliver() the target
+	 * processor may well have alrady consumed & completed it, and thus
+	 * the rejected interrupt might actually be already acceptable.
+	 */
+	if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+		/*
+		 * Delivery was successful, did we reject somebody else ?
+		 */
+		if (reject && reject != XICS_IPI) {
+			mutex_unlock(&ics->lock);
+			new_irq = reject;
+			goto again;
+		}
+	} else {
+		/*
+		 * We failed to deliver the interrupt we need to set the
+		 * resend map bit and mark the ICS state as needing a resend
+		 */
+		set_bit(ics->icsid, icp->resend_map);
+		state->resend = 1;
+
+		/*
+		 * If the need_resend flag got cleared in the ICP some time
+		 * between icp_try_to_deliver() atomic update and now, then
+		 * we know it might have missed the resend_map bit. So we
+		 * retry
+		 */
+		smp_mb();
+		if (!icp->state.need_resend) {
+			mutex_unlock(&ics->lock);
+			goto again;
+		}
+	}
+ out:
+	mutex_unlock(&ics->lock);
+}
+
+static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			  u8 new_cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool resend;
+
+	/*
+	 * This handles several related states in one operation:
+	 *
+	 * ICP State: Down_CPPR
+	 *
+	 * Load CPPR with new value and if the XISR is 0
+	 * then check for resends:
+	 *
+	 * ICP State: Resend
+	 *
+	 * If MFRR is more favored than CPPR, check for IPIs
+	 * and notify ICS of a potential resend. This is done
+	 * asynchronously (when used in real mode, we will have
+	 * to exit here).
+	 *
+	 * We do not handle the complete Check_IPI as documented
+	 * here. In the PAPR, this state will be used for both
+	 * Set_MFRR and Down_CPPR. However, we know that we aren't
+	 * changing the MFRR state here so we don't need to handle
+	 * the case of an MFRR causing a reject of a pending irq,
+	 * this will have been handled when the MFRR was set in the
+	 * first place.
+	 *
+	 * Thus we don't have to handle rejects, only resends.
+	 *
+	 * When implementing real mode for HV KVM, resend will lead to
+	 * a H_TOO_HARD return and the whole transaction will be handled
+	 * in virtual mode.
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		/* Down_CPPR */
+		new_state.cppr = new_cppr;
+
+		/*
+		 * Cut down Resend / Check_IPI / IPI
+		 *
+		 * The logic is that we cannot have a pending interrupt
+		 * trumped by an IPI at this point (see above), so we
+		 * know that either the pending interrupt is already an
+		 * IPI (in which case we don't care to override it) or
+		 * it's either more favored than us or non existent
+		 */
+		if (new_state.mfrr < new_cppr &&
+		    new_state.mfrr <= new_state.pending_pri) {
+			WARN_ON(new_state.xisr != XICS_IPI &&
+				new_state.xisr != 0);
+			new_state.pending_pri = new_state.mfrr;
+			new_state.xisr = XICS_IPI;
+		}
+
+		/* Latch/clear resend bit */
+		resend = new_state.need_resend;
+		new_state.need_resend = 0;
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	/*
+	 * Now handle resend checks. Those are asynchronous to the ICP
+	 * state update in HW (ie bus transactions) so we can handle them
+	 * separately here too
+	 */
+	if (resend)
+		icp_check_resend(xics, icp);
+}
+
+static noinline unsigned long h_xirr(struct kvm_vcpu *vcpu)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 xirr;
+
+	/* First, remove EE from the processor */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+	/*
+	 * ICP State: Accept_Interrupt
+	 *
+	 * Return the pending interrupt (if any) along with the
+	 * current CPPR, then clear the XISR & set CPPR to the
+	 * pending priority
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+		if (!old_state.xisr)
+			break;
+		new_state.cppr = new_state.pending_pri;
+		new_state.pending_pri = 0xff;
+		new_state.xisr = 0;
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr);
+
+	return xirr;
+}
+
+static noinline int h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+			  unsigned long mfrr)
+{
+        union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp;
+	u32 reject;
+	bool resend;
+	bool local;
+
+	XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n",
+			vcpu->vcpu_id, server, mfrr);
+
+	local = vcpu->vcpu_id == server;
+	if (local)
+		icp = vcpu->arch.icp;
+	else
+		icp = kvmppc_xics_find_server(vcpu->kvm, server);
+	if (!icp)
+		return H_PARAMETER;
+
+	/*
+	 * ICP state: Set_MFRR
+	 *
+	 * If the CPPR is more favored than the new MFRR, then
+	 * nothing needs to be rejected as there can be no XISR to
+	 * reject.  If the MFRR is being made less favored then
+	 * there might be a previously-rejected interrupt needing
+	 * to be resent.
+	 *
+	 * If the CPPR is less favored, then we might be replacing
+	 * an interrupt, and thus need to possibly reject it as in
+	 *
+	 * ICP state: Check_IPI
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		/* Set_MFRR */
+		new_state.mfrr = mfrr;
+
+		/* Check_IPI */
+		reject = 0;
+		resend = false;
+		if (mfrr < new_state.cppr) {
+			/* Reject a pending interrupt if not an IPI */
+			if (mfrr <= new_state.pending_pri)
+				reject = new_state.xisr;
+			new_state.pending_pri = mfrr;
+			new_state.xisr = XICS_IPI;
+		}
+
+		if (mfrr > old_state.mfrr && mfrr > new_state.cppr) {
+			resend = new_state.need_resend;
+			new_state.need_resend = 0;
+		}
+	} while (!icp_try_update(icp, old_state, new_state, local));
+
+	/* Handle reject */
+	if (reject && reject != XICS_IPI)
+		icp_deliver_irq(xics, icp, reject);
+		
+	/* Handle resend */
+	if (resend)
+		icp_check_resend(xics, icp);
+
+	return H_SUCCESS;
+}
+
+static noinline void h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 reject;
+
+	XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr);
+
+	/*
+	 * ICP State: Set_CPPR
+	 *
+	 * We can safely compare the new value with the current
+	 * value outside of the transaction as the CPPR is only
+	 * ever changed by the processor on itself
+	 */
+	if (cppr > icp->state.cppr)
+		icp_down_cppr(xics, icp, cppr);
+	else if (cppr == icp->state.cppr)
+		return;
+
+	/*
+	 * ICP State: Up_CPPR
+	 *
+	 * The processor is raising its priority, this can result
+	 * in a rejection of a pending interrupt:
+	 *
+	 * ICP State: Reject_Current
+	 *
+	 * We can remove EE from the current processor, the update
+	 * transaction will set it again if needed
+	 */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		reject = 0;
+		new_state.cppr = cppr;
+
+		if (cppr <= new_state.pending_pri) {
+			reject = new_state.xisr;
+			new_state.xisr = 0;
+			new_state.pending_pri = 0xff;
+		}
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	/*
+	 * Check for rejects. They are handled by doing a new delivery
+	 * attempt (see comments in icp_deliver_irq).
+	 */
+	if (reject && reject != XICS_IPI)
+		icp_deliver_irq(xics, icp, reject);
+}
+
+static noinline int h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u32 irq = xirr & 0x00ffffff;
+	u16 src;
+
+	XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
+
+	/*
+	 * ICP State: EOI
+	 *
+	 * Note: If EOI is incorrectly used by SW to lower the CPPR
+	 * value (ie more favored), we do not check for rejection of
+	 * a pending interrupt, this is a SW error and PAPR sepcifies
+	 * that we don't have to deal with it.
+	 *
+	 * The sending of an EOI to the ICS is handled after the
+	 * CPPR update
+	 *
+	 * ICP State: Down_CPPR which we handle
+	 * in a separate function as it's shared with H_CPPR.
+	 */
+	icp_down_cppr(xics, icp, xirr >> 24);
+
+	/* IPIs have no EOI */
+	if (irq == XICS_IPI)
+		return H_SUCCESS;
+	/*
+	 * EOI handling: If the interrupt is still asserted, we need to
+	 * resend it. We can take a lockless "peek" at the ICS state here.
+	 *
+	 * "Message" interrupts will never have "asserted" set
+	 */
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics) {
+		XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
+		return H_PARAMETER;
+	}
+	state = &ics->irq_state[src];
+
+	/* Still asserted, resend it */
+	if (state->asserted)
+		icp_deliver_irq(xics, icp, irq);
+
+	return H_SUCCESS;
+}
+
+int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
+{
+	unsigned long res;
+	int rc = H_SUCCESS;
+
+	/* Check if we have an ICP */
+	if (!vcpu->arch.icp || !vcpu->kvm->arch.xics)
+		return H_HARDWARE;
+
+	switch (req) {
+	case H_XIRR:
+		res = h_xirr(vcpu);
+		kvmppc_set_gpr(vcpu, 4, res);
+		break;
+	case H_CPPR:
+		h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
+		break;
+	case H_EOI:
+		rc = h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
+		break;
+	case H_IPI:
+		rc = h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
+			   kvmppc_get_gpr(vcpu, 5));
+		break;
+	}
+
+	return rc;
+}
+
+
+/* -- Initialisation code etc. -- */
+
+static int xics_debug_show(struct seq_file *m, void *private)
+{
+	struct kvmppc_xics *xics = m->private;
+	struct kvm *kvm = xics->kvm;
+	struct kvm_vcpu *vcpu;
+	int icsid, i;
+
+	if (!kvm)
+		return 0;
+
+	seq_printf(m, "=========\nICP state\n=========\n");
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvmppc_icp *icp = vcpu->arch.icp;
+		union kvmppc_icp_state state;
+
+		if (!icp)
+			continue;
+
+		state.raw = ACCESS_ONCE(icp->state.raw);
+		seq_printf(m, "cpu server %#x XIRR:%#x PPRI:%#x CPPR:%#x "
+			   "MFRR:%#x OUT:%d NR:%d\n", vcpu->vcpu_id, state.xisr,
+			   state.pending_pri, state.cppr, state.mfrr,
+			   state.out_ee, state.need_resend);
+	}
+
+	for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
+		struct kvmppc_ics *ics = xics->ics[icsid];
+
+		if (!ics)
+			continue;
+
+		seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n",
+			   icsid);
+
+		mutex_lock(&ics->lock);
+
+		for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+			struct ics_irq_state *irq = &ics->irq_state[i];
+
+			seq_printf(m, "irq 0x%06x: server %#x prio %#x save"
+				   " prio %#x asserted %d resend %d masked"
+				   " pending %d\n",
+				   irq->number, irq->server, irq->priority,
+				   irq->saved_priority, irq->asserted,
+				   irq->resend, irq->masked_pending);
+
+		}
+		mutex_unlock(&ics->lock);
+	}
+	return 0;
+}
+
+static int xics_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xics_debug_show, inode->i_private);
+}
+
+static const struct file_operations xics_debug_fops = {
+	.open = xics_debug_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void xics_debugfs_init(struct kvmppc_xics *xics)
+{
+	char *name;
+
+	name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics);
+	if (!name) {
+		pr_err("%s: no memory for name\n", __func__);
+		return;
+	}
+
+	xics->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root,
+					   xics, &xics_debug_fops);
+
+	pr_debug("%s: created %s\n", __func__, name);
+	kfree(name);
+}
+
+struct kvmppc_ics *kvmppc_xics_create_ics(struct kvmppc_xics *xics,
+					  int irq)
+{
+	struct kvmppc_ics *ics;
+	int i, icsid;
+
+	icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+
+	mutex_lock(&xics->kvm->lock);
+
+	/* ICS already exists - somebody else got here first */
+	if (xics->ics[icsid])
+		goto out;
+
+	/* Create the ICS */
+	ics = kzalloc(sizeof(struct kvmppc_ics), GFP_KERNEL);
+	if (!ics)
+		goto out;
+
+	mutex_init(&ics->lock);
+	ics->icsid = icsid;
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		ics->irq_state[i].number = (icsid << KVMPPC_XICS_ICS_SHIFT) | i;
+		ics->irq_state[i].priority = MASKED;
+		ics->irq_state[i].saved_priority = MASKED;
+	}
+	smp_wmb();
+	xics->ics[icsid] = ics;
+
+	if (icsid > xics->max_icsid)
+		xics->max_icsid = icsid;
+
+ out:
+	mutex_unlock(&xics->kvm->lock);
+	return xics->ics[icsid];
+}
+
+int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num)
+{
+	struct kvmppc_icp *icp;
+
+	if (!vcpu->kvm->arch.xics)
+		return -ENODEV;
+
+	if (kvmppc_xics_find_server(vcpu->kvm, server_num))
+		return -EEXIST;
+
+	icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL);
+	if (!icp)
+		return -ENOMEM;
+
+	icp->vcpu = vcpu;
+	icp->server_num = server_num;
+	icp->state.mfrr = MASKED;
+	icp->state.pending_pri = MASKED;
+	vcpu->arch.icp = icp;
+
+	XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id);
+
+	return 0;
+}
+
+/* -- ioctls -- */
+
+int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args)
+{
+	struct kvmppc_xics *xics;
+	int r;
+
+	/* locking against multiple callers? */
+
+	xics = kvm->arch.xics;
+	if (!xics)
+		return -ENODEV;
+
+	switch (args->level) {
+	case KVM_INTERRUPT_SET:
+	case KVM_INTERRUPT_SET_LEVEL:
+	case KVM_INTERRUPT_UNSET:
+		r = ics_deliver_irq(xics, args->irq, args->level);
+		break;
+	default:
+		r = -EINVAL;
+	}
+
+	return r;
+}
+
+void kvmppc_xics_free(struct kvmppc_xics *xics)
+{
+	int i;
+
+	debugfs_remove(xics->dentry);
+
+	if (xics->kvm) {
+		kvm_put_kvm(xics->kvm);
+		xics->kvm->arch.xics = NULL;
+		xics->kvm = NULL;
+	}
+
+	for (i = 0; i <= xics->max_icsid; i++) {
+		if (xics->ics[i])
+			kfree(xics->ics[i]);
+	}
+	kfree(xics);
+}
+
+int kvm_create_xics(struct kvm *kvm, u32 type)
+{
+	struct kvmppc_xics *xics;
+
+	/* Already there ? */
+	if (kvm->arch.xics)
+		return -EEXIST;
+
+	xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+	if (!xics)
+		return -ENOMEM;
+
+	xics->kvm = kvm;
+	kvm->arch.xics = xics;
+	xics_debugfs_init(xics);
+
+	kvm_get_kvm(kvm);
+	return 0;
+}
+
+void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->arch.icp)
+		return;
+	kfree(vcpu->arch.icp);
+	vcpu->arch.icp = NULL;
+	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
+}
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
new file mode 100644
index 0000000..c44034f
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -0,0 +1,113 @@ 
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _KVM_PPC_BOOK3S_XICS_H
+#define _KVM_PPC_BOOK3S_XICS_H
+
+/*
+ * We use a two-level tree to store interrupt source information.
+ * There are up to 1024 ICS nodes, each of which can represent
+ * 1024 sources.
+ */
+#define KVMPPC_XICS_MAX_ICS_ID	1023
+#define KVMPPC_XICS_ICS_SHIFT	10
+#define KVMPPC_XICS_IRQ_PER_ICS	(1 << KVMPPC_XICS_ICS_SHIFT)
+#define KVMPPC_XICS_SRC_MASK	(KVMPPC_XICS_IRQ_PER_ICS - 1)
+
+/*
+ * Interrupt source numbers below this are reserved, for example
+ * 0 is "no interrupt", and 2 is used for IPIs.
+ */
+#define KVMPPC_XICS_FIRST_IRQ	16
+#define KVMPPC_XICS_NR_IRQS	((KVMPPC_XICS_MAX_ICS_ID + 1) * KVMPPC_XICS_IRQ_PER_ICS)
+
+/* Priority value to use for disabling an interrupt */
+#define MASKED	0xff
+
+/* State for one irq source */
+struct ics_irq_state {
+	u32 number;
+	u32 server;
+	u8  priority;
+	u8  saved_priority; /* currently unused */
+	u8  resend;
+	u8  masked_pending;
+	u8  asserted; /* Only for LSI */
+	u8  exists;
+};
+
+/* Atomic ICP state, updated with a single compare & swap */
+union kvmppc_icp_state {
+	unsigned long raw;
+	struct {
+		u8 out_ee : 1;
+		u8 need_resend : 1;
+		u8 cppr;
+		u8 mfrr;
+		u8 pending_pri;
+		u32 xisr;
+	};
+};
+
+/* One bit per ICS */
+#define ICP_RESEND_MAP_SIZE	(KVMPPC_XICS_MAX_ICS_ID / BITS_PER_LONG + 1)
+
+struct kvmppc_icp {
+	struct kvm_vcpu *vcpu;
+	unsigned long server_num;
+	union kvmppc_icp_state state;
+	unsigned long resend_map[ICP_RESEND_MAP_SIZE];
+};
+
+struct kvmppc_ics {
+	struct mutex lock;
+	u16 icsid;
+	struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
+};
+
+struct kvmppc_xics {
+	struct kvm *kvm;
+	struct dentry *dentry;
+	u32 max_icsid;
+	atomic_t usecount;
+	struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1];
+};
+
+static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm,
+							 u32 nr)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (vcpu->arch.icp && nr == vcpu->arch.icp->server_num)
+			return vcpu->arch.icp;
+	}
+	return NULL;
+}
+
+static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
+						      u32 irq, u16 *source)
+{
+	u32 icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+	u16 src = irq & KVMPPC_XICS_SRC_MASK;
+	struct kvmppc_ics *ics;
+
+	if (source)
+		*source = src;
+	if (icsid > KVMPPC_XICS_MAX_ICS_ID)
+		return NULL;
+	ics = xics->ics[icsid];
+	if (!ics)
+		return NULL;
+	return ics;
+}
+
+
+#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 5cde31d..bc6d7cf 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -472,6 +472,11 @@  void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 		kvmppc_mpic_put(vcpu->arch.mpic);
 		break;
 #endif
+#ifdef CONFIG_KVM_XICS
+	case KVMPPC_IRQ_XICS:
+		kvmppc_xics_free_icp(vcpu);
+		break;
+#endif
 	default:
 		break;
 	}