Patchwork [RFC,3/9] KVM: PPC: Book3S: Add kernel emulation for the XICS interrupt controller

login
register
mail settings
Submitter Paul Mackerras
Date Nov. 5, 2012, 3:21 a.m.
Message ID <20121105032109.GD22409@drongo>
Download mbox | patch
Permalink /patch/197131/
State New
Headers show

Comments

Paul Mackerras - Nov. 5, 2012, 3:21 a.m.
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>

This adds in-kernel emulation of the XICS (eXternal Interrupt
Controller Specification) interrupt controller specified by PAPR, for
both HV and PR KVM guests.

It supports up to 4095 "BUIDs" (blocks of interrupts) of up to 4096
interrupts each.  Each vcpu gets one ICP (interrupt controller
presentation) entity, and each BUID gets one ICS (interrupt controller
source) entity.  These entities are created with the new
KVM_CREATE_IRQCHIP_ARGS ioctl, which takes a structure indicating
which type of controller to create and the parameters pertaining to it
(flags for both, BUID and number of interrupt sources for ICS).
We don't use the existing KVM_CREATE_IRQCHIP ioctl because it doesn't
allow us to pass any parameters defining the new interrupt controller.

This is based on an initial implementation by Michael Ellerman
<michael@ellerman.id.au> reworked by myself.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 Documentation/virtual/kvm/api.txt     |   17 +
 arch/powerpc/include/asm/kvm_book3s.h |    1 +
 arch/powerpc/include/asm/kvm_host.h   |    8 +
 arch/powerpc/include/asm/kvm_ppc.h    |   18 +
 arch/powerpc/include/uapi/asm/kvm.h   |   32 +
 arch/powerpc/kvm/Makefile             |    1 +
 arch/powerpc/kvm/book3s.c             |    2 +-
 arch/powerpc/kvm/book3s_hv.c          |   20 +
 arch/powerpc/kvm/book3s_pr.c          |   13 +
 arch/powerpc/kvm/book3s_pr_papr.c     |   16 +
 arch/powerpc/kvm/book3s_rtas.c        |   51 +-
 arch/powerpc/kvm/book3s_xics.c        | 1112 +++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/powerpc.c            |   21 +
 include/uapi/linux/kvm.h              |    6 +-
 14 files changed, 1315 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_xics.c

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 8ca0a7c..fbe018e 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2090,6 +2090,23 @@  associated with the service will be forgotten, and subsequent RTAS
 calls by the guest for that service will be passed to userspace to be
 handled.
 
+4.79 KVM_CREATE_IRQCHIP_ARGS
+
+Capability: KVM_CAP_SPAPR_XICS
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_irqchip_args
+Returns: 0 on success, -1 on error
+
+Creates an interrupt controller model in the kernel.  This is
+currently only implemented for the XICS (eXternal Interrupt Controller
+Specification) model defined in PAPR.  This ioctl creates either an
+interrupt controller presentation (ICP) or an interrupt controller
+source (ICS) entity, depending on the type field of the argument
+struct.  When creating an ICS, the argument struct further indicates
+the BUID (Bus Unit ID) and number of interrupt sources for the new
+ICS.
+
 
 5. The kvm_run structure
 ------------------------
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 36fcf41..fbda9b1 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -140,6 +140,7 @@  extern int kvmppc_mmu_hv_init(void);
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+extern void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
 extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
 			   bool upper, u32 val);
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 8e745a8..0136e1e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -190,6 +190,10 @@  struct kvmppc_linear_info {
 	int		 type;
 };
 
+/* XICS components, defined in boo3s_xics.c */
+struct kvmppc_xics;
+struct kvmppc_icp;
+
 /*
  * The reverse mapping array has one entry for each HPTE,
  * which stores the guest's view of the second word of the HPTE
@@ -256,6 +260,7 @@  struct kvm_arch {
 #ifdef CONFIG_PPC_BOOK3S_64
 	struct list_head spapr_tce_tables;
 	struct list_head rtas_tokens;
+	struct kvmppc_xics *xics;
 #endif
 };
 
@@ -565,6 +570,9 @@  struct kvm_vcpu_arch {
 	u64 busy_stolen;
 	u64 busy_preempt;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	struct kvmppc_icp *icp; /* XICS presentation controller */
+#endif
 };
 
 /* Values for vcpu->arch.state */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 996aeca..c74fd20 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -131,6 +131,12 @@  extern long kvmppc_prepare_vrma(struct kvm *kvm,
 extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
 			struct kvm_memory_slot *memslot, unsigned long porder);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
+extern int kvmppc_xics_ioctl(struct kvm *kvm, unsigned ioctl, unsigned long arg);
+extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
+extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu);
+extern void kvmppc_xics_free(struct kvm *kvm);
+
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce *args);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
@@ -166,6 +172,8 @@  extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
 extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp);
 extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu);
 extern void kvmppc_rtas_tokens_free(struct kvm *kvm);
+extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority);
+extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority);
 
 /*
  * Cuts out inst bits with ordering according to spec.
@@ -262,6 +270,16 @@  static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 
 static inline void kvm_linear_init(void)
 {}
+
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+static inline int kvmppc_xics_enabled(struct kvm *kvm)
+{
+	return kvm->arch.xics != NULL;
+}
+#else
+static inline int kvmppc_xics_enabled(struct kvm *kvm) { return 0; }
 #endif
 
 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index fd0a6f7..145c645 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -302,6 +302,38 @@  struct kvm_rtas_token_args {
 	__u64 token;	/* Use a token of 0 to undefine a mapping */
 };
 
+/* for KVM_CAP_SPAPR_XICS */
+#define __KVM_HAVE_IRQCHIP_ARGS
+struct kvm_irqchip_args {
+#define KVM_IRQCHIP_TYPE_ICP	0	/* XICS: ICP (presentation controller) */
+#define KVM_IRQCHIP_TYPE_ICS	1	/* XICS: ICS (source controller) */
+	__u32 type;
+	union {
+		/* XICS ICP arguments. This needs to be called once before
+		 * creating any VCPU to initialize the main kernel XICS data
+		 * structures.
+		 */
+		struct {
+			__u32 flags;
+		} icp;
+
+		/* XICS ICS arguments. You can call this for every BUID you
+		 * want to make available.
+		 *
+		 * The BUID is 12 bits, the interrupt number within a BUID
+		 * is up to 12 bits as well. The resulting interrupt numbers
+		 * exposed to the guest are BUID || IRQ which is 24 bit
+		 *
+		 * BUID cannot be 0.
+		 */
+		struct {
+			__u32 flags;
+			__u16 buid;
+			__u16 nr_irqs;
+		} ics;
+	};
+};
+
 struct kvm_book3e_206_tlb_entry {
 	__u32 mas8;
 	__u32 mas1;
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 536f65f..ec2f8da 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -81,6 +81,7 @@  kvm-book3s_64-module-objs := \
 	book3s.o \
 	book3s_64_vio.o \
 	book3s_rtas.o \
+	book3s_xics.o \
 	$(kvm-book3s_64-objs-y)
 
 kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 6548445..c5a4478 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -104,7 +104,7 @@  static int kvmppc_book3s_vec2irqprio(unsigned int vec)
 	return prio;
 }
 
-static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
 					  unsigned int vec)
 {
 	unsigned long old_pending = vcpu->arch.pending_exceptions;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 8284d55..0d6616e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -528,6 +528,14 @@  int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 
 		/* Send the error out to userspace via KVM_RUN */
 		return rc;
+	case H_XIRR:
+	case H_CPPR:
+	case H_EOI:
+	case H_IPI:
+		if (kvmppc_xics_enabled(vcpu->kvm)) {
+			ret = kvmppc_xics_hcall(vcpu, req);
+			break;
+		} /* fallthrough */
 	default:
 		return RESUME_HOST;
 	}
@@ -876,6 +884,13 @@  struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	spin_lock_init(&vcpu->arch.tbacct_lock);
 	vcpu->arch.busy_preempt = TB_NIL;
 
+	/* Create the XICS */
+	if (kvmppc_xics_enabled(kvm)) {
+		err = kvmppc_xics_create_icp(vcpu);
+		if (err < 0)
+			goto free_vcpu;
+	}
+
 	kvmppc_mmu_book3s_hv_init(vcpu);
 
 	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
@@ -926,6 +941,8 @@  void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa.pinned_addr);
 	spin_unlock(&vcpu->arch.vpa_update_lock);
 	kvm_vcpu_uninit(vcpu);
+	if (kvmppc_xics_enabled(vcpu->kvm))
+		kvmppc_xics_free_icp(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
 
@@ -1876,6 +1893,9 @@  void kvmppc_core_destroy_vm(struct kvm *kvm)
 
 	kvmppc_rtas_tokens_free(kvm);
 
+	if (kvmppc_xics_enabled(kvm))
+		kvmppc_xics_free(kvm);
+
 	kvmppc_free_hpt(kvm);
 	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 }
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index b853696..26acf5c 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1052,6 +1052,13 @@  struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	if (err < 0)
 		goto uninit_vcpu;
 
+	/* Create the XICS */
+	if (kvmppc_xics_enabled(kvm)) {
+		err = kvmppc_xics_create_icp(vcpu);
+		if (err < 0)
+			goto free_vcpu;
+	}
+
 	return vcpu;
 
 uninit_vcpu:
@@ -1068,6 +1075,8 @@  void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 
+	if (kvmppc_xics_enabled(vcpu->kvm))
+		kvmppc_xics_free_icp(vcpu);
 	free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
 	kvm_vcpu_uninit(vcpu);
 	kfree(vcpu_book3s->shadow_vcpu);
@@ -1278,6 +1287,7 @@  int kvmppc_core_init_vm(struct kvm *kvm)
 {
 #ifdef CONFIG_PPC64
 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+	INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
 #endif
 
 	return 0;
@@ -1288,6 +1298,9 @@  void kvmppc_core_destroy_vm(struct kvm *kvm)
 #ifdef CONFIG_PPC64
 	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 #endif
+	if (kvmppc_xics_enabled(kvm))
+		kvmppc_xics_free(kvm);
+
 }
 
 static int kvmppc_book3s_init(void)
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index 4efa4a4..94cec5b 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -227,6 +227,15 @@  static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 }
 
+static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
+{
+	long rc = kvmppc_xics_hcall(vcpu, cmd);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
 int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 {
 	switch (cmd) {
@@ -246,6 +255,13 @@  int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 		vcpu->stat.halt_wakeup++;
 		return EMULATE_DONE;
+	case H_XIRR:
+	case H_CPPR:
+	case H_EOI:
+	case H_IPI:
+		if (kvmppc_xics_enabled(vcpu->kvm))
+			return kvmppc_h_pr_xics_hcall(vcpu, cmd);
+		break;
 	case H_RTAS:
 		if (list_empty(&vcpu->kvm->arch.rtas_tokens))
 			return RESUME_HOST;
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
index 8a324e8..6a6c1fe 100644
--- a/arch/powerpc/kvm/book3s_rtas.c
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -18,12 +18,61 @@ 
 #include <asm/rtas.h>
 
 
+static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq, server, priority;
+	int rc;
+
+	if (args->nargs != 3 || args->nret != 1) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = args->args[0];
+	server = args->args[1];
+	priority = args->args[2];
+
+	rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
+	if (rc)
+		rc = -3;
+out:
+	args->rets[0] = rc;
+}
+
+static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq, server, priority;
+	int rc;
+
+	if (args->nargs != 1 || args->nret != 3) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = args->args[0];
+
+	server = priority = 0;
+	rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
+	if (rc) {
+		rc = -3;
+		goto out;
+	}
+
+	args->rets[1] = server;
+	args->rets[2] = priority;
+out:
+	args->rets[0] = rc;
+}
+
 struct rtas_handler {
 	void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args);
 	char *name;
 };
 
-static struct rtas_handler rtas_handlers[] = { };
+static struct rtas_handler rtas_handlers[] = {
+	{ .name = "ibm,set-xive", .handler = kvm_rtas_set_xive },
+	{ .name = "ibm,get-xive", .handler = kvm_rtas_get_xive },
+};
 
 struct rtas_token_definition {
 	struct list_head list;
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
new file mode 100644
index 0000000..1538be2
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -0,0 +1,1112 @@ 
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#define MASKED	0xff
+
+#define XICS_DBG(fmt...) do { } while (0)
+//#define XICS_DBG(fmt...) do { trace_printk(fmt); } while (0)
+
+#undef DEBUG_REALMODE
+
+/*
+ * LOCKING
+ * =======
+ *
+ * Each ICS has a mutex protecting the information about the IRQ
+ * sources and avoiding simultaneous deliveries if the same interrupt.
+ *
+ * ICP operations are done via a single compare & swap transaction
+ * (most ICP state fits in the union kvmppc_icp_state)
+ */
+
+/*
+ * Interrupt numbering
+ * ===================
+ *
+ * The 24-bit global interrupt numbers are divided in two components,
+ * the BUID and the interrupt source. We have arbitrarily chosen a
+ * 10 bit
+ */
+
+/*
+ * TODO
+ * ====
+ *
+ * - To speed up resends, keep a bitmap of "resend" set bits in the
+ *   ICS
+ *
+ * - Speed up server# -> ICP lookup (array ? hash table ?)
+ *
+ * - Make ICS lockless as well, or at least a per-interrupt lock or hashed
+ *   locks array to improve scalability
+ *
+ * - ioctl's to save/restore the entire state for snapshot & migration
+ */
+
+#define KVMPPC_XICS_MAX_BUID	0xfff
+#define KVMPPC_XICS_IRQ_COUNT	0x1000
+#define KVMPPC_XICS_BUID_SHIFT	12
+#define KVMPPC_XICS_SRC_MASK	0xfff
+
+/* State for one irq in an ics */
+struct ics_irq_state {
+	u32 number;
+	u32 server;
+	u8  priority;
+	u8  saved_priority; /* currently unused */
+	u8  resend;
+	u8  masked_pending;
+	u8  asserted; /* Only for LSI */
+};
+
+#define ICP_RESEND_MAP_SIZE	\
+	((KVMPPC_XICS_MAX_BUID + BITS_PER_LONG - 1) / BITS_PER_LONG)
+
+/* Atomic ICP state, updated with a single compare & swap */
+union kvmppc_icp_state {
+	unsigned long raw;
+	struct {
+		u8 out_ee : 1;
+		u8 need_resend : 1;
+		u8 cppr;
+		u8 mfrr;
+		u8 pending_pri;
+		u32 xisr;
+	};
+};
+
+struct kvmppc_icp {
+	struct kvm_vcpu *vcpu;
+	union kvmppc_icp_state state;
+	unsigned long resend_map[ICP_RESEND_MAP_SIZE];
+};
+
+struct kvmppc_ics {
+	struct mutex lock;
+	u16 buid;
+	u16 nr_irqs;
+	struct ics_irq_state irq_state[];
+};
+
+struct kvmppc_xics {
+	struct kvm *kvm;
+	struct dentry *dentry;
+	u32 max_buid;
+	struct kvmppc_ics *ics[KVMPPC_XICS_MAX_BUID]; /* [1...MAX_BUID] */
+};
+
+static struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm, u32 nr)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (nr == vcpu->vcpu_id)
+			return vcpu->arch.icp;
+	}
+	return NULL;
+}
+
+static struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
+					       u32 irq, u16 *source)
+{
+	u16 buid = irq >> KVMPPC_XICS_BUID_SHIFT;
+	u16 src = irq & KVMPPC_XICS_SRC_MASK;
+	struct kvmppc_ics *ics;
+
+	if (WARN_ON_ONCE(!buid || buid > KVMPPC_XICS_MAX_BUID)) {
+		XICS_DBG("kvmppc_xics_find_ics: irq %#x BUID out of range !\n",
+			 irq);
+		return NULL;
+	}
+	ics = xics->ics[buid - 1];
+	if (!ics)
+		return NULL;
+	if (src >= ics->nr_irqs)
+		return NULL;
+	if (source)
+		*source = src;
+	return ics;
+}
+
+
+/* -- ICS routines -- */
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq);
+
+static void ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
+{
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;	
+	u16 src;
+
+	XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics) {
+		XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq);
+		return;
+	}
+	state = &ics->irq_state[src];
+
+	/*
+	 * We set state->asserted locklessly. This should be fine as
+	 * we are the only setter, thus concurrent access is undefined
+	 * to begin with.
+	 */
+	if (level == KVM_INTERRUPT_SET_LEVEL)
+		state->asserted = 1;
+	else if (level == KVM_INTERRUPT_UNSET) {
+		state->asserted = 0;
+		return;
+	}
+
+	/* Attempt delivery */
+	icp_deliver_irq(xics, NULL, irq);
+}
+
+static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
+			     struct kvmppc_icp *icp)
+{
+	int i;
+
+	mutex_lock(&ics->lock);
+
+	for (i = 0; i < ics->nr_irqs; i++) {
+		struct ics_irq_state *state = &ics->irq_state[i];
+
+		if (!state->resend)
+			continue;
+
+		XICS_DBG("resend %#x prio %#x\n", state->number,
+			      state->priority);
+
+		mutex_unlock(&ics->lock);
+		icp_deliver_irq(xics, icp, state->number);
+		mutex_lock(&ics->lock);
+	}
+
+	mutex_unlock(&ics->lock);
+}
+
+int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_icp *icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+	bool deliver;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	icp = kvmppc_xics_find_server(kvm, server);
+	if (!icp)
+		return -EINVAL;
+
+	mutex_lock(&ics->lock);
+
+	XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n",
+		 irq, server, priority,
+		 state->masked_pending, state->resend);
+
+	state->server = server;
+	state->priority = priority;
+	deliver = false;
+	if ((state->masked_pending || state->resend) && priority != MASKED) {
+		state->masked_pending = 0;
+		deliver = true;
+	}
+
+	mutex_unlock(&ics->lock);
+
+	if (deliver)
+		icp_deliver_irq(xics, icp, irq);
+
+	return 0;
+}
+
+int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	mutex_lock(&ics->lock);
+	*server = state->server;
+	*priority = state->priority;
+	mutex_unlock(&ics->lock);
+
+	return 0;
+}
+
+/* -- ICP routines, including hcalls -- */
+
+static inline bool icp_try_update(struct kvmppc_icp *icp,
+				  union kvmppc_icp_state old,
+				  union kvmppc_icp_state new,
+				  bool change_self)
+{
+	bool success;
+
+	/* Calculate new output value */
+	new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+	/* Attempt atomic update */
+	success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+	if (!success)
+		goto bail;
+
+	XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+		 icp->vcpu->vcpu_id,
+		 old.cppr, old.mfrr, old.pending_pri, old.xisr,
+		 old.need_resend, old.out_ee);
+	XICS_DBG("UPD        - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+		 new.cppr, new.mfrr, new.pending_pri, new.xisr,
+		 new.need_resend, new.out_ee);
+	/*
+	 * Check for output state update
+	 *
+	 * Note that this is racy since another processor could be updating
+	 * the state already. This is why we never clear the interrupt output
+	 * here, we only ever set it. The clear only happens prior to doing
+	 * an update and only by the processor itself. Currently we do it
+	 * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+	 *
+	 * We also do not try to figure out whether the EE state has changed,
+	 * we unconditionally set it if the new state calls for it for the
+	 * same reason.
+	 */
+	if (new.out_ee) {
+		kvmppc_book3s_queue_irqprio(icp->vcpu,
+					    BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+		if (!change_self)
+			kvm_vcpu_kick(icp->vcpu);
+	}
+ bail:
+	return success;
+}
+
+static void icp_check_resend(struct kvmppc_xics *xics,
+			     struct kvmppc_icp *icp)
+{
+	u32 buid;
+	
+	/* Order this load with the test for need_resend in the caller */
+	smp_rmb();
+	for_each_set_bit(buid, icp->resend_map, xics->max_buid + 1) {
+		struct kvmppc_ics *ics = xics->ics[buid - 1];
+
+		if (!test_and_clear_bit(buid, icp->resend_map))
+			continue;
+		if (!ics)
+			continue;
+		ics_check_resend(xics, ics, icp);
+	}
+}
+
+static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+			       u32 *reject)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool success;
+
+	XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority,
+		 icp->vcpu->vcpu_id);
+
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		*reject = 0;
+
+		/* See if we can deliver */
+		success = new_state.cppr > priority &&
+			new_state.mfrr > priority &&
+			new_state.pending_pri > priority;
+
+		/*
+		 * If we can, check for a rejection and perform the
+		 * delivery
+		 */
+		if (success) {
+			*reject = new_state.xisr;
+			new_state.xisr = irq;
+			new_state.pending_pri = priority;
+		} else {
+			/*
+			 * If we failed to deliver we set need_resend
+			 * so a subsequent CPPR state change causes us
+			 * to try a new delivery.
+			 */
+			new_state.need_resend = true;
+		}
+
+	} while (!icp_try_update(icp, old_state, new_state, false));
+
+	return success;
+}
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq)
+{
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;
+	u32 reject;
+	u16 src;	
+
+	/*
+	 * This is used both for initial delivery of an interrupt and
+	 * for subsequent rejection.
+	 *
+	 * Rejection can be racy vs. resends. We have evaluated the
+	 * rejection in an atomic ICP transaction which is now complete,
+	 * so potentially the ICP can already accept the interrupt again.
+	 *
+	 * So we need to retry the delivery. Essentially the reject path
+	 * boils down to a failed delivery. Always.
+	 *
+	 * Now the interrupt could also have moved to a different target,
+	 * thus we may need to re-do the ICP lookup as well
+	 */
+	 
+ again:
+	/* Get the ICS state and lock it */
+	ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+	if (!ics) {
+		XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq);
+		return;
+	}
+	state = &ics->irq_state[src];
+
+	/* Get a lock on the ICS */
+	mutex_lock(&ics->lock);
+
+	/* Get our server */
+	if (!icp || state->server != icp->vcpu->vcpu_id) {
+		icp = kvmppc_xics_find_server(xics->kvm, state->server);
+		if (!icp) {
+			pr_warning("icp_deliver_irq: IRQ 0x%06x server 0x%x"
+				   " not found !\n", new_irq, state->server);
+			goto out;
+		}
+	}
+
+	/* Clear the resend bit of that interrupt */
+	state->resend = 0;
+
+	/*
+	 * If masked, bail out
+	 *
+	 * Note: PAPR doesn't mention anything about masked pending
+	 * when doing a resend, only when doing a delivery.
+	 *
+	 * However that would have the effect of losing a masked
+	 * interrupt that was rejected and isn't consistent with
+	 * the whole masked_pending business which is about not
+	 * losing interrupts that occur while masked.
+	 *
+	 * I don't differenciate normal deliveries and resends, this
+	 * implementation will differ from PAPR and not lose such
+	 * interrupts.
+	 */
+	if (state->priority == MASKED) {
+		XICS_DBG("irq %#x masked pending\n", new_irq);
+		state->masked_pending = 1;
+		goto out;
+	}
+
+	/*
+	 * Try the delivery, this will set the need_resend flag
+	 * in the ICP as part of the atomic transaction if the
+	 * delivery is not possible.
+	 *
+	 * Note that if successful, the new delivery might have itself
+	 * rejected an interrupt that was "delivered" before we took the
+	 * icp mutex.
+	 *
+	 * In this case we do the whole sequence all over again for the
+	 * new guy. We cannot assume that the rejected interrupt is less
+	 * favored than the new one, and thus doesn't need to be delivered,
+	 * because by the time we exit icp_try_to_deliver() the target
+	 * processor may well have alrady consumed & completed it, and thus
+	 * the rejected interrupt might actually be already acceptable.
+	 */
+	if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+		/*
+		 * Delivery was successful, did we reject somebody else ?
+		 */
+		if (reject && reject != XICS_IPI) {
+			mutex_unlock(&ics->lock);
+			new_irq = reject;
+			goto again;
+		}
+	} else {
+		/*
+		 * We failed to deliver the interrupt we need to set the
+		 * resend map bit and mark the ICS state as needing a resend
+		 */
+		set_bit(ics->buid, icp->resend_map);
+		state->resend = 1;
+
+		/*
+		 * If the need_resend flag got cleared in the ICP some time
+		 * between icp_try_to_deliver() atomic update and now, then
+		 * we know it might have missed the resend_map bit. So we
+		 * retry
+		 */
+		smp_mb();
+		if (!icp->state.need_resend) {
+			mutex_unlock(&ics->lock);
+			goto again;
+		}
+	}
+ out:
+	mutex_unlock(&ics->lock);
+}
+
+static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			  u8 new_cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool resend;
+
+	/*
+	 * This handles several related states in one operation:
+	 *
+	 * ICP State: Down_CPPR
+	 *
+	 * Load CPPR with new value and if the XISR is 0
+	 * then check for resends:
+	 *
+	 * ICP State: Resend
+	 *
+	 * If MFRR is more favored than CPPR, check for IPIs
+	 * and notify ICS of a potential resend. This is done
+	 * asynchronously (when used in real mode, we will have
+	 * to exit here).
+	 *
+	 * We do not handle the complete Check_IPI as documented
+	 * here. In the PAPR, this state will be used for both
+	 * Set_MFRR and Down_CPPR. However, we know that we aren't
+	 * changing the MFRR state here so we don't need to handle
+	 * the case of an MFRR causing a reject of a pending irq,
+	 * this will have been handled when the MFRR was set in the
+	 * first place.
+	 *
+	 * Thus we don't have to handle rejects, only resends.
+	 *
+	 * When implementing real mode for HV KVM, resend will lead to
+	 * a H_TOO_HARD return and the whole transaction will be handled
+	 * in virtual mode.
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		/* Down_CPPR */
+		new_state.cppr = new_cppr;
+
+		/*
+		 * Cut down Resend / Check_IPI / IPI
+		 *
+		 * The logic is that we cannot have a pending interrupt
+		 * trumped by an IPI at this point (see above), so we
+		 * know that either the pending interrupt is already an
+		 * IPI (in which case we don't care to override it) or
+		 * it's either more favored than us or non existent
+		 */
+		if (new_state.mfrr < new_cppr &&
+		    new_state.mfrr <= new_state.pending_pri) {
+			WARN_ON(new_state.xisr != XICS_IPI &&
+				new_state.xisr != 0);
+			new_state.pending_pri = new_state.mfrr;
+			new_state.xisr = XICS_IPI;
+		}
+
+		/* Latch/clear resend bit */
+		resend = new_state.need_resend;
+		new_state.need_resend = 0;
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	/*
+	 * Now handle resend checks. Those are asynchronous to the ICP
+	 * state update in HW (ie bus transactions) so we can handle them
+	 * separately here too
+	 */
+	if (resend)
+		icp_check_resend(xics, icp);
+}
+
+static noinline unsigned long h_xirr(struct kvm_vcpu *vcpu)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 xirr;
+
+	/* First, remove EE from the processor */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+	/*
+	 * ICP State: Accept_Interrupt
+	 *
+	 * Return the pending interrupt (if any) along with the
+	 * current CPPR, then clear the XISR & set CPPR to the
+	 * pending priority
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+		if (!old_state.xisr)
+			break;
+		new_state.cppr = new_state.pending_pri;
+		new_state.pending_pri = 0xff;
+		new_state.xisr = 0;
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr);
+
+	return xirr;
+}
+
+static noinline int h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+			  unsigned long mfrr)
+{
+        union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp;
+	u32 reject;
+	bool resend;
+	bool local;
+
+	XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n",
+			vcpu->vcpu_id, server, mfrr);
+
+	local = vcpu->vcpu_id == server;
+	if (local)
+		icp = vcpu->arch.icp;
+	else
+		icp = kvmppc_xics_find_server(vcpu->kvm, server);
+	if (!icp)
+		return H_PARAMETER;
+
+	/*
+	 * ICP state: Set_MFRR
+	 *
+	 * If the CPPR is more favored than the new MFRR, then
+	 * nothing needs to be rejected as there can be no XISR to
+	 * reject.  If the MFRR is being made less favored then
+	 * there might be a previously-rejected interrupt needing
+	 * to be resent.
+	 *
+	 * If the CPPR is less favored, then we might be replacing
+	 * an interrupt, and thus need to possibly reject it as in
+	 *
+	 * ICP state: Check_IPI
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		/* Set_MFRR */
+		new_state.mfrr = mfrr;
+
+		/* Check_IPI */
+		reject = 0;
+		resend = false;
+		if (mfrr < new_state.cppr) {
+			/* Reject a pending interrupt if not an IPI */
+			if (mfrr <= new_state.pending_pri)
+				reject = new_state.xisr;
+			new_state.pending_pri = mfrr;
+			new_state.xisr = XICS_IPI;
+		}
+
+		if (mfrr > old_state.mfrr && mfrr > new_state.cppr) {
+			resend = new_state.need_resend;
+			new_state.need_resend = 0;
+		}
+	} while (!icp_try_update(icp, old_state, new_state, local));
+
+	/* Handle reject */
+	if (reject && reject != XICS_IPI)
+		icp_deliver_irq(xics, icp, reject);
+		
+	/* Handle resend */
+	if (resend)
+		icp_check_resend(xics, icp);
+
+	return H_SUCCESS;
+}
+
+static noinline void h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 reject;
+
+	XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr);
+
+	/*
+	 * ICP State: Set_CPPR
+	 *
+	 * We can safely compare the new value with the current
+	 * value outside of the transaction as the CPPR is only
+	 * ever changed by the processor on itself
+	 */
+	if (cppr > icp->state.cppr)
+		icp_down_cppr(xics, icp, cppr);
+	else if (cppr == icp->state.cppr)
+		return;
+
+	/*
+	 * ICP State: Up_CPPR
+	 *
+	 * The processor is raising its priority, this can result
+	 * in a rejection of a pending interrupt:
+	 *
+	 * ICP State: Reject_Current
+	 *
+	 * We can remove EE from the current processor, the update
+	 * transaction will set it again if needed
+	 */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		reject = 0;
+		new_state.cppr = cppr;
+
+		if (cppr <= new_state.pending_pri) {
+			reject = new_state.xisr;
+			new_state.xisr = 0;
+			new_state.pending_pri = 0xff;
+		}
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	/*
+	 * Check for rejects. They are handled by doing a new delivery
+	 * attempt (see comments in icp_deliver_irq).
+	 */
+	if (reject && reject != XICS_IPI)
+		icp_deliver_irq(xics, icp, reject);
+}
+
+static noinline int h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u32 irq = xirr & 0x00ffffff;
+	u16 src;
+
+	XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
+
+	/*
+	 * ICP State: EOI
+	 *
+	 * Note: If EOI is incorrectly used by SW to lower the CPPR
+	 * value (ie more favored), we do not check for rejection of
+	 * a pending interrupt, this is a SW error and PAPR sepcifies
+	 * that we don't have to deal with it.
+	 *
+	 * The sending of an EOI to the ICS is handled after the
+	 * CPPR update
+	 *
+	 * ICP State: Down_CPPR which we handle
+	 * in a separate function as it's shared with H_CPPR.
+	 */
+	icp_down_cppr(xics, icp, xirr >> 24);
+
+	/* IPIs have no EOI */
+	if (irq == XICS_IPI)
+		return H_SUCCESS;
+	/*
+	 * EOI handling: If the interrupt is still asserted, we need to
+	 * resend it. We can take a lockless "peek" at the ICS state here.
+	 *
+	 * "Message" interrupts will never have "asserted" set
+	 */
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics) {
+		XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
+		return H_PARAMETER;
+	}
+	state = &ics->irq_state[src];
+
+	/* Still asserted, resend it */
+	if (state->asserted)
+		icp_deliver_irq(xics, icp, irq);
+
+	return H_SUCCESS;
+}
+
+int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
+{
+	unsigned long res;
+	int rc = H_SUCCESS;
+
+	/* Check if we have an ICP */
+	if (!vcpu->arch.icp || !vcpu->kvm->arch.xics)
+		return H_HARDWARE;
+
+	switch (req) {
+	case H_XIRR:
+		res = h_xirr(vcpu);
+		kvmppc_set_gpr(vcpu, 4, res);
+		break;
+	case H_CPPR:
+		h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
+		break;
+	case H_EOI:
+		rc = h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
+		break;
+	case H_IPI:
+		rc = h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
+			   kvmppc_get_gpr(vcpu, 5));
+		break;
+	}
+
+	return rc;
+}
+
+
+/* -- Initialisation code etc. -- */
+
+static int xics_debug_show(struct seq_file *m, void *private)
+{
+	struct kvmppc_xics *xics = m->private;
+	struct kvm *kvm = xics->kvm;
+	struct kvm_vcpu *vcpu;
+	int buid, i;
+
+	if (!kvm)
+		return 0;
+
+	seq_printf(m, "=========\nICP state\n=========\n");
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvmppc_icp *icp = vcpu->arch.icp;
+		union kvmppc_icp_state state;
+
+		if (!icp)
+			continue;
+
+		state.raw = ACCESS_ONCE(icp->state.raw);
+		seq_printf(m, "cpu server %#x XIRR:%#x PPRI:%#x CPPR:%#x "
+			   "MFRR:%#x OUT:%d NR:%d\n", vcpu->vcpu_id, state.xisr,
+			   state.pending_pri, state.cppr, state.mfrr,
+			   state.out_ee, state.need_resend);
+	}
+
+	for (buid = 1; buid <= KVMPPC_XICS_MAX_BUID; buid++) {
+		struct kvmppc_ics *ics = xics->ics[buid - 1];
+
+		if (!ics)
+			continue;
+
+		seq_printf(m, "=========\nICS state for BUID 0x%x\n=========\n",
+			   buid);
+
+		mutex_lock(&ics->lock);
+
+		for (i = 0; i < ics->nr_irqs; i++) {
+			struct ics_irq_state *irq = &ics->irq_state[i];
+
+			seq_printf(m, "irq 0x%06x: server %#x prio %#x save"
+				   " prio %#x asserted %d resend %d masked"
+				   " pending %d\n",
+				   irq->number, irq->server, irq->priority,
+				   irq->saved_priority, irq->asserted,
+				   irq->resend, irq->masked_pending);
+
+		}
+		mutex_unlock(&ics->lock);
+	}
+	return 0;
+}
+
+static int xics_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xics_debug_show, inode->i_private);
+}
+
+static const struct file_operations xics_debug_fops = {
+	.open = xics_debug_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void xics_debugfs_init(struct kvmppc_xics *xics)
+{
+	char *name;
+
+	name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics);
+	if (!name) {
+		pr_err("%s: no memory for name\n", __func__);
+		return;
+	}
+
+	xics->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root,
+					   xics, &xics_debug_fops);
+
+	pr_debug("%s: created %s\n", __func__, name);
+	kfree(name);
+}
+
+static int kvmppc_xics_create_ics(struct kvmppc_xics *xics, u16 buid,
+				  u16 nr_irqs)
+{
+	struct kvmppc_ics *ics;
+	int i, size;
+
+
+	/* Create the ICS */
+	size = sizeof(struct kvmppc_ics) +
+		sizeof(struct ics_irq_state) * nr_irqs;
+	ics = kzalloc(size, GFP_KERNEL);
+	if (!ics)
+		return -ENOMEM;
+
+	mutex_init(&ics->lock);
+	ics->buid = buid;
+	ics->nr_irqs = nr_irqs;
+
+	for (i = 0; i < nr_irqs; i++) {
+		ics->irq_state[i].number = (buid << KVMPPC_XICS_BUID_SHIFT) | i;
+		ics->irq_state[i].priority = MASKED;
+		ics->irq_state[i].saved_priority = MASKED;
+	}
+	smp_wmb();
+	xics->ics[buid - 1] = ics;
+
+	if (buid > xics->max_buid)
+		xics->max_buid = buid;
+
+	return 0;
+}
+
+int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_icp *icp;
+
+	icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL);
+	if (!icp)
+		return -ENOMEM;
+
+	icp->vcpu = vcpu;
+	icp->state.mfrr = MASKED;
+	icp->state.pending_pri = MASKED;
+	vcpu->arch.icp = icp;
+
+	XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id);
+
+	return 0;
+}
+
+void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->arch.icp)
+		return;
+	kfree(vcpu->arch.icp);
+	vcpu->arch.icp = NULL;
+}
+
+void kvmppc_xics_free(struct kvm *kvm)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	int i;
+
+	if (!xics)
+		return;
+
+	lockdep_assert_held(&kvm->lock);
+
+	debugfs_remove(xics->dentry);
+
+	if (xics->kvm) {
+		xics->kvm->arch.xics = NULL;
+		xics->kvm = NULL;
+	}
+
+	for (i = 0; i < xics->max_buid; i++) {
+		if (xics->ics[i])
+			kfree(xics->ics[i]);
+	}
+	kfree(xics);
+}
+
+/* -- ioctls -- */
+
+static int kvm_vm_ioctl_create_icp(struct kvm *kvm,
+				   struct kvm_irqchip_args *args)
+{
+	struct kvmppc_xics *xics;
+	int rc = 0;
+
+	mutex_lock(&kvm->lock);
+
+	/* Already there ? */
+	if (kvm->arch.xics)
+		return -EEXIST;
+
+	xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+	if (!xics) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	xics->kvm = kvm;
+	kvm->arch.xics = xics;
+	xics_debugfs_init(xics);
+
+out:
+	mutex_unlock(&kvm->lock);
+	return rc;
+}
+
+static int kvm_vm_ioctl_create_ics(struct kvm *kvm,
+				   struct kvm_irqchip_args *args)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	u16 nr_irqs, buid;
+	int rc;
+
+	if (!xics)
+		return -ENODEV;
+
+	nr_irqs = args->ics.nr_irqs;
+	buid = args->ics.buid;
+
+	/* BUID 0 is bogus */
+	if (buid == 0) {
+		rc = 0;
+		goto out;
+	}
+
+	/* Sanity checks */
+	if (nr_irqs == 0 || nr_irqs > KVMPPC_XICS_IRQ_COUNT ||
+	    buid > KVMPPC_XICS_MAX_BUID)
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+
+	/* BUID already exists */
+	if (xics->ics[buid - 1]) {
+		rc = -EEXIST;
+		goto out;
+	}
+
+	/* Create the ICS */
+	rc = kvmppc_xics_create_ics(xics, buid, nr_irqs);
+out:
+	mutex_unlock(&kvm->lock);
+	return rc;
+}
+
+static int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args)
+{
+	struct kvmppc_xics *xics;
+
+	/* locking against multiple callers? */
+
+	xics = kvm->arch.xics;
+	if (!xics)
+		return -ENODEV;
+
+	switch (args->level) {
+	case KVM_INTERRUPT_SET:
+	case KVM_INTERRUPT_SET_LEVEL:
+	case KVM_INTERRUPT_UNSET:
+		ics_deliver_irq(xics, args->irq, args->level);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int kvmppc_xics_ioctl(struct kvm *kvm, unsigned ioctl, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	int rc;
+
+	BUILD_BUG_ON(sizeof(union kvmppc_icp_state) != sizeof(unsigned long));
+
+	switch (ioctl) {
+	case KVM_CREATE_IRQCHIP_ARGS: {
+		struct kvm_irqchip_args args;
+
+		rc = -EFAULT;
+		if (copy_from_user(&args, argp, sizeof(args)))
+			break;
+		rc = -EINVAL;
+		if (args.type == KVM_IRQCHIP_TYPE_ICP)
+			rc = kvm_vm_ioctl_create_icp(kvm, &args);
+		else if (args.type == KVM_IRQCHIP_TYPE_ICS)
+			rc = kvm_vm_ioctl_create_ics(kvm, &args);
+		break;
+	}
+
+	case KVM_IRQ_LINE: {
+		struct kvm_irq_level args;
+
+		rc = -EFAULT;
+		if (copy_from_user(&args, argp, sizeof(args)))
+			break;
+		rc = kvm_vm_ioctl_xics_irq(kvm, &args);
+		break;
+	}
+
+	default:
+		rc = -ENOTTY;
+		break;
+	}
+
+	return rc;
+}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index ba552f8..90b5b5c 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -374,6 +374,7 @@  int kvm_dev_ioctl_check_extension(long ext)
 		break;
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CAP_PPC_GET_SMMU_INFO:
+	case KVM_CAP_SPAPR_XICS:
 		r = 1;
 		break;
 #endif
@@ -954,6 +955,26 @@  long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
 		break;
 	}
+	case KVM_IRQ_LINE: {
+		struct kvm *kvm = filp->private_data;
+
+		r = -ENOTTY;
+		if (kvmppc_xics_enabled(kvm))
+			r = kvmppc_xics_ioctl(kvm, ioctl, arg);
+		break;
+	}
+	case KVM_CREATE_IRQCHIP_ARGS: {
+		struct kvm *kvm = filp->private_data;
+		u32 type;
+
+		r = -EFAULT;
+		if (get_user(type, (u32 __user *)argp))
+			break;
+		r = -EINVAL;
+		if (type == KVM_IRQCHIP_TYPE_ICP || type == KVM_IRQCHIP_TYPE_ICS)
+			r = kvmppc_xics_ioctl(kvm, ioctl, arg);
+		break;
+	}
 #endif /* CONFIG_PPC_BOOK3S_64 */
 	default:
 		r = -ENOTTY;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3a92001..8674d32 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -115,6 +115,7 @@  struct kvm_irq_level {
 	 * ACPI gsi notion of irq.
 	 * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47..
 	 * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23..
+	 * On powerpc SPAPR, the ICS source number, level is ignored.
 	 */
 	union {
 		__u32 irq;
@@ -635,6 +636,7 @@  struct kvm_ppc_smmu_info {
 #define KVM_CAP_IRQFD_RESAMPLE 82
 #define KVM_CAP_PPC_BOOKE_WATCHDOG 83
 #define KVM_CAP_PPC_RTAS 84
+#define KVM_CAP_SPAPR_XICS 85
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -862,7 +864,9 @@  struct kvm_s390_ucas_mapping {
 #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
 /* Available with KVM_CAP_PPC_RTAS */
 #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xaa, struct kvm_rtas_token_args)
-
+#ifdef __KVM_HAVE_IRQCHIP_ARGS
+#define KVM_CREATE_IRQCHIP_ARGS   _IOW(KVMIO,  0xab, struct kvm_irqchip_args)
+#endif
 
 /*
  * ioctls for vcpu fds