diff mbox

[3/9] KVM: PPC: Book3S: Add kernel emulation for the XICS interrupt controller

Message ID 20130215000108.GD17099@iris.ozlabs.ibm.com
State New, archived
Headers show

Commit Message

Paul Mackerras Feb. 15, 2013, 12:01 a.m. UTC
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>

This adds in-kernel emulation of the XICS (eXternal Interrupt
Controller Specification) interrupt controller specified by PAPR, for
both HV and PR KVM guests.

This adds a new KVM_CREATE_IRQCHIP_ARGS ioctl, which is like
KVM_CREATE_IRQCHIP in that it indicates that the virtual machine
should use in-kernel interrupt controller emulation, but also takes an
argument struct that contains the type of interrupt controller
architecture and an optional parameter.  Currently only one type value
is defined, that which indicates the XICS architecture.

The XICS emulation supports up to 1048560 interrupt sources.
Interrupt source numbers below 16 are reserved; 0 is used to mean no
interrupt and 2 is used for IPIs.  Internally these are represented in
blocks of 1024, called ICS (interrupt controller source) entities, but
that is not visible to userspace.

Two other new ioctls allow userspace to control the interrupt
sources.  The KVM_IRQCHIP_SET_SOURCES ioctl sets the priority,
destination cpu, level/edge sensitivity and pending state of a range
of interrupt sources, creating them if they don't already exist.  The
KVM_IRQCHIP_GET_SOURCES ioctl returns that information for a range of
interrupt sources (they are required to already exist).

Each vcpu gets one ICP (interrupt controller presentation) entity.
They are created automatically when the vcpu is created provided the
KVM_CREATE_IRQCHIP_ARGS ioctl has been performed.

This is based on an initial implementation by Michael Ellerman
<michael@ellerman.id.au> reworked by Benjamin Herrenschmidt and
Paul Mackerras.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 Documentation/virtual/kvm/api.txt     |   51 ++
 arch/powerpc/include/asm/kvm_book3s.h |    1 +
 arch/powerpc/include/asm/kvm_host.h   |    8 +
 arch/powerpc/include/asm/kvm_ppc.h    |   19 +
 arch/powerpc/kvm/Makefile             |    1 +
 arch/powerpc/kvm/book3s.c             |    2 +-
 arch/powerpc/kvm/book3s_hv.c          |   20 +
 arch/powerpc/kvm/book3s_pr.c          |   13 +
 arch/powerpc/kvm/book3s_pr_papr.c     |   16 +
 arch/powerpc/kvm/book3s_rtas.c        |   51 +-
 arch/powerpc/kvm/book3s_xics.c        | 1101 +++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_xics.h        |  111 ++++
 arch/powerpc/kvm/powerpc.c            |   23 +
 include/uapi/linux/kvm.h              |   29 +
 14 files changed, 1444 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_xics.c
 create mode 100644 arch/powerpc/kvm/book3s_xics.h

Comments

Paul Mackerras Feb. 15, 2013, 11:18 p.m. UTC | #1
On Fri, Feb 15, 2013 at 02:05:41PM -0600, Scott Wood wrote:
> On 02/14/2013 06:01:08 PM, Paul Mackerras wrote:
> >From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> >
> >This adds in-kernel emulation of the XICS (eXternal Interrupt
> >Controller Specification) interrupt controller specified by PAPR, for
> >both HV and PR KVM guests.
> >
> >This adds a new KVM_CREATE_IRQCHIP_ARGS ioctl, which is like
> >KVM_CREATE_IRQCHIP in that it indicates that the virtual machine
> >should use in-kernel interrupt controller emulation, but also takes an
> >argument struct that contains the type of interrupt controller
> >architecture and an optional parameter.  Currently only one type value
> >is defined, that which indicates the XICS architecture.
> 
> Would the device config API I posted a couple days ago work for you?

I suppose it could be made to work.  It doesn't feel like a natural
fit though, because your API seems to assume (AFAICT) that a device is
manipulated via registers at specific physical addresses, so I would
have to invent an artificial set of registers with addresses and bit
layouts, that aren't otherwise required.  The XICS is operated from
the guest side via hcalls, not via emulated MMIO.

Part of the reason I went with the API that I did is that that was
what was agreed on at KVM Forum (as far as I can tell, not having been
at the meeting).  Your device API seems to be quite different to that.

> >The XICS emulation supports up to 1048560 interrupt sources.
> >Interrupt source numbers below 16 are reserved; 0 is used to mean no
> >interrupt and 2 is used for IPIs.  Internally these are represented in
> >blocks of 1024, called ICS (interrupt controller source) entities, but
> >that is not visible to userspace.
> >
> >Two other new ioctls allow userspace to control the interrupt
> >sources.  The KVM_IRQCHIP_SET_SOURCES ioctl sets the priority,
> >destination cpu, level/edge sensitivity and pending state of a range
> >of interrupt sources, creating them if they don't already exist.  The
> >KVM_IRQCHIP_GET_SOURCES ioctl returns that information for a range of
> >interrupt sources (they are required to already exist).
> 
> Why is it userspace's job to control these?  If you use KVM_IRQ_PENDING

These are primarily there to support live migration.  For live
migration, userspace needs to be able to extract the entire state of
the virtual machine from the old guest, and then set the new guest to
that exact same state.  We have live migration working in qemu for
pSeries guests with in-kernel XICS emulation using this interface.
If you're not doing live migration, the only time userspace needs to
use these is at initialization, when it does a SET_SOURCES to create
the interrupt sources it wants the VM to have.

> for interrupt injection, what if there's a race with the user changing
> other flags via MMIO?  Maybe this isn't an issue with XICS, but this is
> being presented as a generic API.

They're not used while the guest is running, as I said, but even if
they were, there is appropriate locking in there to handle any races.

> >+4.80 KVM_CREATE_IRQCHIP_ARGS
> >+
> >+Capability: KVM_CAP_IRQCHIP_ARGS
> >+Architectures: ppc
> 
> Why just ppc?

Just because only PPC has code to handle it at this point.  I would
hope that ARM and others could pick it up.  Maybe I should make it:

+Architectures: all (but so far only implemented on ppc)

or something.

> >+struct kvm_irq_sources {
> >+	__u32 irq;
> >+	__u32 nr_irqs;
> >+	__u64 __user *irqbuf;
> >+};
> 
> Please no pointers in UAPI -- this would require a compat wrapper with
> 32-bit user and 64-bit kernel.

Hmmm, you're right.  I suppose it will have to be a fixed-size buffer,
which is generally less efficient, but since it's mainly for migration
that probably doesn't matter.  In fact, I could probably use the
existing KVM_GET_IRQCHIP and KVM_SET_IRQCHIP ioctls, if I use the
`chip_id' field for `irq' and the `pad' field for `nr_irqs'.

> >+/* irqbuf entries are laid out like this: */
> >+#define KVM_IRQ_SERVER_SHIFT	0
> >+#define KVM_IRQ_SERVER_MASK	0xffffffffULL
> >+#define KVM_IRQ_PRIORITY_SHIFT	32
> >+#define KVM_IRQ_PRIORITY_MASK	0xff
> >+#define KVM_IRQ_LEVEL_SENSITIVE	(1ULL << 40)
> >+#define KVM_IRQ_MASKED		(1ULL << 41)
> >+#define KVM_IRQ_PENDING		(1ULL << 42)
> 
> What does "server" mean?  Do you mean "laid out like this for XICS"?

Sorry, I should have made that "destination" rather than "server".
You're right, "server" is confusing, but it just means "where the
interrupt is sent to be handled".  It has nothing particularly to do
with "server" computers.

> Let's please have a clean separation between what is generic and what is
> implementation-specific.

I believe that the interface is pretty cleanly generic - the model is
a set of interrupt sources and some per-vcpu state, with priorities to
decide which interrupts get delivered when.  That describes the basics
of just about any SMP-capable interrupt controller, including MPIC.

MPIC would still need an extra interface for userspace to save and
restore things like the timer registers at live migration time, and
for userspace to configure the base address, MPIC version, etc., of
course.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras Feb. 16, 2013, 2:56 a.m. UTC | #2
On Fri, Feb 15, 2013 at 05:59:11PM -0600, Scott Wood wrote:
> On 02/15/2013 05:18:31 PM, Paul Mackerras wrote:
> >On Fri, Feb 15, 2013 at 02:05:41PM -0600, Scott Wood wrote:
> >> On 02/14/2013 06:01:08 PM, Paul Mackerras wrote:
> >> >From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> >> >
> >> >This adds in-kernel emulation of the XICS (eXternal Interrupt
> >> >Controller Specification) interrupt controller specified by
> >PAPR, for
> >> >both HV and PR KVM guests.
> >> >
> >> >This adds a new KVM_CREATE_IRQCHIP_ARGS ioctl, which is like
> >> >KVM_CREATE_IRQCHIP in that it indicates that the virtual machine
> >> >should use in-kernel interrupt controller emulation, but also
> >takes an
> >> >argument struct that contains the type of interrupt controller
> >> >architecture and an optional parameter.  Currently only one
> >type value
> >> >is defined, that which indicates the XICS architecture.
> >>
> >> Would the device config API I posted a couple days ago work for you?
> >
> >I suppose it could be made to work.  It doesn't feel like a natural
> >fit though, because your API seems to assume (AFAICT) that a device is
> >manipulated via registers at specific physical addresses, so I would
> >have to invent an artificial set of registers with addresses and bit
> >layouts, that aren't otherwise required.  The XICS is operated from
> >the guest side via hcalls, not via emulated MMIO.
> 
> I don't think it makes such an assumption.  The MPIC device has
> physical registers, so it exposes them, but it also exposes things
> that are not physical registers (e.g. the per-IRQ input state).  The
> generic device control layer leaves interpretation of attributes up
> to the device.
> 
> I think it would be easier to fit XICS into the device control api
> model than to fit MPIC into this model, not to mention what would
> happen if we later want to emulate some other type of device -- x86
> already has at least one non-irqchip emulated device (i8254).

I have no particular objection to the device control API per se, but
I have two objections to using it as the primary interface to the XICS
emulation.

First, I dislike the magical side-effect where creating a device of a
particular type (e.g. MPIC or XICS) automatically attaches it to the
interrupt lines of the vcpus.  I prefer an explicit request to do
in-kernel interrupt control.  Further, the magic means that you can
only have one instance of the device, whereas you might want to model
the interrupt controller architecture as several devices.  You could
do that using several device types, but then the interconnections
between them would also be magic.

Secondly, it means that we are completely abandoning any attempt to
define an abstract or generic interface to in-kernel interrupt
controller emulations.  Each device will have its own unique set of
attribute groups and its own unique userspace code to drive it, with
no commonality between them.

> >Part of the reason I went with the API that I did is that that was
> >what was agreed on at KVM Forum (as far as I can tell, not having been
> >at the meeting).  Your device API seems to be quite different to that.
> 
> I wasn't there either.  It's fine to have discussions at such
> events, but it should not preclude others from having input, or
> better ideas from being considered afterward.

Sure - I was just trying to fit in with the expressed wish of the
maintainer.

> >> >The XICS emulation supports up to 1048560 interrupt sources.
> >> >Interrupt source numbers below 16 are reserved; 0 is used to
> >mean no
> >> >interrupt and 2 is used for IPIs.  Internally these are
> >represented in
> >> >blocks of 1024, called ICS (interrupt controller source)
> >entities, but
> >> >that is not visible to userspace.
> >> >
> >> >Two other new ioctls allow userspace to control the interrupt
> >> >sources.  The KVM_IRQCHIP_SET_SOURCES ioctl sets the priority,
> >> >destination cpu, level/edge sensitivity and pending state of a
> >range
> >> >of interrupt sources, creating them if they don't already
> >exist.  The
> >> >KVM_IRQCHIP_GET_SOURCES ioctl returns that information for a
> >range of
> >> >interrupt sources (they are required to already exist).
> >>
> >> Why is it userspace's job to control these?  If you use
> >KVM_IRQ_PENDING
> >
> >These are primarily there to support live migration.  For live
> >migration, userspace needs to be able to extract the entire state of
> >the virtual machine from the old guest, and then set the new guest to
> >that exact same state.
> 
> In that case, the state it describes is insufficient for MPIC.

Yes, MPIC has other random stuff in it besides interrupt control, so
that's not surprising.

> >We have live migration working in qemu for
> >pSeries guests with in-kernel XICS emulation using this interface.
> >If you're not doing live migration,
> 
> We don't yet, but would prefer not to assume that it'll never happen.
> 
> >> for interrupt injection, what if there's a race with the user
> >changing
> >> other flags via MMIO?  Maybe this isn't an issue with XICS, but
> >this is
> >> being presented as a generic API.
> >
> >They're not used while the guest is running, as I said, but even if
> >they were, there is appropriate locking in there to handle any races.
> 
> OK, KVM_IRQ_LINE is still used for interrupt injection.  I was
> hoping to avoid going through a standardized interface that forces a
> global interrupt numberspace.

Why?

> How do MSIs get injected?

Just like other interrupts - from the point of view of the interrupt
controller they're edge-triggered interrupt sources.

> BTW, do you have any plans regarding irqfd?

I'm going to look at that next.

> 
> >> >+struct kvm_irq_sources {
> >> >+	__u32 irq;
> >> >+	__u32 nr_irqs;
> >> >+	__u64 __user *irqbuf;
> >> >+};
> >>
> >> Please no pointers in UAPI -- this would require a compat
> >wrapper with
> >> 32-bit user and 64-bit kernel.
> >
> >Hmmm, you're right.  I suppose it will have to be a fixed-size buffer,
> 
> It doesn't need to be a fixed size buffer.  You can still have
> pointers, but they need to be represented as a plain "__u64" with
> users casting the pointer.
> 
> >> >+/* irqbuf entries are laid out like this: */
> >> >+#define KVM_IRQ_SERVER_SHIFT	0
> >> >+#define KVM_IRQ_SERVER_MASK	0xffffffffULL
> >> >+#define KVM_IRQ_PRIORITY_SHIFT	32
> >> >+#define KVM_IRQ_PRIORITY_MASK	0xff
> >> >+#define KVM_IRQ_LEVEL_SENSITIVE	(1ULL << 40)
> >> >+#define KVM_IRQ_MASKED		(1ULL << 41)
> >> >+#define KVM_IRQ_PENDING		(1ULL << 42)
> >>
> >> What does "server" mean?  Do you mean "laid out like this for XICS"?
> >
> >Sorry, I should have made that "destination" rather than "server".
> >You're right, "server" is confusing, but it just means "where the
> >interrupt is sent to be handled".  It has nothing particularly to do
> >with "server" computers.
> 
> Right, I was aware of the IBM terminology here -- just thought it
> wasn't appropriate in a generic interface.

Sure.

> What about interrupt controllers that allow multiple destinations?

The destination can be an identifier for a group of vcpus, or even a
bitmap -- that's why I made it 32 bits.

> More than 256 priorities?  Different "levels" of output (normal,
> critical, machine check)?  Programmable vector numbers?  Active
> high/low control?

There are plenty of bits free in the 64 bits per source that I have
allowed.  We can accommodate those things.  (BTW, I think having more
than 256 priorities would be insane - do you know of any actual
example that does?)

> I just don't think irqchip state can be sanely made generic.
> 
> >> Let's please have a clean separation between what is generic and
> >what is
> >> implementation-specific.
> >
> >I believe that the interface is pretty cleanly generic - the model is
> >a set of interrupt sources and some per-vcpu state, with priorities to
> >decide which interrupts get delivered when.  That describes the basics
> >of just about any SMP-capable interrupt controller, including MPIC.
> 
> The per-vcpu state isn't even part of this AFAICT.  It's an
> XICS-specific ONE_REG -- which is fine, but all that's left of the
> "generic" API is the get/set sources which is an imperfect match to
> our per-IRQ state and it's not clear how an implementation should
> extend it.

Yes, the names of the bitfields in the ICP state word are
XICS-specific, but the concepts are pretty generic - current processor
priority, identifier for interrupt awaiting service, pending IPI
request priority, pending interrupt request priority.

> It would be straightforward to map it to the device control api by
> having XICS declare an attribute group that corresponds to IRQ
> sources (like KVM_DEV_MPIC_GRP_IRQ_ACTIVE) and the attribute data be
> what you've defined for kvm_irq_sources.  Or if you want to preserve
> the "set multiple IRQs at once" approach, you could have an
> attribute that takes exactly a kvm_irq_sources struct, though it
> might be better to have a generic batched attribute set/get ioctl
> (as has been proposed for ONE_REG if the need arises).

Sure, the device control API can probably do just about anything.
We could even use it to create and control vcpus and memory slots, but
that doesn't mean we should.

> >MPIC would still need an extra interface for userspace to save and
> >restore things like the timer registers at live migration time, and
> >for userspace to configure the base address, MPIC version, etc., of
> >course.
> 
> Yes, which pretty much means we need the device control api anyway
> -- or MPIC-specific ioctls, which I wanted to avoid.

I agree that MPIC has enough extra stuff in it, besides interrupt
control, that it probably needs your device control API, at least for
that extra stuff.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras Feb. 16, 2013, 4:51 a.m. UTC | #3
On Fri, Feb 15, 2013 at 09:57:06PM -0600, Scott Wood wrote:
> On 02/15/2013 08:56:14 PM, Paul Mackerras wrote:
> >I have no particular objection to the device control API per se, but
> >I have two objections to using it as the primary interface to the XICS
> >emulation.
> >
> >First, I dislike the magical side-effect where creating a device of a
> >particular type (e.g. MPIC or XICS) automatically attaches it to the
> >interrupt lines of the vcpus.  I prefer an explicit request to do
> >in-kernel interrupt control.
> 
> OK.  This is device-specific behavior, so you could define it
> differently for XICS than MPIC.  I suppose we could change it for
> MPIC as well, to leave an opening for the unlikely case where we'd
> want to model an MPIC that isn't directly connected to the CPUs.

You can have cascaded MPICs; I once had a powerbook that had one MPIC
cascaded into another.

> How is the explicit request made in this patchset?

The KVM_CREATE_IRQCHIP_ARGS ioctl says that you want emulation of a
specific interrupt controller architecture connected to the vcpus'
external interrupt inputs.  In that sense it's explicit, compared to a
generic "create device" ioctl that could be for any device.

> >Secondly, it means that we are completely abandoning any attempt to
> >define an abstract or generic interface to in-kernel interrupt
> >controller emulations.  Each device will have its own unique set of
> >attribute groups and its own unique userspace code to drive it, with
> >no commonality between them.
> 
> Yes.  I am unconvinced that such an abstraction is well-advised
> (especially after seeing existing "generic" interfaces that are
> clearly APIC-oriented).  This isn't like normal driver interfaces
> where we're abstracting away hardware differences to let generic
> code use a device.  Userspace knows what kind of device it wants,
> and how it wants it to integrate with the rest of the emulated
> system.  We'd have to go out of our way to apply the abstraction on
> *both* ends.  What do we get from that other than a chance that the
> abstraction leaks?  What significant code actually becomes common?
> kvm_set_irq() is just a thin wrapper around the ioctl.

I'd think there could be some code reduction in the live migration
code, but I'd need a qemu hacker to chime in here.

> >> OK, KVM_IRQ_LINE is still used for interrupt injection.  I was
> >> hoping to avoid going through a standardized interface that forces a
> >> global interrupt numberspace.
> >
> >Why?
> 
> The standardized interface doesn't make things any easier (as noted
> above, the caller is already mpic-specific code), and we'd have to
> come up with a scheme for flattening our interrupt numberspace
> (rather than introduce new attribute groups for things like IPI and
> timer interrupts).  It may still be necessary when it comes to
> irqfd, though...

With 24 bits of interrupt source number, you don't have to flatten it
very far.  IPIs are handled separately and don't appear in the
interrupt source space.

> >> How do MSIs get injected?
> >
> >Just like other interrupts - from the point of view of the interrupt
> >controller they're edge-triggered interrupt sources.
> 
> Ah right, I guess this is all set up via hcalls for XICS.
> 
> With MPIC exposing its registers via the device control api,
> everything just works -- the PCI device generates a write to the
> MPIC's memory region, the QEMU MPIC stub sends the write to the
> kernel as for any other MMIO access (this passthrough is also useful
> for debugging), the in-kernel MPIC sees the write to the "generate
> an MSI" register and does its thing.  Compare that to all special
> the MSI code for APIC... :-)

You're doing a round trip to userspace for every MPIC register access
by the guest?  Seriously?  If that's so, then why bother with
in-kernel emulation at all?  Once you're back in userspace, it's just
as fast to do the emulation there as in the kernel.

> >There are plenty of bits free in the 64 bits per source that I have
> >allowed.  We can accommodate those things.
> 
> MPIC vector numbers take up 16 of the bits.  The architected
> interrupt level field is 8 bits, though only a handful of values are
> actually needed.  Add a couple binary flags, and it gets pretty
> tight if a third type of interrupt controller starts wanting
> something new.

There's enough space for MPIC to have 16 bits of vector and some
flags.  We don't need to overdesign this.

> >Yes, the names of the bitfields in the ICP state word are
> >XICS-specific, but the concepts are pretty generic - current processor
> >priority, identifier for interrupt awaiting service, pending IPI
> >request priority, pending interrupt request priority.
> 
> We don't have separate concepts of "pending IPI request priority"
> and "pending interrupt request priority".  There can be multiple

Sorry, I meant "pending interrupt request".  You do have that, it's
what you read from the interrupt acknowledge register.

> interrupts awaiting service (or even in service, if different
> priorities).  We have both "current task priority" (which is a
> user-set mask-by-priority register) and the priority of the
> highest-prio in-service interrupt -- which would "current processor
> priorty" be?  Etc.

It would be the current task priority.  I assume MPIC maintains a
16-bit map of the interrupt priorities in service, so that would need
to be added.  (XICS doesn't maintain the stack in hardware, which
would require 256 bits of state, but instead gets software to do
that.)

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Wood Feb. 18, 2013, 10:43 p.m. UTC | #4
On 02/15/2013 10:51:16 PM, Paul Mackerras wrote:
> On Fri, Feb 15, 2013 at 09:57:06PM -0600, Scott Wood wrote:
> > On 02/15/2013 08:56:14 PM, Paul Mackerras wrote:
> > >I have no particular objection to the device control API per se,  
> but
> > >I have two objections to using it as the primary interface to the  
> XICS
> > >emulation.
> > >
> > >First, I dislike the magical side-effect where creating a device  
> of a
> > >particular type (e.g. MPIC or XICS) automatically attaches it to  
> the
> > >interrupt lines of the vcpus.  I prefer an explicit request to do
> > >in-kernel interrupt control.
> >
> > OK.  This is device-specific behavior, so you could define it
> > differently for XICS than MPIC.  I suppose we could change it for
> > MPIC as well, to leave an opening for the unlikely case where we'd
> > want to model an MPIC that isn't directly connected to the CPUs.
> 
> You can have cascaded MPICs; I once had a powerbook that had one MPIC
> cascaded into another.

OK.

> > How is the explicit request made in this patchset?
> 
> The KVM_CREATE_IRQCHIP_ARGS ioctl says that you want emulation of a
> specific interrupt controller architecture connected to the vcpus'
> external interrupt inputs.  In that sense it's explicit, compared to a
> generic "create device" ioctl that could be for any device.

Hooking up to the CPU's interrupt lines is implicit in creating an MPIC  
(and I'm fine with changing that), not in creating any device.  I don't  
see how it's worse than being implicit in calling  
KVM_CREATE_IRQCHIP_ARGS (which doesn't allow for cascaded irqchips).

> > The standardized interface doesn't make things any easier (as noted
> > above, the caller is already mpic-specific code), and we'd have to
> > come up with a scheme for flattening our interrupt numberspace
> > (rather than introduce new attribute groups for things like IPI and
> > timer interrupts).  It may still be necessary when it comes to
> > irqfd, though...
> 
> With 24 bits of interrupt source number, you don't have to flatten it
> very far.  IPIs are handled separately and don't appear in the
> interrupt source space.

They do need to appear in the interrupt source space if we want to  
inject or irqfd them.  Most users won't want to do that, but we have  
had a customer directly assign IPIs (to communicate with an OS running  
on the other CPU in an AMP setup -- host Linux was non-SMP so wasn't  
using them) and MPIC timers to a guest.

> > >> How do MSIs get injected?
> > >
> > >Just like other interrupts - from the point of view of the  
> interrupt
> > >controller they're edge-triggered interrupt sources.
> >
> > Ah right, I guess this is all set up via hcalls for XICS.
> >
> > With MPIC exposing its registers via the device control api,
> > everything just works -- the PCI device generates a write to the
> > MPIC's memory region, the QEMU MPIC stub sends the write to the
> > kernel as for any other MMIO access (this passthrough is also useful
> > for debugging), the in-kernel MPIC sees the write to the "generate
> > an MSI" register and does its thing.  Compare that to all special
> > the MSI code for APIC... :-)
> 
> You're doing a round trip to userspace for every MPIC register access
> by the guest?  Seriously?

No.  Accesses by the guest get handled in the kernel.  Accesses in  
QEMU, including MSIs generated by virtio, get forwarded to the kernel.

> > >There are plenty of bits free in the 64 bits per source that I have
> > >allowed.  We can accommodate those things.
> >
> > MPIC vector numbers take up 16 of the bits.  The architected
> > interrupt level field is 8 bits, though only a handful of values are
> > actually needed.  Add a couple binary flags, and it gets pretty
> > tight if a third type of interrupt controller starts wanting
> > something new.
> 
> There's enough space for MPIC to have 16 bits of vector and some
> flags.  We don't need to overdesign this.

I view anything other than passing the actual MPIC register values  
around as overdesign here, given that it is communication between  
hw/kvm/mpic.c on the QEMU side and arch/powerpc/kvm/mpic.c on the  
kernel side.

> > interrupts awaiting service (or even in service, if different
> > priorities).  We have both "current task priority" (which is a
> > user-set mask-by-priority register) and the priority of the
> > highest-prio in-service interrupt -- which would "current processor
> > priorty" be?  Etc.
> 
> It would be the current task priority.  I assume MPIC maintains a
> 16-bit map of the interrupt priorities in service, so that would need
> to be added.

We don't maintain such a map in the emulation code.  We have a per-CPU  
bitmap of the actual interrupt sources pending/active, which is another  
attribute that would need to be added in order to support migration on  
MPIC.

-Scott
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras Feb. 20, 2013, 12:41 a.m. UTC | #5
On Mon, Feb 18, 2013 at 04:43:27PM -0600, Scott Wood wrote:
> On 02/15/2013 10:51:16 PM, Paul Mackerras wrote:
> >The KVM_CREATE_IRQCHIP_ARGS ioctl says that you want emulation of a
> >specific interrupt controller architecture connected to the vcpus'
> >external interrupt inputs.  In that sense it's explicit, compared to a
> >generic "create device" ioctl that could be for any device.
> 
> Hooking up to the CPU's interrupt lines is implicit in creating an
> MPIC (and I'm fine with changing that), not in creating any device.
> I don't see how it's worse than being implicit in calling
> KVM_CREATE_IRQCHIP_ARGS (which doesn't allow for cascaded irqchips).

First, KVM_CREATE_IRQCHIP_ARGS specifies the overall architecture of
the interrupt control subsystem, so yes it does allow for cascaded
controllers.

Secondly, the difference is that if you see a KVM_CREATE_IRQCHIP_ARGS
call, you know that the vcpus' interrupt inputs will be driven by
kernel code.  If you see a KVM_CREATE_DEVICE call, you don't know
that; they might be, or they might not be.

> >You're doing a round trip to userspace for every MPIC register access
> >by the guest?  Seriously?
> 
> No.  Accesses by the guest get handled in the kernel.  Accesses in
> QEMU, including MSIs generated by virtio, get forwarded to the
> kernel.

OK, I missed the path where that gets done, then.

> >It would be the current task priority.  I assume MPIC maintains a
> >16-bit map of the interrupt priorities in service, so that would need
> >to be added.
> 
> We don't maintain such a map in the emulation code.  We have a

Oh, so how do you handle EOI of nested interrupts?  How do you know
what to reset the CPU priority to in that case?

> per-CPU bitmap of the actual interrupt sources pending/active, which
> is another attribute that would need to be added in order to support
> migration on MPIC.

Not really, that can be recomputed from the sources easily enough.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Feb. 20, 2013, 7:58 p.m. UTC | #6
On Sat, Feb 16, 2013 at 01:56:14PM +1100, Paul Mackerras wrote:
> On Fri, Feb 15, 2013 at 05:59:11PM -0600, Scott Wood wrote:
> > On 02/15/2013 05:18:31 PM, Paul Mackerras wrote:
> > >On Fri, Feb 15, 2013 at 02:05:41PM -0600, Scott Wood wrote:
> > >> On 02/14/2013 06:01:08 PM, Paul Mackerras wrote:
> > >> >From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> > >> >
> > >> >This adds in-kernel emulation of the XICS (eXternal Interrupt
> > >> >Controller Specification) interrupt controller specified by
> > >PAPR, for
> > >> >both HV and PR KVM guests.
> > >> >
> > >> >This adds a new KVM_CREATE_IRQCHIP_ARGS ioctl, which is like
> > >> >KVM_CREATE_IRQCHIP in that it indicates that the virtual machine
> > >> >should use in-kernel interrupt controller emulation, but also
> > >takes an
> > >> >argument struct that contains the type of interrupt controller
> > >> >architecture and an optional parameter.  Currently only one
> > >type value
> > >> >is defined, that which indicates the XICS architecture.
> > >>
> > >> Would the device config API I posted a couple days ago work for you?
> > >
> > >I suppose it could be made to work.  It doesn't feel like a natural
> > >fit though, because your API seems to assume (AFAICT) that a device is
> > >manipulated via registers at specific physical addresses, so I would
> > >have to invent an artificial set of registers with addresses and bit
> > >layouts, that aren't otherwise required.  The XICS is operated from
> > >the guest side via hcalls, not via emulated MMIO.
> > 
> > I don't think it makes such an assumption.  The MPIC device has
> > physical registers, so it exposes them, but it also exposes things
> > that are not physical registers (e.g. the per-IRQ input state).  The
> > generic device control layer leaves interpretation of attributes up
> > to the device.
> > 
> > I think it would be easier to fit XICS into the device control api
> > model than to fit MPIC into this model, not to mention what would
> > happen if we later want to emulate some other type of device -- x86
> > already has at least one non-irqchip emulated device (i8254).
> 
> I have no particular objection to the device control API per se, but
> I have two objections to using it as the primary interface to the XICS
> emulation.
> 
> First, I dislike the magical side-effect where creating a device of a
> particular type (e.g. MPIC or XICS) automatically attaches it to the
> interrupt lines of the vcpus.  I prefer an explicit request to do
> in-kernel interrupt control. 

This is probably a stupid question, but why the
KVM_SET_IRQCHIP/KVM_SET_GSI_ROUTING interface is not appropriate for
your purposes?

x86 sets up a default GSI->IRQCHIP PIN mapping on creation (during
KVM_SET_IRQCHIP), but it can be modified with KVM_SET_GSI_ROUTING.

>  Further, the magic means that you can
> only have one instance of the device, whereas you might want to model
> the interrupt controller architecture as several devices.  You could
> do that using several device types, but then the interconnections
> between them would also be magic.
> 
> Secondly, it means that we are completely abandoning any attempt to
> define an abstract or generic interface to in-kernel interrupt
> controller emulations.  Each device will have its own unique set of
> attribute groups and its own unique userspace code to drive it, with
> no commonality between them.

<snip>

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Feb. 21, 2013, 1:09 a.m. UTC | #7
On Wed, Feb 20, 2013 at 06:20:51PM -0600, Scott Wood wrote:
> On 02/20/2013 01:58:54 PM, Marcelo Tosatti wrote:
> >This is probably a stupid question, but why the
> >KVM_SET_IRQCHIP/KVM_SET_GSI_ROUTING interface is not appropriate for
> >your purposes?
> >
> >x86 sets up a default GSI->IRQCHIP PIN mapping on creation (during
> >KVM_SET_IRQCHIP), but it can be modified with KVM_SET_GSI_ROUTING.
> 
> To start, the whole IRQ routing stuff is poorly documented.
> 
> Am I supposed to make up GSI numbers and use the routing thing to
> associate them with real interrupts? 

I have no idea. Is mapping from one integer linear space (GSIs) 
to real interrupts suitable for you? 

> Are there constraints on what
> sort of GSI numbers I can choose (I now see in the code that
> KVM_MAX_IRQ_ROUTES is returned from the capability check, but where
> is that documented?  

Don't think it is.

> It looks like the APIC implementation has
> default routes, where is that documented?)? 

In the code.

> Where does the code live to manage this table, and how APICy is it (looks like the
> answer is "irq_comm.c, and very")? 

Thinking faster than typing? Not sure what you mean.

> I suppose I could write another
> implementation of the table management code for MPIC, though the
> placement of "irqchip" inside the route entry, rather than as an
> argument to KVM_IRQ_LINE, suggests the table is supposed to be
> global, not in the individual interrupt controller.

Yes the table is global. It maps GSI ("Global System Interrupt" IIRC)
(integer) to (irqchip,pin) pair.

> It looks like I'm going to have to do this anyway for irqfd, though
> that doesn't make the other uses of the device control api go away.
> Even KVM_DEV_MPIC_GRP_IRQ_ACTIVE would still be useful for reading
> the status for debugging (reading device registers doesn't quite do
> it, since the "active" bit won't show up if the interrupt is
> masked).  

> At that point, is it more offensive to make it read-only
> even though it would be trivial to make it read/write (which would
> allow users who don't need it to bypass the routing API), or to make
> it read/write and live with there being more than one way to do
> something?

Can't follow this sentence.

> KVM_SET_IRQCHIP is not suitable because we have more than 512 bytes
> of state, and because it doesn't allow debugging access to device
> registers (e.g. inspecting from the QEMU command line), and because
> it's hard to add new pieces of state if we realize we left something
> out.  It reminds be of GET/SET_SREGS.  With that, I did what you
> seem to want here, which is to adapt the existing interfaces, using
> feature flags to control optional state.  It ended up being a mess,
> and ONE_REG was introduced as a replacement.  The device control API
> is the equivalent of ONE_REG for things other than vcpus.
> 
> -Scott

- ACK on 512 bytes not sufficient. Add another ioctl, SET_IRQCHIP2?
- Agree on poor extensibility of interface. Adding a reserved amount
of space as padding and versioning such as has been done so far 
is not acceptable? 
- Debugging: why is reading entire register state not acceptable? Yes,
  its slow.

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Feb. 24, 2013, 2:08 p.m. UTC | #8
On Wed, Feb 20, 2013 at 06:20:51PM -0600, Scott Wood wrote:
> On 02/20/2013 01:58:54 PM, Marcelo Tosatti wrote:
> >This is probably a stupid question, but why the
> >KVM_SET_IRQCHIP/KVM_SET_GSI_ROUTING interface is not appropriate for
> >your purposes?
> >
> >x86 sets up a default GSI->IRQCHIP PIN mapping on creation (during
> >KVM_SET_IRQCHIP), but it can be modified with KVM_SET_GSI_ROUTING.
> 
> To start, the whole IRQ routing stuff is poorly documented.
> 
> Am I supposed to make up GSI numbers and use the routing thing to
> associate them with real interrupts?
You can consider GSI to be a cookie that you use to refer to whatever
data you've put into routing table by KVM_IRQ_LINE/irqfd interface.
Even on x86, when irq routing is used to inject MSI interrupt, this is
exactly how GSI is used. In MSI case it does not have a meaning besides
"look at that interrupt entry to see what MSI should be injected".

>                                       Are there constraints on what
> sort of GSI numbers I can choose (I now see in the code that
> KVM_MAX_IRQ_ROUTES is returned from the capability check, but where
> is that documented?
The only constrain is that the number should be smalled than
KVM_MAX_IRQ_ROUTES, but this is implementation detail. Current
implementation uses array to map from GSI to a data, if a lot more
entries then currently allowed is needed implementation may be changed
to different data structure.

>                      It looks like the APIC implementation has
> default routes, where is that documented?)?
It is very PC centric, should not be even compiled for other arches.

>                                              Where does the code
> live to manage this table, and how APICy is it (looks like the
> answer is "irq_comm.c, and very")?
It is a mistake to refer to the irq routing table as APICy :). It is
certainly PC centric currently, but there is at least one HW layer
between it and the APIC. PC has global GSI space, each GSI can be
delivered via different irq chip. Some GSIs can be delivered through
multiple irq chips. Irq routing table provides mapping between GSI and
irq chips it should be delivered through. Some irq chips deliver
interrupt via APIC some not, but this is different story.

The work is needed to make the code not PC centric, but it should not be
a lot of work.

>                                     I suppose I could write another
> implementation of the table management code for MPIC, though the
> placement of "irqchip" inside the route entry, rather than as an
> argument to KVM_IRQ_LINE, suggests the table is supposed to be
> global, not in the individual interrupt controller.
> 
Yes, it is global. It sits between emulated devices and irq chips.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras Feb. 25, 2013, 12:59 a.m. UTC | #9
On Wed, Feb 20, 2013 at 04:58:54PM -0300, Marcelo Tosatti wrote:

> This is probably a stupid question, but why the
> KVM_SET_IRQCHIP/KVM_SET_GSI_ROUTING interface is not appropriate for
> your purposes?
> 
> x86 sets up a default GSI->IRQCHIP PIN mapping on creation (during
> KVM_SET_IRQCHIP), but it can be modified with KVM_SET_GSI_ROUTING.

So, I see Scott already answered from the point of view of his MPIC
emulation stuff, but I'll answer too from the point of view of my XICS
emulation code.

My understanding, possibly imperfect, is that in a real system the
routing of GSIs to IOAPICs would either be hardwired or set up by the
BIOS, described in ACPI tables, and not modified by the operating
system.  Is that correct?  So my belief is that the GSI routing is
fundamentally distinct from and handled differently from the routing
of interrupts to CPUs, which is fully under the control of the OS.

In the XICS model we have a set of interrupt sources, each identified
by a 24-bit number.  Control operations on an interrupt source just
identify the source by its number.  Thus the interrupt source number
is like a GSI, but we don't need to map that to a different space
(e.g. IOAPIC identifier and input number) in order to operate on it,
we can just operate on it directly.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf March 21, 2013, 9:20 a.m. UTC | #10
On 25.02.2013, at 01:59, Paul Mackerras wrote:

> On Wed, Feb 20, 2013 at 04:58:54PM -0300, Marcelo Tosatti wrote:
> 
>> This is probably a stupid question, but why the
>> KVM_SET_IRQCHIP/KVM_SET_GSI_ROUTING interface is not appropriate for
>> your purposes?
>> 
>> x86 sets up a default GSI->IRQCHIP PIN mapping on creation (during
>> KVM_SET_IRQCHIP), but it can be modified with KVM_SET_GSI_ROUTING.
> 
> So, I see Scott already answered from the point of view of his MPIC
> emulation stuff, but I'll answer too from the point of view of my XICS
> emulation code.
> 
> My understanding, possibly imperfect, is that in a real system the
> routing of GSIs to IOAPICs would either be hardwired or set up by the
> BIOS, described in ACPI tables, and not modified by the operating
> system.  Is that correct?  So my belief is that the GSI routing is
> fundamentally distinct from and handled differently from the routing
> of interrupts to CPUs, which is fully under the control of the OS.

It's a different layer. I guess there's really some confusion on names here :). I'm always confused when I read "sources" and you apparently get confused when you read about GSIs.

GSIs are an ACPI concept. It's not x86 specific, it's also not APIC specific. It's just a global name space for IRQs.

Imagine you have 2 MPICs in your system. But you only want to use a single token / numer to access any IRQ on any chip. That's where GSIs come into play. They map different irqchip IRQs onto a flat number space. To speak with x86 names:

Virtualization perspective:

  QEMU -> GSI -> IOAPIC -> LAPIC -> CPU

Device perspective:

  Device irq line -> IOAPIC -> LAPIC -> CPU


The "IOAPIC" is the piece of hardware that interrupt lines get attached to. You connect a pin on it to an irq pin of your device. That talks to the LAPIC to actually schedule interrupts on target CPUs. The LAPIC then fetches interrupts and pulls the CPU's interrupt line.

Of course, things are slightly more complicated in the x86 world, as everything behind the IOAPIC also carries a payload defining which pin actually got triggered, but you get the idea.

So really just consider GSIs as a "global flat number space" for irqchip pins.


Alex

> In the XICS model we have a set of interrupt sources, each identified
> by a 24-bit number.  Control operations on an interrupt source just
> identify the source by its number.  Thus the interrupt source number
> is like a GSI, but we don't need to map that to a different space
> (e.g. IOAPIC identifier and input number) in order to operate on it,
> we can just operate on it directly.
> 
> Paul.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index d3e2d60..0ff9dcf 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2141,6 +2141,57 @@  associated with the service will be forgotten, and subsequent RTAS
 calls by the guest for that service will be passed to userspace to be
 handled.
 
+4.80 KVM_CREATE_IRQCHIP_ARGS
+
+Capability: KVM_CAP_IRQCHIP_ARGS
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_irqchip_args
+Returns: 0 on success, -1 on error
+
+Creates an interrupt controller model in the kernel.  The type field
+of the argument struct indicates the interrupt controller architecture
+of the virtual machine.  Currently the only value permitted for the
+type field is 1, indicating the XICS (eXternal Interrupt Controller
+Specification) model defined in PAPR.  For XICS, this ioctl indicates
+to the kernel that an interrupt controller presentation (ICP) entity
+should be created for every vcpu, and interrupt controller source
+(ICS) entities should be created to accommodate the sources that are
+configured with the KVM_IRQCHIP_SET_SOURCES ioctl.
+
+4.81 KVM_IRQCHIP_GET_SOURCES
+
+Capability: KVM_CAP_IRQCHIP_ARGS
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_irq_sources
+Returns: 0 on success, -1 on error
+
+Copies configuration and status information about a range of interrupt
+sources into a user-supplied buffer.  The argument struct gives the
+starting interrupt source number and the number of interrupt sources.
+The user buffer is an array of 64-bit quantities, one per interrupt
+source, with (from the least- significant bit) 32 bits of interrupt
+server number, 8 bits of priority, and 1 bit each for a
+level-sensitive indicator, a masked indicator, and a pending
+indicator.  If some of the sources in the range don't exist, that is,
+have not yet been created with the KVM_IRQCHIP_SET_SOURCES ioctl,
+this returns an ENODEV error.
+
+4.82 KVM_IRQCHIP_SET_SOURCES
+
+Capability: KVM_CAP_IRQCHIP_ARGS
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_irq_sources
+Returns: 0 on success, -1 on error
+
+Sets the configuration and status for a range of interrupt sources
+from information supplied in a user-supplied buffer, creating the
+sources if they don't already exist.  The argument struct gives the
+starting interrupt source number and the number of interrupt sources.
+The user buffer is formatted as for KVM_IRQCHIP_GET_SOURCES.
+
 
 5. The kvm_run structure
 ------------------------
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 5a56e1c..17c9a15 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -142,6 +142,7 @@  extern int kvmppc_mmu_hv_init(void);
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+extern void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
 extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
 			   bool upper, u32 val);
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 8295dc7..b05e7cd 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -188,6 +188,10 @@  struct kvmppc_linear_info {
 	int		 type;
 };
 
+/* XICS components, defined in boo3s_xics.c */
+struct kvmppc_xics;
+struct kvmppc_icp;
+
 /*
  * The reverse mapping array has one entry for each HPTE,
  * which stores the guest's view of the second word of the HPTE
@@ -256,6 +260,7 @@  struct kvm_arch {
 #ifdef CONFIG_PPC_BOOK3S_64
 	struct list_head spapr_tce_tables;
 	struct list_head rtas_tokens;
+	struct kvmppc_xics *xics;
 #endif
 };
 
@@ -572,6 +577,9 @@  struct kvm_vcpu_arch {
 	u64 busy_stolen;
 	u64 busy_preempt;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	struct kvmppc_icp *icp; /* XICS presentation controller */
+#endif
 };
 
 /* Values for vcpu->arch.state */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index be611f6..f0fd22b 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -130,6 +130,13 @@  extern long kvmppc_prepare_vrma(struct kvm *kvm,
 extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
 			struct kvm_memory_slot *memslot, unsigned long porder);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
+extern int kvmppc_xics_ioctl(struct kvm *kvm, unsigned ioctl, unsigned long arg);
+extern int kvmppc_xics_create(struct kvm *kvm, struct kvm_irqchip_args *args);
+extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
+extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu);
+extern void kvmppc_xics_free(struct kvm *kvm);
+
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce *args);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
@@ -167,6 +174,8 @@  extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
 extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp);
 extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu);
 extern void kvmppc_rtas_tokens_free(struct kvm *kvm);
+extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority);
+extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority);
 
 /*
  * Cuts out inst bits with ordering according to spec.
@@ -263,6 +272,16 @@  static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 
 static inline void kvm_linear_init(void)
 {}
+
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+static inline int kvmppc_xics_enabled(struct kvm *kvm)
+{
+	return kvm->arch.xics != NULL;
+}
+#else
+static inline int kvmppc_xics_enabled(struct kvm *kvm) { return 0; }
 #endif
 
 static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 432132c..e2eb04c 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -87,6 +87,7 @@  kvm-book3s_64-module-objs := \
 	book3s.o \
 	book3s_64_vio.o \
 	book3s_rtas.o \
+	book3s_xics.o \
 	$(kvm-book3s_64-objs-y)
 
 kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 6548445..c5a4478 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -104,7 +104,7 @@  static int kvmppc_book3s_vec2irqprio(unsigned int vec)
 	return prio;
 }
 
-static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
 					  unsigned int vec)
 {
 	unsigned long old_pending = vcpu->arch.pending_exceptions;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 567c264..aa3a0db 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -528,6 +528,14 @@  int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 
 		/* Send the error out to userspace via KVM_RUN */
 		return rc;
+	case H_XIRR:
+	case H_CPPR:
+	case H_EOI:
+	case H_IPI:
+		if (kvmppc_xics_enabled(vcpu->kvm)) {
+			ret = kvmppc_xics_hcall(vcpu, req);
+			break;
+		} /* fallthrough */
 	default:
 		return RESUME_HOST;
 	}
@@ -886,6 +894,13 @@  struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	spin_lock_init(&vcpu->arch.tbacct_lock);
 	vcpu->arch.busy_preempt = TB_NIL;
 
+	/* Create the XICS */
+	if (kvmppc_xics_enabled(kvm)) {
+		err = kvmppc_xics_create_icp(vcpu);
+		if (err < 0)
+			goto free_vcpu;
+	}
+
 	kvmppc_mmu_book3s_hv_init(vcpu);
 
 	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
@@ -937,6 +952,8 @@  void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa.pinned_addr);
 	spin_unlock(&vcpu->arch.vpa_update_lock);
 	kvm_vcpu_uninit(vcpu);
+	if (kvmppc_xics_enabled(vcpu->kvm))
+		kvmppc_xics_free_icp(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
 
@@ -1882,6 +1899,9 @@  void kvmppc_core_destroy_vm(struct kvm *kvm)
 
 	kvmppc_rtas_tokens_free(kvm);
 
+	if (kvmppc_xics_enabled(kvm))
+		kvmppc_xics_free(kvm);
+
 	kvmppc_free_hpt(kvm);
 	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 }
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 73ed11c..9b2237f 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1069,6 +1069,13 @@  struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	if (err < 0)
 		goto uninit_vcpu;
 
+	/* Create the XICS */
+	if (kvmppc_xics_enabled(kvm)) {
+		err = kvmppc_xics_create_icp(vcpu);
+		if (err < 0)
+			goto free_vcpu;
+	}
+
 	return vcpu;
 
 uninit_vcpu:
@@ -1085,6 +1092,8 @@  void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 
+	if (kvmppc_xics_enabled(vcpu->kvm))
+		kvmppc_xics_free_icp(vcpu);
 	free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
 	kvm_vcpu_uninit(vcpu);
 	kfree(vcpu_book3s->shadow_vcpu);
@@ -1293,6 +1302,7 @@  int kvmppc_core_init_vm(struct kvm *kvm)
 {
 #ifdef CONFIG_PPC64
 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+	INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
 #endif
 
 	return 0;
@@ -1303,6 +1313,9 @@  void kvmppc_core_destroy_vm(struct kvm *kvm)
 #ifdef CONFIG_PPC64
 	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 #endif
+	if (kvmppc_xics_enabled(kvm))
+		kvmppc_xics_free(kvm);
+
 }
 
 static int kvmppc_book3s_init(void)
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index 4efa4a4..94cec5b 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -227,6 +227,15 @@  static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 }
 
+static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
+{
+	long rc = kvmppc_xics_hcall(vcpu, cmd);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
 int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 {
 	switch (cmd) {
@@ -246,6 +255,13 @@  int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 		vcpu->stat.halt_wakeup++;
 		return EMULATE_DONE;
+	case H_XIRR:
+	case H_CPPR:
+	case H_EOI:
+	case H_IPI:
+		if (kvmppc_xics_enabled(vcpu->kvm))
+			return kvmppc_h_pr_xics_hcall(vcpu, cmd);
+		break;
 	case H_RTAS:
 		if (list_empty(&vcpu->kvm->arch.rtas_tokens))
 			return RESUME_HOST;
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
index 8a324e8..6a6c1fe 100644
--- a/arch/powerpc/kvm/book3s_rtas.c
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -18,12 +18,61 @@ 
 #include <asm/rtas.h>
 
 
+static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq, server, priority;
+	int rc;
+
+	if (args->nargs != 3 || args->nret != 1) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = args->args[0];
+	server = args->args[1];
+	priority = args->args[2];
+
+	rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
+	if (rc)
+		rc = -3;
+out:
+	args->rets[0] = rc;
+}
+
+static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq, server, priority;
+	int rc;
+
+	if (args->nargs != 1 || args->nret != 3) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = args->args[0];
+
+	server = priority = 0;
+	rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
+	if (rc) {
+		rc = -3;
+		goto out;
+	}
+
+	args->rets[1] = server;
+	args->rets[2] = priority;
+out:
+	args->rets[0] = rc;
+}
+
 struct rtas_handler {
 	void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args);
 	char *name;
 };
 
-static struct rtas_handler rtas_handlers[] = { };
+static struct rtas_handler rtas_handlers[] = {
+	{ .name = "ibm,set-xive", .handler = kvm_rtas_set_xive },
+	{ .name = "ibm,get-xive", .handler = kvm_rtas_get_xive },
+};
 
 struct rtas_token_definition {
 	struct list_head list;
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
new file mode 100644
index 0000000..7749060
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -0,0 +1,1101 @@ 
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "book3s_xics.h"
+
+#define XICS_DBG(fmt...) do { } while (0)
+//#define XICS_DBG(fmt...) do { trace_printk(fmt); } while (0)
+
+/*
+ * LOCKING
+ * =======
+ *
+ * Each ICS has a mutex protecting the information about the IRQ
+ * sources and avoiding simultaneous deliveries if the same interrupt.
+ *
+ * ICP operations are done via a single compare & swap transaction
+ * (most ICP state fits in the union kvmppc_icp_state)
+ */
+
+/*
+ * TODO
+ * ====
+ *
+ * - To speed up resends, keep a bitmap of "resend" set bits in the
+ *   ICS
+ *
+ * - Speed up server# -> ICP lookup (array ? hash table ?)
+ *
+ * - Make ICS lockless as well, or at least a per-interrupt lock or hashed
+ *   locks array to improve scalability
+ *
+ * - ioctl's to save/restore the entire state for snapshot & migration
+ */
+
+/* -- ICS routines -- */
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq);
+
+static void ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
+{
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;	
+	u16 src;
+
+	XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics) {
+		XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq);
+		return;
+	}
+	state = &ics->irq_state[src];
+
+	/*
+	 * We set state->asserted locklessly. This should be fine as
+	 * we are the only setter, thus concurrent access is undefined
+	 * to begin with.
+	 */
+	if (level == KVM_INTERRUPT_SET_LEVEL)
+		state->asserted = 1;
+	else if (level == KVM_INTERRUPT_UNSET) {
+		state->asserted = 0;
+		return;
+	}
+
+	/* Attempt delivery */
+	icp_deliver_irq(xics, NULL, irq);
+}
+
+static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
+			     struct kvmppc_icp *icp)
+{
+	int i;
+
+	mutex_lock(&ics->lock);
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		struct ics_irq_state *state = &ics->irq_state[i];
+
+		if (!state->resend)
+			continue;
+
+		XICS_DBG("resend %#x prio %#x\n", state->number,
+			      state->priority);
+
+		mutex_unlock(&ics->lock);
+		icp_deliver_irq(xics, icp, state->number);
+		mutex_lock(&ics->lock);
+	}
+
+	mutex_unlock(&ics->lock);
+}
+
+int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_icp *icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+	bool deliver;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	icp = kvmppc_xics_find_server(kvm, server);
+	if (!icp)
+		return -EINVAL;
+
+	mutex_lock(&ics->lock);
+
+	XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n",
+		 irq, server, priority,
+		 state->masked_pending, state->resend);
+
+	state->server = server;
+	state->priority = priority;
+	deliver = false;
+	if ((state->masked_pending || state->resend) && priority != MASKED) {
+		state->masked_pending = 0;
+		deliver = true;
+	}
+
+	mutex_unlock(&ics->lock);
+
+	if (deliver)
+		icp_deliver_irq(xics, icp, irq);
+
+	return 0;
+}
+
+int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	mutex_lock(&ics->lock);
+	*server = state->server;
+	*priority = state->priority;
+	mutex_unlock(&ics->lock);
+
+	return 0;
+}
+
+/* -- ICP routines, including hcalls -- */
+
+static inline bool icp_try_update(struct kvmppc_icp *icp,
+				  union kvmppc_icp_state old,
+				  union kvmppc_icp_state new,
+				  bool change_self)
+{
+	bool success;
+
+	/* Calculate new output value */
+	new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+	/* Attempt atomic update */
+	success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+	if (!success)
+		goto bail;
+
+	XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+		 icp->vcpu->vcpu_id,
+		 old.cppr, old.mfrr, old.pending_pri, old.xisr,
+		 old.need_resend, old.out_ee);
+	XICS_DBG("UPD        - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+		 new.cppr, new.mfrr, new.pending_pri, new.xisr,
+		 new.need_resend, new.out_ee);
+	/*
+	 * Check for output state update
+	 *
+	 * Note that this is racy since another processor could be updating
+	 * the state already. This is why we never clear the interrupt output
+	 * here, we only ever set it. The clear only happens prior to doing
+	 * an update and only by the processor itself. Currently we do it
+	 * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+	 *
+	 * We also do not try to figure out whether the EE state has changed,
+	 * we unconditionally set it if the new state calls for it for the
+	 * same reason.
+	 */
+	if (new.out_ee) {
+		kvmppc_book3s_queue_irqprio(icp->vcpu,
+					    BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+		if (!change_self)
+			kvm_vcpu_kick(icp->vcpu);
+	}
+ bail:
+	return success;
+}
+
+static void icp_check_resend(struct kvmppc_xics *xics,
+			     struct kvmppc_icp *icp)
+{
+	u32 icsid;
+	
+	/* Order this load with the test for need_resend in the caller */
+	smp_rmb();
+	for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
+		struct kvmppc_ics *ics = xics->ics[icsid];
+
+		if (!test_and_clear_bit(icsid, icp->resend_map))
+			continue;
+		if (!ics)
+			continue;
+		ics_check_resend(xics, ics, icp);
+	}
+}
+
+static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+			       u32 *reject)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool success;
+
+	XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority,
+		 icp->vcpu->vcpu_id);
+
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		*reject = 0;
+
+		/* See if we can deliver */
+		success = new_state.cppr > priority &&
+			new_state.mfrr > priority &&
+			new_state.pending_pri > priority;
+
+		/*
+		 * If we can, check for a rejection and perform the
+		 * delivery
+		 */
+		if (success) {
+			*reject = new_state.xisr;
+			new_state.xisr = irq;
+			new_state.pending_pri = priority;
+		} else {
+			/*
+			 * If we failed to deliver we set need_resend
+			 * so a subsequent CPPR state change causes us
+			 * to try a new delivery.
+			 */
+			new_state.need_resend = true;
+		}
+
+	} while (!icp_try_update(icp, old_state, new_state, false));
+
+	return success;
+}
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq)
+{
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;
+	u32 reject;
+	u16 src;	
+
+	/*
+	 * This is used both for initial delivery of an interrupt and
+	 * for subsequent rejection.
+	 *
+	 * Rejection can be racy vs. resends. We have evaluated the
+	 * rejection in an atomic ICP transaction which is now complete,
+	 * so potentially the ICP can already accept the interrupt again.
+	 *
+	 * So we need to retry the delivery. Essentially the reject path
+	 * boils down to a failed delivery. Always.
+	 *
+	 * Now the interrupt could also have moved to a different target,
+	 * thus we may need to re-do the ICP lookup as well
+	 */
+	 
+ again:
+	/* Get the ICS state and lock it */
+	ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+	if (!ics) {
+		XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq);
+		return;
+	}
+	state = &ics->irq_state[src];
+
+	/* Get a lock on the ICS */
+	mutex_lock(&ics->lock);
+
+	/* Get our server */
+	if (!icp || state->server != icp->vcpu->vcpu_id) {
+		icp = kvmppc_xics_find_server(xics->kvm, state->server);
+		if (!icp) {
+			pr_warning("icp_deliver_irq: IRQ 0x%06x server 0x%x"
+				   " not found !\n", new_irq, state->server);
+			goto out;
+		}
+	}
+
+	/* Clear the resend bit of that interrupt */
+	state->resend = 0;
+
+	/*
+	 * If masked, bail out
+	 *
+	 * Note: PAPR doesn't mention anything about masked pending
+	 * when doing a resend, only when doing a delivery.
+	 *
+	 * However that would have the effect of losing a masked
+	 * interrupt that was rejected and isn't consistent with
+	 * the whole masked_pending business which is about not
+	 * losing interrupts that occur while masked.
+	 *
+	 * I don't differenciate normal deliveries and resends, this
+	 * implementation will differ from PAPR and not lose such
+	 * interrupts.
+	 */
+	if (state->priority == MASKED) {
+		XICS_DBG("irq %#x masked pending\n", new_irq);
+		state->masked_pending = 1;
+		goto out;
+	}
+
+	/*
+	 * Try the delivery, this will set the need_resend flag
+	 * in the ICP as part of the atomic transaction if the
+	 * delivery is not possible.
+	 *
+	 * Note that if successful, the new delivery might have itself
+	 * rejected an interrupt that was "delivered" before we took the
+	 * icp mutex.
+	 *
+	 * In this case we do the whole sequence all over again for the
+	 * new guy. We cannot assume that the rejected interrupt is less
+	 * favored than the new one, and thus doesn't need to be delivered,
+	 * because by the time we exit icp_try_to_deliver() the target
+	 * processor may well have alrady consumed & completed it, and thus
+	 * the rejected interrupt might actually be already acceptable.
+	 */
+	if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+		/*
+		 * Delivery was successful, did we reject somebody else ?
+		 */
+		if (reject && reject != XICS_IPI) {
+			mutex_unlock(&ics->lock);
+			new_irq = reject;
+			goto again;
+		}
+	} else {
+		/*
+		 * We failed to deliver the interrupt we need to set the
+		 * resend map bit and mark the ICS state as needing a resend
+		 */
+		set_bit(ics->icsid, icp->resend_map);
+		state->resend = 1;
+
+		/*
+		 * If the need_resend flag got cleared in the ICP some time
+		 * between icp_try_to_deliver() atomic update and now, then
+		 * we know it might have missed the resend_map bit. So we
+		 * retry
+		 */
+		smp_mb();
+		if (!icp->state.need_resend) {
+			mutex_unlock(&ics->lock);
+			goto again;
+		}
+	}
+ out:
+	mutex_unlock(&ics->lock);
+}
+
+static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			  u8 new_cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool resend;
+
+	/*
+	 * This handles several related states in one operation:
+	 *
+	 * ICP State: Down_CPPR
+	 *
+	 * Load CPPR with new value and if the XISR is 0
+	 * then check for resends:
+	 *
+	 * ICP State: Resend
+	 *
+	 * If MFRR is more favored than CPPR, check for IPIs
+	 * and notify ICS of a potential resend. This is done
+	 * asynchronously (when used in real mode, we will have
+	 * to exit here).
+	 *
+	 * We do not handle the complete Check_IPI as documented
+	 * here. In the PAPR, this state will be used for both
+	 * Set_MFRR and Down_CPPR. However, we know that we aren't
+	 * changing the MFRR state here so we don't need to handle
+	 * the case of an MFRR causing a reject of a pending irq,
+	 * this will have been handled when the MFRR was set in the
+	 * first place.
+	 *
+	 * Thus we don't have to handle rejects, only resends.
+	 *
+	 * When implementing real mode for HV KVM, resend will lead to
+	 * a H_TOO_HARD return and the whole transaction will be handled
+	 * in virtual mode.
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		/* Down_CPPR */
+		new_state.cppr = new_cppr;
+
+		/*
+		 * Cut down Resend / Check_IPI / IPI
+		 *
+		 * The logic is that we cannot have a pending interrupt
+		 * trumped by an IPI at this point (see above), so we
+		 * know that either the pending interrupt is already an
+		 * IPI (in which case we don't care to override it) or
+		 * it's either more favored than us or non existent
+		 */
+		if (new_state.mfrr < new_cppr &&
+		    new_state.mfrr <= new_state.pending_pri) {
+			WARN_ON(new_state.xisr != XICS_IPI &&
+				new_state.xisr != 0);
+			new_state.pending_pri = new_state.mfrr;
+			new_state.xisr = XICS_IPI;
+		}
+
+		/* Latch/clear resend bit */
+		resend = new_state.need_resend;
+		new_state.need_resend = 0;
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	/*
+	 * Now handle resend checks. Those are asynchronous to the ICP
+	 * state update in HW (ie bus transactions) so we can handle them
+	 * separately here too
+	 */
+	if (resend)
+		icp_check_resend(xics, icp);
+}
+
+static noinline unsigned long h_xirr(struct kvm_vcpu *vcpu)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 xirr;
+
+	/* First, remove EE from the processor */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+	/*
+	 * ICP State: Accept_Interrupt
+	 *
+	 * Return the pending interrupt (if any) along with the
+	 * current CPPR, then clear the XISR & set CPPR to the
+	 * pending priority
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+		if (!old_state.xisr)
+			break;
+		new_state.cppr = new_state.pending_pri;
+		new_state.pending_pri = 0xff;
+		new_state.xisr = 0;
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr);
+
+	return xirr;
+}
+
+static noinline int h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+			  unsigned long mfrr)
+{
+        union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp;
+	u32 reject;
+	bool resend;
+	bool local;
+
+	XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n",
+			vcpu->vcpu_id, server, mfrr);
+
+	local = vcpu->vcpu_id == server;
+	if (local)
+		icp = vcpu->arch.icp;
+	else
+		icp = kvmppc_xics_find_server(vcpu->kvm, server);
+	if (!icp)
+		return H_PARAMETER;
+
+	/*
+	 * ICP state: Set_MFRR
+	 *
+	 * If the CPPR is more favored than the new MFRR, then
+	 * nothing needs to be rejected as there can be no XISR to
+	 * reject.  If the MFRR is being made less favored then
+	 * there might be a previously-rejected interrupt needing
+	 * to be resent.
+	 *
+	 * If the CPPR is less favored, then we might be replacing
+	 * an interrupt, and thus need to possibly reject it as in
+	 *
+	 * ICP state: Check_IPI
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		/* Set_MFRR */
+		new_state.mfrr = mfrr;
+
+		/* Check_IPI */
+		reject = 0;
+		resend = false;
+		if (mfrr < new_state.cppr) {
+			/* Reject a pending interrupt if not an IPI */
+			if (mfrr <= new_state.pending_pri)
+				reject = new_state.xisr;
+			new_state.pending_pri = mfrr;
+			new_state.xisr = XICS_IPI;
+		}
+
+		if (mfrr > old_state.mfrr && mfrr > new_state.cppr) {
+			resend = new_state.need_resend;
+			new_state.need_resend = 0;
+		}
+	} while (!icp_try_update(icp, old_state, new_state, local));
+
+	/* Handle reject */
+	if (reject && reject != XICS_IPI)
+		icp_deliver_irq(xics, icp, reject);
+		
+	/* Handle resend */
+	if (resend)
+		icp_check_resend(xics, icp);
+
+	return H_SUCCESS;
+}
+
+static noinline void h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 reject;
+
+	XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr);
+
+	/*
+	 * ICP State: Set_CPPR
+	 *
+	 * We can safely compare the new value with the current
+	 * value outside of the transaction as the CPPR is only
+	 * ever changed by the processor on itself
+	 */
+	if (cppr > icp->state.cppr)
+		icp_down_cppr(xics, icp, cppr);
+	else if (cppr == icp->state.cppr)
+		return;
+
+	/*
+	 * ICP State: Up_CPPR
+	 *
+	 * The processor is raising its priority, this can result
+	 * in a rejection of a pending interrupt:
+	 *
+	 * ICP State: Reject_Current
+	 *
+	 * We can remove EE from the current processor, the update
+	 * transaction will set it again if needed
+	 */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		reject = 0;
+		new_state.cppr = cppr;
+
+		if (cppr <= new_state.pending_pri) {
+			reject = new_state.xisr;
+			new_state.xisr = 0;
+			new_state.pending_pri = 0xff;
+		}
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	/*
+	 * Check for rejects. They are handled by doing a new delivery
+	 * attempt (see comments in icp_deliver_irq).
+	 */
+	if (reject && reject != XICS_IPI)
+		icp_deliver_irq(xics, icp, reject);
+}
+
+static noinline int h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u32 irq = xirr & 0x00ffffff;
+	u16 src;
+
+	XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
+
+	/*
+	 * ICP State: EOI
+	 *
+	 * Note: If EOI is incorrectly used by SW to lower the CPPR
+	 * value (ie more favored), we do not check for rejection of
+	 * a pending interrupt, this is a SW error and PAPR sepcifies
+	 * that we don't have to deal with it.
+	 *
+	 * The sending of an EOI to the ICS is handled after the
+	 * CPPR update
+	 *
+	 * ICP State: Down_CPPR which we handle
+	 * in a separate function as it's shared with H_CPPR.
+	 */
+	icp_down_cppr(xics, icp, xirr >> 24);
+
+	/* IPIs have no EOI */
+	if (irq == XICS_IPI)
+		return H_SUCCESS;
+	/*
+	 * EOI handling: If the interrupt is still asserted, we need to
+	 * resend it. We can take a lockless "peek" at the ICS state here.
+	 *
+	 * "Message" interrupts will never have "asserted" set
+	 */
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics) {
+		XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
+		return H_PARAMETER;
+	}
+	state = &ics->irq_state[src];
+
+	/* Still asserted, resend it */
+	if (state->asserted)
+		icp_deliver_irq(xics, icp, irq);
+
+	return H_SUCCESS;
+}
+
+int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
+{
+	unsigned long res;
+	int rc = H_SUCCESS;
+
+	/* Check if we have an ICP */
+	if (!vcpu->arch.icp || !vcpu->kvm->arch.xics)
+		return H_HARDWARE;
+
+	switch (req) {
+	case H_XIRR:
+		res = h_xirr(vcpu);
+		kvmppc_set_gpr(vcpu, 4, res);
+		break;
+	case H_CPPR:
+		h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
+		break;
+	case H_EOI:
+		rc = h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
+		break;
+	case H_IPI:
+		rc = h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
+			   kvmppc_get_gpr(vcpu, 5));
+		break;
+	}
+
+	return rc;
+}
+
+
+/* -- Initialisation code etc. -- */
+
+static int xics_debug_show(struct seq_file *m, void *private)
+{
+	struct kvmppc_xics *xics = m->private;
+	struct kvm *kvm = xics->kvm;
+	struct kvm_vcpu *vcpu;
+	int icsid, i;
+
+	if (!kvm)
+		return 0;
+
+	seq_printf(m, "=========\nICP state\n=========\n");
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvmppc_icp *icp = vcpu->arch.icp;
+		union kvmppc_icp_state state;
+
+		if (!icp)
+			continue;
+
+		state.raw = ACCESS_ONCE(icp->state.raw);
+		seq_printf(m, "cpu server %#x XIRR:%#x PPRI:%#x CPPR:%#x "
+			   "MFRR:%#x OUT:%d NR:%d\n", vcpu->vcpu_id, state.xisr,
+			   state.pending_pri, state.cppr, state.mfrr,
+			   state.out_ee, state.need_resend);
+	}
+
+	for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
+		struct kvmppc_ics *ics = xics->ics[icsid];
+
+		if (!ics)
+			continue;
+
+		seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n",
+			   icsid);
+
+		mutex_lock(&ics->lock);
+
+		for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+			struct ics_irq_state *irq = &ics->irq_state[i];
+
+			seq_printf(m, "irq 0x%06x: server %#x prio %#x save"
+				   " prio %#x asserted %d resend %d masked"
+				   " pending %d\n",
+				   irq->number, irq->server, irq->priority,
+				   irq->saved_priority, irq->asserted,
+				   irq->resend, irq->masked_pending);
+
+		}
+		mutex_unlock(&ics->lock);
+	}
+	return 0;
+}
+
+static int xics_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xics_debug_show, inode->i_private);
+}
+
+static const struct file_operations xics_debug_fops = {
+	.open = xics_debug_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void xics_debugfs_init(struct kvmppc_xics *xics)
+{
+	char *name;
+
+	name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics);
+	if (!name) {
+		pr_err("%s: no memory for name\n", __func__);
+		return;
+	}
+
+	xics->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root,
+					   xics, &xics_debug_fops);
+
+	pr_debug("%s: created %s\n", __func__, name);
+	kfree(name);
+}
+
+static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvmppc_xics *xics,
+						 int irq)
+{
+	struct kvmppc_ics *ics;
+	int i, icsid;
+
+	icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+
+	mutex_lock(&xics->kvm->lock);
+
+	/* ICS already exists - somebody else got here first */
+	if (xics->ics[icsid])
+		goto out;
+
+	/* Create the ICS */
+	ics = kzalloc(sizeof(struct kvmppc_ics), GFP_KERNEL);
+	if (!ics)
+		goto out;
+
+	mutex_init(&ics->lock);
+	ics->icsid = icsid;
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		ics->irq_state[i].number = (icsid << KVMPPC_XICS_ICS_SHIFT) | i;
+		ics->irq_state[i].priority = MASKED;
+		ics->irq_state[i].saved_priority = MASKED;
+	}
+	smp_wmb();
+	xics->ics[icsid] = ics;
+
+	if (icsid > xics->max_icsid)
+		xics->max_icsid = icsid;
+
+ out:
+	mutex_unlock(&xics->kvm->lock);
+	return xics->ics[icsid];
+}
+
+int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_icp *icp;
+
+	icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL);
+	if (!icp)
+		return -ENOMEM;
+
+	icp->vcpu = vcpu;
+	icp->state.mfrr = MASKED;
+	icp->state.pending_pri = MASKED;
+	vcpu->arch.icp = icp;
+
+	XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id);
+
+	return 0;
+}
+
+void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->arch.icp)
+		return;
+	kfree(vcpu->arch.icp);
+	vcpu->arch.icp = NULL;
+}
+
+void kvmppc_xics_free(struct kvm *kvm)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	int i;
+
+	if (!xics)
+		return;
+
+	lockdep_assert_held(&kvm->lock);
+
+	debugfs_remove(xics->dentry);
+
+	if (xics->kvm) {
+		xics->kvm->arch.xics = NULL;
+		xics->kvm = NULL;
+	}
+
+	for (i = 0; i <= xics->max_icsid; i++) {
+		if (xics->ics[i])
+			kfree(xics->ics[i]);
+	}
+	kfree(xics);
+}
+
+static int kvm_xics_get_sources(struct kvm *kvm, struct kvm_irq_sources *srcs)
+{
+	int ret = 0;
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *irqp;
+	u64 __user *ubufp;
+	u16 idx;
+	u64 val;
+	long int i, irq, nirq;
+
+	irq = srcs->irq;
+	ubufp = srcs->irqbuf;
+
+	while (srcs->nr_irqs > 0 && !ret) {
+		ics = kvmppc_xics_find_ics(xics, irq, &idx);
+		if (!ics)
+			return -ENOENT;
+		nirq = KVMPPC_XICS_IRQ_PER_ICS - idx;
+		if (nirq > srcs->nr_irqs)
+			nirq = srcs->nr_irqs;
+		srcs->nr_irqs -= nirq;
+		irq += nirq;
+
+		irqp = &ics->irq_state[idx];
+		mutex_lock(&ics->lock);
+		for (i = 0; i < nirq; ++i, ++irqp, ++ubufp) {
+			ret = -ENOENT;
+			if (!irqp->exists)
+				break;
+			val = irqp->server;
+			val |= ((u64)irqp->priority << KVM_IRQ_PRIORITY_SHIFT);
+			if (irqp->priority == MASKED)
+				val |= KVM_IRQ_MASKED;
+			if (irqp->asserted)
+				val |= KVM_IRQ_LEVEL_SENSITIVE |
+					KVM_IRQ_PENDING;
+			else if (irqp->masked_pending || irqp->resend)
+				val |= KVM_IRQ_PENDING;
+			ret = -EFAULT;
+			if (__put_user(val, ubufp))
+				break;
+			ret = 0;
+		}
+		mutex_unlock(&ics->lock);
+	}
+
+	return ret;
+}
+
+static int kvm_xics_set_sources(struct kvm *kvm, struct kvm_irq_sources *srcs)
+{
+	int ret = 0;
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *irqp;
+	u64 __user *ubufp;
+	u16 idx;
+	u64 val;
+	long int i, irq, nirq;
+
+	irq = srcs->irq;
+	ubufp = srcs->irqbuf;
+
+	if (irq < KVMPPC_XICS_FIRST_IRQ ||
+	    irq + srcs->nr_irqs > KVMPPC_XICS_NR_IRQS)
+		return -ENOENT;
+
+	while (srcs->nr_irqs > 0 && !ret) {
+		ics = kvmppc_xics_find_ics(xics, irq, &idx);
+		if (!ics) {
+			ics = kvmppc_xics_create_ics(xics, irq);
+			if (!ics)
+				return -ENOMEM;
+		}
+		nirq = KVMPPC_XICS_IRQ_PER_ICS - idx;
+		if (nirq > srcs->nr_irqs)
+			nirq = srcs->nr_irqs;
+		srcs->nr_irqs -= nirq;
+		irq += nirq;
+
+		irqp = &ics->irq_state[idx];
+		ubufp = srcs->irqbuf;
+		for (i = 0; i < nirq; ++i, ++irqp, ++ubufp) {
+			ret = -EFAULT;
+			if (__get_user(val, ubufp))
+				break;
+			ret = 0;
+
+			mutex_lock(&ics->lock);
+			irqp->server = val & KVM_IRQ_SERVER_MASK;
+			irqp->priority = val >> KVM_IRQ_PRIORITY_SHIFT;
+			irqp->resend = 0;
+			irqp->masked_pending = 0;
+			irqp->asserted = 0;
+			if ((val & KVM_IRQ_PENDING) &&
+			    (val & KVM_IRQ_LEVEL_SENSITIVE))
+				irqp->asserted = 1;
+			irqp->exists = 1;
+			mutex_unlock(&ics->lock);
+
+			if (val & KVM_IRQ_PENDING)
+				icp_deliver_irq(xics, NULL, irqp->number);
+		}
+	}
+
+	return ret;
+}
+
+/* -- ioctls -- */
+
+int kvmppc_xics_create(struct kvm *kvm, struct kvm_irqchip_args *args)
+{
+	struct kvmppc_xics *xics;
+	int rc = 0;
+
+	mutex_lock(&kvm->lock);
+
+	/* Already there ? */
+	if (kvm->arch.xics)
+		return -EEXIST;
+
+	xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+	if (!xics) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	xics->kvm = kvm;
+	kvm->arch.xics = xics;
+	xics_debugfs_init(xics);
+
+out:
+	mutex_unlock(&kvm->lock);
+	return rc;
+}
+
+static int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args)
+{
+	struct kvmppc_xics *xics;
+
+	/* locking against multiple callers? */
+
+	xics = kvm->arch.xics;
+	if (!xics)
+		return -ENODEV;
+
+	switch (args->level) {
+	case KVM_INTERRUPT_SET:
+	case KVM_INTERRUPT_SET_LEVEL:
+	case KVM_INTERRUPT_UNSET:
+		ics_deliver_irq(xics, args->irq, args->level);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int kvmppc_xics_ioctl(struct kvm *kvm, unsigned ioctl, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	int rc;
+
+	BUILD_BUG_ON(sizeof(union kvmppc_icp_state) != sizeof(unsigned long));
+
+	switch (ioctl) {
+	case KVM_IRQ_LINE: {
+		struct kvm_irq_level args;
+
+		rc = -EFAULT;
+		if (copy_from_user(&args, argp, sizeof(args)))
+			break;
+		rc = kvm_vm_ioctl_xics_irq(kvm, &args);
+		break;
+	}
+
+	case KVM_IRQCHIP_GET_SOURCES: {
+		struct kvm_irq_sources sources;
+
+		rc = -EFAULT;
+		if (copy_from_user(&sources, argp, sizeof(sources)))
+			break;
+		if (!access_ok(VERIFY_WRITE, sources.irqbuf,
+			       sources.nr_irqs * sizeof(u64)))
+			break;
+		rc = kvm_xics_get_sources(kvm, &sources);
+		break;
+	}
+
+	case KVM_IRQCHIP_SET_SOURCES: {
+		struct kvm_irq_sources sources;
+
+		rc = -EFAULT;
+		if (copy_from_user(&sources, argp, sizeof(sources)))
+			break;
+		if (!access_ok(VERIFY_READ, sources.irqbuf,
+			       sources.nr_irqs * sizeof(u64)))
+			break;
+		rc = kvm_xics_set_sources(kvm, &sources);
+		break;
+	}
+
+	default:
+		rc = -ENOTTY;
+		break;
+	}
+
+	return rc;
+}
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
new file mode 100644
index 0000000..0e20a51
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -0,0 +1,111 @@ 
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _KVM_PPC_BOOK3S_XICS_H
+#define _KVM_PPC_BOOK3S_XICS_H
+
+/*
+ * We use a two-level tree to store interrupt source information.
+ * There are up to 1024 ICS nodes, each of which can represent
+ * 1024 sources.
+ */
+#define KVMPPC_XICS_MAX_ICS_ID	1023
+#define KVMPPC_XICS_ICS_SHIFT	10
+#define KVMPPC_XICS_IRQ_PER_ICS	(1 << KVMPPC_XICS_ICS_SHIFT)
+#define KVMPPC_XICS_SRC_MASK	(KVMPPC_XICS_IRQ_PER_ICS - 1)
+
+/*
+ * Interrupt source numbers below this are reserved, for example
+ * 0 is "no interrupt", and 2 is used for IPIs.
+ */
+#define KVMPPC_XICS_FIRST_IRQ	16
+#define KVMPPC_XICS_NR_IRQS	((KVMPPC_XICS_MAX_ICS_ID + 1) * KVMPPC_XICS_IRQ_PER_ICS)
+
+/* Priority value to use for disabling an interrupt */
+#define MASKED	0xff
+
+/* State for one irq source */
+struct ics_irq_state {
+	u32 number;
+	u32 server;
+	u8  priority;
+	u8  saved_priority; /* currently unused */
+	u8  resend;
+	u8  masked_pending;
+	u8  asserted; /* Only for LSI */
+	u8  exists;
+};
+
+/* Atomic ICP state, updated with a single compare & swap */
+union kvmppc_icp_state {
+	unsigned long raw;
+	struct {
+		u8 out_ee : 1;
+		u8 need_resend : 1;
+		u8 cppr;
+		u8 mfrr;
+		u8 pending_pri;
+		u32 xisr;
+	};
+};
+
+/* One bit per ICS */
+#define ICP_RESEND_MAP_SIZE	(KVMPPC_XICS_MAX_ICS_ID / BITS_PER_LONG + 1)
+
+struct kvmppc_icp {
+	struct kvm_vcpu *vcpu;
+	union kvmppc_icp_state state;
+	unsigned long resend_map[ICP_RESEND_MAP_SIZE];
+};
+
+struct kvmppc_ics {
+	struct mutex lock;
+	u16 icsid;
+	struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
+};
+
+struct kvmppc_xics {
+	struct kvm *kvm;
+	struct dentry *dentry;
+	u32 max_icsid;
+	struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1];
+};
+
+static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm,
+							 u32 nr)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (nr == vcpu->vcpu_id)
+			return vcpu->arch.icp;
+	}
+	return NULL;
+}
+
+static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
+						      u32 irq, u16 *source)
+{
+	u32 icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+	u16 src = irq & KVMPPC_XICS_SRC_MASK;
+	struct kvmppc_ics *ics;
+
+	if (source)
+		*source = src;
+	if (icsid > KVMPPC_XICS_MAX_ICS_ID)
+		return NULL;
+	ics = xics->ics[icsid];
+	if (!ics)
+		return NULL;
+	return ics;
+}
+
+
+#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1772883..3bcc030 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -383,6 +383,7 @@  int kvm_dev_ioctl_check_extension(long ext)
 		break;
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CAP_PPC_GET_SMMU_INFO:
+	case KVM_CAP_IRQCHIP_ARGS:
 		r = 1;
 		break;
 #endif
@@ -1002,6 +1003,28 @@  long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
 		break;
 	}
+	case KVM_IRQ_LINE:
+	case KVM_IRQCHIP_GET_SOURCES:
+	case KVM_IRQCHIP_SET_SOURCES: {
+		struct kvm *kvm = filp->private_data;
+
+		r = -ENOTTY;
+		if (kvmppc_xics_enabled(kvm))
+			r = kvmppc_xics_ioctl(kvm, ioctl, arg);
+		break;
+	}
+	case KVM_CREATE_IRQCHIP_ARGS: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_irqchip_args args;
+
+		r = -EFAULT;
+		if (copy_from_user(&args, argp, sizeof(args)))
+			break;
+		r = -EINVAL;
+		if (args.type == KVM_IRQCHIP_TYPE_XICS)
+			r = kvmppc_xics_create(kvm, &args);
+		break;
+	}
 #endif /* CONFIG_PPC_BOOK3S_64 */
 	default:
 		r = -ENOTTY;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 1e2fda0..25d73c0 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -115,6 +115,7 @@  struct kvm_irq_level {
 	 * ACPI gsi notion of irq.
 	 * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47..
 	 * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23..
+	 * On powerpc SPAPR, the ICS source number, level is ignored.
 	 */
 	union {
 		__u32 irq;
@@ -146,6 +147,15 @@  struct kvm_pit_config {
 
 #define KVM_PIT_SPEAKER_DUMMY     1
 
+/* for KVM_CREATE_IRQCHIP_ARGS */
+struct kvm_irqchip_args {
+	__u64 type;
+	__u64 param;
+};
+
+/* values for type */
+#define KVM_IRQCHIP_TYPE_XICS	1	/* Power server external intr ctrler */
+
 #define KVM_EXIT_UNKNOWN          0
 #define KVM_EXIT_EXCEPTION        1
 #define KVM_EXIT_IO               2
@@ -663,6 +673,7 @@  struct kvm_ppc_smmu_info {
 #define KVM_CAP_S390_CSS_SUPPORT 85
 #define KVM_CAP_PPC_EPR 86
 #define KVM_CAP_PPC_RTAS 87
+#define KVM_CAP_IRQCHIP_ARGS 88
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -805,6 +816,21 @@  struct kvm_msi {
 	__u8  pad[16];
 };
 
+struct kvm_irq_sources {
+	__u32 irq;
+	__u32 nr_irqs;
+	__u64 __user *irqbuf;
+};
+
+/* irqbuf entries are laid out like this: */
+#define KVM_IRQ_SERVER_SHIFT	0
+#define KVM_IRQ_SERVER_MASK	0xffffffffULL
+#define KVM_IRQ_PRIORITY_SHIFT	32
+#define KVM_IRQ_PRIORITY_MASK	0xff
+#define KVM_IRQ_LEVEL_SENSITIVE	(1ULL << 40)
+#define KVM_IRQ_MASKED		(1ULL << 41)
+#define KVM_IRQ_PENDING		(1ULL << 42)
+
 /*
  * ioctls for VM fds
  */
@@ -892,6 +918,9 @@  struct kvm_s390_ucas_mapping {
 #define KVM_PPC_GET_HTAB_FD	  _IOW(KVMIO,  0xaa, struct kvm_get_htab_fd)
 /* Available with KVM_CAP_PPC_RTAS */
 #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xab, struct kvm_rtas_token_args)
+#define KVM_CREATE_IRQCHIP_ARGS   _IOW(KVMIO,  0xac, struct kvm_irqchip_args)
+#define KVM_IRQCHIP_GET_SOURCES	  _IOW(KVMIO,  0xad, struct kvm_irq_sources)
+#define KVM_IRQCHIP_SET_SOURCES	  _IOW(KVMIO,  0xae, struct kvm_irq_sources)
 
 /*
  * ioctls for vcpu fds