Patchwork [18/26] Implement the PAPR (pSeries) virtualized interrupt controller (xics)

login
register
mail settings
Submitter David Gibson
Date March 16, 2011, 4:56 a.m.
Message ID <1300251423-6715-19-git-send-email-david@gibson.dropbear.id.au>
Download mbox | patch
Permalink /patch/87168/
State New
Headers show

Comments

David Gibson - March 16, 2011, 4:56 a.m.
PAPR defines an interrupt control architecture which is logically divided
into ICS (Interrupt Control Presentation, each unit is responsible for
presenting interrupts to a particular "interrupt server", i.e. CPU) and
ICS (Interrupt Control Source, each unit responsible for one or more
hardware interrupts as numbered globally across the system).  All PAPR
virtual IO devices expect to deliver interrupts via this mechanism.  In
Linux, this interrupt controller system is handled by the "xics" driver.

On pSeries systems, access to the interrupt controller is virtualized via
hypercalls and RTAS methods.  However, the virtualized interface is very
similar to the underlying interrupt controller hardware, and similar PICs
exist un-virtualized in some other systems.

This patch implements both the ICP and ICS sides of the PAPR interrupt
controller.  For now, only the hypercall virtualized interface is provided,
however it would be relatively straightforward to graft an emulated
register interface onto the underlying interrupt logic if we want to add
a machine with a hardware ICS/ICP system in the future.

There are some limitations in this implementation: it is assumed for now
that only one instance of the ICS exists, although a full xics system can
have several, each responsible for a different group of hardware irqs.
ICP/ICS can handle both level-sensitve (LSI) and message signalled (MSI)
interrupt inputs.  For now, this implementation supports only MSI
interrupts, since that is used by PAPR virtual IO devices.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: David Gibson <dwg@au1.ibm.com>
---
 Makefile.target |    2 +-
 hw/spapr.c      |   26 +++
 hw/spapr.h      |    2 +
 hw/xics.c       |  528 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/xics.h       |   13 ++
 5 files changed, 570 insertions(+), 1 deletions(-)
 create mode 100644 hw/xics.c
 create mode 100644 hw/xics.h
Alexander Graf - March 16, 2011, 3:47 p.m.
On 03/16/2011 05:56 AM, David Gibson wrote:
> PAPR defines an interrupt control architecture which is logically divided
> into ICS (Interrupt Control Presentation, each unit is responsible for
> presenting interrupts to a particular "interrupt server", i.e. CPU) and
> ICS (Interrupt Control Source, each unit responsible for one or more
> hardware interrupts as numbered globally across the system).  All PAPR
> virtual IO devices expect to deliver interrupts via this mechanism.  In
> Linux, this interrupt controller system is handled by the "xics" driver.
>
> On pSeries systems, access to the interrupt controller is virtualized via
> hypercalls and RTAS methods.  However, the virtualized interface is very
> similar to the underlying interrupt controller hardware, and similar PICs
> exist un-virtualized in some other systems.
>
> This patch implements both the ICP and ICS sides of the PAPR interrupt
> controller.  For now, only the hypercall virtualized interface is provided,
> however it would be relatively straightforward to graft an emulated
> register interface onto the underlying interrupt logic if we want to add
> a machine with a hardware ICS/ICP system in the future.
>
> There are some limitations in this implementation: it is assumed for now
> that only one instance of the ICS exists, although a full xics system can
> have several, each responsible for a different group of hardware irqs.
> ICP/ICS can handle both level-sensitve (LSI) and message signalled (MSI)
> interrupt inputs.  For now, this implementation supports only MSI
> interrupts, since that is used by PAPR virtual IO devices.
>
> Signed-off-by: Paul Mackerras<paulus@samba.org>
> Signed-off-by: David Gibson<dwg@au1.ibm.com>
> ---
>   Makefile.target |    2 +-
>   hw/spapr.c      |   26 +++
>   hw/spapr.h      |    2 +
>   hw/xics.c       |  528 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>   hw/xics.h       |   13 ++
>   5 files changed, 570 insertions(+), 1 deletions(-)
>   create mode 100644 hw/xics.c
>   create mode 100644 hw/xics.h
>
> diff --git a/Makefile.target b/Makefile.target
> index e333225..2b0588e 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -233,7 +233,7 @@ obj-ppc-y += ppc_oldworld.o
>   obj-ppc-y += ppc_newworld.o
>   # IBM pSeries (sPAPR)
>   obj-ppc-y += spapr.o spapr_hcall.o spapr_rtas.o spapr_vio.o
> -obj-ppc-y += spapr_vty.o
> +obj-ppc-y += xics.o spapr_vty.o
>   # PowerPC 4xx boards
>   obj-ppc-y += ppc4xx_devs.o ppc4xx_pci.o ppc405_uc.o ppc405_boards.o
>   obj-ppc-y += ppc440.o ppc440_bamboo.o
> diff --git a/hw/spapr.c b/hw/spapr.c
> index 23f493a..be30def 100644
> --- a/hw/spapr.c
> +++ b/hw/spapr.c
> @@ -34,6 +34,7 @@
>
>   #include "hw/spapr.h"
>   #include "hw/spapr_vio.h"
> +#include "hw/xics.h"
>
>   #include<libfdt.h>
>
> @@ -62,6 +63,7 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>       uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
>       uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)};
>       char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr";
> +    uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
>       int i;
>       char *modelname;
>       int ret;
> @@ -120,6 +122,7 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>
>       for (i = 0; i<  smp_cpus; i++) {
>           CPUState *env = envs[i];
> +        uint32_t gserver_prop[] = {cpu_to_be32(i), 0}; /* HACK! */
>           char *nodename;
>           uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
>                              0xffffffff, 0xffffffff};
> @@ -147,6 +150,9 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>           _FDT((fdt_property(fdt, "ibm,pft-size", pft_size_prop, sizeof(pft_size_prop))));
>           _FDT((fdt_property_string(fdt, "status", "okay")));
>           _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
> +        _FDT((fdt_property_cell(fdt, "ibm,ppc-interrupt-server#s", i)));
> +        _FDT((fdt_property(fdt, "ibm,ppc-interrupt-gserver#s",
> +                           gserver_prop, sizeof(gserver_prop))));
>
>           if (envs[i]->mmu_model&  POWERPC_MMU_1TSEG) {
>               _FDT((fdt_property(fdt, "ibm,processor-segment-sizes",
> @@ -168,6 +174,20 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>
>       _FDT((fdt_end_node(fdt)));
>
> +    /* interrupt controller */
> +    _FDT((fdt_begin_node(fdt, "interrupt-controller@0")));
> +
> +    _FDT((fdt_property_string(fdt, "device_type",
> +                              "PowerPC-External-Interrupt-Presentation")));
> +    _FDT((fdt_property_string(fdt, "compatible", "IBM,ppc-xicp")));
> +    _FDT((fdt_property_cell(fdt, "reg", 0)));
> +    _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
> +    _FDT((fdt_property(fdt, "ibm,interrupt-server-ranges",
> +                       interrupt_server_ranges_prop,
> +                       sizeof(interrupt_server_ranges_prop))));
> +
> +    _FDT((fdt_end_node(fdt)));
> +
>       /* vdevice */
>       _FDT((fdt_begin_node(fdt, "vdevice")));
>
> @@ -175,6 +195,8 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>       _FDT((fdt_property_string(fdt, "compatible", "IBM,vdevice")));
>       _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
>       _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));
> +    _FDT((fdt_property_cell(fdt, "#interrupt-cells", 0x2)));
> +    _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
>
>       _FDT((fdt_end_node(fdt)));
>
> @@ -290,6 +312,10 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>       }
>       qemu_free(filename);
>
> +    /* Set up Interrupt Controller */
> +    spapr->icp = xics_system_init(smp_cpus,&env, MAX_SERIAL_PORTS);
> +
> +    /* Set up VIO bus */
>       spapr->vio_bus = spapr_vio_bus_init();
>
>       for (i = 0; i<  MAX_SERIAL_PORTS; i++) {
> diff --git a/hw/spapr.h b/hw/spapr.h
> index 7a7c319..4b54c22 100644
> --- a/hw/spapr.h
> +++ b/hw/spapr.h
> @@ -2,9 +2,11 @@
>   #define __HW_SPAPR_H__
>
>   struct VIOsPAPRBus;
> +struct icp_state;
>
>   typedef struct sPAPREnvironment {
>       struct VIOsPAPRBus *vio_bus;
> +    struct icp_state *icp;
>   } sPAPREnvironment;
>
>   #define H_SUCCESS         0
> diff --git a/hw/xics.c b/hw/xics.c
> new file mode 100644
> index 0000000..46e778a
> --- /dev/null
> +++ b/hw/xics.c
> @@ -0,0 +1,528 @@
> +#include "hw.h"
> +#include "hw/spapr.h"
> +#include "hw/xics.h"
> +
> +#include<pthread.h>
> +
> +/*
> + * ICP: Presentation layer
> + */
> +
> +struct icp_server_state {
> +    uint32_t cppr :8;
> +    uint32_t xisr :24;
> +    uint8_t pending_priority;
> +    uint8_t mfrr;
> +    qemu_irq output;
> +    pthread_mutex_t lock;
> +};
> +
> +struct ics_state;
> +
> +struct icp_state {
> +    long nr_servers;
> +    struct icp_server_state *ss;
> +    struct ics_state *ics;
> +};
> +
> +static void ics_reject(struct ics_state *ics, int nr);
> +static void ics_resend(struct ics_state *ics);
> +static void ics_eoi(struct ics_state *ics, int nr);
> +
> +static void icp_check_ipi(struct icp_state *icp, int server)
> +{
> +    struct icp_server_state *ss = icp->ss + server;
> +
> +    if (ss->xisr&&  (ss->pending_priority<= ss->mfrr)) {
> +        return;
> +    }
> +
> +    if (ss->xisr) {
> +        ics_reject(icp->ics, ss->xisr);
> +    }
> +
> +    ss->xisr = XICS_IPI;
> +    ss->pending_priority = ss->mfrr;
> +    qemu_irq_raise(ss->output);
> +}
> +
> +static void icp_resend(struct icp_state *icp, int server)
> +{
> +    struct icp_server_state *ss = icp->ss + server;
> +
> +    if (ss->mfrr<  ss->cppr) {
> +        icp_check_ipi(icp, server);
> +    }
> +    ics_resend(icp->ics);
> +}
> +
> +static void icp_set_cppr(struct icp_state *icp, int server, uint8_t cppr)
> +{
> +    struct icp_server_state *ss = icp->ss + server;
> +    uint8_t old_cppr;
> +    uint32_t old_xisr;
> +
> +    pthread_mutex_lock(&ss->lock);
> +    old_cppr = ss->cppr;
> +    ss->cppr = cppr;
> +
> +    if (cppr<  old_cppr) {
> +        if (ss->xisr&&  (cppr<= ss->pending_priority)) {
> +            old_xisr = ss->xisr;
> +            ss->xisr = 0;
> +            qemu_irq_lower(ss->output);
> +            ics_reject(icp->ics, old_xisr);
> +        }
> +    } else {
> +        if (!ss->xisr) {
> +            icp_resend(icp, server);
> +        }
> +    }
> +    pthread_mutex_unlock(&ss->lock);
> +}
> +
> +static void icp_set_mfrr(struct icp_state *icp, int nr, uint8_t mfrr)
> +{
> +    struct icp_server_state *ss = icp->ss + nr;
> +
> +    pthread_mutex_lock(&ss->lock);
> +
> +    ss->mfrr = mfrr;
> +    if (mfrr<  ss->cppr) {
> +        icp_check_ipi(icp, nr);
> +    }
> +
> +    pthread_mutex_unlock(&ss->lock);
> +}
> +
> +static uint32_t icp_accept(struct icp_server_state *ss)
> +{
> +    uint32_t xirr;
> +
> +    pthread_mutex_lock(&ss->lock);
> +    qemu_irq_lower(ss->output);
> +    xirr = ss->cppr<<  24 | ss->xisr;
> +    ss->xisr = 0;
> +    ss->cppr = ss->pending_priority;
> +    pthread_mutex_unlock(&ss->lock);
> +    return xirr;
> +}
> +
> +static void icp_eoi(struct icp_state *icp, int server, uint32_t xirr)
> +{
> +    struct icp_server_state *ss = icp->ss + server;
> +
> +    ics_eoi(icp->ics, xirr&  0xffffff);
> +    /* Send EOI ->  ICS */
> +    ss->cppr = xirr>>  24;
> +    if (!ss->xisr) {
> +        icp_resend(icp, server);
> +    }
> +}
> +
> +static void icp_irq(struct icp_state *icp, int server, int nr, uint8_t priority)
> +{
> +    struct icp_server_state *ss = icp->ss + server;
> +
> +    pthread_mutex_lock(&ss->lock);
> +
> +    if ((priority>= ss->cppr)
> +        || (ss->xisr&&  (ss->pending_priority<= priority))) {
> +        ics_reject(icp->ics, nr);
> +    } else {
> +        if (ss->xisr) {
> +            ics_reject(icp->ics, ss->xisr);
> +        }
> +        ss->xisr = nr;
> +        ss->pending_priority = priority;
> +        qemu_irq_raise(ss->output);
> +    }
> +
> +    pthread_mutex_unlock(&ss->lock);
> +}
> +
> +/*
> + * ICS: Source layer
> + */
> +
> +struct ics_irq_state {
> +    int server;
> +    uint8_t priority;
> +    uint8_t saved_priority;
> +    /* int pending :1; */
> +    /* int presented :1; */
> +    int rejected :1;
> +    int masked_pending :1;
> +};
> +
> +struct ics_state {
> +    int nr_irqs;
> +    int offset;
> +    qemu_irq *qirqs;
> +    struct ics_irq_state *irqs;
> +    struct icp_state *icp;
> +};
> +
> +static int ics_valid_irq(struct ics_state *ics, uint32_t nr)
> +{
> +    return (nr>= ics->offset)
> +&&  (nr<  (ics->offset + ics->nr_irqs));
> +}
> +
> +static void ics_set_irq_msi(void *opaque, int nr, int val)
> +{
> +    struct ics_state *ics = (struct ics_state *)opaque;
> +    struct ics_irq_state *irq = ics->irqs + nr;
> +
> +    if (val) {
> +        if (irq->priority == 0xff) {
> +            irq->masked_pending = 1;
> +            /* masked pending */ ;
> +        } else  {
> +            icp_irq(ics->icp, irq->server, nr + ics->offset, irq->priority);
> +        }
> +    }
> +}
> +
> +static void ics_reject_msi(struct ics_state *ics, int nr)
> +{
> +    struct ics_irq_state *irq = ics->irqs + nr - ics->offset;
> +
> +    irq->rejected = 1;
> +}
> +
> +static void ics_resend_msi(struct ics_state *ics)
> +{
> +    int i;
> +
> +    for (i = 0; i<  ics->nr_irqs; i++) {
> +        struct ics_irq_state *irq = ics->irqs + i;
> +
> +        /* FIXME: filter by server#? */
> +        if (irq->rejected) {
> +            irq->rejected = 0;
> +            if (irq->priority != 0xff) {
> +                icp_irq(ics->icp, irq->server, i + ics->offset, irq->priority);
> +            }
> +        }
> +    }
> +}
> +
> +static void ics_write_xive_msi(struct ics_state *ics, int nr, int server,
> +                               uint8_t priority)
> +{
> +    struct ics_irq_state *irq = ics->irqs + nr;
> +
> +    irq->server = server;
> +    irq->priority = priority;
> +
> +    if (!irq->masked_pending || (priority = 0xff)) {
> +        return;
> +    }
> +
> +    irq->masked_pending = 0;
> +    icp_irq(ics->icp, server, nr + ics->offset, priority);
> +}
> +
> +/* static void ics_recheck_irq(struct ics_state *ics, int nr) */
> +/* { */
> +/*     struct ics_irq_state *irq = xics->irqs + (nr - xics->offset); */
> +
> +/*     if (irq->pending&&  (irq->priority != 0xff)) { */
> +/*      irq->presented = 1; */
> +/*      icp_irq(xicp->ss + irq->server, nr + ics->offset, irq->priority); */
> +/*     } */
> +/* } */
> +
> +/* static void ics_set_irq(void *opaque, int nr, int val) */
> +/* { */
> +/*     struct ics_state *ics = (struct ics_state *)opaque; */
> +/*     struct ics_irq_state *irq = ics->irqs + nr; */
> +
> +/*     irq->pending = val; */
> +/*     ics_recheck_irq(ics, nr); */
> +/* } */
> +
> +/* static void ics_reject(int nr) */
> +/* { */
> +/*     struct ics_irq_state *irq = xics->irqs + (nr - xics->offset); */
> +
> +/*     assert(irq->presented); */
> +/*     irq->rejected = 1; */
> +/*     irq->presented = 0; */
> +/* } */
> +
> +/* static void ics_eoi(int nr) */
> +/* { */
> +/*     struct ics_irq_state *irq = xics->irqs + (nr - xics->offset); */
> +
> +/*     assert(irq->presented); */
> +/*     irq->presented = 0; */
> +/*     irq->rejected = 0; */
> +/*     ics_recheck_irq(xics, nr); */
> +/* } */
> +
> +/* static void ics_resend_irq(struct ics_state *ics, int nr, */
> +/*                            struct icp_server_state *ss) */
> +/* { */
> +/*     struct ics_irq_state *irq = ics->irqs + (nr - ics->offset); */
> +
> +/*     if (!irq->rejected) */
> +/*         return; /\* Not rejected, so no need to resend *\/ */
> +
> +/*     if (ss != (xicp->ss + irq->server)) */
> +/*         return; /\* Not for this server, so don't resend *\/ */
> +
> +/*     ics_recheck_irq(ics, nr); */
> +/* } */
> +
> +/* static void ics_resend(struct icp_server_state *ss) */
> +/* { */
> +/*     int i; */
> +
> +/*     for (i = 0; i<  xics->nr_irqs; i++) */
> +/*         ics_resend_irq(xics, nr, ss); */
> +/* } */

Why is all this commented out? Better #if 0 it all away. Or even better, 
don't include it in the patch - unless you think the code is crucial and 
to be activated soon.

> +
> +static void ics_reject(struct ics_state *ics, int nr)
> +{
> +    ics_reject_msi(ics, nr);
> +}
> +
> +static void ics_resend(struct ics_state *ics)
> +{
> +    ics_resend_msi(ics);
> +}
> +
> +static void ics_eoi(struct ics_state *ics, int nr)
> +{
> +}
> +
> +/*
> + * Exported functions
> + */
> +
> +qemu_irq xics_find_qirq(struct icp_state *icp, int irq)
> +{
> +    if ((irq<  icp->ics->offset)
> +        || (irq>= (icp->ics->offset + icp->ics->nr_irqs))) {
> +        return NULL;
> +    }
> +
> +    return icp->ics->qirqs[irq - icp->ics->offset];
> +}
> +
> +static target_ulong h_cppr(CPUState *env, sPAPREnvironment *spapr,
> +                           target_ulong opcode, target_ulong *args)
> +{
> +    target_ulong cppr = args[0];
> +
> +    icp_set_cppr(spapr->icp, env->cpu_index, cppr);
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_ipi(CPUState *env, sPAPREnvironment *spapr,
> +                          target_ulong opcode, target_ulong *args)
> +{
> +    target_ulong server = args[0];
> +    target_ulong mfrr = args[1];
> +
> +    if (server>= spapr->icp->nr_servers) {
> +        return H_PARAMETER;
> +    }
> +
> +    icp_set_mfrr(spapr->icp, server, mfrr);
> +    return H_SUCCESS;
> +
> +}
> +
> +static target_ulong h_xirr(CPUState *env, sPAPREnvironment *spapr,
> +                           target_ulong opcode, target_ulong *args)
> +{
> +    uint32_t xirr = icp_accept(spapr->icp->ss + env->cpu_index);
> +
> +    args[0] = xirr;
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_eoi(CPUState *env, sPAPREnvironment *spapr,
> +                          target_ulong opcode, target_ulong *args)
> +{
> +    target_ulong xirr = args[0];
> +
> +    icp_eoi(spapr->icp, env->cpu_index, xirr);
> +    return H_SUCCESS;
> +}
> +
> +static void rtas_set_xive(sPAPREnvironment *spapr, uint32_t token,
> +                          uint32_t nargs, target_ulong args,
> +                          uint32_t nret, target_ulong rets)
> +{
> +    struct ics_state *ics = spapr->icp->ics;
> +    uint32_t nr, server, priority;
> +
> +    if ((nargs != 3) || (nret != 1)) {
> +        rtas_st(rets, 0, -3);
> +        return;
> +    }
> +
> +    nr = rtas_ld(args, 0);
> +    server = rtas_ld(args, 1);
> +    priority = rtas_ld(args, 2);
> +
> +    if (!ics_valid_irq(ics, nr) || (server>= ics->icp->nr_servers)
> +        || (priority>  0xff)) {
> +        rtas_st(rets, 0, -3);
> +        return;
> +    }
> +
> +    ics_write_xive_msi(ics, nr - ics->offset, server, priority);
> +
> +    rtas_st(rets, 0, 0); /* Success */
> +}
> +
> +static void rtas_get_xive(sPAPREnvironment *spapr, uint32_t token,
> +                          uint32_t nargs, target_ulong args,
> +                          uint32_t nret, target_ulong rets)
> +{
> +    struct ics_state *ics = spapr->icp->ics;
> +    uint32_t nr;
> +
> +    if ((nargs != 1) || (nret != 3)) {
> +        rtas_st(rets, 0, -3);
> +        return;
> +    }
> +
> +    nr = rtas_ld(args, 0);
> +
> +    if (!ics_valid_irq(ics, nr)) {
> +        rtas_st(rets, 0, -3);
> +        return;
> +    }
> +
> +    rtas_st(rets, 0, 0); /* Success */
> +    rtas_st(rets, 1, ics->irqs[nr - ics->offset].server);
> +    rtas_st(rets, 2, ics->irqs[nr - ics->offset].priority);
> +}
> +
> +static void rtas_int_off(sPAPREnvironment *spapr, uint32_t token,
> +                         uint32_t nargs, target_ulong args,
> +                         uint32_t nret, target_ulong rets)
> +{
> +    struct ics_state *ics = spapr->icp->ics;
> +    uint32_t nr;
> +
> +    if ((nargs != 1) || (nret != 1)) {
> +        rtas_st(rets, 0, -3);
> +        return;
> +    }
> +
> +    nr = rtas_ld(args, 0);
> +
> +    if (!ics_valid_irq(ics, nr)) {
> +        rtas_st(rets, 0, -3);
> +        return;
> +    }
> +
> +    /* This is a NOP for now, since the described PAPR semantics don't
> +     * seem to gel with what Linux does */
> +#if 0
> +    struct ics_irq_state *irq = xics->irqs + (nr - xics->offset);
> +
> +    irq->saved_priority = irq->priority;
> +    ics_write_xive_msi(xics, nr - xics->offset, irq->server, 0xff);
> +#endif
> +
> +    rtas_st(rets, 0, 0); /* Success */
> +}
> +
> +static void rtas_int_on(sPAPREnvironment *spapr, uint32_t token,
> +                        uint32_t nargs, target_ulong args,
> +                        uint32_t nret, target_ulong rets)
> +{
> +    struct ics_state *ics = spapr->icp->ics;
> +    uint32_t nr;
> +
> +    if ((nargs != 1) || (nret != 1)) {
> +        rtas_st(rets, 0, -3);
> +        return;
> +    }
> +
> +    nr = rtas_ld(args, 0);
> +
> +    if (!ics_valid_irq(ics, nr)) {
> +        rtas_st(rets, 0, -3);
> +        return;
> +    }
> +
> +    /* This is a NOP for now, since the described PAPR semantics don't
> +     * seem to gel with what Linux does */
> +#if 0
> +    struct ics_irq_state *irq = xics->irqs + (nr - xics->offset);
> +
> +    ics_write_xive_msi(xics, nr - xics->offset,
> +                       irq->server, irq->saved_priority);
> +#endif
> +
> +    rtas_st(rets, 0, 0); /* Success */
> +}
> +
> +struct icp_state *xics_system_init(int nr_servers, CPUState *servers[],
> +                                   int nr_irqs)
> +{
> +    int i;
> +    struct icp_state *icp;
> +    struct ics_state *ics;
> +
> +    icp = qemu_mallocz(sizeof(*icp));
> +    icp->nr_servers = nr_servers;
> +    icp->ss = qemu_mallocz(nr_servers * sizeof(struct icp_server_state));
> +
> +    for (i = 0; i<  nr_servers; i++) {
> +        servers[i]->cpu_index = i;
> +
> +        switch (PPC_INPUT(servers[i])) {
> +        case PPC_FLAGS_INPUT_POWER7:
> +            icp->ss[i].output = servers[i]->irq_inputs[POWER7_INPUT_INT];
> +            break;
> +
> +        case PPC_FLAGS_INPUT_970:
> +            icp->ss[i].output = servers[i]->irq_inputs[PPC970_INPUT_INT];
> +            break;
> +
> +        default:
> +            hw_error("XICS interrupt model does not support this CPU bus model\n");
> +            exit(1);
> +        }
> +
> +        icp->ss[i].mfrr = 0xff;
> +        pthread_mutex_init(&icp->ss[i].lock, NULL);
> +    }
> +
> +    ics = qemu_mallocz(sizeof(*ics));
> +    ics->nr_irqs = nr_irqs;
> +    ics->offset = 16;
> +    ics->irqs = qemu_mallocz(nr_irqs * sizeof(struct ics_irq_state));
> +
> +    icp->ics = ics;
> +    ics->icp = icp;
> +
> +    for (i = 0; i<  nr_irqs; i++) {
> +        ics->irqs[i].priority = 0xff;
> +        ics->irqs[i].saved_priority = 0xff;
> +    }
> +
> +    ics->qirqs = qemu_allocate_irqs(ics_set_irq_msi, ics, nr_irqs);
> +
> +    spapr_register_hypercall(H_CPPR, h_cppr);
> +    spapr_register_hypercall(H_IPI, h_ipi);
> +    spapr_register_hypercall(H_XIRR, h_xirr);
> +    spapr_register_hypercall(H_EOI, h_eoi);
> +
> +    spapr_rtas_register("ibm,set-xive", rtas_set_xive);
> +    spapr_rtas_register("ibm,get-xive", rtas_get_xive);
> +    spapr_rtas_register("ibm,int-off", rtas_int_off);
> +    spapr_rtas_register("ibm,int-on", rtas_int_on);
> +
> +    return icp;
> +}
> diff --git a/hw/xics.h b/hw/xics.h
> new file mode 100644
> index 0000000..e55f5f1
> --- /dev/null
> +++ b/hw/xics.h

Header missing

> @@ -0,0 +1,13 @@
> +#if !defined(__XICS_H__)
> +#define __XICS_H__
> +
> +#define XICS_IPI        0x2
> +
> +struct icp_state;
> +
> +qemu_irq xics_find_qirq(struct icp_state *icp, int irq);
> +
> +struct icp_state *xics_system_init(int nr_servers, CPUState *servers[],
> +                                   int nr_irqs);
> +
> +#endif /* __XICS_H__ */

Alex
Anthony Liguori - March 16, 2011, 10:16 p.m.
On 03/15/2011 11:56 PM, David Gibson wrote:
> PAPR defines an interrupt control architecture which is logically divided
> into ICS (Interrupt Control Presentation, each unit is responsible for
> presenting interrupts to a particular "interrupt server", i.e. CPU) and
> ICS (Interrupt Control Source, each unit responsible for one or more
> hardware interrupts as numbered globally across the system).  All PAPR
> virtual IO devices expect to deliver interrupts via this mechanism.  In
> Linux, this interrupt controller system is handled by the "xics" driver.
>
> On pSeries systems, access to the interrupt controller is virtualized via
> hypercalls and RTAS methods.  However, the virtualized interface is very
> similar to the underlying interrupt controller hardware, and similar PICs
> exist un-virtualized in some other systems.
>
> This patch implements both the ICP and ICS sides of the PAPR interrupt
> controller.  For now, only the hypercall virtualized interface is provided,
> however it would be relatively straightforward to graft an emulated
> register interface onto the underlying interrupt logic if we want to add
> a machine with a hardware ICS/ICP system in the future.
>
> There are some limitations in this implementation: it is assumed for now
> that only one instance of the ICS exists, although a full xics system can
> have several, each responsible for a different group of hardware irqs.
> ICP/ICS can handle both level-sensitve (LSI) and message signalled (MSI)
> interrupt inputs.  For now, this implementation supports only MSI
> interrupts, since that is used by PAPR virtual IO devices.
>
> Signed-off-by: Paul Mackerras<paulus@samba.org>
> Signed-off-by: David Gibson<dwg@au1.ibm.com>
> ---
>   Makefile.target |    2 +-
>   hw/spapr.c      |   26 +++
>   hw/spapr.h      |    2 +
>   hw/xics.c       |  528 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>   hw/xics.h       |   13 ++
>   5 files changed, 570 insertions(+), 1 deletions(-)
>   create mode 100644 hw/xics.c
>   create mode 100644 hw/xics.h
>
> diff --git a/Makefile.target b/Makefile.target
> index e333225..2b0588e 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -233,7 +233,7 @@ obj-ppc-y += ppc_oldworld.o
>   obj-ppc-y += ppc_newworld.o
>   # IBM pSeries (sPAPR)
>   obj-ppc-y += spapr.o spapr_hcall.o spapr_rtas.o spapr_vio.o
> -obj-ppc-y += spapr_vty.o
> +obj-ppc-y += xics.o spapr_vty.o
>   # PowerPC 4xx boards
>   obj-ppc-y += ppc4xx_devs.o ppc4xx_pci.o ppc405_uc.o ppc405_boards.o
>   obj-ppc-y += ppc440.o ppc440_bamboo.o
> diff --git a/hw/spapr.c b/hw/spapr.c
> index 23f493a..be30def 100644
> --- a/hw/spapr.c
> +++ b/hw/spapr.c
> @@ -34,6 +34,7 @@
>
>   #include "hw/spapr.h"
>   #include "hw/spapr_vio.h"
> +#include "hw/xics.h"
>
>   #include<libfdt.h>
>
> @@ -62,6 +63,7 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>       uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
>       uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)};
>       char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr";
> +    uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
>       int i;
>       char *modelname;
>       int ret;
> @@ -120,6 +122,7 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>
>       for (i = 0; i<  smp_cpus; i++) {
>           CPUState *env = envs[i];
> +        uint32_t gserver_prop[] = {cpu_to_be32(i), 0}; /* HACK! */
>           char *nodename;
>           uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
>                              0xffffffff, 0xffffffff};
> @@ -147,6 +150,9 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>           _FDT((fdt_property(fdt, "ibm,pft-size", pft_size_prop, sizeof(pft_size_prop))));
>           _FDT((fdt_property_string(fdt, "status", "okay")));
>           _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
> +        _FDT((fdt_property_cell(fdt, "ibm,ppc-interrupt-server#s", i)));
> +        _FDT((fdt_property(fdt, "ibm,ppc-interrupt-gserver#s",
> +                           gserver_prop, sizeof(gserver_prop))));
>
>           if (envs[i]->mmu_model&  POWERPC_MMU_1TSEG) {
>               _FDT((fdt_property(fdt, "ibm,processor-segment-sizes",
> @@ -168,6 +174,20 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>
>       _FDT((fdt_end_node(fdt)));
>
> +    /* interrupt controller */
> +    _FDT((fdt_begin_node(fdt, "interrupt-controller@0")));
> +
> +    _FDT((fdt_property_string(fdt, "device_type",
> +                              "PowerPC-External-Interrupt-Presentation")));
> +    _FDT((fdt_property_string(fdt, "compatible", "IBM,ppc-xicp")));
> +    _FDT((fdt_property_cell(fdt, "reg", 0)));
> +    _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
> +    _FDT((fdt_property(fdt, "ibm,interrupt-server-ranges",
> +                       interrupt_server_ranges_prop,
> +                       sizeof(interrupt_server_ranges_prop))));
> +
> +    _FDT((fdt_end_node(fdt)));
> +
>       /* vdevice */
>       _FDT((fdt_begin_node(fdt, "vdevice")));
>
> @@ -175,6 +195,8 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>       _FDT((fdt_property_string(fdt, "compatible", "IBM,vdevice")));
>       _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
>       _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));
> +    _FDT((fdt_property_cell(fdt, "#interrupt-cells", 0x2)));
> +    _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
>
>       _FDT((fdt_end_node(fdt)));
>
> @@ -290,6 +312,10 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>       }
>       qemu_free(filename);
>
> +    /* Set up Interrupt Controller */
> +    spapr->icp = xics_system_init(smp_cpus,&env, MAX_SERIAL_PORTS);
> +
> +    /* Set up VIO bus */
>       spapr->vio_bus = spapr_vio_bus_init();
>
>       for (i = 0; i<  MAX_SERIAL_PORTS; i++) {
> diff --git a/hw/spapr.h b/hw/spapr.h
> index 7a7c319..4b54c22 100644
> --- a/hw/spapr.h
> +++ b/hw/spapr.h
> @@ -2,9 +2,11 @@
>   #define __HW_SPAPR_H__
>
>   struct VIOsPAPRBus;
> +struct icp_state;
>
>   typedef struct sPAPREnvironment {
>       struct VIOsPAPRBus *vio_bus;
> +    struct icp_state *icp;
>   } sPAPREnvironment;
>
>   #define H_SUCCESS         0
> diff --git a/hw/xics.c b/hw/xics.c
> new file mode 100644
> index 0000000..46e778a
> --- /dev/null
> +++ b/hw/xics.c
> @@ -0,0 +1,528 @@

Copyright.

> +#include "hw.h"
> +#include "hw/spapr.h"
> +#include "hw/xics.h"
> +
> +#include<pthread.h>

This isn't needed and it'll break the Windows build.   We carry a global 
mutex whenever QEMU code executes.

> +/*
> + * ICP: Presentation layer
> + */
> +
> +struct icp_server_state {
> +    uint32_t cppr :8;
> +    uint32_t xisr :24;

No real reason to use bitfields here.

> +    uint8_t pending_priority;
> +    uint8_t mfrr;
> +    qemu_irq output;
> +    pthread_mutex_t lock;
> +};
> +
> +struct ics_state;
> +
> +struct icp_state {
> +    long nr_servers;
> +    struct icp_server_state *ss;
> +    struct ics_state *ics;
> +};
> +
> +static void ics_reject(struct ics_state *ics, int nr);
> +static void ics_resend(struct ics_state *ics);
> +static void ics_eoi(struct ics_state *ics, int nr);
> +
> +static void icp_check_ipi(struct icp_state *icp, int server)
> +{
> +    struct icp_server_state *ss = icp->ss + server;
> +
> +    if (ss->xisr&&  (ss->pending_priority<= ss->mfrr)) {
> +        return;
> +    }
> +
> +    if (ss->xisr) {
> +        ics_reject(icp->ics, ss->xisr);
> +    }
> +
> +    ss->xisr = XICS_IPI;
> +    ss->pending_priority = ss->mfrr;
> +    qemu_irq_raise(ss->output);
> +}
> +
> +static void icp_resend(struct icp_state *icp, int server)
> +{
> +    struct icp_server_state *ss = icp->ss + server;
> +
> +    if (ss->mfrr<  ss->cppr) {
> +        icp_check_ipi(icp, server);
> +    }
> +    ics_resend(icp->ics);
> +}
> +
> +static void icp_set_cppr(struct icp_state *icp, int server, uint8_t cppr)
> +{
> +    struct icp_server_state *ss = icp->ss + server;
> +    uint8_t old_cppr;
> +    uint32_t old_xisr;
> +
> +    pthread_mutex_lock(&ss->lock);
> +    old_cppr = ss->cppr;
> +    ss->cppr = cppr;
> +
> +    if (cppr<  old_cppr) {
> +        if (ss->xisr&&  (cppr<= ss->pending_priority)) {
> +            old_xisr = ss->xisr;
> +            ss->xisr = 0;
> +            qemu_irq_lower(ss->output);
> +            ics_reject(icp->ics, old_xisr);
> +        }
> +    } else {
> +        if (!ss->xisr) {
> +            icp_resend(icp, server);
> +        }
> +    }
> +    pthread_mutex_unlock(&ss->lock);
> +}
> +
> +static void icp_set_mfrr(struct icp_state *icp, int nr, uint8_t mfrr)
> +{
> +    struct icp_server_state *ss = icp->ss + nr;
> +
> +    pthread_mutex_lock(&ss->lock);
> +
> +    ss->mfrr = mfrr;
> +    if (mfrr<  ss->cppr) {
> +        icp_check_ipi(icp, nr);
> +    }
> +
> +    pthread_mutex_unlock(&ss->lock);
> +}
> +
> +static uint32_t icp_accept(struct icp_server_state *ss)
> +{
> +    uint32_t xirr;
> +
> +    pthread_mutex_lock(&ss->lock);
> +    qemu_irq_lower(ss->output);
> +    xirr = ss->cppr<<  24 | ss->xisr;
> +    ss->xisr = 0;
> +    ss->cppr = ss->pending_priority;
> +    pthread_mutex_unlock(&ss->lock);
> +    return xirr;
> +}
> +
> +static void icp_eoi(struct icp_state *icp, int server, uint32_t xirr)
> +{
> +    struct icp_server_state *ss = icp->ss + server;
> +
> +    ics_eoi(icp->ics, xirr&  0xffffff);
> +    /* Send EOI ->  ICS */
> +    ss->cppr = xirr>>  24;
> +    if (!ss->xisr) {
> +        icp_resend(icp, server);
> +    }
> +}
> +
> +static void icp_irq(struct icp_state *icp, int server, int nr, uint8_t priority)
> +{
> +    struct icp_server_state *ss = icp->ss + server;
> +
> +    pthread_mutex_lock(&ss->lock);
> +
> +    if ((priority>= ss->cppr)
> +        || (ss->xisr&&  (ss->pending_priority<= priority))) {
> +        ics_reject(icp->ics, nr);
> +    } else {
> +        if (ss->xisr) {
> +            ics_reject(icp->ics, ss->xisr);
> +        }
> +        ss->xisr = nr;
> +        ss->pending_priority = priority;
> +        qemu_irq_raise(ss->output);
> +    }
> +
> +    pthread_mutex_unlock(&ss->lock);
> +}
> +
> +/*
> + * ICS: Source layer
> + */
> +
> +struct ics_irq_state {
> +    int server;
> +    uint8_t priority;
> +    uint8_t saved_priority;
> +    /* int pending :1; */
> +    /* int presented :1; */
> +    int rejected :1;
> +    int masked_pending :1;
> +};
> +
> +struct ics_state {
> +    int nr_irqs;
> +    int offset;
> +    qemu_irq *qirqs;
> +    struct ics_irq_state *irqs;
> +    struct icp_state *icp;
> +};
> +
> +static int ics_valid_irq(struct ics_state *ics, uint32_t nr)
> +{
> +    return (nr>= ics->offset)
> +&&  (nr<  (ics->offset + ics->nr_irqs));
> +}
> +
> +static void ics_set_irq_msi(void *opaque, int nr, int val)
> +{
> +    struct ics_state *ics = (struct ics_state *)opaque;
> +    struct ics_irq_state *irq = ics->irqs + nr;
> +
> +    if (val) {
> +        if (irq->priority == 0xff) {
> +            irq->masked_pending = 1;
> +            /* masked pending */ ;
> +        } else  {
> +            icp_irq(ics->icp, irq->server, nr + ics->offset, irq->priority);
> +        }
> +    }
> +}
> +
> +static void ics_reject_msi(struct ics_state *ics, int nr)
> +{
> +    struct ics_irq_state *irq = ics->irqs + nr - ics->offset;
> +
> +    irq->rejected = 1;
> +}
> +
> +static void ics_resend_msi(struct ics_state *ics)
> +{
> +    int i;
> +
> +    for (i = 0; i<  ics->nr_irqs; i++) {
> +        struct ics_irq_state *irq = ics->irqs + i;
> +
> +        /* FIXME: filter by server#? */
> +        if (irq->rejected) {
> +            irq->rejected = 0;
> +            if (irq->priority != 0xff) {
> +                icp_irq(ics->icp, irq->server, i + ics->offset, irq->priority);
> +            }
> +        }
> +    }
> +}
> +
> +static void ics_write_xive_msi(struct ics_state *ics, int nr, int server,
> +                               uint8_t priority)
> +{
> +    struct ics_irq_state *irq = ics->irqs + nr;
> +
> +    irq->server = server;
> +    irq->priority = priority;
> +
> +    if (!irq->masked_pending || (priority = 0xff)) {
> +        return;
> +    }
> +
> +    irq->masked_pending = 0;
> +    icp_irq(ics->icp, server, nr + ics->offset, priority);
> +}
> +
> +/* static void ics_recheck_irq(struct ics_state *ics, int nr) */

This is a pretty ugly way to comment out code.  At least use an #if 0.

Regards,

Anthony Liguori
David Gibson - March 17, 2011, 1:29 a.m.
On Wed, Mar 16, 2011 at 04:47:11PM +0100, Alexander Graf wrote:
> On 03/16/2011 05:56 AM, David Gibson wrote:
[snip]
> >+/* static void ics_resend(struct icp_server_state *ss) */
> >+/* { */
> >+/*     int i; */
> >+
> >+/*     for (i = 0; i<  xics->nr_irqs; i++) */
> >+/*         ics_resend_irq(xics, nr, ss); */
> >+/* } */
> 
> Why is all this commented out? Better #if 0 it all away. Or even
> better, don't include it in the patch - unless you think the code is
> crucial and to be activated soon.

Hrm, it was supposed to implement level (rather than message)
interrupts on XICS.  But I think its bitrotted since I commented it
out.  Removed.

> >diff --git a/hw/xics.h b/hw/xics.h
> >new file mode 100644
> >index 0000000..e55f5f1
> >--- /dev/null
> >+++ b/hw/xics.h
> 
> Header missing

I'm not sure what you mean by this
David Gibson - March 17, 2011, 1:34 a.m.
On Wed, Mar 16, 2011 at 05:16:07PM -0500, Anthony Liguori wrote:
> On 03/15/2011 11:56 PM, David Gibson wrote:
[snip]
> >+#include<pthread.h>
> 
> This isn't needed and it'll break the Windows build.   We carry a
> global mutex whenever QEMU code executes.

Good point, I wrote this before I realized all the qemu code was serialized.

> >+/*
> >+ * ICP: Presentation layer
> >+ */
> >+
> >+struct icp_server_state {
> >+    uint32_t cppr :8;
> >+    uint32_t xisr :24;
> 
> No real reason to use bitfields here.

Well.. in the hardware xics implementation, CPPR and XISR are
considered fields of the one 32-bit register, XIRR.  Matching that is
why I have the bitfield.
Alexander Graf - March 17, 2011, 7:37 a.m.
On 17.03.2011, at 02:29, David Gibson <david@gibson.dropbear.id.au> wrote:

> On Wed, Mar 16, 2011 at 04:47:11PM +0100, Alexander Graf wrote:
>> On 03/16/2011 05:56 AM, David Gibson wrote:
> [snip]
>>> +/* static void ics_resend(struct icp_server_state *ss) */
>>> +/* { */
>>> +/*     int i; */
>>> +
>>> +/*     for (i = 0; i<  xics->nr_irqs; i++) */
>>> +/*         ics_resend_irq(xics, nr, ss); */
>>> +/* } */
>> 
>> Why is all this commented out? Better #if 0 it all away. Or even
>> better, don't include it in the patch - unless you think the code is
>> crucial and to be activated soon.
> 
> Hrm, it was supposed to implement level (rather than message)
> interrupts on XICS.  But I think its bitrotted since I commented it
> out.  Removed.
> 
>>> diff --git a/hw/xics.h b/hw/xics.h
>>> new file mode 100644
>>> index 0000000..e55f5f1
>>> --- /dev/null
>>> +++ b/hw/xics.h
>> 
>> Header missing
> 
> I'm not sure what you mean by this

Every source file should have a license/copyright header ;)

Alex

>
Anthony Liguori - March 17, 2011, 1:13 p.m.
On 03/16/2011 08:34 PM, David Gibson wrote:
>
>>> +/*
>>> + * ICP: Presentation layer
>>> + */
>>> +
>>> +struct icp_server_state {
>>> +    uint32_t cppr :8;
>>> +    uint32_t xisr :24;
>> No real reason to use bitfields here.
> Well.. in the hardware xics implementation, CPPR and XISR are
> considered fields of the one 32-bit register, XIRR.  Matching that is
> why I have the bitfield.

Bitfields don't work well with the way we save device state.

Regards,

Anthony Liguori
David Gibson - March 23, 2011, 3:48 a.m.
On Thu, Mar 17, 2011 at 08:13:27AM -0500, Anthony Liguori wrote:
> On 03/16/2011 08:34 PM, David Gibson wrote:
> >
> >>>+/*
> >>>+ * ICP: Presentation layer
> >>>+ */
> >>>+
> >>>+struct icp_server_state {
> >>>+    uint32_t cppr :8;
> >>>+    uint32_t xisr :24;
> >>No real reason to use bitfields here.
> >Well.. in the hardware xics implementation, CPPR and XISR are
> >considered fields of the one 32-bit register, XIRR.  Matching that is
> >why I have the bitfield.
> 
> Bitfields don't work well with the way we save device state.

Good point.  In fact, I think I even hit that when I did some
preliminary looking at adding partition save/migration support to the
pseries stuff.  Bitfields removed in the next version.

Patch

diff --git a/Makefile.target b/Makefile.target
index e333225..2b0588e 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -233,7 +233,7 @@  obj-ppc-y += ppc_oldworld.o
 obj-ppc-y += ppc_newworld.o
 # IBM pSeries (sPAPR)
 obj-ppc-y += spapr.o spapr_hcall.o spapr_rtas.o spapr_vio.o
-obj-ppc-y += spapr_vty.o
+obj-ppc-y += xics.o spapr_vty.o
 # PowerPC 4xx boards
 obj-ppc-y += ppc4xx_devs.o ppc4xx_pci.o ppc405_uc.o ppc405_boards.o
 obj-ppc-y += ppc440.o ppc440_bamboo.o
diff --git a/hw/spapr.c b/hw/spapr.c
index 23f493a..be30def 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -34,6 +34,7 @@ 
 
 #include "hw/spapr.h"
 #include "hw/spapr_vio.h"
+#include "hw/xics.h"
 
 #include <libfdt.h>
 
@@ -62,6 +63,7 @@  static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
     uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
     uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)};
     char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr";
+    uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
     int i;
     char *modelname;
     int ret;
@@ -120,6 +122,7 @@  static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
 
     for (i = 0; i < smp_cpus; i++) {
         CPUState *env = envs[i];
+        uint32_t gserver_prop[] = {cpu_to_be32(i), 0}; /* HACK! */
         char *nodename;
         uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
                            0xffffffff, 0xffffffff};
@@ -147,6 +150,9 @@  static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
         _FDT((fdt_property(fdt, "ibm,pft-size", pft_size_prop, sizeof(pft_size_prop))));
         _FDT((fdt_property_string(fdt, "status", "okay")));
         _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
+        _FDT((fdt_property_cell(fdt, "ibm,ppc-interrupt-server#s", i)));
+        _FDT((fdt_property(fdt, "ibm,ppc-interrupt-gserver#s", 
+                           gserver_prop, sizeof(gserver_prop))));
 
         if (envs[i]->mmu_model & POWERPC_MMU_1TSEG) {
             _FDT((fdt_property(fdt, "ibm,processor-segment-sizes",
@@ -168,6 +174,20 @@  static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
 
     _FDT((fdt_end_node(fdt)));
 
+    /* interrupt controller */ 
+    _FDT((fdt_begin_node(fdt, "interrupt-controller@0")));
+
+    _FDT((fdt_property_string(fdt, "device_type",
+                              "PowerPC-External-Interrupt-Presentation")));
+    _FDT((fdt_property_string(fdt, "compatible", "IBM,ppc-xicp")));
+    _FDT((fdt_property_cell(fdt, "reg", 0)));    
+    _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
+    _FDT((fdt_property(fdt, "ibm,interrupt-server-ranges",
+                       interrupt_server_ranges_prop,
+                       sizeof(interrupt_server_ranges_prop))));
+
+    _FDT((fdt_end_node(fdt)));
+   
     /* vdevice */
     _FDT((fdt_begin_node(fdt, "vdevice")));
 
@@ -175,6 +195,8 @@  static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
     _FDT((fdt_property_string(fdt, "compatible", "IBM,vdevice")));
     _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
     _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));
+    _FDT((fdt_property_cell(fdt, "#interrupt-cells", 0x2)));
+    _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
     
     _FDT((fdt_end_node(fdt)));
 
@@ -290,6 +312,10 @@  static void ppc_spapr_init(ram_addr_t ram_size,
     }
     qemu_free(filename);
 
+    /* Set up Interrupt Controller */
+    spapr->icp = xics_system_init(smp_cpus, &env, MAX_SERIAL_PORTS);
+
+    /* Set up VIO bus */
     spapr->vio_bus = spapr_vio_bus_init();
 
     for (i = 0; i < MAX_SERIAL_PORTS; i++) {
diff --git a/hw/spapr.h b/hw/spapr.h
index 7a7c319..4b54c22 100644
--- a/hw/spapr.h
+++ b/hw/spapr.h
@@ -2,9 +2,11 @@ 
 #define __HW_SPAPR_H__
 
 struct VIOsPAPRBus;
+struct icp_state;
 
 typedef struct sPAPREnvironment {
     struct VIOsPAPRBus *vio_bus;
+    struct icp_state *icp;
 } sPAPREnvironment;
 
 #define H_SUCCESS         0
diff --git a/hw/xics.c b/hw/xics.c
new file mode 100644
index 0000000..46e778a
--- /dev/null
+++ b/hw/xics.c
@@ -0,0 +1,528 @@ 
+#include "hw.h"
+#include "hw/spapr.h"
+#include "hw/xics.h"
+
+#include <pthread.h>
+
+/*
+ * ICP: Presentation layer
+ */
+
+struct icp_server_state {
+    uint32_t cppr :8;
+    uint32_t xisr :24;
+    uint8_t pending_priority;
+    uint8_t mfrr;
+    qemu_irq output;
+    pthread_mutex_t lock;
+};
+
+struct ics_state;
+
+struct icp_state {
+    long nr_servers;
+    struct icp_server_state *ss;
+    struct ics_state *ics;
+};
+
+static void ics_reject(struct ics_state *ics, int nr);
+static void ics_resend(struct ics_state *ics);
+static void ics_eoi(struct ics_state *ics, int nr);
+
+static void icp_check_ipi(struct icp_state *icp, int server)
+{
+    struct icp_server_state *ss = icp->ss + server;
+    
+    if (ss->xisr && (ss->pending_priority <= ss->mfrr)) {
+        return;
+    }
+
+    if (ss->xisr) {
+        ics_reject(icp->ics, ss->xisr);
+    }
+
+    ss->xisr = XICS_IPI;
+    ss->pending_priority = ss->mfrr;
+    qemu_irq_raise(ss->output);
+}
+
+static void icp_resend(struct icp_state *icp, int server)
+{
+    struct icp_server_state *ss = icp->ss + server;
+
+    if (ss->mfrr < ss->cppr) {
+        icp_check_ipi(icp, server);
+    }
+    ics_resend(icp->ics);
+}
+
+static void icp_set_cppr(struct icp_state *icp, int server, uint8_t cppr)
+{
+    struct icp_server_state *ss = icp->ss + server;
+    uint8_t old_cppr;
+    uint32_t old_xisr;
+
+    pthread_mutex_lock(&ss->lock);
+    old_cppr = ss->cppr;
+    ss->cppr = cppr;
+
+    if (cppr < old_cppr) {
+        if (ss->xisr && (cppr <= ss->pending_priority)) {
+            old_xisr = ss->xisr;
+            ss->xisr = 0;
+            qemu_irq_lower(ss->output);
+            ics_reject(icp->ics, old_xisr);
+        }
+    } else {
+        if (!ss->xisr) {
+            icp_resend(icp, server);
+        }
+    }
+    pthread_mutex_unlock(&ss->lock);
+}
+
+static void icp_set_mfrr(struct icp_state *icp, int nr, uint8_t mfrr)
+{
+    struct icp_server_state *ss = icp->ss + nr;
+
+    pthread_mutex_lock(&ss->lock);
+
+    ss->mfrr = mfrr;
+    if (mfrr < ss->cppr) {
+        icp_check_ipi(icp, nr);
+    }
+
+    pthread_mutex_unlock(&ss->lock);
+}
+
+static uint32_t icp_accept(struct icp_server_state *ss)
+{
+    uint32_t xirr;
+
+    pthread_mutex_lock(&ss->lock);
+    qemu_irq_lower(ss->output);
+    xirr = ss->cppr << 24 | ss->xisr;
+    ss->xisr = 0;
+    ss->cppr = ss->pending_priority;
+    pthread_mutex_unlock(&ss->lock);
+    return xirr;
+}
+
+static void icp_eoi(struct icp_state *icp, int server, uint32_t xirr)
+{
+    struct icp_server_state *ss = icp->ss + server;
+
+    ics_eoi(icp->ics, xirr & 0xffffff);
+    /* Send EOI -> ICS */
+    ss->cppr = xirr >> 24;
+    if (!ss->xisr) {
+        icp_resend(icp, server);
+    }
+}
+
+static void icp_irq(struct icp_state *icp, int server, int nr, uint8_t priority)
+{
+    struct icp_server_state *ss = icp->ss + server;
+
+    pthread_mutex_lock(&ss->lock);
+
+    if ((priority >= ss->cppr)
+        || (ss->xisr && (ss->pending_priority <= priority))) {
+        ics_reject(icp->ics, nr);
+    } else {
+        if (ss->xisr) {
+            ics_reject(icp->ics, ss->xisr);
+        }
+        ss->xisr = nr;
+        ss->pending_priority = priority;
+        qemu_irq_raise(ss->output);
+    }
+
+    pthread_mutex_unlock(&ss->lock);
+}
+
+/*
+ * ICS: Source layer
+ */
+
+struct ics_irq_state {
+    int server;
+    uint8_t priority;
+    uint8_t saved_priority;
+    /* int pending :1; */
+    /* int presented :1; */
+    int rejected :1;
+    int masked_pending :1;
+};
+
+struct ics_state {
+    int nr_irqs;
+    int offset;
+    qemu_irq *qirqs;
+    struct ics_irq_state *irqs;
+    struct icp_state *icp;
+};
+
+static int ics_valid_irq(struct ics_state *ics, uint32_t nr)
+{
+    return (nr >= ics->offset)
+        && (nr < (ics->offset + ics->nr_irqs));
+}
+
+static void ics_set_irq_msi(void *opaque, int nr, int val)
+{
+    struct ics_state *ics = (struct ics_state *)opaque;
+    struct ics_irq_state *irq = ics->irqs + nr;
+
+    if (val) {
+        if (irq->priority == 0xff) {
+            irq->masked_pending = 1;
+            /* masked pending */ ;
+        } else  {
+            icp_irq(ics->icp, irq->server, nr + ics->offset, irq->priority);
+        }
+    }
+}
+
+static void ics_reject_msi(struct ics_state *ics, int nr)
+{
+    struct ics_irq_state *irq = ics->irqs + nr - ics->offset;
+
+    irq->rejected = 1;
+}
+
+static void ics_resend_msi(struct ics_state *ics)
+{
+    int i;
+
+    for (i = 0; i < ics->nr_irqs; i++) {
+        struct ics_irq_state *irq = ics->irqs + i;
+
+        /* FIXME: filter by server#? */
+        if (irq->rejected) {
+            irq->rejected = 0;
+            if (irq->priority != 0xff) {
+                icp_irq(ics->icp, irq->server, i + ics->offset, irq->priority);
+            }
+        }
+    }
+}
+
+static void ics_write_xive_msi(struct ics_state *ics, int nr, int server,
+                               uint8_t priority)
+{
+    struct ics_irq_state *irq = ics->irqs + nr;
+
+    irq->server = server;
+    irq->priority = priority;
+
+    if (!irq->masked_pending || (priority = 0xff)) {
+        return;
+    }
+
+    irq->masked_pending = 0;
+    icp_irq(ics->icp, server, nr + ics->offset, priority);
+}
+
+/* static void ics_recheck_irq(struct ics_state *ics, int nr) */
+/* { */
+/*     struct ics_irq_state *irq = xics->irqs + (nr - xics->offset); */
+
+/*     if (irq->pending && (irq->priority != 0xff)) { */
+/*      irq->presented = 1; */
+/*      icp_irq(xicp->ss + irq->server, nr + ics->offset, irq->priority); */
+/*     } */
+/* } */
+
+/* static void ics_set_irq(void *opaque, int nr, int val) */
+/* { */
+/*     struct ics_state *ics = (struct ics_state *)opaque; */
+/*     struct ics_irq_state *irq = ics->irqs + nr; */
+
+/*     irq->pending = val; */
+/*     ics_recheck_irq(ics, nr); */
+/* } */
+
+/* static void ics_reject(int nr) */
+/* { */
+/*     struct ics_irq_state *irq = xics->irqs + (nr - xics->offset); */
+
+/*     assert(irq->presented); */
+/*     irq->rejected = 1; */
+/*     irq->presented = 0; */
+/* } */
+
+/* static void ics_eoi(int nr) */
+/* { */
+/*     struct ics_irq_state *irq = xics->irqs + (nr - xics->offset); */
+
+/*     assert(irq->presented); */
+/*     irq->presented = 0; */
+/*     irq->rejected = 0; */
+/*     ics_recheck_irq(xics, nr); */
+/* } */
+
+/* static void ics_resend_irq(struct ics_state *ics, int nr, */
+/*                            struct icp_server_state *ss) */
+/* { */
+/*     struct ics_irq_state *irq = ics->irqs + (nr - ics->offset); */
+
+/*     if (!irq->rejected) */
+/*         return; /\* Not rejected, so no need to resend *\/ */
+
+/*     if (ss != (xicp->ss + irq->server)) */
+/*         return; /\* Not for this server, so don't resend *\/ */
+
+/*     ics_recheck_irq(ics, nr); */
+/* } */
+
+/* static void ics_resend(struct icp_server_state *ss) */
+/* { */
+/*     int i; */
+
+/*     for (i = 0; i < xics->nr_irqs; i++) */
+/*         ics_resend_irq(xics, nr, ss); */
+/* } */
+
+static void ics_reject(struct ics_state *ics, int nr)
+{
+    ics_reject_msi(ics, nr);
+}
+
+static void ics_resend(struct ics_state *ics)
+{
+    ics_resend_msi(ics);
+}
+
+static void ics_eoi(struct ics_state *ics, int nr)
+{
+}
+
+/*
+ * Exported functions
+ */
+
+qemu_irq xics_find_qirq(struct icp_state *icp, int irq)
+{
+    if ((irq < icp->ics->offset)
+        || (irq >= (icp->ics->offset + icp->ics->nr_irqs))) {
+        return NULL;
+    }
+
+    return icp->ics->qirqs[irq - icp->ics->offset];
+}
+
+static target_ulong h_cppr(CPUState *env, sPAPREnvironment *spapr,
+                           target_ulong opcode, target_ulong *args)
+{
+    target_ulong cppr = args[0];
+
+    icp_set_cppr(spapr->icp, env->cpu_index, cppr);
+    return H_SUCCESS;
+}
+
+static target_ulong h_ipi(CPUState *env, sPAPREnvironment *spapr,
+                          target_ulong opcode, target_ulong *args)
+{
+    target_ulong server = args[0];
+    target_ulong mfrr = args[1];
+
+    if (server >= spapr->icp->nr_servers) {
+        return H_PARAMETER;
+    }
+
+    icp_set_mfrr(spapr->icp, server, mfrr);
+    return H_SUCCESS;
+
+}
+
+static target_ulong h_xirr(CPUState *env, sPAPREnvironment *spapr,
+                           target_ulong opcode, target_ulong *args)
+{
+    uint32_t xirr = icp_accept(spapr->icp->ss + env->cpu_index);
+
+    args[0] = xirr;
+    return H_SUCCESS;
+}
+
+static target_ulong h_eoi(CPUState *env, sPAPREnvironment *spapr,
+                          target_ulong opcode, target_ulong *args)
+{
+    target_ulong xirr = args[0];
+
+    icp_eoi(spapr->icp, env->cpu_index, xirr);
+    return H_SUCCESS;
+}
+
+static void rtas_set_xive(sPAPREnvironment *spapr, uint32_t token,
+                          uint32_t nargs, target_ulong args,
+                          uint32_t nret, target_ulong rets)
+{
+    struct ics_state *ics = spapr->icp->ics;
+    uint32_t nr, server, priority;
+
+    if ((nargs != 3) || (nret != 1)) {
+        rtas_st(rets, 0, -3);
+        return;
+    }
+
+    nr = rtas_ld(args, 0);
+    server = rtas_ld(args, 1);
+    priority = rtas_ld(args, 2);
+
+    if (!ics_valid_irq(ics, nr) || (server >= ics->icp->nr_servers)
+        || (priority > 0xff)) {
+        rtas_st(rets, 0, -3);
+        return;
+    }
+
+    ics_write_xive_msi(ics, nr - ics->offset, server, priority);
+
+    rtas_st(rets, 0, 0); /* Success */
+}
+
+static void rtas_get_xive(sPAPREnvironment *spapr, uint32_t token,
+                          uint32_t nargs, target_ulong args,
+                          uint32_t nret, target_ulong rets)
+{
+    struct ics_state *ics = spapr->icp->ics;
+    uint32_t nr;
+
+    if ((nargs != 1) || (nret != 3)) {
+        rtas_st(rets, 0, -3);
+        return;
+    }
+
+    nr = rtas_ld(args, 0);
+
+    if (!ics_valid_irq(ics, nr)) {
+        rtas_st(rets, 0, -3);
+        return;
+    }
+
+    rtas_st(rets, 0, 0); /* Success */
+    rtas_st(rets, 1, ics->irqs[nr - ics->offset].server);
+    rtas_st(rets, 2, ics->irqs[nr - ics->offset].priority);
+}
+
+static void rtas_int_off(sPAPREnvironment *spapr, uint32_t token,
+                         uint32_t nargs, target_ulong args,
+                         uint32_t nret, target_ulong rets)
+{
+    struct ics_state *ics = spapr->icp->ics;
+    uint32_t nr;
+
+    if ((nargs != 1) || (nret != 1)) {
+        rtas_st(rets, 0, -3);
+        return;
+    }
+
+    nr = rtas_ld(args, 0);
+
+    if (!ics_valid_irq(ics, nr)) {
+        rtas_st(rets, 0, -3);
+        return;
+    }
+
+    /* This is a NOP for now, since the described PAPR semantics don't
+     * seem to gel with what Linux does */
+#if 0
+    struct ics_irq_state *irq = xics->irqs + (nr - xics->offset);
+
+    irq->saved_priority = irq->priority;
+    ics_write_xive_msi(xics, nr - xics->offset, irq->server, 0xff);
+#endif
+
+    rtas_st(rets, 0, 0); /* Success */
+}
+
+static void rtas_int_on(sPAPREnvironment *spapr, uint32_t token,
+                        uint32_t nargs, target_ulong args,
+                        uint32_t nret, target_ulong rets)
+{
+    struct ics_state *ics = spapr->icp->ics;
+    uint32_t nr;
+
+    if ((nargs != 1) || (nret != 1)) {
+        rtas_st(rets, 0, -3);
+        return;
+    }
+
+    nr = rtas_ld(args, 0);
+
+    if (!ics_valid_irq(ics, nr)) {
+        rtas_st(rets, 0, -3);
+        return;
+    }
+
+    /* This is a NOP for now, since the described PAPR semantics don't
+     * seem to gel with what Linux does */
+#if 0
+    struct ics_irq_state *irq = xics->irqs + (nr - xics->offset);
+
+    ics_write_xive_msi(xics, nr - xics->offset,
+                       irq->server, irq->saved_priority);
+#endif
+
+    rtas_st(rets, 0, 0); /* Success */
+}
+
+struct icp_state *xics_system_init(int nr_servers, CPUState *servers[],
+                                   int nr_irqs)
+{
+    int i;
+    struct icp_state *icp;
+    struct ics_state *ics;
+
+    icp = qemu_mallocz(sizeof(*icp));
+    icp->nr_servers = nr_servers;
+    icp->ss = qemu_mallocz(nr_servers * sizeof(struct icp_server_state));
+
+    for (i = 0; i < nr_servers; i++) {
+        servers[i]->cpu_index = i;
+
+        switch (PPC_INPUT(servers[i])) {
+        case PPC_FLAGS_INPUT_POWER7:
+            icp->ss[i].output = servers[i]->irq_inputs[POWER7_INPUT_INT];
+            break;
+
+        case PPC_FLAGS_INPUT_970:
+            icp->ss[i].output = servers[i]->irq_inputs[PPC970_INPUT_INT];
+            break;
+
+        default:
+            hw_error("XICS interrupt model does not support this CPU bus model\n");
+            exit(1);
+        }
+
+        icp->ss[i].mfrr = 0xff;
+        pthread_mutex_init(&icp->ss[i].lock, NULL);
+    }
+
+    ics = qemu_mallocz(sizeof(*ics));
+    ics->nr_irqs = nr_irqs;
+    ics->offset = 16;
+    ics->irqs = qemu_mallocz(nr_irqs * sizeof(struct ics_irq_state));
+
+    icp->ics = ics;
+    ics->icp = icp;
+
+    for (i = 0; i < nr_irqs; i++) {
+        ics->irqs[i].priority = 0xff;
+        ics->irqs[i].saved_priority = 0xff;
+    }
+
+    ics->qirqs = qemu_allocate_irqs(ics_set_irq_msi, ics, nr_irqs);
+
+    spapr_register_hypercall(H_CPPR, h_cppr);
+    spapr_register_hypercall(H_IPI, h_ipi);
+    spapr_register_hypercall(H_XIRR, h_xirr);
+    spapr_register_hypercall(H_EOI, h_eoi);
+
+    spapr_rtas_register("ibm,set-xive", rtas_set_xive);
+    spapr_rtas_register("ibm,get-xive", rtas_get_xive);
+    spapr_rtas_register("ibm,int-off", rtas_int_off);
+    spapr_rtas_register("ibm,int-on", rtas_int_on);
+
+    return icp;
+}
diff --git a/hw/xics.h b/hw/xics.h
new file mode 100644
index 0000000..e55f5f1
--- /dev/null
+++ b/hw/xics.h
@@ -0,0 +1,13 @@ 
+#if !defined(__XICS_H__)
+#define __XICS_H__
+
+#define XICS_IPI        0x2
+
+struct icp_state;
+
+qemu_irq xics_find_qirq(struct icp_state *icp, int irq);
+
+struct icp_state *xics_system_init(int nr_servers, CPUState *servers[],
+                                   int nr_irqs);
+
+#endif /* __XICS_H__ */