diff mbox series

[10/25] spapr: add MMIO handlers for the XIVE interrupt sources

Message ID 20171123132955.1261-11-clg@kaod.org
State New
Headers show
Series spapr: Guest exploitation of the XIVE interrupt controller (POWER9) | expand

Commit Message

Cédric Le Goater Nov. 23, 2017, 1:29 p.m. UTC
Each interrupt source is associated with a two bit state machine
called an Event State Buffer (ESB). The bits are named "P" (pending)
and "Q" (queued) and can be controlled by MMIO. It is used to trigger
events. See code for more details on the states and transitions.

The MMIO space for the ESB translation is 512GB large on baremetal
(powernv) systems and the BAR depends on the chip id. In our model for
the sPAPR machine, we choose to only map a sub memory region for the
provisionned IRQ numbers and to use the mapping address of chip 0 on a
real system. The OS will get the address of the MMIO page of the ESB
entry associated with an IRQ using the H_INT_GET_SOURCE_INFO hcall.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 hw/intc/spapr_xive.c        | 268 +++++++++++++++++++++++++++++++++++++++++++-
 include/hw/ppc/spapr_xive.h |   8 ++
 2 files changed, 275 insertions(+), 1 deletion(-)

Comments

David Gibson Nov. 28, 2017, 6:38 a.m. UTC | #1
On Thu, Nov 23, 2017 at 02:29:40PM +0100, Cédric Le Goater wrote:
> Each interrupt source is associated with a two bit state machine
> called an Event State Buffer (ESB). The bits are named "P" (pending)
> and "Q" (queued) and can be controlled by MMIO. It is used to trigger
> events. See code for more details on the states and transitions.
> 
> The MMIO space for the ESB translation is 512GB large on baremetal
> (powernv) systems and the BAR depends on the chip id. In our model for
> the sPAPR machine, we choose to only map a sub memory region for the
> provisionned IRQ numbers and to use the mapping address of chip 0 on a
> real system. The OS will get the address of the MMIO page of the ESB
> entry associated with an IRQ using the H_INT_GET_SOURCE_INFO hcall.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  hw/intc/spapr_xive.c        | 268 +++++++++++++++++++++++++++++++++++++++++++-
>  include/hw/ppc/spapr_xive.h |   8 ++
>  2 files changed, 275 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> index 66c533fb1d78..f45f50fd017e 100644
> --- a/hw/intc/spapr_xive.c
> +++ b/hw/intc/spapr_xive.c
> @@ -32,6 +32,216 @@ static void spapr_xive_irq(sPAPRXive *xive, int lisn)
>  }
>  
>  /*
> + * "magic" Event State Buffer (ESB) MMIO offsets.
> + *
> + * Each interrupt source has a 2-bit state machine called ESB
> + * which can be controlled by MMIO. It's made of 2 bits, P and
> + * Q. P indicates that an interrupt is pending (has been sent
> + * to a queue and is waiting for an EOI). Q indicates that the
> + * interrupt has been triggered while pending.
> + *
> + * This acts as a coalescing mechanism in order to guarantee
> + * that a given interrupt only occurs at most once in a queue.
> + *
> + * When doing an EOI, the Q bit will indicate if the interrupt
> + * needs to be re-triggered.
> + *
> + * The following offsets into the ESB MMIO allow to read or
> + * manipulate the PQ bits. They must be used with an 8-bytes
> + * load instruction. They all return the previous state of the
> + * interrupt (atomically).
> + *
> + * Additionally, some ESB pages support doing an EOI via a
> + * store at 0 and some ESBs support doing a trigger via a
> + * separate trigger page.
> + */
> +#define XIVE_ESB_GET            0x800
> +#define XIVE_ESB_SET_PQ_00      0xc00
> +#define XIVE_ESB_SET_PQ_01      0xd00
> +#define XIVE_ESB_SET_PQ_10      0xe00
> +#define XIVE_ESB_SET_PQ_11      0xf00
> +
> +#define XIVE_ESB_VAL_P          0x2
> +#define XIVE_ESB_VAL_Q          0x1
> +
> +#define XIVE_ESB_RESET          0x0
> +#define XIVE_ESB_PENDING        XIVE_ESB_VAL_P
> +#define XIVE_ESB_QUEUED         (XIVE_ESB_VAL_P | XIVE_ESB_VAL_Q)
> +#define XIVE_ESB_OFF            XIVE_ESB_VAL_Q
> +
> +static uint8_t spapr_xive_pq_get(sPAPRXive *xive, uint32_t lisn)
> +{
> +    uint32_t byte = lisn / 4;
> +    uint32_t bit  = (lisn % 4) * 2;
> +
> +    assert(byte < xive->sbe_size);
> +
> +    return (xive->sbe[byte] >> bit) & 0x3;
> +}
> +
> +static uint8_t spapr_xive_pq_set(sPAPRXive *xive, uint32_t lisn, uint8_t pq)
> +{
> +    uint32_t byte = lisn / 4;
> +    uint32_t bit  = (lisn % 4) * 2;
> +    uint8_t old, new;
> +
> +    assert(byte < xive->sbe_size);
> +
> +    old = xive->sbe[byte];
> +
> +    new = xive->sbe[byte] & ~(0x3 << bit);
> +    new |= (pq & 0x3) << bit;
> +
> +    xive->sbe[byte] = new;
> +
> +    return (old >> bit) & 0x3;
> +}
> +
> +static bool spapr_xive_pq_eoi(sPAPRXive *xive, uint32_t lisn)
> +{
> +    uint8_t old_pq = spapr_xive_pq_get(xive, lisn);
> +
> +    switch (old_pq) {
> +    case XIVE_ESB_RESET:
> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_RESET);
> +        return false;
> +    case XIVE_ESB_PENDING:
> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_RESET);
> +        return false;
> +    case XIVE_ESB_QUEUED:
> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_PENDING);
> +        return true;
> +    case XIVE_ESB_OFF:
> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_OFF);
> +        return false;
> +    default:
> +         g_assert_not_reached();
> +    }
> +}
> +
> +static bool spapr_xive_pq_trigger(sPAPRXive *xive, uint32_t lisn)
> +{
> +    uint8_t old_pq = spapr_xive_pq_get(xive, lisn);
> +
> +    switch (old_pq) {
> +    case XIVE_ESB_RESET:
> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_PENDING);
> +        return true;
> +    case XIVE_ESB_PENDING:
> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_QUEUED);
> +        return true;
> +    case XIVE_ESB_QUEUED:
> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_QUEUED);
> +        return true;
> +    case XIVE_ESB_OFF:
> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_OFF);
> +        return false;
> +    default:
> +         g_assert_not_reached();
> +    }
> +}
> +
> +/*
> + * XIVE Interrupt Source MMIOs
> + */
> +static void spapr_xive_source_eoi(sPAPRXive *xive, uint32_t lisn)
> +{
> +    if (spapr_xive_irq_is_lsi(xive, lisn)) {
> +        xive->status[lisn] &= ~XIVE_STATUS_SENT;
> +    }
> +}
> +
> +/* TODO: handle second page
> + *
> + * Some HW use a separate page for trigger. We only support the case
> + * in which the trigger can be done in the same page as the EOI.
> + */
> +static uint64_t spapr_xive_esb_read(void *opaque, hwaddr addr, unsigned size)
> +{
> +    sPAPRXive *xive = SPAPR_XIVE(opaque);
> +    uint32_t offset = addr & 0xF00;
> +    uint32_t lisn = addr >> xive->esb_shift;
> +    XiveIVE *ive;
> +    uint64_t ret = -1;
> +
> +    ive = spapr_xive_get_ive(xive, lisn);
> +    if (!ive || !(ive->w & IVE_VALID))  {
> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn);
> +        goto out;
> +    }
> +
> +    switch (offset) {
> +    case 0:
> +        spapr_xive_source_eoi(xive, lisn);

Hrm.  I don't love that you're dealing with clearing that LSI bit
here, but setting it at a different level.

The state machines are doing my head in a bit, is there any way
you could derive the STATUS_SENT bit from the PQ bits?

> +        /* return TRUE or FALSE depending on PQ value */
> +        ret = spapr_xive_pq_eoi(xive, lisn);
> +        break;
> +
> +    case XIVE_ESB_GET:
> +        ret = spapr_xive_pq_get(xive, lisn);
> +        break;
> +
> +    case XIVE_ESB_SET_PQ_00:
> +    case XIVE_ESB_SET_PQ_01:
> +    case XIVE_ESB_SET_PQ_10:
> +    case XIVE_ESB_SET_PQ_11:
> +        ret = spapr_xive_pq_set(xive, lisn, (offset >> 8) & 0x3);
> +        break;
> +    default:
> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB addr %d\n", offset);
> +    }
> +
> +out:
> +    return ret;
> +}
> +
> +static void spapr_xive_esb_write(void *opaque, hwaddr addr,
> +                           uint64_t value, unsigned size)
> +{
> +    sPAPRXive *xive = SPAPR_XIVE(opaque);
> +    uint32_t offset = addr & 0xF00;
> +    uint32_t lisn = addr >> xive->esb_shift;
> +    XiveIVE *ive;
> +    bool notify = false;
> +
> +    ive = spapr_xive_get_ive(xive, lisn);
> +    if (!ive || !(ive->w & IVE_VALID))  {
> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn);
> +        return;
> +    }
> +
> +    switch (offset) {
> +    case 0:
> +        /* TODO: should we trigger even if the IVE is masked ? */
> +        notify = spapr_xive_pq_trigger(xive, lisn);
> +        break;
> +    default:
> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB write addr %d\n",
> +                      offset);
> +        return;
> +    }
> +
> +    if (notify && !(ive->w & IVE_MASKED)) {
> +        qemu_irq_pulse(xive->qirqs[lisn]);
> +    }
> +}
> +
> +static const MemoryRegionOps spapr_xive_esb_ops = {
> +    .read = spapr_xive_esb_read,
> +    .write = spapr_xive_esb_write,
> +    .endianness = DEVICE_BIG_ENDIAN,
> +    .valid = {
> +        .min_access_size = 8,
> +        .max_access_size = 8,
> +    },
> +    .impl = {
> +        .min_access_size = 8,
> +        .max_access_size = 8,
> +    },
> +};
> +
> +/*
>   * XIVE Interrupt Source
>   */
>  static void spapr_xive_source_set_irq_msi(sPAPRXive *xive, int lisn, int val)
> @@ -70,6 +280,33 @@ static void spapr_xive_source_set_irq(void *opaque, int lisn, int val)
>  /*
>   * Main XIVE object
>   */
> +#define P9_MMIO_BASE     0x006000000000000ull
> +
> +/* VC BAR contains set translations for the ESBs and the EQs. */
> +#define VC_BAR_DEFAULT   0x10000000000ull
> +#define VC_BAR_SIZE      0x08000000000ull
> +#define ESB_SHIFT        16 /* One 64k page. OPAL has two */
> +
> +static uint64_t spapr_xive_esb_default_read(void *p, hwaddr offset,
> +                                            unsigned size)
> +{
> +    qemu_log_mask(LOG_UNIMP, "%s: 0x%" HWADDR_PRIx " [%u]\n",
> +                  __func__, offset, size);
> +    return 0;
> +}
> +
> +static void spapr_xive_esb_default_write(void *opaque, hwaddr offset,
> +                                         uint64_t value, unsigned size)
> +{
> +    qemu_log_mask(LOG_UNIMP, "%s: 0x%" HWADDR_PRIx " <- 0x%" PRIx64 " [%u]\n",
> +                  __func__, offset, value, size);
> +}
> +
> +static const MemoryRegionOps spapr_xive_esb_default_ops = {
> +    .read = spapr_xive_esb_default_read,
> +    .write = spapr_xive_esb_default_write,
> +    .endianness = DEVICE_BIG_ENDIAN,

I think you should at least have a valid access size field here.

> +};
>  
>  void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
>  {
> @@ -77,14 +314,19 @@ void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
>  
>      for (i = 0; i < xive->nr_irqs; i++) {
>          XiveIVE *ive = &xive->ivt[i];
> +        uint8_t pq;
>  
>          if (!(ive->w & IVE_VALID)) {
>              continue;
>          }
>  
> -        monitor_printf(mon, "  %4x %s %s %08x %08x\n", i,
> +        pq = spapr_xive_pq_get(xive, i);
> +
> +        monitor_printf(mon, "  %4x %s %s %c%c %08x %08x\n", i,
>                         spapr_xive_irq_is_lsi(xive, i) ? "LSI" : "MSI",
>                         ive->w & IVE_MASKED ? "M" : " ",
> +                       pq & XIVE_ESB_VAL_P ? 'P' : '-',
> +                       pq & XIVE_ESB_VAL_Q ? 'Q' : '-',
>                         (int) GETFIELD(IVE_EQ_INDEX, ive->w),
>                         (int) GETFIELD(IVE_EQ_DATA, ive->w));
>      }
> @@ -104,6 +346,9 @@ void spapr_xive_reset(void *dev)
>              ive->w |= IVE_MASKED;
>          }
>      }
> +
> +    /* SBEs are initialized to 0b01 which corresponds to "ints off" */
> +    memset(xive->sbe, 0x55, xive->sbe_size);
>  }
>  
>  static void spapr_xive_realize(DeviceState *dev, Error **errp)
> @@ -123,6 +368,26 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>      /* Allocate the IVT (Interrupt Virtualization Table) */
>      xive->ivt = g_malloc0(xive->nr_irqs * sizeof(XiveIVE));
>  
> +    /* Allocate SBEs (State Bit Entry). 2 bits, so 4 entries per byte */
> +    xive->sbe_size = DIV_ROUND_UP(xive->nr_irqs, 4);
> +    xive->sbe = g_malloc0(xive->sbe_size);
> +
> +    /* VC BAR. That's the full window but we will only map the
> +     * subregions in use. */
> +    xive->esb_base = (P9_MMIO_BASE | VC_BAR_DEFAULT);
> +    xive->esb_shift = ESB_SHIFT;

Any point to having this as a variable, if it's always the same size?

> +
> +    /* Install default memory region handlers to log bogus access */
> +    memory_region_init_io(&xive->esb_mr, NULL, &spapr_xive_esb_default_ops,
> +                          NULL, "xive.esb.full", VC_BAR_SIZE);
> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->esb_mr);
> +
> +    /* Install the ESB memory region in the overall one */
> +    memory_region_init_io(&xive->esb_iomem, OBJECT(xive), &spapr_xive_esb_ops,
> +                          xive, "xive.esb",
> +                          (1ull << xive->esb_shift) * xive->nr_irqs);
> +    memory_region_add_subregion(&xive->esb_mr, 0, &xive->esb_iomem);

Is there a benegit to to having these nested regions, rather than just
validating the lisn in the read/write functions?

>      qemu_register_reset(spapr_xive_reset, dev);
>  }
>  
> @@ -152,6 +417,7 @@ static const VMStateDescription vmstate_spapr_xive = {
>          VMSTATE_STRUCT_VARRAY_UINT32_ALLOC(ivt, sPAPRXive, nr_irqs, 1,
>                                             vmstate_spapr_xive_ive, XiveIVE),
>          VMSTATE_VBUFFER_UINT32(status, sPAPRXive, 1, NULL, nr_irqs),
> +        VMSTATE_VBUFFER_UINT32(sbe, sPAPRXive, 1, NULL, sbe_size),
>          VMSTATE_END_OF_LIST()
>      },
>  };
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index 6a799cdaba66..84c910e62e56 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -42,6 +42,14 @@ struct sPAPRXive {
>  
>      /* XIVE internal tables */
>      XiveIVE      *ivt;
> +    uint8_t      *sbe;
> +    uint32_t     sbe_size;

sbe_size is derivable from nr_irqs, so I don't think there's a point
to storing it separately .

> +
> +    /* ESB memory region */
> +    uint32_t     esb_shift;
> +    hwaddr       esb_base;
> +    MemoryRegion esb_mr;
> +    MemoryRegion esb_iomem;
>  };
>  
>  static inline bool spapr_xive_irq_is_lsi(sPAPRXive *xive, int lisn)
Cédric Le Goater Nov. 28, 2017, 6:33 p.m. UTC | #2
On 11/28/2017 06:38 AM, David Gibson wrote:
> On Thu, Nov 23, 2017 at 02:29:40PM +0100, Cédric Le Goater wrote:
>> Each interrupt source is associated with a two bit state machine
>> called an Event State Buffer (ESB). The bits are named "P" (pending)
>> and "Q" (queued) and can be controlled by MMIO. It is used to trigger
>> events. See code for more details on the states and transitions.
>>
>> The MMIO space for the ESB translation is 512GB large on baremetal
>> (powernv) systems and the BAR depends on the chip id. In our model for
>> the sPAPR machine, we choose to only map a sub memory region for the
>> provisionned IRQ numbers and to use the mapping address of chip 0 on a
>> real system. The OS will get the address of the MMIO page of the ESB
>> entry associated with an IRQ using the H_INT_GET_SOURCE_INFO hcall.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  hw/intc/spapr_xive.c        | 268 +++++++++++++++++++++++++++++++++++++++++++-
>>  include/hw/ppc/spapr_xive.h |   8 ++
>>  2 files changed, 275 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>> index 66c533fb1d78..f45f50fd017e 100644
>> --- a/hw/intc/spapr_xive.c
>> +++ b/hw/intc/spapr_xive.c
>> @@ -32,6 +32,216 @@ static void spapr_xive_irq(sPAPRXive *xive, int lisn)
>>  }
>>  
>>  /*
>> + * "magic" Event State Buffer (ESB) MMIO offsets.
>> + *
>> + * Each interrupt source has a 2-bit state machine called ESB
>> + * which can be controlled by MMIO. It's made of 2 bits, P and
>> + * Q. P indicates that an interrupt is pending (has been sent
>> + * to a queue and is waiting for an EOI). Q indicates that the
>> + * interrupt has been triggered while pending.
>> + *
>> + * This acts as a coalescing mechanism in order to guarantee
>> + * that a given interrupt only occurs at most once in a queue.
>> + *
>> + * When doing an EOI, the Q bit will indicate if the interrupt
>> + * needs to be re-triggered.
>> + *
>> + * The following offsets into the ESB MMIO allow to read or
>> + * manipulate the PQ bits. They must be used with an 8-bytes
>> + * load instruction. They all return the previous state of the
>> + * interrupt (atomically).
>> + *
>> + * Additionally, some ESB pages support doing an EOI via a
>> + * store at 0 and some ESBs support doing a trigger via a
>> + * separate trigger page.
>> + */
>> +#define XIVE_ESB_GET            0x800
>> +#define XIVE_ESB_SET_PQ_00      0xc00
>> +#define XIVE_ESB_SET_PQ_01      0xd00
>> +#define XIVE_ESB_SET_PQ_10      0xe00
>> +#define XIVE_ESB_SET_PQ_11      0xf00
>> +
>> +#define XIVE_ESB_VAL_P          0x2
>> +#define XIVE_ESB_VAL_Q          0x1
>> +
>> +#define XIVE_ESB_RESET          0x0
>> +#define XIVE_ESB_PENDING        XIVE_ESB_VAL_P
>> +#define XIVE_ESB_QUEUED         (XIVE_ESB_VAL_P | XIVE_ESB_VAL_Q)
>> +#define XIVE_ESB_OFF            XIVE_ESB_VAL_Q
>> +
>> +static uint8_t spapr_xive_pq_get(sPAPRXive *xive, uint32_t lisn)
>> +{
>> +    uint32_t byte = lisn / 4;
>> +    uint32_t bit  = (lisn % 4) * 2;
>> +
>> +    assert(byte < xive->sbe_size);
>> +
>> +    return (xive->sbe[byte] >> bit) & 0x3;
>> +}
>> +
>> +static uint8_t spapr_xive_pq_set(sPAPRXive *xive, uint32_t lisn, uint8_t pq)
>> +{
>> +    uint32_t byte = lisn / 4;
>> +    uint32_t bit  = (lisn % 4) * 2;
>> +    uint8_t old, new;
>> +
>> +    assert(byte < xive->sbe_size);
>> +
>> +    old = xive->sbe[byte];
>> +
>> +    new = xive->sbe[byte] & ~(0x3 << bit);
>> +    new |= (pq & 0x3) << bit;
>> +
>> +    xive->sbe[byte] = new;
>> +
>> +    return (old >> bit) & 0x3;
>> +}
>> +
>> +static bool spapr_xive_pq_eoi(sPAPRXive *xive, uint32_t lisn)
>> +{
>> +    uint8_t old_pq = spapr_xive_pq_get(xive, lisn);
>> +
>> +    switch (old_pq) {
>> +    case XIVE_ESB_RESET:
>> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_RESET);
>> +        return false;
>> +    case XIVE_ESB_PENDING:
>> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_RESET);
>> +        return false;
>> +    case XIVE_ESB_QUEUED:
>> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_PENDING);
>> +        return true;
>> +    case XIVE_ESB_OFF:
>> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_OFF);
>> +        return false;
>> +    default:
>> +         g_assert_not_reached();
>> +    }
>> +}
>> +
>> +static bool spapr_xive_pq_trigger(sPAPRXive *xive, uint32_t lisn)
>> +{
>> +    uint8_t old_pq = spapr_xive_pq_get(xive, lisn);
>> +
>> +    switch (old_pq) {
>> +    case XIVE_ESB_RESET:
>> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_PENDING);
>> +        return true;
>> +    case XIVE_ESB_PENDING:
>> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_QUEUED);
>> +        return true;
>> +    case XIVE_ESB_QUEUED:
>> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_QUEUED);
>> +        return true;
>> +    case XIVE_ESB_OFF:
>> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_OFF);
>> +        return false;
>> +    default:
>> +         g_assert_not_reached();
>> +    }
>> +}
>> +
>> +/*
>> + * XIVE Interrupt Source MMIOs
>> + */
>> +static void spapr_xive_source_eoi(sPAPRXive *xive, uint32_t lisn)
>> +{
>> +    if (spapr_xive_irq_is_lsi(xive, lisn)) {
>> +        xive->status[lisn] &= ~XIVE_STATUS_SENT;
>> +    }
>> +}
>> +
>> +/* TODO: handle second page
>> + *
>> + * Some HW use a separate page for trigger. We only support the case
>> + * in which the trigger can be done in the same page as the EOI.
>> + */
>> +static uint64_t spapr_xive_esb_read(void *opaque, hwaddr addr, unsigned size)
>> +{
>> +    sPAPRXive *xive = SPAPR_XIVE(opaque);
>> +    uint32_t offset = addr & 0xF00;
>> +    uint32_t lisn = addr >> xive->esb_shift;
>> +    XiveIVE *ive;
>> +    uint64_t ret = -1;
>> +
>> +    ive = spapr_xive_get_ive(xive, lisn);
>> +    if (!ive || !(ive->w & IVE_VALID))  {
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn);
>> +        goto out;
>> +    }
>> +
>> +    switch (offset) {
>> +    case 0:
>> +        spapr_xive_source_eoi(xive, lisn);
> 
> Hrm.  I don't love that you're dealing with clearing that LSI bit
> here, but setting it at a different level.
> 
> The state machines are doing my head in a bit, is there any way
> you could derive the STATUS_SENT bit from the PQ bits?

Yes. I should. 

I am also lacking a guest driver to exercise these LSIs so I didn't
pay a lot of attention to level interrupts. Any idea ?

>> +        /* return TRUE or FALSE depending on PQ value */
>> +        ret = spapr_xive_pq_eoi(xive, lisn);
>> +        break;
>> +
>> +    case XIVE_ESB_GET:
>> +        ret = spapr_xive_pq_get(xive, lisn);
>> +        break;
>> +
>> +    case XIVE_ESB_SET_PQ_00:
>> +    case XIVE_ESB_SET_PQ_01:
>> +    case XIVE_ESB_SET_PQ_10:
>> +    case XIVE_ESB_SET_PQ_11:
>> +        ret = spapr_xive_pq_set(xive, lisn, (offset >> 8) & 0x3);
>> +        break;
>> +    default:
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB addr %d\n", offset);
>> +    }
>> +
>> +out:
>> +    return ret;
>> +}
>> +
>> +static void spapr_xive_esb_write(void *opaque, hwaddr addr,
>> +                           uint64_t value, unsigned size)
>> +{
>> +    sPAPRXive *xive = SPAPR_XIVE(opaque);
>> +    uint32_t offset = addr & 0xF00;
>> +    uint32_t lisn = addr >> xive->esb_shift;
>> +    XiveIVE *ive;
>> +    bool notify = false;
>> +
>> +    ive = spapr_xive_get_ive(xive, lisn);
>> +    if (!ive || !(ive->w & IVE_VALID))  {
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn);
>> +        return;
>> +    }
>> +
>> +    switch (offset) {
>> +    case 0:
>> +        /* TODO: should we trigger even if the IVE is masked ? */
>> +        notify = spapr_xive_pq_trigger(xive, lisn);
>> +        break;
>> +    default:
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB write addr %d\n",
>> +                      offset);
>> +        return;
>> +    }
>> +
>> +    if (notify && !(ive->w & IVE_MASKED)) {
>> +        qemu_irq_pulse(xive->qirqs[lisn]);
>> +    }
>> +}
>> +
>> +static const MemoryRegionOps spapr_xive_esb_ops = {
>> +    .read = spapr_xive_esb_read,
>> +    .write = spapr_xive_esb_write,
>> +    .endianness = DEVICE_BIG_ENDIAN,
>> +    .valid = {
>> +        .min_access_size = 8,
>> +        .max_access_size = 8,
>> +    },
>> +    .impl = {
>> +        .min_access_size = 8,
>> +        .max_access_size = 8,
>> +    },
>> +};
>> +
>> +/*
>>   * XIVE Interrupt Source
>>   */
>>  static void spapr_xive_source_set_irq_msi(sPAPRXive *xive, int lisn, int val)
>> @@ -70,6 +280,33 @@ static void spapr_xive_source_set_irq(void *opaque, int lisn, int val)
>>  /*
>>   * Main XIVE object
>>   */
>> +#define P9_MMIO_BASE     0x006000000000000ull
>> +
>> +/* VC BAR contains set translations for the ESBs and the EQs. */
>> +#define VC_BAR_DEFAULT   0x10000000000ull
>> +#define VC_BAR_SIZE      0x08000000000ull
>> +#define ESB_SHIFT        16 /* One 64k page. OPAL has two */
>> +
>> +static uint64_t spapr_xive_esb_default_read(void *p, hwaddr offset,
>> +                                            unsigned size)
>> +{
>> +    qemu_log_mask(LOG_UNIMP, "%s: 0x%" HWADDR_PRIx " [%u]\n",
>> +                  __func__, offset, size);
>> +    return 0;
>> +}
>> +
>> +static void spapr_xive_esb_default_write(void *opaque, hwaddr offset,
>> +                                         uint64_t value, unsigned size)
>> +{
>> +    qemu_log_mask(LOG_UNIMP, "%s: 0x%" HWADDR_PRIx " <- 0x%" PRIx64 " [%u]\n",
>> +                  __func__, offset, value, size);
>> +}
>> +
>> +static const MemoryRegionOps spapr_xive_esb_default_ops = {
>> +    .read = spapr_xive_esb_default_read,
>> +    .write = spapr_xive_esb_default_write,
>> +    .endianness = DEVICE_BIG_ENDIAN,
> 
> I think you should at least have a valid access size field here.

yes. 

>> +};
>>  
>>  void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
>>  {
>> @@ -77,14 +314,19 @@ void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
>>  
>>      for (i = 0; i < xive->nr_irqs; i++) {
>>          XiveIVE *ive = &xive->ivt[i];
>> +        uint8_t pq;
>>  
>>          if (!(ive->w & IVE_VALID)) {
>>              continue;
>>          }
>>  
>> -        monitor_printf(mon, "  %4x %s %s %08x %08x\n", i,
>> +        pq = spapr_xive_pq_get(xive, i);
>> +
>> +        monitor_printf(mon, "  %4x %s %s %c%c %08x %08x\n", i,
>>                         spapr_xive_irq_is_lsi(xive, i) ? "LSI" : "MSI",
>>                         ive->w & IVE_MASKED ? "M" : " ",
>> +                       pq & XIVE_ESB_VAL_P ? 'P' : '-',
>> +                       pq & XIVE_ESB_VAL_Q ? 'Q' : '-',
>>                         (int) GETFIELD(IVE_EQ_INDEX, ive->w),
>>                         (int) GETFIELD(IVE_EQ_DATA, ive->w));
>>      }
>> @@ -104,6 +346,9 @@ void spapr_xive_reset(void *dev)
>>              ive->w |= IVE_MASKED;
>>          }
>>      }
>> +
>> +    /* SBEs are initialized to 0b01 which corresponds to "ints off" */
>> +    memset(xive->sbe, 0x55, xive->sbe_size);
>>  }
>>  
>>  static void spapr_xive_realize(DeviceState *dev, Error **errp)
>> @@ -123,6 +368,26 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>      /* Allocate the IVT (Interrupt Virtualization Table) */
>>      xive->ivt = g_malloc0(xive->nr_irqs * sizeof(XiveIVE));
>>  
>> +    /* Allocate SBEs (State Bit Entry). 2 bits, so 4 entries per byte */
>> +    xive->sbe_size = DIV_ROUND_UP(xive->nr_irqs, 4);
>> +    xive->sbe = g_malloc0(xive->sbe_size);
>> +
>> +    /* VC BAR. That's the full window but we will only map the
>> +     * subregions in use. */
>> +    xive->esb_base = (P9_MMIO_BASE | VC_BAR_DEFAULT);
>> +    xive->esb_shift = ESB_SHIFT;
> 
> Any point to having this as a variable, if it's always the same size?

Well, it is related to the page size and we could have a HW configuration 
with two pages, with one specific page for trigger. But we don't need to 
model this, yet.  

The 'esb_shift' field is used in different places. I would rather keep it.
>> +
>> +    /* Install default memory region handlers to log bogus access */
>> +    memory_region_init_io(&xive->esb_mr, NULL, &spapr_xive_esb_default_ops,
>> +                          NULL, "xive.esb.full", VC_BAR_SIZE);
>> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->esb_mr);
>> +
>> +    /* Install the ESB memory region in the overall one */
>> +    memory_region_init_io(&xive->esb_iomem, OBJECT(xive), &spapr_xive_esb_ops,
>> +                          xive, "xive.esb",
>> +                          (1ull << xive->esb_shift) * xive->nr_irqs);
>> +    memory_region_add_subregion(&xive->esb_mr, 0, &xive->esb_iomem);
> 
> Is there a benegit to to having these nested regions, rather than just
> validating the lisn in the read/write functions?

No. this was to represent the full ESB space of the system but we can 
reduce it to one region. This is not a problem I think.

>>      qemu_register_reset(spapr_xive_reset, dev);
>>  }
>>  
>> @@ -152,6 +417,7 @@ static const VMStateDescription vmstate_spapr_xive = {
>>          VMSTATE_STRUCT_VARRAY_UINT32_ALLOC(ivt, sPAPRXive, nr_irqs, 1,
>>                                             vmstate_spapr_xive_ive, XiveIVE),
>>          VMSTATE_VBUFFER_UINT32(status, sPAPRXive, 1, NULL, nr_irqs),
>> +        VMSTATE_VBUFFER_UINT32(sbe, sPAPRXive, 1, NULL, sbe_size),
>>          VMSTATE_END_OF_LIST()
>>      },
>>  };
>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
>> index 6a799cdaba66..84c910e62e56 100644
>> --- a/include/hw/ppc/spapr_xive.h
>> +++ b/include/hw/ppc/spapr_xive.h
>> @@ -42,6 +42,14 @@ struct sPAPRXive {
>>  
>>      /* XIVE internal tables */
>>      XiveIVE      *ivt;
>> +    uint8_t      *sbe;
>> +    uint32_t     sbe_size;
> 
> sbe_size is derivable from nr_irqs, so I don't think there's a point
> to storing it separately .

I needed the value for the vmstate macros. 

Thanks,

C. 

 
>> +
>> +    /* ESB memory region */
>> +    uint32_t     esb_shift;
>> +    hwaddr       esb_base;
>> +    MemoryRegion esb_mr;
>> +    MemoryRegion esb_iomem;
>>  };
>>  
>>  static inline bool spapr_xive_irq_is_lsi(sPAPRXive *xive, int lisn)
>
David Gibson Nov. 29, 2017, 4:59 a.m. UTC | #3
On Tue, Nov 28, 2017 at 06:33:06PM +0000, Cédric Le Goater wrote:
> On 11/28/2017 06:38 AM, David Gibson wrote:
> > On Thu, Nov 23, 2017 at 02:29:40PM +0100, Cédric Le Goater wrote:
> >> Each interrupt source is associated with a two bit state machine
> >> called an Event State Buffer (ESB). The bits are named "P" (pending)
> >> and "Q" (queued) and can be controlled by MMIO. It is used to trigger
> >> events. See code for more details on the states and transitions.
> >>
> >> The MMIO space for the ESB translation is 512GB large on baremetal
> >> (powernv) systems and the BAR depends on the chip id. In our model for
> >> the sPAPR machine, we choose to only map a sub memory region for the
> >> provisionned IRQ numbers and to use the mapping address of chip 0 on a
> >> real system. The OS will get the address of the MMIO page of the ESB
> >> entry associated with an IRQ using the H_INT_GET_SOURCE_INFO hcall.
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >> ---
> >>  hw/intc/spapr_xive.c        | 268 +++++++++++++++++++++++++++++++++++++++++++-
> >>  include/hw/ppc/spapr_xive.h |   8 ++
> >>  2 files changed, 275 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> >> index 66c533fb1d78..f45f50fd017e 100644
> >> --- a/hw/intc/spapr_xive.c
> >> +++ b/hw/intc/spapr_xive.c
> >> @@ -32,6 +32,216 @@ static void spapr_xive_irq(sPAPRXive *xive, int lisn)
> >>  }
> >>  
> >>  /*
> >> + * "magic" Event State Buffer (ESB) MMIO offsets.
> >> + *
> >> + * Each interrupt source has a 2-bit state machine called ESB
> >> + * which can be controlled by MMIO. It's made of 2 bits, P and
> >> + * Q. P indicates that an interrupt is pending (has been sent
> >> + * to a queue and is waiting for an EOI). Q indicates that the
> >> + * interrupt has been triggered while pending.
> >> + *
> >> + * This acts as a coalescing mechanism in order to guarantee
> >> + * that a given interrupt only occurs at most once in a queue.
> >> + *
> >> + * When doing an EOI, the Q bit will indicate if the interrupt
> >> + * needs to be re-triggered.
> >> + *
> >> + * The following offsets into the ESB MMIO allow to read or
> >> + * manipulate the PQ bits. They must be used with an 8-bytes
> >> + * load instruction. They all return the previous state of the
> >> + * interrupt (atomically).
> >> + *
> >> + * Additionally, some ESB pages support doing an EOI via a
> >> + * store at 0 and some ESBs support doing a trigger via a
> >> + * separate trigger page.
> >> + */
> >> +#define XIVE_ESB_GET            0x800
> >> +#define XIVE_ESB_SET_PQ_00      0xc00
> >> +#define XIVE_ESB_SET_PQ_01      0xd00
> >> +#define XIVE_ESB_SET_PQ_10      0xe00
> >> +#define XIVE_ESB_SET_PQ_11      0xf00
> >> +
> >> +#define XIVE_ESB_VAL_P          0x2
> >> +#define XIVE_ESB_VAL_Q          0x1
> >> +
> >> +#define XIVE_ESB_RESET          0x0
> >> +#define XIVE_ESB_PENDING        XIVE_ESB_VAL_P
> >> +#define XIVE_ESB_QUEUED         (XIVE_ESB_VAL_P | XIVE_ESB_VAL_Q)
> >> +#define XIVE_ESB_OFF            XIVE_ESB_VAL_Q
> >> +
> >> +static uint8_t spapr_xive_pq_get(sPAPRXive *xive, uint32_t lisn)
> >> +{
> >> +    uint32_t byte = lisn / 4;
> >> +    uint32_t bit  = (lisn % 4) * 2;
> >> +
> >> +    assert(byte < xive->sbe_size);
> >> +
> >> +    return (xive->sbe[byte] >> bit) & 0x3;
> >> +}
> >> +
> >> +static uint8_t spapr_xive_pq_set(sPAPRXive *xive, uint32_t lisn, uint8_t pq)
> >> +{
> >> +    uint32_t byte = lisn / 4;
> >> +    uint32_t bit  = (lisn % 4) * 2;
> >> +    uint8_t old, new;
> >> +
> >> +    assert(byte < xive->sbe_size);
> >> +
> >> +    old = xive->sbe[byte];
> >> +
> >> +    new = xive->sbe[byte] & ~(0x3 << bit);
> >> +    new |= (pq & 0x3) << bit;
> >> +
> >> +    xive->sbe[byte] = new;
> >> +
> >> +    return (old >> bit) & 0x3;
> >> +}
> >> +
> >> +static bool spapr_xive_pq_eoi(sPAPRXive *xive, uint32_t lisn)
> >> +{
> >> +    uint8_t old_pq = spapr_xive_pq_get(xive, lisn);
> >> +
> >> +    switch (old_pq) {
> >> +    case XIVE_ESB_RESET:
> >> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_RESET);
> >> +        return false;
> >> +    case XIVE_ESB_PENDING:
> >> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_RESET);
> >> +        return false;
> >> +    case XIVE_ESB_QUEUED:
> >> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_PENDING);
> >> +        return true;
> >> +    case XIVE_ESB_OFF:
> >> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_OFF);
> >> +        return false;
> >> +    default:
> >> +         g_assert_not_reached();
> >> +    }
> >> +}
> >> +
> >> +static bool spapr_xive_pq_trigger(sPAPRXive *xive, uint32_t lisn)
> >> +{
> >> +    uint8_t old_pq = spapr_xive_pq_get(xive, lisn);
> >> +
> >> +    switch (old_pq) {
> >> +    case XIVE_ESB_RESET:
> >> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_PENDING);
> >> +        return true;
> >> +    case XIVE_ESB_PENDING:
> >> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_QUEUED);
> >> +        return true;
> >> +    case XIVE_ESB_QUEUED:
> >> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_QUEUED);
> >> +        return true;
> >> +    case XIVE_ESB_OFF:
> >> +        spapr_xive_pq_set(xive, lisn, XIVE_ESB_OFF);
> >> +        return false;
> >> +    default:
> >> +         g_assert_not_reached();
> >> +    }
> >> +}
> >> +
> >> +/*
> >> + * XIVE Interrupt Source MMIOs
> >> + */
> >> +static void spapr_xive_source_eoi(sPAPRXive *xive, uint32_t lisn)
> >> +{
> >> +    if (spapr_xive_irq_is_lsi(xive, lisn)) {
> >> +        xive->status[lisn] &= ~XIVE_STATUS_SENT;
> >> +    }
> >> +}
> >> +
> >> +/* TODO: handle second page
> >> + *
> >> + * Some HW use a separate page for trigger. We only support the case
> >> + * in which the trigger can be done in the same page as the EOI.
> >> + */
> >> +static uint64_t spapr_xive_esb_read(void *opaque, hwaddr addr, unsigned size)
> >> +{
> >> +    sPAPRXive *xive = SPAPR_XIVE(opaque);
> >> +    uint32_t offset = addr & 0xF00;
> >> +    uint32_t lisn = addr >> xive->esb_shift;
> >> +    XiveIVE *ive;
> >> +    uint64_t ret = -1;
> >> +
> >> +    ive = spapr_xive_get_ive(xive, lisn);
> >> +    if (!ive || !(ive->w & IVE_VALID))  {
> >> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn);
> >> +        goto out;
> >> +    }
> >> +
> >> +    switch (offset) {
> >> +    case 0:
> >> +        spapr_xive_source_eoi(xive, lisn);
> > 
> > Hrm.  I don't love that you're dealing with clearing that LSI bit
> > here, but setting it at a different level.
> > 
> > The state machines are doing my head in a bit, is there any way
> > you could derive the STATUS_SENT bit from the PQ bits?
> 
> Yes. I should. 
> 
> I am also lacking a guest driver to exercise these LSIs so I didn't
> pay a lot of attention to level interrupts. Any idea ?

How about an old-school emulated PCI device?  Maybe rtl8139?

> >> +        /* return TRUE or FALSE depending on PQ value */
> >> +        ret = spapr_xive_pq_eoi(xive, lisn);
> >> +        break;
> >> +
> >> +    case XIVE_ESB_GET:
> >> +        ret = spapr_xive_pq_get(xive, lisn);
> >> +        break;
> >> +
> >> +    case XIVE_ESB_SET_PQ_00:
> >> +    case XIVE_ESB_SET_PQ_01:
> >> +    case XIVE_ESB_SET_PQ_10:
> >> +    case XIVE_ESB_SET_PQ_11:
> >> +        ret = spapr_xive_pq_set(xive, lisn, (offset >> 8) & 0x3);
> >> +        break;
> >> +    default:
> >> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB addr %d\n", offset);
> >> +    }
> >> +
> >> +out:
> >> +    return ret;
> >> +}
> >> +
> >> +static void spapr_xive_esb_write(void *opaque, hwaddr addr,
> >> +                           uint64_t value, unsigned size)
> >> +{
> >> +    sPAPRXive *xive = SPAPR_XIVE(opaque);
> >> +    uint32_t offset = addr & 0xF00;
> >> +    uint32_t lisn = addr >> xive->esb_shift;
> >> +    XiveIVE *ive;
> >> +    bool notify = false;
> >> +
> >> +    ive = spapr_xive_get_ive(xive, lisn);
> >> +    if (!ive || !(ive->w & IVE_VALID))  {
> >> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn);
> >> +        return;
> >> +    }
> >> +
> >> +    switch (offset) {
> >> +    case 0:
> >> +        /* TODO: should we trigger even if the IVE is masked ? */
> >> +        notify = spapr_xive_pq_trigger(xive, lisn);
> >> +        break;
> >> +    default:
> >> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB write addr %d\n",
> >> +                      offset);
> >> +        return;
> >> +    }
> >> +
> >> +    if (notify && !(ive->w & IVE_MASKED)) {
> >> +        qemu_irq_pulse(xive->qirqs[lisn]);
> >> +    }
> >> +}
> >> +
> >> +static const MemoryRegionOps spapr_xive_esb_ops = {
> >> +    .read = spapr_xive_esb_read,
> >> +    .write = spapr_xive_esb_write,
> >> +    .endianness = DEVICE_BIG_ENDIAN,
> >> +    .valid = {
> >> +        .min_access_size = 8,
> >> +        .max_access_size = 8,
> >> +    },
> >> +    .impl = {
> >> +        .min_access_size = 8,
> >> +        .max_access_size = 8,
> >> +    },
> >> +};
> >> +
> >> +/*
> >>   * XIVE Interrupt Source
> >>   */
> >>  static void spapr_xive_source_set_irq_msi(sPAPRXive *xive, int lisn, int val)
> >> @@ -70,6 +280,33 @@ static void spapr_xive_source_set_irq(void *opaque, int lisn, int val)
> >>  /*
> >>   * Main XIVE object
> >>   */
> >> +#define P9_MMIO_BASE     0x006000000000000ull
> >> +
> >> +/* VC BAR contains set translations for the ESBs and the EQs. */
> >> +#define VC_BAR_DEFAULT   0x10000000000ull
> >> +#define VC_BAR_SIZE      0x08000000000ull
> >> +#define ESB_SHIFT        16 /* One 64k page. OPAL has two */
> >> +
> >> +static uint64_t spapr_xive_esb_default_read(void *p, hwaddr offset,
> >> +                                            unsigned size)
> >> +{
> >> +    qemu_log_mask(LOG_UNIMP, "%s: 0x%" HWADDR_PRIx " [%u]\n",
> >> +                  __func__, offset, size);
> >> +    return 0;
> >> +}
> >> +
> >> +static void spapr_xive_esb_default_write(void *opaque, hwaddr offset,
> >> +                                         uint64_t value, unsigned size)
> >> +{
> >> +    qemu_log_mask(LOG_UNIMP, "%s: 0x%" HWADDR_PRIx " <- 0x%" PRIx64 " [%u]\n",
> >> +                  __func__, offset, value, size);
> >> +}
> >> +
> >> +static const MemoryRegionOps spapr_xive_esb_default_ops = {
> >> +    .read = spapr_xive_esb_default_read,
> >> +    .write = spapr_xive_esb_default_write,
> >> +    .endianness = DEVICE_BIG_ENDIAN,
> > 
> > I think you should at least have a valid access size field here.
> 
> yes. 
> 
> >> +};
> >>  
> >>  void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
> >>  {
> >> @@ -77,14 +314,19 @@ void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
> >>  
> >>      for (i = 0; i < xive->nr_irqs; i++) {
> >>          XiveIVE *ive = &xive->ivt[i];
> >> +        uint8_t pq;
> >>  
> >>          if (!(ive->w & IVE_VALID)) {
> >>              continue;
> >>          }
> >>  
> >> -        monitor_printf(mon, "  %4x %s %s %08x %08x\n", i,
> >> +        pq = spapr_xive_pq_get(xive, i);
> >> +
> >> +        monitor_printf(mon, "  %4x %s %s %c%c %08x %08x\n", i,
> >>                         spapr_xive_irq_is_lsi(xive, i) ? "LSI" : "MSI",
> >>                         ive->w & IVE_MASKED ? "M" : " ",
> >> +                       pq & XIVE_ESB_VAL_P ? 'P' : '-',
> >> +                       pq & XIVE_ESB_VAL_Q ? 'Q' : '-',
> >>                         (int) GETFIELD(IVE_EQ_INDEX, ive->w),
> >>                         (int) GETFIELD(IVE_EQ_DATA, ive->w));
> >>      }
> >> @@ -104,6 +346,9 @@ void spapr_xive_reset(void *dev)
> >>              ive->w |= IVE_MASKED;
> >>          }
> >>      }
> >> +
> >> +    /* SBEs are initialized to 0b01 which corresponds to "ints off" */
> >> +    memset(xive->sbe, 0x55, xive->sbe_size);
> >>  }
> >>  
> >>  static void spapr_xive_realize(DeviceState *dev, Error **errp)
> >> @@ -123,6 +368,26 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
> >>      /* Allocate the IVT (Interrupt Virtualization Table) */
> >>      xive->ivt = g_malloc0(xive->nr_irqs * sizeof(XiveIVE));
> >>  
> >> +    /* Allocate SBEs (State Bit Entry). 2 bits, so 4 entries per byte */
> >> +    xive->sbe_size = DIV_ROUND_UP(xive->nr_irqs, 4);
> >> +    xive->sbe = g_malloc0(xive->sbe_size);
> >> +
> >> +    /* VC BAR. That's the full window but we will only map the
> >> +     * subregions in use. */
> >> +    xive->esb_base = (P9_MMIO_BASE | VC_BAR_DEFAULT);
> >> +    xive->esb_shift = ESB_SHIFT;
> > 
> > Any point to having this as a variable, if it's always the same size?
> 
> Well, it is related to the page size and we could have a HW configuration 
> with two pages, with one specific page for trigger. But we don't need to 
> model this, yet.  
> 
> The 'esb_shift' field is used in different places. I would rather
> keep it.

Hm, ok.

> >> +
> >> +    /* Install default memory region handlers to log bogus access */
> >> +    memory_region_init_io(&xive->esb_mr, NULL, &spapr_xive_esb_default_ops,
> >> +                          NULL, "xive.esb.full", VC_BAR_SIZE);
> >> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->esb_mr);
> >> +
> >> +    /* Install the ESB memory region in the overall one */
> >> +    memory_region_init_io(&xive->esb_iomem, OBJECT(xive), &spapr_xive_esb_ops,
> >> +                          xive, "xive.esb",
> >> +                          (1ull << xive->esb_shift) * xive->nr_irqs);
> >> +    memory_region_add_subregion(&xive->esb_mr, 0, &xive->esb_iomem);
> > 
> > Is there a benegit to to having these nested regions, rather than just
> > validating the lisn in the read/write functions?
> 
> No. this was to represent the full ESB space of the system but we can 
> reduce it to one region. This is not a problem I think.
> 
> >>      qemu_register_reset(spapr_xive_reset, dev);
> >>  }
> >>  
> >> @@ -152,6 +417,7 @@ static const VMStateDescription vmstate_spapr_xive = {
> >>          VMSTATE_STRUCT_VARRAY_UINT32_ALLOC(ivt, sPAPRXive, nr_irqs, 1,
> >>                                             vmstate_spapr_xive_ive, XiveIVE),
> >>          VMSTATE_VBUFFER_UINT32(status, sPAPRXive, 1, NULL, nr_irqs),
> >> +        VMSTATE_VBUFFER_UINT32(sbe, sPAPRXive, 1, NULL, sbe_size),
> >>          VMSTATE_END_OF_LIST()
> >>      },
> >>  };
> >> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> >> index 6a799cdaba66..84c910e62e56 100644
> >> --- a/include/hw/ppc/spapr_xive.h
> >> +++ b/include/hw/ppc/spapr_xive.h
> >> @@ -42,6 +42,14 @@ struct sPAPRXive {
> >>  
> >>      /* XIVE internal tables */
> >>      XiveIVE      *ivt;
> >> +    uint8_t      *sbe;
> >> +    uint32_t     sbe_size;
> > 
> > sbe_size is derivable from nr_irqs, so I don't think there's a point
> > to storing it separately .
> 
> I needed the value for the vmstate macros.

Ah, right.


> 
> Thanks,
> 
> C. 
> 
>  
> >> +
> >> +    /* ESB memory region */
> >> +    uint32_t     esb_shift;
> >> +    hwaddr       esb_base;
> >> +    MemoryRegion esb_mr;
> >> +    MemoryRegion esb_iomem;
> >>  };
> >>  
> >>  static inline bool spapr_xive_irq_is_lsi(sPAPRXive *xive, int lisn)
> > 
>
Cédric Le Goater Nov. 29, 2017, 1:56 p.m. UTC | #4
>>>> +    switch (offset) {
>>>> +    case 0:
>>>> +        spapr_xive_source_eoi(xive, lisn);
>>>
>>> Hrm.  I don't love that you're dealing with clearing that LSI bit
>>> here, but setting it at a different level.
>>>
>>> The state machines are doing my head in a bit, is there any way
>>> you could derive the STATUS_SENT bit from the PQ bits?
>>
>> Yes. I should. 
>>
>> I am also lacking a guest driver to exercise these LSIs so I didn't
>> pay a lot of attention to level interrupts. Any idea ?
> 
> How about an old-school emulated PCI device?  Maybe rtl8139?

Perfect. The current model is working but I will see how I can 
improve it to use the PQ bits instead.

I also found a couple of issues on the way. 

We do need the "#interrupt-cells" and "interrupt-controller" 
properties. They are missing from the XIVE sPAPR specs but there
is no other way to find the parent controller for the LSIs ... 
I have re-asked the pHyp team to include them in the specs and 
fixed the QEMU model.
 
Linux thinks the interrupt type is an "edge" and not a "level" one :
  
  (initramfs) cat /proc/interrupts 
             CPU0       
   16:          0  XIVE-IPI    0 Edge      IPI
   17:         14  XIVE-IRQ 4100 Edge      enp0s0
   18:          0  XIVE-IRQ 4097 Edge      RAS_HOTPLUG
   19:          0  XIVE-IRQ 4096 Edge      RAS_EPOW
   20:         20  XIVE-IRQ 4098 Edge      hvc_console

and XIVE complains :

  [    8.319970] xive: Interrupt 17 (HW 0x1004) type mismatch, Linux says Edge, FW says Level

I am digging this one.

Thanks.

C.
Cédric Le Goater Nov. 29, 2017, 4:23 p.m. UTC | #5
On 11/29/2017 02:56 PM, Cédric Le Goater wrote:
>>>>> +    switch (offset) {
>>>>> +    case 0:
>>>>> +        spapr_xive_source_eoi(xive, lisn);
>>>>
>>>> Hrm.  I don't love that you're dealing with clearing that LSI bit
>>>> here, but setting it at a different level.
>>>>
>>>> The state machines are doing my head in a bit, is there any way
>>>> you could derive the STATUS_SENT bit from the PQ bits?
>>>
>>> Yes. I should. 
>>>
>>> I am also lacking a guest driver to exercise these LSIs so I didn't
>>> pay a lot of attention to level interrupts. Any idea ?
>>
>> How about an old-school emulated PCI device?  Maybe rtl8139?
> 
> Perfect. The current model is working but I will see how I can 
> improve it to use the PQ bits instead.

Using the PQ bits is simplifying the model but we still have to 
maintain an array to store the IRQ type. 

There are 3 unused bits in the IVE descriptor, bits[1-3]:  

  #define IVE_VALID       PPC_BIT(0)
  #define IVE_EQ_BLOCK    PPC_BITMASK(4, 7)        /* Destination EQ block# */
  #define IVE_EQ_INDEX    PPC_BITMASK(8, 31)       /* Destination EQ index */
  #define IVE_MASKED      PPC_BIT(32)              /* Masked */
  #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */

We could hijack one of them to store the LSI type and get rid of 
the type array. Would you object to that ? 

C.
David Gibson Nov. 30, 2017, 4:26 a.m. UTC | #6
On Wed, Nov 29, 2017 at 02:56:39PM +0100, Cédric Le Goater wrote:
> >>>> +    switch (offset) {
> >>>> +    case 0:
> >>>> +        spapr_xive_source_eoi(xive, lisn);
> >>>
> >>> Hrm.  I don't love that you're dealing with clearing that LSI bit
> >>> here, but setting it at a different level.
> >>>
> >>> The state machines are doing my head in a bit, is there any way
> >>> you could derive the STATUS_SENT bit from the PQ bits?
> >>
> >> Yes. I should. 
> >>
> >> I am also lacking a guest driver to exercise these LSIs so I didn't
> >> pay a lot of attention to level interrupts. Any idea ?
> > 
> > How about an old-school emulated PCI device?  Maybe rtl8139?
> 
> Perfect. The current model is working but I will see how I can 
> improve it to use the PQ bits instead.
> 
> I also found a couple of issues on the way. 
> 
> We do need the "#interrupt-cells" and "interrupt-controller" 
> properties. They are missing from the XIVE sPAPR specs but there
> is no other way to find the parent controller for the LSIs ... 
> I have re-asked the pHyp team to include them in the specs and 
> fixed the QEMU model.

Told ya so :).

> Linux thinks the interrupt type is an "edge" and not a "level" one :

Right "edge" and message interrupts work basically the same way.

>   (initramfs) cat /proc/interrupts 
>              CPU0       
>    16:          0  XIVE-IPI    0 Edge      IPI
>    17:         14  XIVE-IRQ 4100 Edge      enp0s0
>    18:          0  XIVE-IRQ 4097 Edge      RAS_HOTPLUG
>    19:          0  XIVE-IRQ 4096 Edge      RAS_EPOW
>    20:         20  XIVE-IRQ 4098 Edge      hvc_console
> 
> and XIVE complains :
> 
>   [    8.319970] xive: Interrupt 17 (HW 0x1004) type mismatch, Linux says Edge, FW says Level
> 
> I am digging this one.
> 
> Thanks.
> 
> C.
>
David Gibson Nov. 30, 2017, 4:28 a.m. UTC | #7
On Wed, Nov 29, 2017 at 05:23:25PM +0100, Cédric Le Goater wrote:
> On 11/29/2017 02:56 PM, Cédric Le Goater wrote:
> >>>>> +    switch (offset) {
> >>>>> +    case 0:
> >>>>> +        spapr_xive_source_eoi(xive, lisn);
> >>>>
> >>>> Hrm.  I don't love that you're dealing with clearing that LSI bit
> >>>> here, but setting it at a different level.
> >>>>
> >>>> The state machines are doing my head in a bit, is there any way
> >>>> you could derive the STATUS_SENT bit from the PQ bits?
> >>>
> >>> Yes. I should. 
> >>>
> >>> I am also lacking a guest driver to exercise these LSIs so I didn't
> >>> pay a lot of attention to level interrupts. Any idea ?
> >>
> >> How about an old-school emulated PCI device?  Maybe rtl8139?
> > 
> > Perfect. The current model is working but I will see how I can 
> > improve it to use the PQ bits instead.
> 
> Using the PQ bits is simplifying the model but we still have to 
> maintain an array to store the IRQ type. 
> 
> There are 3 unused bits in the IVE descriptor, bits[1-3]:  
> 
>   #define IVE_VALID       PPC_BIT(0)
>   #define IVE_EQ_BLOCK    PPC_BITMASK(4, 7)        /* Destination EQ block# */
>   #define IVE_EQ_INDEX    PPC_BITMASK(8, 31)       /* Destination EQ index */
>   #define IVE_MASKED      PPC_BIT(32)              /* Masked */
>   #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
> 
> We could hijack one of them to store the LSI type and get rid of 
> the type array. Would you object to that ?

Hrm.  These IVE bits are architected, aren't they?  In which case I'd
be wary of stealing a reserved bit in case of future extensions.

I'm wondering if we want another word / structure for storing
non-architected, implementation specific flags or info.

How does this work at the hardware level?  Presumbly the actual
hardware components don't communicate with the XIVE to request edge or
level.  So how does it know?  Specific ranges for LSIs?  If that we
should probably do the same.
Cédric Le Goater Nov. 30, 2017, 3:40 p.m. UTC | #8
On 11/30/2017 04:26 AM, David Gibson wrote:
> On Wed, Nov 29, 2017 at 02:56:39PM +0100, Cédric Le Goater wrote:
>>>>>> +    switch (offset) {
>>>>>> +    case 0:
>>>>>> +        spapr_xive_source_eoi(xive, lisn);
>>>>>
>>>>> Hrm.  I don't love that you're dealing with clearing that LSI bit
>>>>> here, but setting it at a different level.
>>>>>
>>>>> The state machines are doing my head in a bit, is there any way
>>>>> you could derive the STATUS_SENT bit from the PQ bits?
>>>>
>>>> Yes. I should. 
>>>>
>>>> I am also lacking a guest driver to exercise these LSIs so I didn't
>>>> pay a lot of attention to level interrupts. Any idea ?
>>>
>>> How about an old-school emulated PCI device?  Maybe rtl8139?
>>
>> Perfect. The current model is working but I will see how I can 
>> improve it to use the PQ bits instead.
>>
>> I also found a couple of issues on the way. 
>>
>> We do need the "#interrupt-cells" and "interrupt-controller" 
>> properties. They are missing from the XIVE sPAPR specs but there
>> is no other way to find the parent controller for the LSIs ... 
>> I have re-asked the pHyp team to include them in the specs and 
>> fixed the QEMU model.
> 
> Told ya so :).

I believed you ! I just needed a test case :)

C.
Cédric Le Goater Nov. 30, 2017, 4:05 p.m. UTC | #9
On 11/30/2017 04:28 AM, David Gibson wrote:
> On Wed, Nov 29, 2017 at 05:23:25PM +0100, Cédric Le Goater wrote:
>> On 11/29/2017 02:56 PM, Cédric Le Goater wrote:
>>>>>>> +    switch (offset) {
>>>>>>> +    case 0:
>>>>>>> +        spapr_xive_source_eoi(xive, lisn);
>>>>>>
>>>>>> Hrm.  I don't love that you're dealing with clearing that LSI bit
>>>>>> here, but setting it at a different level.
>>>>>>
>>>>>> The state machines are doing my head in a bit, is there any way
>>>>>> you could derive the STATUS_SENT bit from the PQ bits?
>>>>>
>>>>> Yes. I should. 
>>>>>
>>>>> I am also lacking a guest driver to exercise these LSIs so I didn't
>>>>> pay a lot of attention to level interrupts. Any idea ?
>>>>
>>>> How about an old-school emulated PCI device?  Maybe rtl8139?
>>>
>>> Perfect. The current model is working but I will see how I can 
>>> improve it to use the PQ bits instead.
>>
>> Using the PQ bits is simplifying the model but we still have to 
>> maintain an array to store the IRQ type. 
>>
>> There are 3 unused bits in the IVE descriptor, bits[1-3]:  
>>
>>   #define IVE_VALID       PPC_BIT(0)
>>   #define IVE_EQ_BLOCK    PPC_BITMASK(4, 7)        /* Destination EQ block# */
>>   #define IVE_EQ_INDEX    PPC_BITMASK(8, 31)       /* Destination EQ index */
>>   #define IVE_MASKED      PPC_BIT(32)              /* Masked */
>>   #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
>>
>> We could hijack one of them to store the LSI type and get rid of 
>> the type array. Would you object to that ?
> 
> Hrm.  These IVE bits are architected, aren't they?  

Yes and unused.

> In which case I'd
> be wary of stealing a reserved bit in case of future extensions.
> 
> I'm wondering if we want another word / structure for storing
> non-architected, implementation specific flags or info.

That's what is done with the status array. As migration will put 
pressure on future changes, may be, we should go for an extra 
word for these model non-architected needs.
 
> How does this work at the hardware level?  Presumbly the actual
> hardware components don't communicate with the XIVE to request edge or
> level.  

No.

> So how does it know?  Specific ranges for LSIs? 
 
When OPAL allocates interrupt numbers for a device, it records 
their type to handle the EOI a little differently for LSIs. 
For sPAPR, it is really the same, the hcall H_INT_GET_SOURCE_INFO 
gives the interrupt type to the guest and the EOI is handled 
with a specific load. A part from that, the LSI interrupts follow 
the same path as the MSI.
 
The device controller must have some extra logic to handle the 
level for these interrupts but I am no expert in the domain. 

> If that we should probably do the same.

Modeling the LSI level with the PQ bits looks fine. We just need 
to store the IRQ type information somewhere under the XIVE object. 
We can keep it in a byte array or a bitmap to reduce the size. 
But if we foresee additional state to store, we might want to use 
the byte array directly.

Thanks,

C.
Benjamin Herrenschmidt Dec. 2, 2017, 2:23 p.m. UTC | #10
On Tue, 2017-11-28 at 17:38 +1100, David Gibson wrote:
> Hrm.  I don't love that you're dealing with clearing that LSI bit
> here, but setting it at a different level.
> 
> The state machines are doing my head in a bit, is there any way
> you could derive the STATUS_SENT bit from the PQ bits?

Yeah it should be...

So you should normally need only one extra bit of state for LSI which
is whether it's asserted or not and no extra bit of state for MSIs.

P is basically "sent". Q is whether another event has been queued up
(and is only meaningful for MSIs though the 01 combination will mask
LSIs too).

The state logic should be for MSIs on event:

	- if PQ=01 ignore (masked)
	- if P=1, set Q and finish
	- set P=1 and forward event to IVE

For EOI (load and store):

	- if PQ=01 ignore
	- P=Q, Q=0
	- (storeEOI only) if new P=1, forward event to IVE
 
For LSIs, and "event" is whenever the state is asserted, and Q is
meaningless, so basically on every change of state or ESB:

	- if PQ=01 ignore (masked)
	- if P=1 finish
	- set P=1 and forward event to IVE

For EOI (load and store):

	- if PQ=01 ignore
	- clear P
	- re-evaluate as above if asserted

Cheers,
Ben.
Benjamin Herrenschmidt Dec. 2, 2017, 2:28 p.m. UTC | #11
On Wed, 2017-11-29 at 17:23 +0100, Cédric Le Goater wrote:
> On 11/29/2017 02:56 PM, Cédric Le Goater wrote:
> > > > > > +    switch (offset) {
> > > > > > +    case 0:
> > > > > > +        spapr_xive_source_eoi(xive, lisn);
> > > > > 
> > > > > Hrm.  I don't love that you're dealing with clearing that LSI bit
> > > > > here, but setting it at a different level.
> > > > > 
> > > > > The state machines are doing my head in a bit, is there any way
> > > > > you could derive the STATUS_SENT bit from the PQ bits?
> > > > 
> > > > Yes. I should. 
> > > > 
> > > > I am also lacking a guest driver to exercise these LSIs so I didn't
> > > > pay a lot of attention to level interrupts. Any idea ?
> > > 
> > > How about an old-school emulated PCI device?  Maybe rtl8139?
> > 
> > Perfect. The current model is working but I will see how I can 
> > improve it to use the PQ bits instead.
> 
> Using the PQ bits is simplifying the model but we still have to 
> maintain an array to store the IRQ type. 
> 
> There are 3 unused bits in the IVE descriptor, bits[1-3]:  
> 
>   #define IVE_VALID       PPC_BIT(0)
>   #define IVE_EQ_BLOCK    PPC_BITMASK(4, 7)        /* Destination EQ block# */
>   #define IVE_EQ_INDEX    PPC_BITMASK(8, 31)       /* Destination EQ index */
>   #define IVE_MASKED      PPC_BIT(32)              /* Masked */
>   #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
> 
> We could hijack one of them to store the LSI type and get rid of 
> the type array. Would you object to that ? 

This won't work well if/when you implement a real HW XIVE.

Another option is to have different source objects for LSIs and MSIs.

Cheers,
Ben.
> 
> C.
Benjamin Herrenschmidt Dec. 2, 2017, 2:33 p.m. UTC | #12
On Thu, 2017-11-30 at 15:28 +1100, David Gibson wrote:
> 
> How does this work at the hardware level?  Presumbly the actual
> hardware components don't communicate with the XIVE to request edge or
> level.  So how does it know?  Specific ranges for LSIs?  If that we
> should probably do the same.

So the source controller and the IVE are separate. The source
controller sends an internal MMIO to the IVE for "translating" the
event into a queue etc...

The IVE only see "events" which are effectively state transitions of
the P bit of the source.

The LSI vs MSI difference is thus entirely a property of the source
HW.

All the XIVE "Generic" built-in sources (the ones you can trigger with
an MMIO, which we use in KVM for all the IPIs and virtual interrupts)
are MSIs.

You find 2 kind of blocks of LSIs in the chip, the one PSI block which
has a handful or two of LSI sources for random "stuff" (LPC
interrupt(s), i2c interrupts etc..) and the LSI blocks which are in
each PHB.

So the PHB has basically two different bits of logic, one for LSIs and
one for MSIs. Their HW state machine is different.

In fact in the PHB and the PSI, I think, there's even an MMIO backdoor
register that allows you to see the "state" of the LSI (asserted).

Cheers,
Ben.
Cédric Le Goater Dec. 2, 2017, 2:47 p.m. UTC | #13
On 12/02/2017 03:28 PM, Benjamin Herrenschmidt wrote:
> On Wed, 2017-11-29 at 17:23 +0100, Cédric Le Goater wrote:
>> On 11/29/2017 02:56 PM, Cédric Le Goater wrote:
>>>>>>> +    switch (offset) {
>>>>>>> +    case 0:
>>>>>>> +        spapr_xive_source_eoi(xive, lisn);
>>>>>>
>>>>>> Hrm.  I don't love that you're dealing with clearing that LSI bit
>>>>>> here, but setting it at a different level.
>>>>>>
>>>>>> The state machines are doing my head in a bit, is there any way
>>>>>> you could derive the STATUS_SENT bit from the PQ bits?
>>>>>
>>>>> Yes. I should. 
>>>>>
>>>>> I am also lacking a guest driver to exercise these LSIs so I didn't
>>>>> pay a lot of attention to level interrupts. Any idea ?
>>>>
>>>> How about an old-school emulated PCI device?  Maybe rtl8139?
>>>
>>> Perfect. The current model is working but I will see how I can 
>>> improve it to use the PQ bits instead.
>>
>> Using the PQ bits is simplifying the model but we still have to 
>> maintain an array to store the IRQ type. 
>>
>> There are 3 unused bits in the IVE descriptor, bits[1-3]:  
>>
>>   #define IVE_VALID       PPC_BIT(0)
>>   #define IVE_EQ_BLOCK    PPC_BITMASK(4, 7)        /* Destination EQ block# */
>>   #define IVE_EQ_INDEX    PPC_BITMASK(8, 31)       /* Destination EQ index */
>>   #define IVE_MASKED      PPC_BIT(32)              /* Masked */
>>   #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
>>
>> We could hijack one of them to store the LSI type and get rid of 
>> the type array. Would you object to that ? 
> 
> This won't work well if/when you implement a real HW XIVE.
> 
> Another option is to have different source objects for LSIs and MSIs.

yes. Like for the PHB3 in PowerNV or in OPAL.

I will need to complexify the model a bit more with multiple source 
support like we did for PowerNV but that might be interesting for 
pass-through.

Thanks,

C.
diff mbox series

Patch

diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
index 66c533fb1d78..f45f50fd017e 100644
--- a/hw/intc/spapr_xive.c
+++ b/hw/intc/spapr_xive.c
@@ -32,6 +32,216 @@  static void spapr_xive_irq(sPAPRXive *xive, int lisn)
 }
 
 /*
+ * "magic" Event State Buffer (ESB) MMIO offsets.
+ *
+ * Each interrupt source has a 2-bit state machine called ESB
+ * which can be controlled by MMIO. It's made of 2 bits, P and
+ * Q. P indicates that an interrupt is pending (has been sent
+ * to a queue and is waiting for an EOI). Q indicates that the
+ * interrupt has been triggered while pending.
+ *
+ * This acts as a coalescing mechanism in order to guarantee
+ * that a given interrupt only occurs at most once in a queue.
+ *
+ * When doing an EOI, the Q bit will indicate if the interrupt
+ * needs to be re-triggered.
+ *
+ * The following offsets into the ESB MMIO allow to read or
+ * manipulate the PQ bits. They must be used with an 8-bytes
+ * load instruction. They all return the previous state of the
+ * interrupt (atomically).
+ *
+ * Additionally, some ESB pages support doing an EOI via a
+ * store at 0 and some ESBs support doing a trigger via a
+ * separate trigger page.
+ */
+#define XIVE_ESB_GET            0x800
+#define XIVE_ESB_SET_PQ_00      0xc00
+#define XIVE_ESB_SET_PQ_01      0xd00
+#define XIVE_ESB_SET_PQ_10      0xe00
+#define XIVE_ESB_SET_PQ_11      0xf00
+
+#define XIVE_ESB_VAL_P          0x2
+#define XIVE_ESB_VAL_Q          0x1
+
+#define XIVE_ESB_RESET          0x0
+#define XIVE_ESB_PENDING        XIVE_ESB_VAL_P
+#define XIVE_ESB_QUEUED         (XIVE_ESB_VAL_P | XIVE_ESB_VAL_Q)
+#define XIVE_ESB_OFF            XIVE_ESB_VAL_Q
+
+static uint8_t spapr_xive_pq_get(sPAPRXive *xive, uint32_t lisn)
+{
+    uint32_t byte = lisn / 4;
+    uint32_t bit  = (lisn % 4) * 2;
+
+    assert(byte < xive->sbe_size);
+
+    return (xive->sbe[byte] >> bit) & 0x3;
+}
+
+static uint8_t spapr_xive_pq_set(sPAPRXive *xive, uint32_t lisn, uint8_t pq)
+{
+    uint32_t byte = lisn / 4;
+    uint32_t bit  = (lisn % 4) * 2;
+    uint8_t old, new;
+
+    assert(byte < xive->sbe_size);
+
+    old = xive->sbe[byte];
+
+    new = xive->sbe[byte] & ~(0x3 << bit);
+    new |= (pq & 0x3) << bit;
+
+    xive->sbe[byte] = new;
+
+    return (old >> bit) & 0x3;
+}
+
+static bool spapr_xive_pq_eoi(sPAPRXive *xive, uint32_t lisn)
+{
+    uint8_t old_pq = spapr_xive_pq_get(xive, lisn);
+
+    switch (old_pq) {
+    case XIVE_ESB_RESET:
+        spapr_xive_pq_set(xive, lisn, XIVE_ESB_RESET);
+        return false;
+    case XIVE_ESB_PENDING:
+        spapr_xive_pq_set(xive, lisn, XIVE_ESB_RESET);
+        return false;
+    case XIVE_ESB_QUEUED:
+        spapr_xive_pq_set(xive, lisn, XIVE_ESB_PENDING);
+        return true;
+    case XIVE_ESB_OFF:
+        spapr_xive_pq_set(xive, lisn, XIVE_ESB_OFF);
+        return false;
+    default:
+         g_assert_not_reached();
+    }
+}
+
+static bool spapr_xive_pq_trigger(sPAPRXive *xive, uint32_t lisn)
+{
+    uint8_t old_pq = spapr_xive_pq_get(xive, lisn);
+
+    switch (old_pq) {
+    case XIVE_ESB_RESET:
+        spapr_xive_pq_set(xive, lisn, XIVE_ESB_PENDING);
+        return true;
+    case XIVE_ESB_PENDING:
+        spapr_xive_pq_set(xive, lisn, XIVE_ESB_QUEUED);
+        return true;
+    case XIVE_ESB_QUEUED:
+        spapr_xive_pq_set(xive, lisn, XIVE_ESB_QUEUED);
+        return true;
+    case XIVE_ESB_OFF:
+        spapr_xive_pq_set(xive, lisn, XIVE_ESB_OFF);
+        return false;
+    default:
+         g_assert_not_reached();
+    }
+}
+
+/*
+ * XIVE Interrupt Source MMIOs
+ */
+static void spapr_xive_source_eoi(sPAPRXive *xive, uint32_t lisn)
+{
+    if (spapr_xive_irq_is_lsi(xive, lisn)) {
+        xive->status[lisn] &= ~XIVE_STATUS_SENT;
+    }
+}
+
+/* TODO: handle second page
+ *
+ * Some HW use a separate page for trigger. We only support the case
+ * in which the trigger can be done in the same page as the EOI.
+ */
+static uint64_t spapr_xive_esb_read(void *opaque, hwaddr addr, unsigned size)
+{
+    sPAPRXive *xive = SPAPR_XIVE(opaque);
+    uint32_t offset = addr & 0xF00;
+    uint32_t lisn = addr >> xive->esb_shift;
+    XiveIVE *ive;
+    uint64_t ret = -1;
+
+    ive = spapr_xive_get_ive(xive, lisn);
+    if (!ive || !(ive->w & IVE_VALID))  {
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn);
+        goto out;
+    }
+
+    switch (offset) {
+    case 0:
+        spapr_xive_source_eoi(xive, lisn);
+
+        /* return TRUE or FALSE depending on PQ value */
+        ret = spapr_xive_pq_eoi(xive, lisn);
+        break;
+
+    case XIVE_ESB_GET:
+        ret = spapr_xive_pq_get(xive, lisn);
+        break;
+
+    case XIVE_ESB_SET_PQ_00:
+    case XIVE_ESB_SET_PQ_01:
+    case XIVE_ESB_SET_PQ_10:
+    case XIVE_ESB_SET_PQ_11:
+        ret = spapr_xive_pq_set(xive, lisn, (offset >> 8) & 0x3);
+        break;
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB addr %d\n", offset);
+    }
+
+out:
+    return ret;
+}
+
+static void spapr_xive_esb_write(void *opaque, hwaddr addr,
+                           uint64_t value, unsigned size)
+{
+    sPAPRXive *xive = SPAPR_XIVE(opaque);
+    uint32_t offset = addr & 0xF00;
+    uint32_t lisn = addr >> xive->esb_shift;
+    XiveIVE *ive;
+    bool notify = false;
+
+    ive = spapr_xive_get_ive(xive, lisn);
+    if (!ive || !(ive->w & IVE_VALID))  {
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn);
+        return;
+    }
+
+    switch (offset) {
+    case 0:
+        /* TODO: should we trigger even if the IVE is masked ? */
+        notify = spapr_xive_pq_trigger(xive, lisn);
+        break;
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB write addr %d\n",
+                      offset);
+        return;
+    }
+
+    if (notify && !(ive->w & IVE_MASKED)) {
+        qemu_irq_pulse(xive->qirqs[lisn]);
+    }
+}
+
+static const MemoryRegionOps spapr_xive_esb_ops = {
+    .read = spapr_xive_esb_read,
+    .write = spapr_xive_esb_write,
+    .endianness = DEVICE_BIG_ENDIAN,
+    .valid = {
+        .min_access_size = 8,
+        .max_access_size = 8,
+    },
+    .impl = {
+        .min_access_size = 8,
+        .max_access_size = 8,
+    },
+};
+
+/*
  * XIVE Interrupt Source
  */
 static void spapr_xive_source_set_irq_msi(sPAPRXive *xive, int lisn, int val)
@@ -70,6 +280,33 @@  static void spapr_xive_source_set_irq(void *opaque, int lisn, int val)
 /*
  * Main XIVE object
  */
+#define P9_MMIO_BASE     0x006000000000000ull
+
+/* VC BAR contains set translations for the ESBs and the EQs. */
+#define VC_BAR_DEFAULT   0x10000000000ull
+#define VC_BAR_SIZE      0x08000000000ull
+#define ESB_SHIFT        16 /* One 64k page. OPAL has two */
+
+static uint64_t spapr_xive_esb_default_read(void *p, hwaddr offset,
+                                            unsigned size)
+{
+    qemu_log_mask(LOG_UNIMP, "%s: 0x%" HWADDR_PRIx " [%u]\n",
+                  __func__, offset, size);
+    return 0;
+}
+
+static void spapr_xive_esb_default_write(void *opaque, hwaddr offset,
+                                         uint64_t value, unsigned size)
+{
+    qemu_log_mask(LOG_UNIMP, "%s: 0x%" HWADDR_PRIx " <- 0x%" PRIx64 " [%u]\n",
+                  __func__, offset, value, size);
+}
+
+static const MemoryRegionOps spapr_xive_esb_default_ops = {
+    .read = spapr_xive_esb_default_read,
+    .write = spapr_xive_esb_default_write,
+    .endianness = DEVICE_BIG_ENDIAN,
+};
 
 void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
 {
@@ -77,14 +314,19 @@  void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
 
     for (i = 0; i < xive->nr_irqs; i++) {
         XiveIVE *ive = &xive->ivt[i];
+        uint8_t pq;
 
         if (!(ive->w & IVE_VALID)) {
             continue;
         }
 
-        monitor_printf(mon, "  %4x %s %s %08x %08x\n", i,
+        pq = spapr_xive_pq_get(xive, i);
+
+        monitor_printf(mon, "  %4x %s %s %c%c %08x %08x\n", i,
                        spapr_xive_irq_is_lsi(xive, i) ? "LSI" : "MSI",
                        ive->w & IVE_MASKED ? "M" : " ",
+                       pq & XIVE_ESB_VAL_P ? 'P' : '-',
+                       pq & XIVE_ESB_VAL_Q ? 'Q' : '-',
                        (int) GETFIELD(IVE_EQ_INDEX, ive->w),
                        (int) GETFIELD(IVE_EQ_DATA, ive->w));
     }
@@ -104,6 +346,9 @@  void spapr_xive_reset(void *dev)
             ive->w |= IVE_MASKED;
         }
     }
+
+    /* SBEs are initialized to 0b01 which corresponds to "ints off" */
+    memset(xive->sbe, 0x55, xive->sbe_size);
 }
 
 static void spapr_xive_realize(DeviceState *dev, Error **errp)
@@ -123,6 +368,26 @@  static void spapr_xive_realize(DeviceState *dev, Error **errp)
     /* Allocate the IVT (Interrupt Virtualization Table) */
     xive->ivt = g_malloc0(xive->nr_irqs * sizeof(XiveIVE));
 
+    /* Allocate SBEs (State Bit Entry). 2 bits, so 4 entries per byte */
+    xive->sbe_size = DIV_ROUND_UP(xive->nr_irqs, 4);
+    xive->sbe = g_malloc0(xive->sbe_size);
+
+    /* VC BAR. That's the full window but we will only map the
+     * subregions in use. */
+    xive->esb_base = (P9_MMIO_BASE | VC_BAR_DEFAULT);
+    xive->esb_shift = ESB_SHIFT;
+
+    /* Install default memory region handlers to log bogus access */
+    memory_region_init_io(&xive->esb_mr, NULL, &spapr_xive_esb_default_ops,
+                          NULL, "xive.esb.full", VC_BAR_SIZE);
+    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->esb_mr);
+
+    /* Install the ESB memory region in the overall one */
+    memory_region_init_io(&xive->esb_iomem, OBJECT(xive), &spapr_xive_esb_ops,
+                          xive, "xive.esb",
+                          (1ull << xive->esb_shift) * xive->nr_irqs);
+    memory_region_add_subregion(&xive->esb_mr, 0, &xive->esb_iomem);
+
     qemu_register_reset(spapr_xive_reset, dev);
 }
 
@@ -152,6 +417,7 @@  static const VMStateDescription vmstate_spapr_xive = {
         VMSTATE_STRUCT_VARRAY_UINT32_ALLOC(ivt, sPAPRXive, nr_irqs, 1,
                                            vmstate_spapr_xive_ive, XiveIVE),
         VMSTATE_VBUFFER_UINT32(status, sPAPRXive, 1, NULL, nr_irqs),
+        VMSTATE_VBUFFER_UINT32(sbe, sPAPRXive, 1, NULL, sbe_size),
         VMSTATE_END_OF_LIST()
     },
 };
diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
index 6a799cdaba66..84c910e62e56 100644
--- a/include/hw/ppc/spapr_xive.h
+++ b/include/hw/ppc/spapr_xive.h
@@ -42,6 +42,14 @@  struct sPAPRXive {
 
     /* XIVE internal tables */
     XiveIVE      *ivt;
+    uint8_t      *sbe;
+    uint32_t     sbe_size;
+
+    /* ESB memory region */
+    uint32_t     esb_shift;
+    hwaddr       esb_base;
+    MemoryRegion esb_mr;
+    MemoryRegion esb_iomem;
 };
 
 static inline bool spapr_xive_irq_is_lsi(sPAPRXive *xive, int lisn)