diff mbox series

[v3,06/35] spapr/xive: introduce a XIVE interrupt presenter model

Message ID 20180419124331.3915-7-clg@kaod.org
State New
Headers show
Series ppc: support for the XIVE interrupt controller (POWER9) | expand

Commit Message

Cédric Le Goater April 19, 2018, 12:43 p.m. UTC
The XIVE presenter engine uses a set of registers to handle priority
management and interrupt acknowledgment among other things. The most
important ones being :

  - Interrupt Priority Register (PIPR)
  - Interrupt Pending Buffer (IPB)
  - Current Processor Priority (CPPR)
  - Notification Source Register (NSR)

There is one set of registers per level of privilege, four in all :
HW, HV pool, OS and User. These are called rings. All registers are
accessible through a specific MMIO region called the Thread Interrupt
Management Areas (TIMA) but, depending on the privilege level of the
CPU, the view of the TIMA is filtered. The sPAPR machine runs at the
OS privilege and therefore can only accesses the OS and the User
rings. The others are for hypervisor levels.

The CPU interrupt state is modeled with a XiveNVT object which stores
the values of the different registers. The different TIMA views are
mapped at the same address for each CPU and 'current_cpu' is used to
retrieve the XiveNVT holding the ring registers.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---

 Changes since v2 :

 - introduced the XiveFabric interface

 hw/intc/spapr_xive.c        |  25 ++++
 hw/intc/xive.c              | 279 ++++++++++++++++++++++++++++++++++++++++++++
 include/hw/ppc/spapr_xive.h |   5 +
 include/hw/ppc/xive.h       |  31 +++++
 include/hw/ppc/xive_regs.h  |  84 +++++++++++++
 5 files changed, 424 insertions(+)

Comments

David Gibson April 26, 2018, 7:11 a.m. UTC | #1
On Thu, Apr 19, 2018 at 02:43:02PM +0200, Cédric Le Goater wrote:
> The XIVE presenter engine uses a set of registers to handle priority
> management and interrupt acknowledgment among other things. The most
> important ones being :
> 
>   - Interrupt Priority Register (PIPR)
>   - Interrupt Pending Buffer (IPB)
>   - Current Processor Priority (CPPR)
>   - Notification Source Register (NSR)
> 
> There is one set of registers per level of privilege, four in all :
> HW, HV pool, OS and User. These are called rings. All registers are
> accessible through a specific MMIO region called the Thread Interrupt
> Management Areas (TIMA) but, depending on the privilege level of the
> CPU, the view of the TIMA is filtered. The sPAPR machine runs at the
> OS privilege and therefore can only accesses the OS and the User
> rings. The others are for hypervisor levels.
> 
> The CPU interrupt state is modeled with a XiveNVT object which stores
> the values of the different registers. The different TIMA views are
> mapped at the same address for each CPU and 'current_cpu' is used to
> retrieve the XiveNVT holding the ring registers.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
> 
>  Changes since v2 :
> 
>  - introduced the XiveFabric interface
> 
>  hw/intc/spapr_xive.c        |  25 ++++
>  hw/intc/xive.c              | 279 ++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/ppc/spapr_xive.h |   5 +
>  include/hw/ppc/xive.h       |  31 +++++
>  include/hw/ppc/xive_regs.h  |  84 +++++++++++++
>  5 files changed, 424 insertions(+)
> 
> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> index 90cde8a4082d..f07832bf0a00 100644
> --- a/hw/intc/spapr_xive.c
> +++ b/hw/intc/spapr_xive.c
> @@ -13,6 +13,7 @@
>  #include "target/ppc/cpu.h"
>  #include "sysemu/cpus.h"
>  #include "monitor/monitor.h"
> +#include "hw/ppc/spapr.h"
>  #include "hw/ppc/spapr_xive.h"
>  #include "hw/ppc/xive.h"
>  #include "hw/ppc/xive_regs.h"
> @@ -95,6 +96,22 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>  
>      /* Allocate the Interrupt Virtualization Table */
>      xive->ivt = g_new0(XiveIVE, xive->nr_irqs);
> +
> +    /* The Thread Interrupt Management Area has the same address for
> +     * each chip. On sPAPR, we only need to expose the User and OS
> +     * level views of the TIMA.
> +     */
> +    xive->tm_base = XIVE_TM_BASE;

The constant should probably have PAPR in the name somewhere, since
it's just for PAPR machines (same for the ESB mappings, actually).

> +
> +    memory_region_init_io(&xive->tm_mmio_user, OBJECT(xive),
> +                          &xive_tm_user_ops, xive, "xive.tima.user",
> +                          1ull << TM_SHIFT);
> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_user);
> +
> +    memory_region_init_io(&xive->tm_mmio_os, OBJECT(xive),
> +                          &xive_tm_os_ops, xive, "xive.tima.os",
> +                          1ull << TM_SHIFT);
> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_os);
>  }
>  
>  static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
> @@ -104,6 +121,13 @@ static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
>      return lisn < xive->nr_irqs ? &xive->ivt[lisn] : NULL;
>  }
>  
> +static XiveNVT *spapr_xive_get_nvt(XiveFabric *xf, uint32_t server)
> +{
> +    PowerPCCPU *cpu = spapr_find_cpu(server);
> +
> +    return cpu ? XIVE_NVT(cpu->intc) : NULL;
> +}

So this is a bit of a tangent, but I've been thinking of implementing
a scheme where there's an opaque pointer in the cpu structure for the
use of the machine.  I'm planning for that to replace the intc pointer
(which isn't really used directly by the cpu). That would allow us to
have spapr put a structure there and have both xics and xive pointers
which could be useful later on.

I think we'd need something similar to correctly handle migration of
the VPA state, which is currently horribly broken.

> +
>  static const VMStateDescription vmstate_spapr_xive_ive = {
>      .name = TYPE_SPAPR_XIVE "/ive",
>      .version_id = 1,
> @@ -143,6 +167,7 @@ static void spapr_xive_class_init(ObjectClass *klass, void *data)
>      dc->vmsd = &vmstate_spapr_xive;
>  
>      xfc->get_ive = spapr_xive_get_ive;
> +    xfc->get_nvt = spapr_xive_get_nvt;
>  }
>  
>  static const TypeInfo spapr_xive_info = {
> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> index dccad0318834..5691bb9474e4 100644
> --- a/hw/intc/xive.c
> +++ b/hw/intc/xive.c
> @@ -14,7 +14,278 @@
>  #include "sysemu/cpus.h"
>  #include "sysemu/dma.h"
>  #include "monitor/monitor.h"
> +#include "hw/ppc/xics.h" /* for ICP_PROP_CPU */
>  #include "hw/ppc/xive.h"
> +#include "hw/ppc/xive_regs.h"
> +
> +/*
> + * XIVE Interrupt Presenter
> + */
> +
> +static uint64_t xive_nvt_accept(XiveNVT *nvt)
> +{
> +    return 0;
> +}
> +
> +static void xive_nvt_set_cppr(XiveNVT *nvt, uint8_t cppr)
> +{
> +    if (cppr > XIVE_PRIORITY_MAX) {
> +        cppr = 0xff;
> +    }
> +
> +    nvt->ring_os[TM_CPPR] = cppr;

Surely this needs to recheck if we should be interrupting the cpu?

> +}
> +
> +/*
> + * OS Thread Interrupt Management Area MMIO
> + */
> +static uint64_t xive_tm_read_special(XiveNVT *nvt, hwaddr offset,
> +                                           unsigned size)
> +{
> +    uint64_t ret = -1;
> +
> +    if (offset == TM_SPC_ACK_OS_REG && size == 2) {
> +        ret = xive_nvt_accept(nvt);
> +    } else {
> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid TIMA read @%"
> +                      HWADDR_PRIx" size %d\n", offset, size);
> +    }
> +
> +    return ret;
> +}
> +
> +#define TM_RING(offset) ((offset) & 0xf0)
> +
> +static uint64_t xive_tm_os_read(void *opaque, hwaddr offset,
> +                                      unsigned size)
> +{
> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);

So, as I said on a previous version of this, we can actually correctly
represent different mappings in different cpu spaces, by exploiting
cpu->as and not just having them all point to &address_space_memory.

> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
> +    uint64_t ret = -1;
> +    int i;
> +
> +    if (offset >= TM_SPC_ACK_EBB) {
> +        return xive_tm_read_special(nvt, offset, size);
> +    }
> +
> +    if (TM_RING(offset) != TM_QW1_OS) {
> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid access to non-OS ring @%"
> +                      HWADDR_PRIx"\n", offset);
> +        return ret;

Just return -1 would be clearer here;

> +    }
> +
> +    ret = 0;
> +    for (i = 0; i < size; i++) {
> +        ret |= (uint64_t) nvt->regs[offset + i] << (8 * (size - i - 1));
> +    }
> +
> +    return ret;
> +}
> +
> +static bool xive_tm_is_readonly(uint8_t offset)
> +{
> +    return offset != TM_QW1_OS + TM_CPPR;
> +}
> +
> +static void xive_tm_write_special(XiveNVT *nvt, hwaddr offset,
> +                                        uint64_t value, unsigned size)
> +{
> +    /* TODO: support TM_SPC_SET_OS_PENDING */
> +
> +    /* TODO: support TM_SPC_ACK_OS_EL */
> +}
> +
> +static void xive_tm_os_write(void *opaque, hwaddr offset,
> +                                   uint64_t value, unsigned size)
> +{
> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
> +    int i;
> +
> +    if (offset >= TM_SPC_ACK_EBB) {
> +        xive_tm_write_special(nvt, offset, value, size);
> +        return;
> +    }
> +
> +    if (TM_RING(offset) != TM_QW1_OS) {

Why have this if you have separate OS and user regions as you appear
to do below?

Or to look at it another way, shouldn't it be possible to make the
read/write accessors the same for the OS and user rings?

> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid access to non-OS ring @%"
> +                      HWADDR_PRIx"\n", offset);
> +        return;
> +    }
> +
> +    switch (size) {
> +    case 1:
> +        if (offset == TM_QW1_OS + TM_CPPR) {
> +            xive_nvt_set_cppr(nvt, value & 0xff);
> +        }
> +        break;
> +    case 4:
> +    case 8:
> +        for (i = 0; i < size; i++) {
> +            if (!xive_tm_is_readonly(offset + i)) {
> +                nvt->regs[offset + i] = (value >> (8 * (size - i - 1))) & 0xff;
> +            }
> +        }
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +}
> +
> +const MemoryRegionOps xive_tm_os_ops = {
> +    .read = xive_tm_os_read,
> +    .write = xive_tm_os_write,
> +    .endianness = DEVICE_BIG_ENDIAN,
> +    .valid = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +    },
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +    },
> +};
> +
> +/*
> + * User Thread Interrupt Management Area MMIO
> + */
> +
> +static uint64_t xive_tm_user_read(void *opaque, hwaddr offset,
> +                                        unsigned size)
> +{
> +    qemu_log_mask(LOG_UNIMP, "XIVE: invalid access to User TIMA @%"
> +                  HWADDR_PRIx"\n", offset);
> +    return -1;
> +}
> +
> +static void xive_tm_user_write(void *opaque, hwaddr offset,
> +                                     uint64_t value, unsigned size)
> +{
> +    qemu_log_mask(LOG_UNIMP, "XIVE: invalid access to User TIMA @%"
> +                  HWADDR_PRIx"\n", offset);
> +}
> +
> +
> +const MemoryRegionOps xive_tm_user_ops = {
> +    .read = xive_tm_user_read,
> +    .write = xive_tm_user_write,
> +    .endianness = DEVICE_BIG_ENDIAN,
> +    .valid = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +    },
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +    },
> +};
> +
> +static char *xive_nvt_ring_print(uint8_t *ring)
> +{
> +    uint32_t w2 = be32_to_cpu(*((uint32_t *) &ring[TM_WORD2]));
> +
> +    return g_strdup_printf("%02x  %02x   %02x  %02x    %02x   "
> +                   "%02x  %02x  %02x   %08x",
> +                   ring[TM_NSR], ring[TM_CPPR], ring[TM_IPB], ring[TM_LSMFB],
> +                   ring[TM_ACK_CNT], ring[TM_INC], ring[TM_AGE], ring[TM_PIPR],
> +                   w2);
> +}
> +
> +void xive_nvt_pic_print_info(XiveNVT *nvt, Monitor *mon)
> +{
> +    int cpu_index = nvt->cs ? nvt->cs->cpu_index : -1;
> +    char *s;
> +
> +    monitor_printf(mon, "CPU[%04x]: QW    NSR CPPR IPB LSMFB ACK# INC AGE PIPR"
> +                   " W2\n", cpu_index);
> +
> +    s = xive_nvt_ring_print(&nvt->regs[TM_QW1_OS]);
> +    monitor_printf(mon, "CPU[%04x]: OS    %s\n", cpu_index, s);
> +    g_free(s);
> +    s = xive_nvt_ring_print(&nvt->regs[TM_QW0_USER]);
> +    monitor_printf(mon, "CPU[%04x]: USER  %s\n", cpu_index, s);
> +    g_free(s);
> +}
> +
> +static void xive_nvt_reset(void *dev)
> +{
> +    XiveNVT *nvt = XIVE_NVT(dev);
> +
> +    memset(nvt->regs, 0, sizeof(nvt->regs));
> +}
> +
> +static void xive_nvt_realize(DeviceState *dev, Error **errp)
> +{
> +    XiveNVT *nvt = XIVE_NVT(dev);
> +    PowerPCCPU *cpu;
> +    CPUPPCState *env;
> +    Object *obj;
> +    Error *err = NULL;
> +
> +    obj = object_property_get_link(OBJECT(dev), ICP_PROP_CPU, &err);

Please get rid of the remaining "ICP" naming in the xive code.

> +    if (!obj) {
> +        error_propagate(errp, err);
> +        error_prepend(errp, "required link '" ICP_PROP_CPU "' not found: ");
> +        return;
> +    }
> +
> +    cpu = POWERPC_CPU(obj);
> +    nvt->cs = CPU(obj);
> +
> +    env = &cpu->env;
> +    switch (PPC_INPUT(env)) {
> +    case PPC_FLAGS_INPUT_POWER7:
> +        nvt->output = env->irq_inputs[POWER7_INPUT_INT];
> +        break;
> +
> +    default:
> +        error_setg(errp, "XIVE interrupt controller does not support "
> +                   "this CPU bus model");
> +        return;
> +    }
> +
> +    qemu_register_reset(xive_nvt_reset, dev);

If this is a sysbus device, which I think it is, you shouldn't need to
explicitly register a reset handler.  Instead you can set a device
reset handler which will be called with the reset.

> +}
> +
> +static void xive_nvt_unrealize(DeviceState *dev, Error **errp)
> +{
> +    qemu_unregister_reset(xive_nvt_reset, dev);
> +}
> +
> +static void xive_nvt_init(Object *obj)
> +{
> +    XiveNVT *nvt = XIVE_NVT(obj);
> +
> +    nvt->ring_os = &nvt->regs[TM_QW1_OS];

The ring_os field is basically pointless, being just an offset into a
structure you already have.  A macro or inline would be a better idea.

> +}
> +
> +static const VMStateDescription vmstate_xive_nvt = {
> +    .name = TYPE_XIVE_NVT,
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_BUFFER(regs, XiveNVT),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
> +static void xive_nvt_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    dc->realize = xive_nvt_realize;
> +    dc->unrealize = xive_nvt_unrealize;
> +    dc->desc = "XIVE Interrupt Presenter";
> +    dc->vmsd = &vmstate_xive_nvt;
> +}
> +
> +static const TypeInfo xive_nvt_info = {
> +    .name          = TYPE_XIVE_NVT,
> +    .parent        = TYPE_DEVICE,
> +    .instance_size = sizeof(XiveNVT),
> +    .instance_init = xive_nvt_init,
> +    .class_init    = xive_nvt_class_init,
> +};
>  
>  /*
>   * XIVE Fabric
> @@ -27,6 +298,13 @@ XiveIVE *xive_fabric_get_ive(XiveFabric *xf, uint32_t lisn)
>      return xfc->get_ive(xf, lisn);
>  }
>  
> +XiveNVT *xive_fabric_get_nvt(XiveFabric *xf, uint32_t server)
> +{
> +    XiveFabricClass *xfc = XIVE_FABRIC_GET_CLASS(xf);
> +
> +    return xfc->get_nvt(xf, server);
> +}
> +
>  static void xive_fabric_route(XiveFabric *xf, int lisn)
>  {
>  
> @@ -418,6 +696,7 @@ static void xive_register_types(void)
>  {
>      type_register_static(&xive_source_info);
>      type_register_static(&xive_fabric_info);
> +    type_register_static(&xive_nvt_info);
>  }
>  
>  type_init(xive_register_types)
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index 4538c622b60a..25d78eec884d 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -25,6 +25,11 @@ typedef struct sPAPRXive {
>      /* Routing table */
>      XiveIVE      *ivt;
>      uint32_t     nr_irqs;
> +
> +    /* TIMA memory regions */
> +    hwaddr       tm_base;
> +    MemoryRegion tm_mmio_user;
> +    MemoryRegion tm_mmio_os;
>  } sPAPRXive;
>  
>  bool spapr_xive_irq_enable(sPAPRXive *xive, uint32_t lisn, bool lsi);
> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> index 57295715a4a5..1a2da610d91c 100644
> --- a/include/hw/ppc/xive.h
> +++ b/include/hw/ppc/xive.h
> @@ -20,6 +20,7 @@ typedef struct XiveFabric XiveFabric;
>   */
>  
>  #define XIVE_VC_BASE   0x0006010000000000ull
> +#define XIVE_TM_BASE   0x0006030203180000ull
>  
>  /*
>   * XIVE Interrupt Source
> @@ -155,6 +156,34 @@ static inline void xive_source_irq_set(XiveSource *xsrc, uint32_t srcno,
>  }
>  
>  /*
> + * XIVE Interrupt Presenter
> + */
> +
> +#define TYPE_XIVE_NVT "xive-nvt"
> +#define XIVE_NVT(obj) OBJECT_CHECK(XiveNVT, (obj), TYPE_XIVE_NVT)
> +
> +#define TM_RING_COUNT           4
> +#define TM_RING_SIZE            0x10
> +
> +typedef struct XiveNVT {
> +    DeviceState parent_obj;
> +
> +    CPUState  *cs;
> +    qemu_irq  output;
> +
> +    /* Thread interrupt Management (TM) registers */
> +    uint8_t   regs[TM_RING_COUNT * TM_RING_SIZE];
> +
> +    /* Shortcuts to rings */
> +    uint8_t   *ring_os;
> +} XiveNVT;
> +
> +extern const MemoryRegionOps xive_tm_user_ops;
> +extern const MemoryRegionOps xive_tm_os_ops;
> +
> +void xive_nvt_pic_print_info(XiveNVT *nvt, Monitor *mon);
> +
> +/*
>   * XIVE Fabric
>   */
>  
> @@ -175,8 +204,10 @@ typedef struct XiveFabricClass {
>      void (*notify)(XiveFabric *xf, uint32_t lisn);
>  
>      XiveIVE *(*get_ive)(XiveFabric *xf, uint32_t lisn);
> +    XiveNVT *(*get_nvt)(XiveFabric *xf, uint32_t server);
>  } XiveFabricClass;
>  
>  XiveIVE *xive_fabric_get_ive(XiveFabric *xf, uint32_t lisn);
> +XiveNVT *xive_fabric_get_nvt(XiveFabric *xf, uint32_t server);
>  
>  #endif /* PPC_XIVE_H */
> diff --git a/include/hw/ppc/xive_regs.h b/include/hw/ppc/xive_regs.h
> index 5903f29eb789..f2e2a1ac8f6e 100644
> --- a/include/hw/ppc/xive_regs.h
> +++ b/include/hw/ppc/xive_regs.h
> @@ -10,6 +10,88 @@
>  #ifndef _PPC_XIVE_REGS_H
>  #define _PPC_XIVE_REGS_H
>  
> +#define TM_SHIFT                16
> +
> +/* TM register offsets */
> +#define TM_QW0_USER             0x000 /* All rings */
> +#define TM_QW1_OS               0x010 /* Ring 0..2 */
> +#define TM_QW2_HV_POOL          0x020 /* Ring 0..1 */
> +#define TM_QW3_HV_PHYS          0x030 /* Ring 0..1 */
> +
> +/* Byte offsets inside a QW             QW0 QW1 QW2 QW3 */
> +#define TM_NSR                  0x0  /*  +   +   -   +  */
> +#define TM_CPPR                 0x1  /*  -   +   -   +  */
> +#define TM_IPB                  0x2  /*  -   +   +   +  */
> +#define TM_LSMFB                0x3  /*  -   +   +   +  */
> +#define TM_ACK_CNT              0x4  /*  -   +   -   -  */
> +#define TM_INC                  0x5  /*  -   +   -   +  */
> +#define TM_AGE                  0x6  /*  -   +   -   +  */
> +#define TM_PIPR                 0x7  /*  -   +   -   +  */
> +
> +#define TM_WORD0                0x0
> +#define TM_WORD1                0x4
> +
> +/*
> + * QW word 2 contains the valid bit at the top and other fields
> + * depending on the QW.
> + */
> +#define TM_WORD2                0x8
> +#define   TM_QW0W2_VU           PPC_BIT32(0)
> +#define   TM_QW0W2_LOGIC_SERV   PPC_BITMASK32(1, 31) /* XX 2,31 ? */
> +#define   TM_QW1W2_VO           PPC_BIT32(0)
> +#define   TM_QW1W2_OS_CAM       PPC_BITMASK32(8, 31)
> +#define   TM_QW2W2_VP           PPC_BIT32(0)
> +#define   TM_QW2W2_POOL_CAM     PPC_BITMASK32(8, 31)
> +#define   TM_QW3W2_VT           PPC_BIT32(0)
> +#define   TM_QW3W2_LP           PPC_BIT32(6)
> +#define   TM_QW3W2_LE           PPC_BIT32(7)
> +#define   TM_QW3W2_T            PPC_BIT32(31)
> +
> +/*
> + * In addition to normal loads to "peek" and writes (only when invalid)
> + * using 4 and 8 bytes accesses, the above registers support these
> + * "special" byte operations:
> + *
> + *   - Byte load from QW0[NSR] - User level NSR (EBB)
> + *   - Byte store to QW0[NSR] - User level NSR (EBB)
> + *   - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access
> + *   - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0
> + *                                    otherwise VT||0000000
> + *   - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present)
> + *
> + * Then we have all these "special" CI ops at these offset that trigger
> + * all sorts of side effects:
> + */
> +#define TM_SPC_ACK_EBB          0x800   /* Load8 ack EBB to reg*/
> +#define TM_SPC_ACK_OS_REG       0x810   /* Load16 ack OS irq to reg */
> +#define TM_SPC_PUSH_USR_CTX     0x808   /* Store32 Push/Validate user context */
> +#define TM_SPC_PULL_USR_CTX     0x808   /* Load32 Pull/Invalidate user
> +                                         * context */
> +#define TM_SPC_SET_OS_PENDING   0x812   /* Store8 Set OS irq pending bit */
> +#define TM_SPC_PULL_OS_CTX      0x818   /* Load32/Load64 Pull/Invalidate OS
> +                                         * context to reg */
> +#define TM_SPC_PULL_POOL_CTX    0x828   /* Load32/Load64 Pull/Invalidate Pool
> +                                         * context to reg*/
> +#define TM_SPC_ACK_HV_REG       0x830   /* Load16 ack HV irq to reg */
> +#define TM_SPC_PULL_USR_CTX_OL  0xc08   /* Store8 Pull/Inval usr ctx to odd
> +                                         * line */
> +#define TM_SPC_ACK_OS_EL        0xc10   /* Store8 ack OS irq to even line */
> +#define TM_SPC_ACK_HV_POOL_EL   0xc20   /* Store8 ack HV evt pool to even
> +                                         * line */
> +#define TM_SPC_ACK_HV_EL        0xc30   /* Store8 ack HV irq to even line */
> +/* XXX more... */
> +
> +/* NSR fields for the various QW ack types */
> +#define TM_QW0_NSR_EB           PPC_BIT8(0)
> +#define TM_QW1_NSR_EO           PPC_BIT8(0)
> +#define TM_QW3_NSR_HE           PPC_BITMASK8(0, 1)
> +#define  TM_QW3_NSR_HE_NONE     0
> +#define  TM_QW3_NSR_HE_POOL     1
> +#define  TM_QW3_NSR_HE_PHYS     2
> +#define  TM_QW3_NSR_HE_LSI      3
> +#define TM_QW3_NSR_I            PPC_BIT8(2)
> +#define TM_QW3_NSR_GRP_LVL      PPC_BIT8(3, 7)
> +
>  /* IVE/EAS
>   *
>   * One per interrupt source. Targets that interrupt to a given EQ
> @@ -30,4 +112,6 @@ typedef struct XiveIVE {
>  #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
>  } XiveIVE;
>  
> +#define XIVE_PRIORITY_MAX  7
> +
>  #endif /* _INTC_XIVE_INTERNAL_H */
Cédric Le Goater April 26, 2018, 9:27 a.m. UTC | #2
On 04/26/2018 09:11 AM, David Gibson wrote:
> On Thu, Apr 19, 2018 at 02:43:02PM +0200, Cédric Le Goater wrote:
>> The XIVE presenter engine uses a set of registers to handle priority
>> management and interrupt acknowledgment among other things. The most
>> important ones being :
>>
>>   - Interrupt Priority Register (PIPR)
>>   - Interrupt Pending Buffer (IPB)
>>   - Current Processor Priority (CPPR)
>>   - Notification Source Register (NSR)
>>
>> There is one set of registers per level of privilege, four in all :
>> HW, HV pool, OS and User. These are called rings. All registers are
>> accessible through a specific MMIO region called the Thread Interrupt
>> Management Areas (TIMA) but, depending on the privilege level of the
>> CPU, the view of the TIMA is filtered. The sPAPR machine runs at the
>> OS privilege and therefore can only accesses the OS and the User
>> rings. The others are for hypervisor levels.
>>
>> The CPU interrupt state is modeled with a XiveNVT object which stores
>> the values of the different registers. The different TIMA views are
>> mapped at the same address for each CPU and 'current_cpu' is used to
>> retrieve the XiveNVT holding the ring registers.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>
>>  Changes since v2 :
>>
>>  - introduced the XiveFabric interface
>>
>>  hw/intc/spapr_xive.c        |  25 ++++
>>  hw/intc/xive.c              | 279 ++++++++++++++++++++++++++++++++++++++++++++
>>  include/hw/ppc/spapr_xive.h |   5 +
>>  include/hw/ppc/xive.h       |  31 +++++
>>  include/hw/ppc/xive_regs.h  |  84 +++++++++++++
>>  5 files changed, 424 insertions(+)
>>
>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>> index 90cde8a4082d..f07832bf0a00 100644
>> --- a/hw/intc/spapr_xive.c
>> +++ b/hw/intc/spapr_xive.c
>> @@ -13,6 +13,7 @@
>>  #include "target/ppc/cpu.h"
>>  #include "sysemu/cpus.h"
>>  #include "monitor/monitor.h"
>> +#include "hw/ppc/spapr.h"
>>  #include "hw/ppc/spapr_xive.h"
>>  #include "hw/ppc/xive.h"
>>  #include "hw/ppc/xive_regs.h"
>> @@ -95,6 +96,22 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>  
>>      /* Allocate the Interrupt Virtualization Table */
>>      xive->ivt = g_new0(XiveIVE, xive->nr_irqs);
>> +
>> +    /* The Thread Interrupt Management Area has the same address for
>> +     * each chip. On sPAPR, we only need to expose the User and OS
>> +     * level views of the TIMA.
>> +     */
>> +    xive->tm_base = XIVE_TM_BASE;
> 
> The constant should probably have PAPR in the name somewhere, since
> it's just for PAPR machines (same for the ESB mappings, actually).

ok. 

I have also made 'tm_base' a property, like 'vc_base' for ESBs, in 
case we want to change the value when the guest is instantiated. 
I doubt it but this is an address in the global address space, so 
letting the machine have control is better I think. 
 
> 
>> +
>> +    memory_region_init_io(&xive->tm_mmio_user, OBJECT(xive),
>> +                          &xive_tm_user_ops, xive, "xive.tima.user",
>> +                          1ull << TM_SHIFT);
>> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_user);
>> +
>> +    memory_region_init_io(&xive->tm_mmio_os, OBJECT(xive),
>> +                          &xive_tm_os_ops, xive, "xive.tima.os",
>> +                          1ull << TM_SHIFT);
>> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_os);
>>  }
>>  
>>  static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
>> @@ -104,6 +121,13 @@ static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
>>      return lisn < xive->nr_irqs ? &xive->ivt[lisn] : NULL;
>>  }
>>  
>> +static XiveNVT *spapr_xive_get_nvt(XiveFabric *xf, uint32_t server)
>> +{
>> +    PowerPCCPU *cpu = spapr_find_cpu(server);
>> +
>> +    return cpu ? XIVE_NVT(cpu->intc) : NULL;
>> +}
> 
> So this is a bit of a tangent, but I've been thinking of implementing
> a scheme where there's an opaque pointer in the cpu structure for the
> use of the machine.  I'm planning for that to replace the intc pointer
> (which isn't really used directly by the cpu). That would allow us to
> have spapr put a structure there and have both xics and xive pointers
> which could be useful later on.

ok. That should simplify the patchset at the end, in which we need to 
switch the 'intc' pointer. 

> I think we'd need something similar to correctly handle migration of
> the VPA state, which is currently horribly broken.
> 
>> +
>>  static const VMStateDescription vmstate_spapr_xive_ive = {
>>      .name = TYPE_SPAPR_XIVE "/ive",
>>      .version_id = 1,
>> @@ -143,6 +167,7 @@ static void spapr_xive_class_init(ObjectClass *klass, void *data)
>>      dc->vmsd = &vmstate_spapr_xive;
>>  
>>      xfc->get_ive = spapr_xive_get_ive;
>> +    xfc->get_nvt = spapr_xive_get_nvt;
>>  }
>>  
>>  static const TypeInfo spapr_xive_info = {
>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>> index dccad0318834..5691bb9474e4 100644
>> --- a/hw/intc/xive.c
>> +++ b/hw/intc/xive.c
>> @@ -14,7 +14,278 @@
>>  #include "sysemu/cpus.h"
>>  #include "sysemu/dma.h"
>>  #include "monitor/monitor.h"
>> +#include "hw/ppc/xics.h" /* for ICP_PROP_CPU */
>>  #include "hw/ppc/xive.h"
>> +#include "hw/ppc/xive_regs.h"
>> +
>> +/*
>> + * XIVE Interrupt Presenter
>> + */
>> +
>> +static uint64_t xive_nvt_accept(XiveNVT *nvt)
>> +{
>> +    return 0;
>> +}
>> +
>> +static void xive_nvt_set_cppr(XiveNVT *nvt, uint8_t cppr)
>> +{
>> +    if (cppr > XIVE_PRIORITY_MAX) {
>> +        cppr = 0xff;
>> +    }
>> +
>> +    nvt->ring_os[TM_CPPR] = cppr;
> 
> Surely this needs to recheck if we should be interrupting the cpu?

yes. In patch 9, when we introduce the nvt notify routine.

>> +}
>> +
>> +/*
>> + * OS Thread Interrupt Management Area MMIO
>> + */
>> +static uint64_t xive_tm_read_special(XiveNVT *nvt, hwaddr offset,
>> +                                           unsigned size)
>> +{
>> +    uint64_t ret = -1;
>> +
>> +    if (offset == TM_SPC_ACK_OS_REG && size == 2) {
>> +        ret = xive_nvt_accept(nvt);
>> +    } else {
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid TIMA read @%"
>> +                      HWADDR_PRIx" size %d\n", offset, size);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +#define TM_RING(offset) ((offset) & 0xf0)
>> +
>> +static uint64_t xive_tm_os_read(void *opaque, hwaddr offset,
>> +                                      unsigned size)
>> +{
>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
> 
> So, as I said on a previous version of this, we can actually correctly
> represent different mappings in different cpu spaces, by exploiting
> cpu->as and not just having them all point to &address_space_memory.

Yes, you did and I haven't studied the question yet. For the next version.

>> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
>> +    uint64_t ret = -1;
>> +    int i;
>> +
>> +    if (offset >= TM_SPC_ACK_EBB) {
>> +        return xive_tm_read_special(nvt, offset, size);
>> +    }
>> +
>> +    if (TM_RING(offset) != TM_QW1_OS) {
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid access to non-OS ring @%"
>> +                      HWADDR_PRIx"\n", offset);
>> +        return ret;
> 
> Just return -1 would be clearer here;

ok.

> 
>> +    }
>> +
>> +    ret = 0;
>> +    for (i = 0; i < size; i++) {
>> +        ret |= (uint64_t) nvt->regs[offset + i] << (8 * (size - i - 1));
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static bool xive_tm_is_readonly(uint8_t offset)
>> +{
>> +    return offset != TM_QW1_OS + TM_CPPR;
>> +}
>> +
>> +static void xive_tm_write_special(XiveNVT *nvt, hwaddr offset,
>> +                                        uint64_t value, unsigned size)
>> +{
>> +    /* TODO: support TM_SPC_SET_OS_PENDING */
>> +
>> +    /* TODO: support TM_SPC_ACK_OS_EL */
>> +}
>> +
>> +static void xive_tm_os_write(void *opaque, hwaddr offset,
>> +                                   uint64_t value, unsigned size)
>> +{
>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
>> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
>> +    int i;
>> +
>> +    if (offset >= TM_SPC_ACK_EBB) {
>> +        xive_tm_write_special(nvt, offset, value, size);
>> +        return;
>> +    }
>> +
>> +    if (TM_RING(offset) != TM_QW1_OS) {
> 
> Why have this if you have separate OS and user regions as you appear
> to do below?

This is another problem we are trying to solve. 

The registers a CPU can access depends on the TIMA view it is using. 
The OS TIMA view only sees the OS ring registers. The HV view sees all. 

> Or to look at it another way, shouldn't it be possible to make the
> read/write accessors the same for the OS and user rings?

For some parts yes, but the special load/store addresses are different
for each view, the read-only register also. It seemed easier to duplicate.

I think the problem will become clearer (or worse) with pnv which uses 
the HV mode.

>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid access to non-OS ring @%"
>> +                      HWADDR_PRIx"\n", offset);
>> +        return;
>> +    }
>> +
>> +    switch (size) {
>> +    case 1:
>> +        if (offset == TM_QW1_OS + TM_CPPR) {
>> +            xive_nvt_set_cppr(nvt, value & 0xff);
>> +        }
>> +        break;
>> +    case 4:
>> +    case 8:
>> +        for (i = 0; i < size; i++) {
>> +            if (!xive_tm_is_readonly(offset + i)) {
>> +                nvt->regs[offset + i] = (value >> (8 * (size - i - 1))) & 0xff;
>> +            }
>> +        }
>> +        break;
>> +    default:
>> +        g_assert_not_reached();
>> +    }
>> +}
>> +
>> +const MemoryRegionOps xive_tm_os_ops = {
>> +    .read = xive_tm_os_read,
>> +    .write = xive_tm_os_write,
>> +    .endianness = DEVICE_BIG_ENDIAN,
>> +    .valid = {
>> +        .min_access_size = 1,
>> +        .max_access_size = 8,
>> +    },
>> +    .impl = {
>> +        .min_access_size = 1,
>> +        .max_access_size = 8,
>> +    },
>> +};
>> +
>> +/*
>> + * User Thread Interrupt Management Area MMIO
>> + */
>> +
>> +static uint64_t xive_tm_user_read(void *opaque, hwaddr offset,
>> +                                        unsigned size)
>> +{
>> +    qemu_log_mask(LOG_UNIMP, "XIVE: invalid access to User TIMA @%"
>> +                  HWADDR_PRIx"\n", offset);
>> +    return -1;
>> +}
>> +
>> +static void xive_tm_user_write(void *opaque, hwaddr offset,
>> +                                     uint64_t value, unsigned size)
>> +{
>> +    qemu_log_mask(LOG_UNIMP, "XIVE: invalid access to User TIMA @%"
>> +                  HWADDR_PRIx"\n", offset);
>> +}
>> +
>> +
>> +const MemoryRegionOps xive_tm_user_ops = {
>> +    .read = xive_tm_user_read,
>> +    .write = xive_tm_user_write,
>> +    .endianness = DEVICE_BIG_ENDIAN,
>> +    .valid = {
>> +        .min_access_size = 1,
>> +        .max_access_size = 8,
>> +    },
>> +    .impl = {
>> +        .min_access_size = 1,
>> +        .max_access_size = 8,
>> +    },
>> +};
>> +
>> +static char *xive_nvt_ring_print(uint8_t *ring)
>> +{
>> +    uint32_t w2 = be32_to_cpu(*((uint32_t *) &ring[TM_WORD2]));
>> +
>> +    return g_strdup_printf("%02x  %02x   %02x  %02x    %02x   "
>> +                   "%02x  %02x  %02x   %08x",
>> +                   ring[TM_NSR], ring[TM_CPPR], ring[TM_IPB], ring[TM_LSMFB],
>> +                   ring[TM_ACK_CNT], ring[TM_INC], ring[TM_AGE], ring[TM_PIPR],
>> +                   w2);
>> +}
>> +
>> +void xive_nvt_pic_print_info(XiveNVT *nvt, Monitor *mon)
>> +{
>> +    int cpu_index = nvt->cs ? nvt->cs->cpu_index : -1;
>> +    char *s;
>> +
>> +    monitor_printf(mon, "CPU[%04x]: QW    NSR CPPR IPB LSMFB ACK# INC AGE PIPR"
>> +                   " W2\n", cpu_index);
>> +
>> +    s = xive_nvt_ring_print(&nvt->regs[TM_QW1_OS]);
>> +    monitor_printf(mon, "CPU[%04x]: OS    %s\n", cpu_index, s);
>> +    g_free(s);
>> +    s = xive_nvt_ring_print(&nvt->regs[TM_QW0_USER]);
>> +    monitor_printf(mon, "CPU[%04x]: USER  %s\n", cpu_index, s);
>> +    g_free(s);
>> +}
>> +
>> +static void xive_nvt_reset(void *dev)
>> +{
>> +    XiveNVT *nvt = XIVE_NVT(dev);
>> +
>> +    memset(nvt->regs, 0, sizeof(nvt->regs));
>> +}
>> +
>> +static void xive_nvt_realize(DeviceState *dev, Error **errp)
>> +{
>> +    XiveNVT *nvt = XIVE_NVT(dev);
>> +    PowerPCCPU *cpu;
>> +    CPUPPCState *env;
>> +    Object *obj;
>> +    Error *err = NULL;
>> +
>> +    obj = object_property_get_link(OBJECT(dev), ICP_PROP_CPU, &err);
> 
> Please get rid of the remaining "ICP" naming in the xive code.

ok.  I will kill the define.

>> +    if (!obj) {
>> +        error_propagate(errp, err);
>> +        error_prepend(errp, "required link '" ICP_PROP_CPU "' not found: ");
>> +        return;
>> +    }
>> +
>> +    cpu = POWERPC_CPU(obj);
>> +    nvt->cs = CPU(obj);
>> +
>> +    env = &cpu->env;
>> +    switch (PPC_INPUT(env)) {
>> +    case PPC_FLAGS_INPUT_POWER7:
>> +        nvt->output = env->irq_inputs[POWER7_INPUT_INT];
>> +        break;
>> +
>> +    default:
>> +        error_setg(errp, "XIVE interrupt controller does not support "
>> +                   "this CPU bus model");
>> +        return;
>> +    }
>> +
>> +    qemu_register_reset(xive_nvt_reset, dev);
> 
> If this is a sysbus device, which I think it is, 

It is not. The TIMA MMIO region is in the sPAPRXive model but that might 
change if we use cpu->as. I agree it would look better to have a memory
region per cpu.

> you shouldn't need to
> explicitly register a reset handler.  Instead you can set a device
> reset handler which will be called with the reset.
> 
>> +}
>> +
>> +static void xive_nvt_unrealize(DeviceState *dev, Error **errp)
>> +{
>> +    qemu_unregister_reset(xive_nvt_reset, dev);
>> +}
>> +
>> +static void xive_nvt_init(Object *obj)
>> +{
>> +    XiveNVT *nvt = XIVE_NVT(obj);
>> +
>> +    nvt->ring_os = &nvt->regs[TM_QW1_OS];
> 
> The ring_os field is basically pointless, being just an offset into a
> structure you already have.  A macro or inline would be a better idea.

ok. I liked the idea but I agree it's overkill to have an init routine
just for this. I will find something.

>> +}
>> +
>> +static const VMStateDescription vmstate_xive_nvt = {
>> +    .name = TYPE_XIVE_NVT,
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .fields = (VMStateField[]) {
>> +        VMSTATE_BUFFER(regs, XiveNVT),
>> +        VMSTATE_END_OF_LIST()
>> +    },
>> +};
>> +
>> +static void xive_nvt_class_init(ObjectClass *klass, void *data)
>> +{
>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>> +
>> +    dc->realize = xive_nvt_realize;
>> +    dc->unrealize = xive_nvt_unrealize;
>> +    dc->desc = "XIVE Interrupt Presenter";
>> +    dc->vmsd = &vmstate_xive_nvt;
>> +}
>> +
>> +static const TypeInfo xive_nvt_info = {
>> +    .name          = TYPE_XIVE_NVT,
>> +    .parent        = TYPE_DEVICE,
>> +    .instance_size = sizeof(XiveNVT),
>> +    .instance_init = xive_nvt_init,
>> +    .class_init    = xive_nvt_class_init,
>> +};
>>  
>>  /*
>>   * XIVE Fabric
>> @@ -27,6 +298,13 @@ XiveIVE *xive_fabric_get_ive(XiveFabric *xf, uint32_t lisn)
>>      return xfc->get_ive(xf, lisn);
>>  }
>>  
>> +XiveNVT *xive_fabric_get_nvt(XiveFabric *xf, uint32_t server)
>> +{
>> +    XiveFabricClass *xfc = XIVE_FABRIC_GET_CLASS(xf);
>> +
>> +    return xfc->get_nvt(xf, server);
>> +}
>> +
>>  static void xive_fabric_route(XiveFabric *xf, int lisn)
>>  {
>>  
>> @@ -418,6 +696,7 @@ static void xive_register_types(void)
>>  {
>>      type_register_static(&xive_source_info);
>>      type_register_static(&xive_fabric_info);
>> +    type_register_static(&xive_nvt_info);
>>  }
>>  
>>  type_init(xive_register_types)
>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
>> index 4538c622b60a..25d78eec884d 100644
>> --- a/include/hw/ppc/spapr_xive.h
>> +++ b/include/hw/ppc/spapr_xive.h
>> @@ -25,6 +25,11 @@ typedef struct sPAPRXive {
>>      /* Routing table */
>>      XiveIVE      *ivt;
>>      uint32_t     nr_irqs;
>> +
>> +    /* TIMA memory regions */
>> +    hwaddr       tm_base;
>> +    MemoryRegion tm_mmio_user;
>> +    MemoryRegion tm_mmio_os;
>>  } sPAPRXive;
>>  
>>  bool spapr_xive_irq_enable(sPAPRXive *xive, uint32_t lisn, bool lsi);
>> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
>> index 57295715a4a5..1a2da610d91c 100644
>> --- a/include/hw/ppc/xive.h
>> +++ b/include/hw/ppc/xive.h
>> @@ -20,6 +20,7 @@ typedef struct XiveFabric XiveFabric;
>>   */
>>  
>>  #define XIVE_VC_BASE   0x0006010000000000ull
>> +#define XIVE_TM_BASE   0x0006030203180000ull
>>  
>>  /*
>>   * XIVE Interrupt Source
>> @@ -155,6 +156,34 @@ static inline void xive_source_irq_set(XiveSource *xsrc, uint32_t srcno,
>>  }
>>  
>>  /*
>> + * XIVE Interrupt Presenter
>> + */
>> +
>> +#define TYPE_XIVE_NVT "xive-nvt"
>> +#define XIVE_NVT(obj) OBJECT_CHECK(XiveNVT, (obj), TYPE_XIVE_NVT)
>> +
>> +#define TM_RING_COUNT           4
>> +#define TM_RING_SIZE            0x10
>> +
>> +typedef struct XiveNVT {
>> +    DeviceState parent_obj;
>> +
>> +    CPUState  *cs;
>> +    qemu_irq  output;
>> +
>> +    /* Thread interrupt Management (TM) registers */
>> +    uint8_t   regs[TM_RING_COUNT * TM_RING_SIZE];
>> +
>> +    /* Shortcuts to rings */
>> +    uint8_t   *ring_os;
>> +} XiveNVT;
>> +
>> +extern const MemoryRegionOps xive_tm_user_ops;
>> +extern const MemoryRegionOps xive_tm_os_ops;
>> +
>> +void xive_nvt_pic_print_info(XiveNVT *nvt, Monitor *mon);
>> +
>> +/*
>>   * XIVE Fabric
>>   */
>>  
>> @@ -175,8 +204,10 @@ typedef struct XiveFabricClass {
>>      void (*notify)(XiveFabric *xf, uint32_t lisn);
>>  
>>      XiveIVE *(*get_ive)(XiveFabric *xf, uint32_t lisn);
>> +    XiveNVT *(*get_nvt)(XiveFabric *xf, uint32_t server);
>>  } XiveFabricClass;
>>  
>>  XiveIVE *xive_fabric_get_ive(XiveFabric *xf, uint32_t lisn);
>> +XiveNVT *xive_fabric_get_nvt(XiveFabric *xf, uint32_t server);
>>  
>>  #endif /* PPC_XIVE_H */
>> diff --git a/include/hw/ppc/xive_regs.h b/include/hw/ppc/xive_regs.h
>> index 5903f29eb789..f2e2a1ac8f6e 100644
>> --- a/include/hw/ppc/xive_regs.h
>> +++ b/include/hw/ppc/xive_regs.h
>> @@ -10,6 +10,88 @@
>>  #ifndef _PPC_XIVE_REGS_H
>>  #define _PPC_XIVE_REGS_H
>>  
>> +#define TM_SHIFT                16
>> +
>> +/* TM register offsets */
>> +#define TM_QW0_USER             0x000 /* All rings */
>> +#define TM_QW1_OS               0x010 /* Ring 0..2 */
>> +#define TM_QW2_HV_POOL          0x020 /* Ring 0..1 */
>> +#define TM_QW3_HV_PHYS          0x030 /* Ring 0..1 */
>> +
>> +/* Byte offsets inside a QW             QW0 QW1 QW2 QW3 */
>> +#define TM_NSR                  0x0  /*  +   +   -   +  */
>> +#define TM_CPPR                 0x1  /*  -   +   -   +  */
>> +#define TM_IPB                  0x2  /*  -   +   +   +  */
>> +#define TM_LSMFB                0x3  /*  -   +   +   +  */
>> +#define TM_ACK_CNT              0x4  /*  -   +   -   -  */
>> +#define TM_INC                  0x5  /*  -   +   -   +  */
>> +#define TM_AGE                  0x6  /*  -   +   -   +  */
>> +#define TM_PIPR                 0x7  /*  -   +   -   +  */
>> +
>> +#define TM_WORD0                0x0
>> +#define TM_WORD1                0x4
>> +
>> +/*
>> + * QW word 2 contains the valid bit at the top and other fields
>> + * depending on the QW.
>> + */
>> +#define TM_WORD2                0x8
>> +#define   TM_QW0W2_VU           PPC_BIT32(0)
>> +#define   TM_QW0W2_LOGIC_SERV   PPC_BITMASK32(1, 31) /* XX 2,31 ? */
>> +#define   TM_QW1W2_VO           PPC_BIT32(0)
>> +#define   TM_QW1W2_OS_CAM       PPC_BITMASK32(8, 31)
>> +#define   TM_QW2W2_VP           PPC_BIT32(0)
>> +#define   TM_QW2W2_POOL_CAM     PPC_BITMASK32(8, 31)
>> +#define   TM_QW3W2_VT           PPC_BIT32(0)
>> +#define   TM_QW3W2_LP           PPC_BIT32(6)
>> +#define   TM_QW3W2_LE           PPC_BIT32(7)
>> +#define   TM_QW3W2_T            PPC_BIT32(31)
>> +
>> +/*
>> + * In addition to normal loads to "peek" and writes (only when invalid)
>> + * using 4 and 8 bytes accesses, the above registers support these
>> + * "special" byte operations:
>> + *
>> + *   - Byte load from QW0[NSR] - User level NSR (EBB)
>> + *   - Byte store to QW0[NSR] - User level NSR (EBB)
>> + *   - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access
>> + *   - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0
>> + *                                    otherwise VT||0000000
>> + *   - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present)
>> + *
>> + * Then we have all these "special" CI ops at these offset that trigger
>> + * all sorts of side effects:
>> + */
>> +#define TM_SPC_ACK_EBB          0x800   /* Load8 ack EBB to reg*/
>> +#define TM_SPC_ACK_OS_REG       0x810   /* Load16 ack OS irq to reg */
>> +#define TM_SPC_PUSH_USR_CTX     0x808   /* Store32 Push/Validate user context */
>> +#define TM_SPC_PULL_USR_CTX     0x808   /* Load32 Pull/Invalidate user
>> +                                         * context */
>> +#define TM_SPC_SET_OS_PENDING   0x812   /* Store8 Set OS irq pending bit */
>> +#define TM_SPC_PULL_OS_CTX      0x818   /* Load32/Load64 Pull/Invalidate OS
>> +                                         * context to reg */
>> +#define TM_SPC_PULL_POOL_CTX    0x828   /* Load32/Load64 Pull/Invalidate Pool
>> +                                         * context to reg*/
>> +#define TM_SPC_ACK_HV_REG       0x830   /* Load16 ack HV irq to reg */
>> +#define TM_SPC_PULL_USR_CTX_OL  0xc08   /* Store8 Pull/Inval usr ctx to odd
>> +                                         * line */
>> +#define TM_SPC_ACK_OS_EL        0xc10   /* Store8 ack OS irq to even line */
>> +#define TM_SPC_ACK_HV_POOL_EL   0xc20   /* Store8 ack HV evt pool to even
>> +                                         * line */
>> +#define TM_SPC_ACK_HV_EL        0xc30   /* Store8 ack HV irq to even line */
>> +/* XXX more... */
>> +
>> +/* NSR fields for the various QW ack types */
>> +#define TM_QW0_NSR_EB           PPC_BIT8(0)
>> +#define TM_QW1_NSR_EO           PPC_BIT8(0)
>> +#define TM_QW3_NSR_HE           PPC_BITMASK8(0, 1)
>> +#define  TM_QW3_NSR_HE_NONE     0
>> +#define  TM_QW3_NSR_HE_POOL     1
>> +#define  TM_QW3_NSR_HE_PHYS     2
>> +#define  TM_QW3_NSR_HE_LSI      3
>> +#define TM_QW3_NSR_I            PPC_BIT8(2)
>> +#define TM_QW3_NSR_GRP_LVL      PPC_BIT8(3, 7)
>> +
>>  /* IVE/EAS
>>   *
>>   * One per interrupt source. Targets that interrupt to a given EQ
>> @@ -30,4 +112,6 @@ typedef struct XiveIVE {
>>  #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
>>  } XiveIVE;
>>  
>> +#define XIVE_PRIORITY_MAX  7
>> +
>>  #endif /* _INTC_XIVE_INTERNAL_H */
>
Cédric Le Goater April 26, 2018, 5:15 p.m. UTC | #3
On 04/26/2018 11:27 AM, Cédric Le Goater wrote:
> On 04/26/2018 09:11 AM, David Gibson wrote:
>> On Thu, Apr 19, 2018 at 02:43:02PM +0200, Cédric Le Goater wrote:
>>> The XIVE presenter engine uses a set of registers to handle priority
>>> management and interrupt acknowledgment among other things. The most
>>> important ones being :
>>>
>>>   - Interrupt Priority Register (PIPR)
>>>   - Interrupt Pending Buffer (IPB)
>>>   - Current Processor Priority (CPPR)
>>>   - Notification Source Register (NSR)
>>>
>>> There is one set of registers per level of privilege, four in all :
>>> HW, HV pool, OS and User. These are called rings. All registers are
>>> accessible through a specific MMIO region called the Thread Interrupt
>>> Management Areas (TIMA) but, depending on the privilege level of the
>>> CPU, the view of the TIMA is filtered. The sPAPR machine runs at the
>>> OS privilege and therefore can only accesses the OS and the User
>>> rings. The others are for hypervisor levels.
>>>
>>> The CPU interrupt state is modeled with a XiveNVT object which stores
>>> the values of the different registers. The different TIMA views are
>>> mapped at the same address for each CPU and 'current_cpu' is used to
>>> retrieve the XiveNVT holding the ring registers.
>>>
>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>> ---
>>>
>>>  Changes since v2 :
>>>
>>>  - introduced the XiveFabric interface
>>>
>>>  hw/intc/spapr_xive.c        |  25 ++++
>>>  hw/intc/xive.c              | 279 ++++++++++++++++++++++++++++++++++++++++++++
>>>  include/hw/ppc/spapr_xive.h |   5 +
>>>  include/hw/ppc/xive.h       |  31 +++++
>>>  include/hw/ppc/xive_regs.h  |  84 +++++++++++++
>>>  5 files changed, 424 insertions(+)
>>>
>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>>> index 90cde8a4082d..f07832bf0a00 100644
>>> --- a/hw/intc/spapr_xive.c
>>> +++ b/hw/intc/spapr_xive.c
>>> @@ -13,6 +13,7 @@
>>>  #include "target/ppc/cpu.h"
>>>  #include "sysemu/cpus.h"
>>>  #include "monitor/monitor.h"
>>> +#include "hw/ppc/spapr.h"
>>>  #include "hw/ppc/spapr_xive.h"
>>>  #include "hw/ppc/xive.h"
>>>  #include "hw/ppc/xive_regs.h"
>>> @@ -95,6 +96,22 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>>  
>>>      /* Allocate the Interrupt Virtualization Table */
>>>      xive->ivt = g_new0(XiveIVE, xive->nr_irqs);
>>> +
>>> +    /* The Thread Interrupt Management Area has the same address for
>>> +     * each chip. On sPAPR, we only need to expose the User and OS
>>> +     * level views of the TIMA.
>>> +     */
>>> +    xive->tm_base = XIVE_TM_BASE;
>>
>> The constant should probably have PAPR in the name somewhere, since
>> it's just for PAPR machines (same for the ESB mappings, actually).
> 
> ok. 
> 
> I have also made 'tm_base' a property, like 'vc_base' for ESBs, in 
> case we want to change the value when the guest is instantiated. 
> I doubt it but this is an address in the global address space, so 
> letting the machine have control is better I think. 
>  
>>
>>> +
>>> +    memory_region_init_io(&xive->tm_mmio_user, OBJECT(xive),
>>> +                          &xive_tm_user_ops, xive, "xive.tima.user",
>>> +                          1ull << TM_SHIFT);
>>> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_user);
>>> +
>>> +    memory_region_init_io(&xive->tm_mmio_os, OBJECT(xive),
>>> +                          &xive_tm_os_ops, xive, "xive.tima.os",
>>> +                          1ull << TM_SHIFT);
>>> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_os);
>>>  }
>>>  
>>>  static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
>>> @@ -104,6 +121,13 @@ static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
>>>      return lisn < xive->nr_irqs ? &xive->ivt[lisn] : NULL;
>>>  }
>>>  
>>> +static XiveNVT *spapr_xive_get_nvt(XiveFabric *xf, uint32_t server)
>>> +{
>>> +    PowerPCCPU *cpu = spapr_find_cpu(server);
>>> +
>>> +    return cpu ? XIVE_NVT(cpu->intc) : NULL;
>>> +}
>>
>> So this is a bit of a tangent, but I've been thinking of implementing
>> a scheme where there's an opaque pointer in the cpu structure for the
>> use of the machine.  I'm planning for that to replace the intc pointer
>> (which isn't really used directly by the cpu). That would allow us to
>> have spapr put a structure there and have both xics and xive pointers
>> which could be useful later on.
> 
> ok. That should simplify the patchset at the end, in which we need to 
> switch the 'intc' pointer. 
> 
>> I think we'd need something similar to correctly handle migration of
>> the VPA state, which is currently horribly broken.
>>
>>> +
>>>  static const VMStateDescription vmstate_spapr_xive_ive = {
>>>      .name = TYPE_SPAPR_XIVE "/ive",
>>>      .version_id = 1,
>>> @@ -143,6 +167,7 @@ static void spapr_xive_class_init(ObjectClass *klass, void *data)
>>>      dc->vmsd = &vmstate_spapr_xive;
>>>  
>>>      xfc->get_ive = spapr_xive_get_ive;
>>> +    xfc->get_nvt = spapr_xive_get_nvt;
>>>  }
>>>  
>>>  static const TypeInfo spapr_xive_info = {
>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>>> index dccad0318834..5691bb9474e4 100644
>>> --- a/hw/intc/xive.c
>>> +++ b/hw/intc/xive.c
>>> @@ -14,7 +14,278 @@
>>>  #include "sysemu/cpus.h"
>>>  #include "sysemu/dma.h"
>>>  #include "monitor/monitor.h"
>>> +#include "hw/ppc/xics.h" /* for ICP_PROP_CPU */
>>>  #include "hw/ppc/xive.h"
>>> +#include "hw/ppc/xive_regs.h"
>>> +
>>> +/*
>>> + * XIVE Interrupt Presenter
>>> + */
>>> +
>>> +static uint64_t xive_nvt_accept(XiveNVT *nvt)
>>> +{
>>> +    return 0;
>>> +}
>>> +
>>> +static void xive_nvt_set_cppr(XiveNVT *nvt, uint8_t cppr)
>>> +{
>>> +    if (cppr > XIVE_PRIORITY_MAX) {
>>> +        cppr = 0xff;
>>> +    }
>>> +
>>> +    nvt->ring_os[TM_CPPR] = cppr;
>>
>> Surely this needs to recheck if we should be interrupting the cpu?
> 
> yes. In patch 9, when we introduce the nvt notify routine.
> 
>>> +}
>>> +
>>> +/*
>>> + * OS Thread Interrupt Management Area MMIO
>>> + */
>>> +static uint64_t xive_tm_read_special(XiveNVT *nvt, hwaddr offset,
>>> +                                           unsigned size)
>>> +{
>>> +    uint64_t ret = -1;
>>> +
>>> +    if (offset == TM_SPC_ACK_OS_REG && size == 2) {
>>> +        ret = xive_nvt_accept(nvt);
>>> +    } else {
>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid TIMA read @%"
>>> +                      HWADDR_PRIx" size %d\n", offset, size);
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +#define TM_RING(offset) ((offset) & 0xf0)
>>> +
>>> +static uint64_t xive_tm_os_read(void *opaque, hwaddr offset,
>>> +                                      unsigned size)
>>> +{
>>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
>>
>> So, as I said on a previous version of this, we can actually correctly
>> represent different mappings in different cpu spaces, by exploiting
>> cpu->as and not just having them all point to &address_space_memory.
> 
> Yes, you did and I haven't studied the question yet. For the next version.
> 
>>> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
>>> +    uint64_t ret = -1;
>>> +    int i;
>>> +
>>> +    if (offset >= TM_SPC_ACK_EBB) {
>>> +        return xive_tm_read_special(nvt, offset, size);
>>> +    }
>>> +
>>> +    if (TM_RING(offset) != TM_QW1_OS) {
>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid access to non-OS ring @%"
>>> +                      HWADDR_PRIx"\n", offset);
>>> +        return ret;
>>
>> Just return -1 would be clearer here;
> 
> ok.
> 
>>
>>> +    }
>>> +
>>> +    ret = 0;
>>> +    for (i = 0; i < size; i++) {
>>> +        ret |= (uint64_t) nvt->regs[offset + i] << (8 * (size - i - 1));
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static bool xive_tm_is_readonly(uint8_t offset)
>>> +{
>>> +    return offset != TM_QW1_OS + TM_CPPR;
>>> +}
>>> +
>>> +static void xive_tm_write_special(XiveNVT *nvt, hwaddr offset,
>>> +                                        uint64_t value, unsigned size)
>>> +{
>>> +    /* TODO: support TM_SPC_SET_OS_PENDING */
>>> +
>>> +    /* TODO: support TM_SPC_ACK_OS_EL */
>>> +}
>>> +
>>> +static void xive_tm_os_write(void *opaque, hwaddr offset,
>>> +                                   uint64_t value, unsigned size)
>>> +{
>>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
>>> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
>>> +    int i;
>>> +
>>> +    if (offset >= TM_SPC_ACK_EBB) {
>>> +        xive_tm_write_special(nvt, offset, value, size);
>>> +        return;
>>> +    }
>>> +
>>> +    if (TM_RING(offset) != TM_QW1_OS) {
>>
>> Why have this if you have separate OS and user regions as you appear
>> to do below?
> 
> This is another problem we are trying to solve. 
> 
> The registers a CPU can access depends on the TIMA view it is using. 
> The OS TIMA view only sees the OS ring registers. The HV view sees all. 

So, I gave a deeper look at the specs and I understood a little more 
details of the concepts behind. You need to do frequent round-trips 
to this document ...  

These registers are accessible through four aligned pages, each exposing 
a different view of the registers. First page (page address ending 
in 0b00) gives access to the entire context and is reserved for the 
ring 0 security monitor. The second (page address ending in 0b01) 
is for the hypervisor, ring 1. The third (page address ending in 0b10) 
is for the operating system, ring 2. The fourth (page address ending 
in 0b11) is for user level, ring 3.

The sPAPR machine runs at the OS privilege and therefore can only 
accesses the OS and the User rings, 2 and 3. The others are for
hypervisor levels.

I will try to come with a better implementation of the model and
make sure the ring numbers are respected. I am not sure we should 
have only one memory region or four distinct ones with their
own ops. There are some differences in the load/store of each view.

C.


>> Or to look at it another way, shouldn't it be possible to make the
>> read/write accessors the same for the OS and user rings?
> 
> For some parts yes, but the special load/store addresses are different
> for each view, the read-only register also. It seemed easier to duplicate.
> 
> I think the problem will become clearer (or worse) with pnv which uses 
> the HV mode.
>
> 
>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid access to non-OS ring @%"
>>> +                      HWADDR_PRIx"\n", offset);
>>> +        return;
>>> +    }
>>> +
>>> +    switch (size) {
>>> +    case 1:
>>> +        if (offset == TM_QW1_OS + TM_CPPR) {
>>> +            xive_nvt_set_cppr(nvt, value & 0xff);
>>> +        }
>>> +        break;
>>> +    case 4:
>>> +    case 8:
>>> +        for (i = 0; i < size; i++) {
>>> +            if (!xive_tm_is_readonly(offset + i)) {
>>> +                nvt->regs[offset + i] = (value >> (8 * (size - i - 1))) & 0xff;
>>> +            }
>>> +        }
>>> +        break;
>>> +    default:
>>> +        g_assert_not_reached();
>>> +    }
>>> +}
>>> +
>>> +const MemoryRegionOps xive_tm_os_ops = {
>>> +    .read = xive_tm_os_read,
>>> +    .write = xive_tm_os_write,
>>> +    .endianness = DEVICE_BIG_ENDIAN,
>>> +    .valid = {
>>> +        .min_access_size = 1,
>>> +        .max_access_size = 8,
>>> +    },
>>> +    .impl = {
>>> +        .min_access_size = 1,
>>> +        .max_access_size = 8,
>>> +    },
>>> +};
>>> +
>>> +/*
>>> + * User Thread Interrupt Management Area MMIO
>>> + */
>>> +
>>> +static uint64_t xive_tm_user_read(void *opaque, hwaddr offset,
>>> +                                        unsigned size)
>>> +{
>>> +    qemu_log_mask(LOG_UNIMP, "XIVE: invalid access to User TIMA @%"
>>> +                  HWADDR_PRIx"\n", offset);
>>> +    return -1;
>>> +}
>>> +
>>> +static void xive_tm_user_write(void *opaque, hwaddr offset,
>>> +                                     uint64_t value, unsigned size)
>>> +{
>>> +    qemu_log_mask(LOG_UNIMP, "XIVE: invalid access to User TIMA @%"
>>> +                  HWADDR_PRIx"\n", offset);
>>> +}
>>> +
>>> +
>>> +const MemoryRegionOps xive_tm_user_ops = {
>>> +    .read = xive_tm_user_read,
>>> +    .write = xive_tm_user_write,
>>> +    .endianness = DEVICE_BIG_ENDIAN,
>>> +    .valid = {
>>> +        .min_access_size = 1,
>>> +        .max_access_size = 8,
>>> +    },
>>> +    .impl = {
>>> +        .min_access_size = 1,
>>> +        .max_access_size = 8,
>>> +    },
>>> +};
>>> +
>>> +static char *xive_nvt_ring_print(uint8_t *ring)
>>> +{
>>> +    uint32_t w2 = be32_to_cpu(*((uint32_t *) &ring[TM_WORD2]));
>>> +
>>> +    return g_strdup_printf("%02x  %02x   %02x  %02x    %02x   "
>>> +                   "%02x  %02x  %02x   %08x",
>>> +                   ring[TM_NSR], ring[TM_CPPR], ring[TM_IPB], ring[TM_LSMFB],
>>> +                   ring[TM_ACK_CNT], ring[TM_INC], ring[TM_AGE], ring[TM_PIPR],
>>> +                   w2);
>>> +}
>>> +
>>> +void xive_nvt_pic_print_info(XiveNVT *nvt, Monitor *mon)
>>> +{
>>> +    int cpu_index = nvt->cs ? nvt->cs->cpu_index : -1;
>>> +    char *s;
>>> +
>>> +    monitor_printf(mon, "CPU[%04x]: QW    NSR CPPR IPB LSMFB ACK# INC AGE PIPR"
>>> +                   " W2\n", cpu_index);
>>> +
>>> +    s = xive_nvt_ring_print(&nvt->regs[TM_QW1_OS]);
>>> +    monitor_printf(mon, "CPU[%04x]: OS    %s\n", cpu_index, s);
>>> +    g_free(s);
>>> +    s = xive_nvt_ring_print(&nvt->regs[TM_QW0_USER]);
>>> +    monitor_printf(mon, "CPU[%04x]: USER  %s\n", cpu_index, s);
>>> +    g_free(s);
>>> +}
>>> +
>>> +static void xive_nvt_reset(void *dev)
>>> +{
>>> +    XiveNVT *nvt = XIVE_NVT(dev);
>>> +
>>> +    memset(nvt->regs, 0, sizeof(nvt->regs));
>>> +}
>>> +
>>> +static void xive_nvt_realize(DeviceState *dev, Error **errp)
>>> +{
>>> +    XiveNVT *nvt = XIVE_NVT(dev);
>>> +    PowerPCCPU *cpu;
>>> +    CPUPPCState *env;
>>> +    Object *obj;
>>> +    Error *err = NULL;
>>> +
>>> +    obj = object_property_get_link(OBJECT(dev), ICP_PROP_CPU, &err);
>>
>> Please get rid of the remaining "ICP" naming in the xive code.
> 
> ok.  I will kill the define.
> 
>>> +    if (!obj) {
>>> +        error_propagate(errp, err);
>>> +        error_prepend(errp, "required link '" ICP_PROP_CPU "' not found: ");
>>> +        return;
>>> +    }
>>> +
>>> +    cpu = POWERPC_CPU(obj);
>>> +    nvt->cs = CPU(obj);
>>> +
>>> +    env = &cpu->env;
>>> +    switch (PPC_INPUT(env)) {
>>> +    case PPC_FLAGS_INPUT_POWER7:
>>> +        nvt->output = env->irq_inputs[POWER7_INPUT_INT];
>>> +        break;
>>> +
>>> +    default:
>>> +        error_setg(errp, "XIVE interrupt controller does not support "
>>> +                   "this CPU bus model");
>>> +        return;
>>> +    }
>>> +
>>> +    qemu_register_reset(xive_nvt_reset, dev);
>>
>> If this is a sysbus device, which I think it is, 
> 
> It is not. The TIMA MMIO region is in the sPAPRXive model but that might 
> change if we use cpu->as. I agree it would look better to have a memory
> region per cpu.
> 
>> you shouldn't need to
>> explicitly register a reset handler.  Instead you can set a device
>> reset handler which will be called with the reset.
>>
>>> +}
>>> +
>>> +static void xive_nvt_unrealize(DeviceState *dev, Error **errp)
>>> +{
>>> +    qemu_unregister_reset(xive_nvt_reset, dev);
>>> +}
>>> +
>>> +static void xive_nvt_init(Object *obj)
>>> +{
>>> +    XiveNVT *nvt = XIVE_NVT(obj);
>>> +
>>> +    nvt->ring_os = &nvt->regs[TM_QW1_OS];
>>
>> The ring_os field is basically pointless, being just an offset into a
>> structure you already have.  A macro or inline would be a better idea.
> 
> ok. I liked the idea but I agree it's overkill to have an init routine
> just for this. I will find something.
> 
>>> +}
>>> +
>>> +static const VMStateDescription vmstate_xive_nvt = {
>>> +    .name = TYPE_XIVE_NVT,
>>> +    .version_id = 1,
>>> +    .minimum_version_id = 1,
>>> +    .fields = (VMStateField[]) {
>>> +        VMSTATE_BUFFER(regs, XiveNVT),
>>> +        VMSTATE_END_OF_LIST()
>>> +    },
>>> +};
>>> +
>>> +static void xive_nvt_class_init(ObjectClass *klass, void *data)
>>> +{
>>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>>> +
>>> +    dc->realize = xive_nvt_realize;
>>> +    dc->unrealize = xive_nvt_unrealize;
>>> +    dc->desc = "XIVE Interrupt Presenter";
>>> +    dc->vmsd = &vmstate_xive_nvt;
>>> +}
>>> +
>>> +static const TypeInfo xive_nvt_info = {
>>> +    .name          = TYPE_XIVE_NVT,
>>> +    .parent        = TYPE_DEVICE,
>>> +    .instance_size = sizeof(XiveNVT),
>>> +    .instance_init = xive_nvt_init,
>>> +    .class_init    = xive_nvt_class_init,
>>> +};
>>>  
>>>  /*
>>>   * XIVE Fabric
>>> @@ -27,6 +298,13 @@ XiveIVE *xive_fabric_get_ive(XiveFabric *xf, uint32_t lisn)
>>>      return xfc->get_ive(xf, lisn);
>>>  }
>>>  
>>> +XiveNVT *xive_fabric_get_nvt(XiveFabric *xf, uint32_t server)
>>> +{
>>> +    XiveFabricClass *xfc = XIVE_FABRIC_GET_CLASS(xf);
>>> +
>>> +    return xfc->get_nvt(xf, server);
>>> +}
>>> +
>>>  static void xive_fabric_route(XiveFabric *xf, int lisn)
>>>  {
>>>  
>>> @@ -418,6 +696,7 @@ static void xive_register_types(void)
>>>  {
>>>      type_register_static(&xive_source_info);
>>>      type_register_static(&xive_fabric_info);
>>> +    type_register_static(&xive_nvt_info);
>>>  }
>>>  
>>>  type_init(xive_register_types)
>>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
>>> index 4538c622b60a..25d78eec884d 100644
>>> --- a/include/hw/ppc/spapr_xive.h
>>> +++ b/include/hw/ppc/spapr_xive.h
>>> @@ -25,6 +25,11 @@ typedef struct sPAPRXive {
>>>      /* Routing table */
>>>      XiveIVE      *ivt;
>>>      uint32_t     nr_irqs;
>>> +
>>> +    /* TIMA memory regions */
>>> +    hwaddr       tm_base;
>>> +    MemoryRegion tm_mmio_user;
>>> +    MemoryRegion tm_mmio_os;
>>>  } sPAPRXive;
>>>  
>>>  bool spapr_xive_irq_enable(sPAPRXive *xive, uint32_t lisn, bool lsi);
>>> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
>>> index 57295715a4a5..1a2da610d91c 100644
>>> --- a/include/hw/ppc/xive.h
>>> +++ b/include/hw/ppc/xive.h
>>> @@ -20,6 +20,7 @@ typedef struct XiveFabric XiveFabric;
>>>   */
>>>  
>>>  #define XIVE_VC_BASE   0x0006010000000000ull
>>> +#define XIVE_TM_BASE   0x0006030203180000ull
>>>  
>>>  /*
>>>   * XIVE Interrupt Source
>>> @@ -155,6 +156,34 @@ static inline void xive_source_irq_set(XiveSource *xsrc, uint32_t srcno,
>>>  }
>>>  
>>>  /*
>>> + * XIVE Interrupt Presenter
>>> + */
>>> +
>>> +#define TYPE_XIVE_NVT "xive-nvt"
>>> +#define XIVE_NVT(obj) OBJECT_CHECK(XiveNVT, (obj), TYPE_XIVE_NVT)
>>> +
>>> +#define TM_RING_COUNT           4
>>> +#define TM_RING_SIZE            0x10
>>> +
>>> +typedef struct XiveNVT {
>>> +    DeviceState parent_obj;
>>> +
>>> +    CPUState  *cs;
>>> +    qemu_irq  output;
>>> +
>>> +    /* Thread interrupt Management (TM) registers */
>>> +    uint8_t   regs[TM_RING_COUNT * TM_RING_SIZE];
>>> +
>>> +    /* Shortcuts to rings */
>>> +    uint8_t   *ring_os;
>>> +} XiveNVT;
>>> +
>>> +extern const MemoryRegionOps xive_tm_user_ops;
>>> +extern const MemoryRegionOps xive_tm_os_ops;
>>> +
>>> +void xive_nvt_pic_print_info(XiveNVT *nvt, Monitor *mon);
>>> +
>>> +/*
>>>   * XIVE Fabric
>>>   */
>>>  
>>> @@ -175,8 +204,10 @@ typedef struct XiveFabricClass {
>>>      void (*notify)(XiveFabric *xf, uint32_t lisn);
>>>  
>>>      XiveIVE *(*get_ive)(XiveFabric *xf, uint32_t lisn);
>>> +    XiveNVT *(*get_nvt)(XiveFabric *xf, uint32_t server);
>>>  } XiveFabricClass;
>>>  
>>>  XiveIVE *xive_fabric_get_ive(XiveFabric *xf, uint32_t lisn);
>>> +XiveNVT *xive_fabric_get_nvt(XiveFabric *xf, uint32_t server);
>>>  
>>>  #endif /* PPC_XIVE_H */
>>> diff --git a/include/hw/ppc/xive_regs.h b/include/hw/ppc/xive_regs.h
>>> index 5903f29eb789..f2e2a1ac8f6e 100644
>>> --- a/include/hw/ppc/xive_regs.h
>>> +++ b/include/hw/ppc/xive_regs.h
>>> @@ -10,6 +10,88 @@
>>>  #ifndef _PPC_XIVE_REGS_H
>>>  #define _PPC_XIVE_REGS_H
>>>  
>>> +#define TM_SHIFT                16
>>> +
>>> +/* TM register offsets */
>>> +#define TM_QW0_USER             0x000 /* All rings */
>>> +#define TM_QW1_OS               0x010 /* Ring 0..2 */
>>> +#define TM_QW2_HV_POOL          0x020 /* Ring 0..1 */
>>> +#define TM_QW3_HV_PHYS          0x030 /* Ring 0..1 */
>>> +
>>> +/* Byte offsets inside a QW             QW0 QW1 QW2 QW3 */
>>> +#define TM_NSR                  0x0  /*  +   +   -   +  */
>>> +#define TM_CPPR                 0x1  /*  -   +   -   +  */
>>> +#define TM_IPB                  0x2  /*  -   +   +   +  */
>>> +#define TM_LSMFB                0x3  /*  -   +   +   +  */
>>> +#define TM_ACK_CNT              0x4  /*  -   +   -   -  */
>>> +#define TM_INC                  0x5  /*  -   +   -   +  */
>>> +#define TM_AGE                  0x6  /*  -   +   -   +  */
>>> +#define TM_PIPR                 0x7  /*  -   +   -   +  */
>>> +
>>> +#define TM_WORD0                0x0
>>> +#define TM_WORD1                0x4
>>> +
>>> +/*
>>> + * QW word 2 contains the valid bit at the top and other fields
>>> + * depending on the QW.
>>> + */
>>> +#define TM_WORD2                0x8
>>> +#define   TM_QW0W2_VU           PPC_BIT32(0)
>>> +#define   TM_QW0W2_LOGIC_SERV   PPC_BITMASK32(1, 31) /* XX 2,31 ? */
>>> +#define   TM_QW1W2_VO           PPC_BIT32(0)
>>> +#define   TM_QW1W2_OS_CAM       PPC_BITMASK32(8, 31)
>>> +#define   TM_QW2W2_VP           PPC_BIT32(0)
>>> +#define   TM_QW2W2_POOL_CAM     PPC_BITMASK32(8, 31)
>>> +#define   TM_QW3W2_VT           PPC_BIT32(0)
>>> +#define   TM_QW3W2_LP           PPC_BIT32(6)
>>> +#define   TM_QW3W2_LE           PPC_BIT32(7)
>>> +#define   TM_QW3W2_T            PPC_BIT32(31)
>>> +
>>> +/*
>>> + * In addition to normal loads to "peek" and writes (only when invalid)
>>> + * using 4 and 8 bytes accesses, the above registers support these
>>> + * "special" byte operations:
>>> + *
>>> + *   - Byte load from QW0[NSR] - User level NSR (EBB)
>>> + *   - Byte store to QW0[NSR] - User level NSR (EBB)
>>> + *   - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access
>>> + *   - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0
>>> + *                                    otherwise VT||0000000
>>> + *   - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present)
>>> + *
>>> + * Then we have all these "special" CI ops at these offset that trigger
>>> + * all sorts of side effects:
>>> + */
>>> +#define TM_SPC_ACK_EBB          0x800   /* Load8 ack EBB to reg*/
>>> +#define TM_SPC_ACK_OS_REG       0x810   /* Load16 ack OS irq to reg */
>>> +#define TM_SPC_PUSH_USR_CTX     0x808   /* Store32 Push/Validate user context */
>>> +#define TM_SPC_PULL_USR_CTX     0x808   /* Load32 Pull/Invalidate user
>>> +                                         * context */
>>> +#define TM_SPC_SET_OS_PENDING   0x812   /* Store8 Set OS irq pending bit */
>>> +#define TM_SPC_PULL_OS_CTX      0x818   /* Load32/Load64 Pull/Invalidate OS
>>> +                                         * context to reg */
>>> +#define TM_SPC_PULL_POOL_CTX    0x828   /* Load32/Load64 Pull/Invalidate Pool
>>> +                                         * context to reg*/
>>> +#define TM_SPC_ACK_HV_REG       0x830   /* Load16 ack HV irq to reg */
>>> +#define TM_SPC_PULL_USR_CTX_OL  0xc08   /* Store8 Pull/Inval usr ctx to odd
>>> +                                         * line */
>>> +#define TM_SPC_ACK_OS_EL        0xc10   /* Store8 ack OS irq to even line */
>>> +#define TM_SPC_ACK_HV_POOL_EL   0xc20   /* Store8 ack HV evt pool to even
>>> +                                         * line */
>>> +#define TM_SPC_ACK_HV_EL        0xc30   /* Store8 ack HV irq to even line */
>>> +/* XXX more... */
>>> +
>>> +/* NSR fields for the various QW ack types */
>>> +#define TM_QW0_NSR_EB           PPC_BIT8(0)
>>> +#define TM_QW1_NSR_EO           PPC_BIT8(0)
>>> +#define TM_QW3_NSR_HE           PPC_BITMASK8(0, 1)
>>> +#define  TM_QW3_NSR_HE_NONE     0
>>> +#define  TM_QW3_NSR_HE_POOL     1
>>> +#define  TM_QW3_NSR_HE_PHYS     2
>>> +#define  TM_QW3_NSR_HE_LSI      3
>>> +#define TM_QW3_NSR_I            PPC_BIT8(2)
>>> +#define TM_QW3_NSR_GRP_LVL      PPC_BIT8(3, 7)
>>> +
>>>  /* IVE/EAS
>>>   *
>>>   * One per interrupt source. Targets that interrupt to a given EQ
>>> @@ -30,4 +112,6 @@ typedef struct XiveIVE {
>>>  #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
>>>  } XiveIVE;
>>>  
>>> +#define XIVE_PRIORITY_MAX  7
>>> +
>>>  #endif /* _INTC_XIVE_INTERNAL_H */
>>
>
Cédric Le Goater May 2, 2018, 7:39 a.m. UTC | #4
>>  
>> +static XiveNVT *spapr_xive_get_nvt(XiveFabric *xf, uint32_t server)
>> +{
>> +    PowerPCCPU *cpu = spapr_find_cpu(server);
>> +
>> +    return cpu ? XIVE_NVT(cpu->intc) : NULL;
>> +}
> 
> So this is a bit of a tangent, but I've been thinking of implementing
> a scheme where there's an opaque pointer in the cpu structure for the
> use of the machine.  I'm planning for that to replace the intc pointer
> (which isn't really used directly by the cpu). That would allow us to
> have spapr put a structure there and have both xics and xive pointers
> which could be useful later on.

Here is a quick try of the idea. Tested on pnv and spapr machines.
I lacked inspiration on the name so I called the object {Machine}Link. 

Thanks,

C.


From 107808feda62c09b2df9a60aba5b30127ffab976 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@kaod.org>
Date: Wed, 2 May 2018 09:24:37 +0200
Subject: [PATCH] ppc: introduce a link Object between the CPU and the machine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 hw/intc/xics_spapr.c    | 10 +++----
 hw/ppc/pnv.c            | 80 +++++++++++++++++++++++++++++++++++++++++++++++--
 hw/ppc/pnv_core.c       |  2 +-
 hw/ppc/spapr.c          | 77 +++++++++++++++++++++++++++++++++++++++++++++--
 hw/ppc/spapr_cpu_core.c |  5 ++--
 include/hw/ppc/pnv.h    |  3 ++
 include/hw/ppc/spapr.h  |  3 ++
 target/ppc/cpu.h        |  2 +-
 8 files changed, 167 insertions(+), 15 deletions(-)

diff --git a/hw/intc/xics_spapr.c b/hw/intc/xics_spapr.c
index 2e27b92b871a..9cd560bdd093 100644
--- a/hw/intc/xics_spapr.c
+++ b/hw/intc/xics_spapr.c
@@ -44,7 +44,7 @@ static target_ulong h_cppr(PowerPCCPU *cpu, sPAPRMachineState *spapr,
 {
     target_ulong cppr = args[0];
 
-    icp_set_cppr(ICP(cpu->intc), cppr);
+    icp_set_cppr(spapr_link_icp(cpu), cppr);
     return H_SUCCESS;
 }
 
@@ -65,7 +65,7 @@ static target_ulong h_ipi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
 static target_ulong h_xirr(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                            target_ulong opcode, target_ulong *args)
 {
-    uint32_t xirr = icp_accept(ICP(cpu->intc));
+    uint32_t xirr = icp_accept(spapr_link_icp(cpu));
 
     args[0] = xirr;
     return H_SUCCESS;
@@ -74,7 +74,7 @@ static target_ulong h_xirr(PowerPCCPU *cpu, sPAPRMachineState *spapr,
 static target_ulong h_xirr_x(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                              target_ulong opcode, target_ulong *args)
 {
-    uint32_t xirr = icp_accept(ICP(cpu->intc));
+    uint32_t xirr = icp_accept(spapr_link_icp(cpu));
 
     args[0] = xirr;
     args[1] = cpu_get_host_ticks();
@@ -86,7 +86,7 @@ static target_ulong h_eoi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
 {
     target_ulong xirr = args[0];
 
-    icp_eoi(ICP(cpu->intc), xirr);
+    icp_eoi(spapr_link_icp(cpu), xirr);
     return H_SUCCESS;
 }
 
@@ -94,7 +94,7 @@ static target_ulong h_ipoll(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                             target_ulong opcode, target_ulong *args)
 {
     uint32_t mfrr;
-    uint32_t xirr = icp_ipoll(ICP(cpu->intc), &mfrr);
+    uint32_t xirr = icp_ipoll(spapr_link_icp(cpu), &mfrr);
 
     args[0] = xirr;
     args[1] = mfrr;
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 031488131629..64c35dfdf427 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -970,6 +970,75 @@ static void pnv_chip_class_init(ObjectClass *klass, void *data)
     dc->desc = "PowerNV Chip";
 }
 
+#define TYPE_PNV_LINK "pnv-link"
+#define PNV_LINK(obj) OBJECT_CHECK(PnvLink, (obj), TYPE_PNV_LINK)
+
+typedef struct PnvLink {
+    DeviceState parent;
+
+    ICPState *icp;
+} PnvLink;
+
+static void pnv_link_realize(DeviceState *dev, Error **errp)
+{
+    PnvMachineState *pnv = PNV_MACHINE(qdev_get_machine());
+    PnvLink *link = PNV_LINK(dev);
+    Object *cpu;
+    Error *local_err = NULL;
+
+    cpu = object_property_get_link(OBJECT(dev), "cpu", &local_err);
+    if (!cpu) {
+        error_propagate(errp, local_err);
+        error_prepend(errp, "required link 'cpu' not found: ");
+        return;
+    }
+
+    link->icp = ICP(icp_create(cpu, TYPE_PNV_ICP, XICS_FABRIC(pnv),
+                               &local_err));
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+}
+
+static void pnv_link_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = pnv_link_realize;
+}
+
+static const TypeInfo pnv_link_info = {
+    .name = TYPE_PNV_LINK,
+    .parent = TYPE_DEVICE,
+    .instance_size = sizeof(PnvLink),
+    .class_init = pnv_link_class_init,
+};
+
+Object *pnv_link_create(Object *cpu, Error **errp)
+{
+    Error *local_err = NULL;
+    Object *obj;
+
+    obj = object_new(TYPE_PNV_LINK);
+    object_property_add_child(cpu, TYPE_PNV_LINK, obj, &error_abort);
+    object_unref(obj);
+    object_property_add_const_link(obj, "cpu", cpu, &error_abort);
+    object_property_set_bool(obj, true, "realized", &local_err);
+    if (local_err) {
+        object_unparent(obj);
+        error_propagate(errp, local_err);
+        obj = NULL;
+    }
+
+    return obj;
+}
+
+ICPState *pnv_link_icp(PowerPCCPU *cpu)
+{
+    return PNV_LINK(cpu->link)->icp;
+}
+
 static ICSState *pnv_ics_get(XICSFabric *xi, int irq)
 {
     PnvMachineState *pnv = PNV_MACHINE(xi);
@@ -1013,7 +1082,7 @@ static ICPState *pnv_icp_get(XICSFabric *xi, int pir)
 {
     PowerPCCPU *cpu = ppc_get_vcpu_by_pir(pir);
 
-    return cpu ? ICP(cpu->intc) : NULL;
+    return cpu ? pnv_link_icp(cpu) : NULL;
 }
 
 static void pnv_pic_print_info(InterruptStatsProvider *obj,
@@ -1026,7 +1095,7 @@ static void pnv_pic_print_info(InterruptStatsProvider *obj,
     CPU_FOREACH(cs) {
         PowerPCCPU *cpu = POWERPC_CPU(cs);
 
-        icp_pic_print_info(ICP(cpu->intc), mon);
+        icp_pic_print_info(pnv_link_icp(cpu), mon);
     }
 
     for (i = 0; i < pnv->num_chips; i++) {
@@ -1142,3 +1211,10 @@ static const TypeInfo types[] = {
 };
 
 DEFINE_TYPES(types)
+
+static void pnv_machine_register_types(void)
+{
+    type_register_static(&pnv_link_info);
+}
+
+type_init(pnv_machine_register_types)
diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index cbb64ad9e7e0..96f70ac2df8b 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -133,7 +133,7 @@ static void pnv_core_realize_child(Object *child, XICSFabric *xi, Error **errp)
         return;
     }
 
-    cpu->intc = icp_create(child, TYPE_PNV_ICP, xi, &local_err);
+    cpu->link = pnv_link_create(child, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         return;
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index b35aff5d811c..d151460dc72c 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1746,7 +1746,7 @@ static int spapr_post_load(void *opaque, int version_id)
         CPUState *cs;
         CPU_FOREACH(cs) {
             PowerPCCPU *cpu = POWERPC_CPU(cs);
-            icp_resend(ICP(cpu->intc));
+            icp_resend(spapr_link_icp(cpu));
         }
     }
 
@@ -3763,6 +3763,76 @@ static void spapr_phb_placement(sPAPRMachineState *spapr, uint32_t index,
     *mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE;
 }
 
+
+#define TYPE_SPAPR_LINK "spapr-link"
+#define SPAPR_LINK(obj) OBJECT_CHECK(sPAPRLink, (obj), TYPE_SPAPR_LINK)
+
+typedef struct sPAPRLink {
+    DeviceState parent;
+
+    ICPState *icp;
+} sPAPRLink;
+
+static void spapr_link_realize(DeviceState *dev, Error **errp)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+    sPAPRLink *link = SPAPR_LINK(dev);
+    Object *cpu;
+    Error *local_err = NULL;
+
+    cpu = object_property_get_link(OBJECT(dev), "cpu", &local_err);
+    if (!cpu) {
+        error_propagate(errp, local_err);
+        error_prepend(errp, "required link 'cpu' not found: ");
+        return;
+    }
+
+    link->icp = ICP(icp_create(cpu, spapr->icp_type, XICS_FABRIC(spapr),
+                               &local_err));
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+}
+
+static void spapr_link_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = spapr_link_realize;
+}
+
+static const TypeInfo spapr_link_info = {
+    .name = TYPE_SPAPR_LINK,
+    .parent = TYPE_DEVICE,
+    .instance_size = sizeof(sPAPRLink),
+    .class_init = spapr_link_class_init,
+};
+
+Object *spapr_link_create(Object *cpu, sPAPRMachineState *spapr, Error **errp)
+{
+    Error *local_err = NULL;
+    Object *obj;
+
+    obj = object_new(TYPE_SPAPR_LINK);
+    object_property_add_child(cpu, TYPE_SPAPR_LINK, obj, &error_abort);
+    object_unref(obj);
+    object_property_add_const_link(obj, "cpu", cpu, &error_abort);
+    object_property_set_bool(obj, true, "realized", &local_err);
+    if (local_err) {
+        object_unparent(obj);
+        error_propagate(errp, local_err);
+        obj = NULL;
+    }
+
+    return obj;
+}
+
+ICPState *spapr_link_icp(PowerPCCPU *cpu)
+{
+    return SPAPR_LINK(cpu->link)->icp;
+}
+
 static ICSState *spapr_ics_get(XICSFabric *dev, int irq)
 {
     sPAPRMachineState *spapr = SPAPR_MACHINE(dev);
@@ -3781,7 +3851,7 @@ static ICPState *spapr_icp_get(XICSFabric *xi, int vcpu_id)
 {
     PowerPCCPU *cpu = spapr_find_cpu(vcpu_id);
 
-    return cpu ? ICP(cpu->intc) : NULL;
+    return cpu ? spapr_link_icp(cpu) : NULL;
 }
 
 #define ICS_IRQ_FREE(ics, srcno)   \
@@ -3923,7 +3993,7 @@ static void spapr_pic_print_info(InterruptStatsProvider *obj,
     CPU_FOREACH(cs) {
         PowerPCCPU *cpu = POWERPC_CPU(cs);
 
-        icp_pic_print_info(ICP(cpu->intc), mon);
+        icp_pic_print_info(spapr_link_icp(cpu), mon);
     }
 
     ics_pic_print_info(spapr->ics, mon);
@@ -4472,6 +4542,7 @@ DEFINE_SPAPR_MACHINE(2_1, "2.1", false);
 static void spapr_machine_register_types(void)
 {
     type_register_static(&spapr_machine_info);
+    type_register_static(&spapr_link_info);
 }
 
 type_init(spapr_machine_register_types)
diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c
index 01dbc6942410..f02a41d011e9 100644
--- a/hw/ppc/spapr_cpu_core.c
+++ b/hw/ppc/spapr_cpu_core.c
@@ -104,7 +104,7 @@ static void spapr_cpu_core_unrealizefn(DeviceState *dev, Error **errp)
         PowerPCCPU *cpu = POWERPC_CPU(cs);
 
         spapr_cpu_destroy(cpu);
-        object_unparent(cpu->intc);
+        object_unparent(cpu->link);
         cpu_remove_sync(cs);
         object_unparent(obj);
     }
@@ -128,8 +128,7 @@ static void spapr_cpu_core_realize_child(Object *child,
         goto error;
     }
 
-    cpu->intc = icp_create(child, spapr->icp_type, XICS_FABRIC(spapr),
-                           &local_err);
+    cpu->link = spapr_link_create(child, spapr, &local_err);
     if (local_err) {
         goto error;
     }
diff --git a/include/hw/ppc/pnv.h b/include/hw/ppc/pnv.h
index 90759240a7b1..f76b3aa16ceb 100644
--- a/include/hw/ppc/pnv.h
+++ b/include/hw/ppc/pnv.h
@@ -137,6 +137,9 @@ typedef struct PnvMachineState {
     Notifier     powerdown_notifier;
 } PnvMachineState;
 
+Object *pnv_link_create(Object *cpu, Error **errp);
+ICPState *pnv_link_icp(PowerPCCPU *cpu);
+
 static inline bool pnv_chip_is_power9(const PnvChip *chip)
 {
     return PNV_CHIP_GET_CLASS(chip)->chip_type == PNV_CHIP_POWER9;
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index d60b7c6d7a8b..5a160f75ac8f 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -803,4 +803,7 @@ void spapr_caps_reset(sPAPRMachineState *spapr);
 void spapr_caps_add_properties(sPAPRMachineClass *smc, Error **errp);
 int spapr_caps_post_migration(sPAPRMachineState *spapr);
 
+Object *spapr_link_create(Object *cpu, sPAPRMachineState *spapr, Error **errp);
+ICPState *spapr_link_icp(PowerPCCPU *cpu);
+
 #endif /* HW_SPAPR_H */
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 8c9e03f54d3d..19f43bbe6723 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1204,7 +1204,7 @@ struct PowerPCCPU {
     int vcpu_id;
     uint32_t compat_pvr;
     PPCVirtualHypervisor *vhyp;
-    Object *intc;
+    Object *link;
     int32_t node_id; /* NUMA node this CPU belongs to */
     PPCHash64Options *hash64_opts;
David Gibson May 3, 2018, 5:35 a.m. UTC | #5
On Thu, Apr 26, 2018 at 11:27:21AM +0200, Cédric Le Goater wrote:
> On 04/26/2018 09:11 AM, David Gibson wrote:
> > On Thu, Apr 19, 2018 at 02:43:02PM +0200, Cédric Le Goater wrote:
> >> The XIVE presenter engine uses a set of registers to handle priority
> >> management and interrupt acknowledgment among other things. The most
> >> important ones being :
> >>
> >>   - Interrupt Priority Register (PIPR)
> >>   - Interrupt Pending Buffer (IPB)
> >>   - Current Processor Priority (CPPR)
> >>   - Notification Source Register (NSR)
> >>
> >> There is one set of registers per level of privilege, four in all :
> >> HW, HV pool, OS and User. These are called rings. All registers are
> >> accessible through a specific MMIO region called the Thread Interrupt
> >> Management Areas (TIMA) but, depending on the privilege level of the
> >> CPU, the view of the TIMA is filtered. The sPAPR machine runs at the
> >> OS privilege and therefore can only accesses the OS and the User
> >> rings. The others are for hypervisor levels.
> >>
> >> The CPU interrupt state is modeled with a XiveNVT object which stores
> >> the values of the different registers. The different TIMA views are
> >> mapped at the same address for each CPU and 'current_cpu' is used to
> >> retrieve the XiveNVT holding the ring registers.
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >> ---
> >>
> >>  Changes since v2 :
> >>
> >>  - introduced the XiveFabric interface
> >>
> >>  hw/intc/spapr_xive.c        |  25 ++++
> >>  hw/intc/xive.c              | 279 ++++++++++++++++++++++++++++++++++++++++++++
> >>  include/hw/ppc/spapr_xive.h |   5 +
> >>  include/hw/ppc/xive.h       |  31 +++++
> >>  include/hw/ppc/xive_regs.h  |  84 +++++++++++++
> >>  5 files changed, 424 insertions(+)
> >>
> >> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> >> index 90cde8a4082d..f07832bf0a00 100644
> >> --- a/hw/intc/spapr_xive.c
> >> +++ b/hw/intc/spapr_xive.c
> >> @@ -13,6 +13,7 @@
> >>  #include "target/ppc/cpu.h"
> >>  #include "sysemu/cpus.h"
> >>  #include "monitor/monitor.h"
> >> +#include "hw/ppc/spapr.h"
> >>  #include "hw/ppc/spapr_xive.h"
> >>  #include "hw/ppc/xive.h"
> >>  #include "hw/ppc/xive_regs.h"
> >> @@ -95,6 +96,22 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
> >>  
> >>      /* Allocate the Interrupt Virtualization Table */
> >>      xive->ivt = g_new0(XiveIVE, xive->nr_irqs);
> >> +
> >> +    /* The Thread Interrupt Management Area has the same address for
> >> +     * each chip. On sPAPR, we only need to expose the User and OS
> >> +     * level views of the TIMA.
> >> +     */
> >> +    xive->tm_base = XIVE_TM_BASE;
> > 
> > The constant should probably have PAPR in the name somewhere, since
> > it's just for PAPR machines (same for the ESB mappings, actually).
> 
> ok. 
> 
> I have also made 'tm_base' a property, like 'vc_base' for ESBs, in 
> case we want to change the value when the guest is instantiated. 
> I doubt it but this is an address in the global address space, so 
> letting the machine have control is better I think.

I agree.

> >> +
> >> +    memory_region_init_io(&xive->tm_mmio_user, OBJECT(xive),
> >> +                          &xive_tm_user_ops, xive, "xive.tima.user",
> >> +                          1ull << TM_SHIFT);
> >> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_user);
> >> +
> >> +    memory_region_init_io(&xive->tm_mmio_os, OBJECT(xive),
> >> +                          &xive_tm_os_ops, xive, "xive.tima.os",
> >> +                          1ull << TM_SHIFT);
> >> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_os);
> >>  }
> >>  
> >>  static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
> >> @@ -104,6 +121,13 @@ static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
> >>      return lisn < xive->nr_irqs ? &xive->ivt[lisn] : NULL;
> >>  }
> >>  
> >> +static XiveNVT *spapr_xive_get_nvt(XiveFabric *xf, uint32_t server)
> >> +{
> >> +    PowerPCCPU *cpu = spapr_find_cpu(server);
> >> +
> >> +    return cpu ? XIVE_NVT(cpu->intc) : NULL;
> >> +}
> > 
> > So this is a bit of a tangent, but I've been thinking of implementing
> > a scheme where there's an opaque pointer in the cpu structure for the
> > use of the machine.  I'm planning for that to replace the intc pointer
> > (which isn't really used directly by the cpu). That would allow us to
> > have spapr put a structure there and have both xics and xive pointers
> > which could be useful later on.
> 
> ok. That should simplify the patchset at the end, in which we need to 
> switch the 'intc' pointer. 
> 
> > I think we'd need something similar to correctly handle migration of
> > the VPA state, which is currently horribly broken.
> > 
> >> +
> >>  static const VMStateDescription vmstate_spapr_xive_ive = {
> >>      .name = TYPE_SPAPR_XIVE "/ive",
> >>      .version_id = 1,
> >> @@ -143,6 +167,7 @@ static void spapr_xive_class_init(ObjectClass *klass, void *data)
> >>      dc->vmsd = &vmstate_spapr_xive;
> >>  
> >>      xfc->get_ive = spapr_xive_get_ive;
> >> +    xfc->get_nvt = spapr_xive_get_nvt;
> >>  }
> >>  
> >>  static const TypeInfo spapr_xive_info = {
> >> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> >> index dccad0318834..5691bb9474e4 100644
> >> --- a/hw/intc/xive.c
> >> +++ b/hw/intc/xive.c
> >> @@ -14,7 +14,278 @@
> >>  #include "sysemu/cpus.h"
> >>  #include "sysemu/dma.h"
> >>  #include "monitor/monitor.h"
> >> +#include "hw/ppc/xics.h" /* for ICP_PROP_CPU */
> >>  #include "hw/ppc/xive.h"
> >> +#include "hw/ppc/xive_regs.h"
> >> +
> >> +/*
> >> + * XIVE Interrupt Presenter
> >> + */
> >> +
> >> +static uint64_t xive_nvt_accept(XiveNVT *nvt)
> >> +{
> >> +    return 0;
> >> +}
> >> +
> >> +static void xive_nvt_set_cppr(XiveNVT *nvt, uint8_t cppr)
> >> +{
> >> +    if (cppr > XIVE_PRIORITY_MAX) {
> >> +        cppr = 0xff;
> >> +    }
> >> +
> >> +    nvt->ring_os[TM_CPPR] = cppr;
> > 
> > Surely this needs to recheck if we should be interrupting the cpu?
> 
> yes. In patch 9, when we introduce the nvt notify routine.

Ok.

> >> +}
> >> +
> >> +/*
> >> + * OS Thread Interrupt Management Area MMIO
> >> + */
> >> +static uint64_t xive_tm_read_special(XiveNVT *nvt, hwaddr offset,
> >> +                                           unsigned size)
> >> +{
> >> +    uint64_t ret = -1;
> >> +
> >> +    if (offset == TM_SPC_ACK_OS_REG && size == 2) {
> >> +        ret = xive_nvt_accept(nvt);
> >> +    } else {
> >> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid TIMA read @%"
> >> +                      HWADDR_PRIx" size %d\n", offset, size);
> >> +    }
> >> +
> >> +    return ret;
> >> +}
> >> +
> >> +#define TM_RING(offset) ((offset) & 0xf0)
> >> +
> >> +static uint64_t xive_tm_os_read(void *opaque, hwaddr offset,
> >> +                                      unsigned size)
> >> +{
> >> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
> > 
> > So, as I said on a previous version of this, we can actually correctly
> > represent different mappings in different cpu spaces, by exploiting
> > cpu->as and not just having them all point to &address_space_memory.
> 
> Yes, you did and I haven't studied the question yet. For the next version.

So, it's possible that using the cpu->as thing will be more trouble
that it's worth.  I am a little concerned about using current_cpu
though.  First, will it work with KVM with kernel_irqchip=off - the
cpus are running truly concurrently, but we still need to work out
who's poking at the TIMA.  Second, are there any cases where we might
need to trip this "on behalf of" a specific cpu that's not the current
one.

> >> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
> >> +    uint64_t ret = -1;
> >> +    int i;
> >> +
> >> +    if (offset >= TM_SPC_ACK_EBB) {
> >> +        return xive_tm_read_special(nvt, offset, size);
> >> +    }
> >> +
> >> +    if (TM_RING(offset) != TM_QW1_OS) {
> >> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid access to non-OS ring @%"
> >> +                      HWADDR_PRIx"\n", offset);
> >> +        return ret;
> > 
> > Just return -1 would be clearer here;
> 
> ok.
> 
> > 
> >> +    }
> >> +
> >> +    ret = 0;
> >> +    for (i = 0; i < size; i++) {
> >> +        ret |= (uint64_t) nvt->regs[offset + i] << (8 * (size - i - 1));
> >> +    }
> >> +
> >> +    return ret;
> >> +}
> >> +
> >> +static bool xive_tm_is_readonly(uint8_t offset)
> >> +{
> >> +    return offset != TM_QW1_OS + TM_CPPR;
> >> +}
> >> +
> >> +static void xive_tm_write_special(XiveNVT *nvt, hwaddr offset,
> >> +                                        uint64_t value, unsigned size)
> >> +{
> >> +    /* TODO: support TM_SPC_SET_OS_PENDING */
> >> +
> >> +    /* TODO: support TM_SPC_ACK_OS_EL */
> >> +}
> >> +
> >> +static void xive_tm_os_write(void *opaque, hwaddr offset,
> >> +                                   uint64_t value, unsigned size)
> >> +{
> >> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
> >> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
> >> +    int i;
> >> +
> >> +    if (offset >= TM_SPC_ACK_EBB) {
> >> +        xive_tm_write_special(nvt, offset, value, size);
> >> +        return;
> >> +    }
> >> +
> >> +    if (TM_RING(offset) != TM_QW1_OS) {
> > 
> > Why have this if you have separate OS and user regions as you appear
> > to do below?
> 
> This is another problem we are trying to solve. 
> 
> The registers a CPU can access depends on the TIMA view it is using. 
> The OS TIMA view only sees the OS ring registers. The HV view sees all. 
> 
> > Or to look at it another way, shouldn't it be possible to make the
> > read/write accessors the same for the OS and user rings?
> 
> For some parts yes, but the special load/store addresses are different
> for each view, the read-only register also. It seemed easier to duplicate.
> 
> I think the problem will become clearer (or worse) with pnv which uses 
> the HV mode.

Oh.  I had the impression that each ring had a basically identical set
of registers and you just had access to the region for your ring and
the ones below.  Are you saying instead it's basically a single block
of registers with various different privilege levels for each of them?

[snip]
> >> +}
> >> +
> >> +static void xive_nvt_unrealize(DeviceState *dev, Error **errp)
> >> +{
> >> +    qemu_unregister_reset(xive_nvt_reset, dev);
> >> +}
> >> +
> >> +static void xive_nvt_init(Object *obj)
> >> +{
> >> +    XiveNVT *nvt = XIVE_NVT(obj);
> >> +
> >> +    nvt->ring_os = &nvt->regs[TM_QW1_OS];
> > 
> > The ring_os field is basically pointless, being just an offset into a
> > structure you already have.  A macro or inline would be a better idea.
> 
> ok. I liked the idea but I agree it's overkill to have an init routine
> just for this. I will find something.

That too, but it's also something that looks like an optimization but
isn't, which is bad practice.  On modern cpus math is cheap (and this
is just a trivial offset), memory accesses are expensive.  You're
essentially caching this offset - raising all the usual invalidation
questions for a cache - when caching it is *more* expensive than just
computing it every time.
David Gibson May 3, 2018, 5:39 a.m. UTC | #6
On Thu, Apr 26, 2018 at 07:15:29PM +0200, Cédric Le Goater wrote:
> On 04/26/2018 11:27 AM, Cédric Le Goater wrote:
> > On 04/26/2018 09:11 AM, David Gibson wrote:
> >> On Thu, Apr 19, 2018 at 02:43:02PM +0200, Cédric Le Goater wrote:
[snip]
> >>> +static void xive_tm_os_write(void *opaque, hwaddr offset,
> >>> +                                   uint64_t value, unsigned size)
> >>> +{
> >>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
> >>> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
> >>> +    int i;
> >>> +
> >>> +    if (offset >= TM_SPC_ACK_EBB) {
> >>> +        xive_tm_write_special(nvt, offset, value, size);
> >>> +        return;
> >>> +    }
> >>> +
> >>> +    if (TM_RING(offset) != TM_QW1_OS) {
> >>
> >> Why have this if you have separate OS and user regions as you appear
> >> to do below?
> > 
> > This is another problem we are trying to solve. 
> > 
> > The registers a CPU can access depends on the TIMA view it is using. 
> > The OS TIMA view only sees the OS ring registers. The HV view sees all. 
> 
> So, I gave a deeper look at the specs and I understood a little more 
> details of the concepts behind. You need to do frequent round-trips 
> to this document ...  
> 
> These registers are accessible through four aligned pages, each exposing 
> a different view of the registers. First page (page address ending 
> in 0b00) gives access to the entire context and is reserved for the 
> ring 0 security monitor. The second (page address ending in 0b01) 
> is for the hypervisor, ring 1. The third (page address ending in 0b10) 
> is for the operating system, ring 2. The fourth (page address ending 
> in 0b11) is for user level, ring 3.
> 
> The sPAPR machine runs at the OS privilege and therefore can only 
> accesses the OS and the User rings, 2 and 3. The others are for
> hypervisor levels.

Ok, that much is what I thought.  What I'm less clear on is what each
page looks like compared to the others.  Previously I thought each one
had the same registers, just manipulating the corresponding ring.  Are
you saying instead that each ring's page basically has a subset of the
registers in the next most privileged page?

> I will try to come with a better implementation of the model and
> make sure the ring numbers are respected. I am not sure we should 
> have only one memory region or four distinct ones with their
> own ops. There are some differences in the load/store of each view.

Right.  I'm not clear at this point if that's for good reasons, or
just because IBM's hardware designers don't seem to have gotten the
hang of Don't Repeat Yourself.
David Gibson May 3, 2018, 5:43 a.m. UTC | #7
On Wed, May 02, 2018 at 09:39:44AM +0200, Cédric Le Goater wrote:
> >>  
> >> +static XiveNVT *spapr_xive_get_nvt(XiveFabric *xf, uint32_t server)
> >> +{
> >> +    PowerPCCPU *cpu = spapr_find_cpu(server);
> >> +
> >> +    return cpu ? XIVE_NVT(cpu->intc) : NULL;
> >> +}
> > 
> > So this is a bit of a tangent, but I've been thinking of implementing
> > a scheme where there's an opaque pointer in the cpu structure for the
> > use of the machine.  I'm planning for that to replace the intc pointer
> > (which isn't really used directly by the cpu). That would allow us to
> > have spapr put a structure there and have both xics and xive pointers
> > which could be useful later on.
> 
> Here is a quick try of the idea. Tested on pnv and spapr machines.
> I lacked inspiration on the name so I called the object
> {Machine}Link.

This is a bit overkill compared to what I had in mind.  I don't think
the thing we're pointing to has to be a fully realized QOM object.  I
was just going to replace the Object * with a void *, that it's up to
the machine to interpret.

I'm also wondering about restricting this idea to vhyp platforms.  The
idea is that for physical-esque machines the cpu really does (or
should) know how things are connected to it.  It's the abstraction of
the paravirt platform that makes it fuzzy.  In which case I'd see it
as the "opaque" pointer that goes along with the vhyp function
pointers.
Cédric Le Goater May 3, 2018, 2:42 p.m. UTC | #8
On 05/03/2018 07:43 AM, David Gibson wrote:
> On Wed, May 02, 2018 at 09:39:44AM +0200, Cédric Le Goater wrote:
>>>>  
>>>> +static XiveNVT *spapr_xive_get_nvt(XiveFabric *xf, uint32_t server)
>>>> +{
>>>> +    PowerPCCPU *cpu = spapr_find_cpu(server);
>>>> +
>>>> +    return cpu ? XIVE_NVT(cpu->intc) : NULL;
>>>> +}
>>>
>>> So this is a bit of a tangent, but I've been thinking of implementing
>>> a scheme where there's an opaque pointer in the cpu structure for the
>>> use of the machine.  I'm planning for that to replace the intc pointer
>>> (which isn't really used directly by the cpu). That would allow us to
>>> have spapr put a structure there and have both xics and xive pointers
>>> which could be useful later on.
>>
>> Here is a quick try of the idea. Tested on pnv and spapr machines.
>> I lacked inspiration on the name so I called the object
>> {Machine}Link.
> 
> This is a bit overkill compared to what I had in mind.  I don't think
> the thing we're pointing to has to be a fully realized QOM object. 

Yes, it is quite a bit of code for a simple struct.

> I was just going to replace the Object * with a void *, that it's up to
> the machine to interpret.

So the machine would just g_malloc0 a custom struct for each CPU, filling
it out depending on the configuration/needs ? 
 
> I'm also wondering about restricting this idea to vhyp platforms.  

OK. 

> The idea is that for physical-esque machines the cpu really does (or
> should) know how things are connected to it.  

yes. P9 does not have a XICS interrupt controller on PowerNV.

> It's the abstraction of
> the paravirt platform that makes it fuzzy.  In which case I'd see it
> as the "opaque" pointer that goes along with the vhyp function
> pointers.

I will take a look for the intc pointer as xive needs an extra one.

C.
Cédric Le Goater May 3, 2018, 3:10 p.m. UTC | #9
On 05/03/2018 07:39 AM, David Gibson wrote:
> On Thu, Apr 26, 2018 at 07:15:29PM +0200, Cédric Le Goater wrote:
>> On 04/26/2018 11:27 AM, Cédric Le Goater wrote:
>>> On 04/26/2018 09:11 AM, David Gibson wrote:
>>>> On Thu, Apr 19, 2018 at 02:43:02PM +0200, Cédric Le Goater wrote:
> [snip]
>>>>> +static void xive_tm_os_write(void *opaque, hwaddr offset,
>>>>> +                                   uint64_t value, unsigned size)
>>>>> +{
>>>>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
>>>>> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
>>>>> +    int i;
>>>>> +
>>>>> +    if (offset >= TM_SPC_ACK_EBB) {
>>>>> +        xive_tm_write_special(nvt, offset, value, size);
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    if (TM_RING(offset) != TM_QW1_OS) {
>>>>
>>>> Why have this if you have separate OS and user regions as you appear
>>>> to do below?
>>>
>>> This is another problem we are trying to solve. 
>>>
>>> The registers a CPU can access depends on the TIMA view it is using. 
>>> The OS TIMA view only sees the OS ring registers. The HV view sees all. 
>>
>> So, I gave a deeper look at the specs and I understood a little more 
>> details of the concepts behind. You need to do frequent round-trips 
>> to this document ...  
>>
>> These registers are accessible through four aligned pages, each exposing 
>> a different view of the registers. First page (page address ending 
>> in 0b00) gives access to the entire context and is reserved for the 
>> ring 0 security monitor. The second (page address ending in 0b01) 
>> is for the hypervisor, ring 1. The third (page address ending in 0b10) 
>> is for the operating system, ring 2. The fourth (page address ending 
>> in 0b11) is for user level, ring 3.
>>
>> The sPAPR machine runs at the OS privilege and therefore can only 
>> accesses the OS and the User rings, 2 and 3. The others are for
>> hypervisor levels.
> 
> Ok, that much is what I thought.  What I'm less clear on is what each
> page looks like compared to the others.  Previously I thought each one
> had the same registers, 

yes.

> just manipulating the corresponding ring.  

no. 

> Are you saying instead that each ring's page basically has a subset 
> of the registers in the next most privileged page?

That's the idea. 

The registers are defined as follow :

	QW-0 User      
	QW-1 O/S      
	QW-2 Pool   
	QW-3 Physical 

and the pages :

- 0006030203180000 security monitor 
  can access all registers 

- 0006030203190000 hv
  can access all registers minus the secure regs

- 00060302031a0000 os
  can access some of the OS (QW1) and User (QW0) registers
 
- 00060302031b0000 user
  can access NSR reg of User (QW0) registers

On sPAPR, we can remap the os/user pages to some other base address 
but we should keep the same page offset.


>> I will try to come with a better implementation of the model and
>> make sure the ring numbers are respected. I am not sure we should 
>> have only one memory region or four distinct ones with their
>> own ops. There are some differences in the load/store of each view.
> 
> Right.  I'm not clear at this point if that's for good reasons, or
> just because IBM's hardware designers don't seem to have gotten the
> hang of Don't Repeat Yourself.
>
Cédric Le Goater May 3, 2018, 4:06 p.m. UTC | #10
On 05/03/2018 07:35 AM, David Gibson wrote:
> On Thu, Apr 26, 2018 at 11:27:21AM +0200, Cédric Le Goater wrote:
>> On 04/26/2018 09:11 AM, David Gibson wrote:
>>> On Thu, Apr 19, 2018 at 02:43:02PM +0200, Cédric Le Goater wrote:
>>>> The XIVE presenter engine uses a set of registers to handle priority
>>>> management and interrupt acknowledgment among other things. The most
>>>> important ones being :
>>>>
>>>>   - Interrupt Priority Register (PIPR)
>>>>   - Interrupt Pending Buffer (IPB)
>>>>   - Current Processor Priority (CPPR)
>>>>   - Notification Source Register (NSR)
>>>>
>>>> There is one set of registers per level of privilege, four in all :
>>>> HW, HV pool, OS and User. These are called rings. All registers are
>>>> accessible through a specific MMIO region called the Thread Interrupt
>>>> Management Areas (TIMA) but, depending on the privilege level of the
>>>> CPU, the view of the TIMA is filtered. The sPAPR machine runs at the
>>>> OS privilege and therefore can only accesses the OS and the User
>>>> rings. The others are for hypervisor levels.
>>>>
>>>> The CPU interrupt state is modeled with a XiveNVT object which stores
>>>> the values of the different registers. The different TIMA views are
>>>> mapped at the same address for each CPU and 'current_cpu' is used to
>>>> retrieve the XiveNVT holding the ring registers.
>>>>
>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>> ---
>>>>
>>>>  Changes since v2 :
>>>>
>>>>  - introduced the XiveFabric interface
>>>>
>>>>  hw/intc/spapr_xive.c        |  25 ++++
>>>>  hw/intc/xive.c              | 279 ++++++++++++++++++++++++++++++++++++++++++++
>>>>  include/hw/ppc/spapr_xive.h |   5 +
>>>>  include/hw/ppc/xive.h       |  31 +++++
>>>>  include/hw/ppc/xive_regs.h  |  84 +++++++++++++
>>>>  5 files changed, 424 insertions(+)
>>>>
>>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>>>> index 90cde8a4082d..f07832bf0a00 100644
>>>> --- a/hw/intc/spapr_xive.c
>>>> +++ b/hw/intc/spapr_xive.c
>>>> @@ -13,6 +13,7 @@
>>>>  #include "target/ppc/cpu.h"
>>>>  #include "sysemu/cpus.h"
>>>>  #include "monitor/monitor.h"
>>>> +#include "hw/ppc/spapr.h"
>>>>  #include "hw/ppc/spapr_xive.h"
>>>>  #include "hw/ppc/xive.h"
>>>>  #include "hw/ppc/xive_regs.h"
>>>> @@ -95,6 +96,22 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>>>  
>>>>      /* Allocate the Interrupt Virtualization Table */
>>>>      xive->ivt = g_new0(XiveIVE, xive->nr_irqs);
>>>> +
>>>> +    /* The Thread Interrupt Management Area has the same address for
>>>> +     * each chip. On sPAPR, we only need to expose the User and OS
>>>> +     * level views of the TIMA.
>>>> +     */
>>>> +    xive->tm_base = XIVE_TM_BASE;
>>>
>>> The constant should probably have PAPR in the name somewhere, since
>>> it's just for PAPR machines (same for the ESB mappings, actually).
>>
>> ok. 
>>
>> I have also made 'tm_base' a property, like 'vc_base' for ESBs, in 
>> case we want to change the value when the guest is instantiated. 
>> I doubt it but this is an address in the global address space, so 
>> letting the machine have control is better I think.
> 
> I agree.
> 
>>>> +
>>>> +    memory_region_init_io(&xive->tm_mmio_user, OBJECT(xive),
>>>> +                          &xive_tm_user_ops, xive, "xive.tima.user",
>>>> +                          1ull << TM_SHIFT);
>>>> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_user);
>>>> +
>>>> +    memory_region_init_io(&xive->tm_mmio_os, OBJECT(xive),
>>>> +                          &xive_tm_os_ops, xive, "xive.tima.os",
>>>> +                          1ull << TM_SHIFT);
>>>> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_os);
>>>>  }
>>>>  
>>>>  static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
>>>> @@ -104,6 +121,13 @@ static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
>>>>      return lisn < xive->nr_irqs ? &xive->ivt[lisn] : NULL;
>>>>  }
>>>>  
>>>> +static XiveNVT *spapr_xive_get_nvt(XiveFabric *xf, uint32_t server)
>>>> +{
>>>> +    PowerPCCPU *cpu = spapr_find_cpu(server);
>>>> +
>>>> +    return cpu ? XIVE_NVT(cpu->intc) : NULL;
>>>> +}
>>>
>>> So this is a bit of a tangent, but I've been thinking of implementing
>>> a scheme where there's an opaque pointer in the cpu structure for the
>>> use of the machine.  I'm planning for that to replace the intc pointer
>>> (which isn't really used directly by the cpu). That would allow us to
>>> have spapr put a structure there and have both xics and xive pointers
>>> which could be useful later on.
>>
>> ok. That should simplify the patchset at the end, in which we need to 
>> switch the 'intc' pointer. 
>>
>>> I think we'd need something similar to correctly handle migration of
>>> the VPA state, which is currently horribly broken.
>>>
>>>> +
>>>>  static const VMStateDescription vmstate_spapr_xive_ive = {
>>>>      .name = TYPE_SPAPR_XIVE "/ive",
>>>>      .version_id = 1,
>>>> @@ -143,6 +167,7 @@ static void spapr_xive_class_init(ObjectClass *klass, void *data)
>>>>      dc->vmsd = &vmstate_spapr_xive;
>>>>  
>>>>      xfc->get_ive = spapr_xive_get_ive;
>>>> +    xfc->get_nvt = spapr_xive_get_nvt;
>>>>  }
>>>>  
>>>>  static const TypeInfo spapr_xive_info = {
>>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>>>> index dccad0318834..5691bb9474e4 100644
>>>> --- a/hw/intc/xive.c
>>>> +++ b/hw/intc/xive.c
>>>> @@ -14,7 +14,278 @@
>>>>  #include "sysemu/cpus.h"
>>>>  #include "sysemu/dma.h"
>>>>  #include "monitor/monitor.h"
>>>> +#include "hw/ppc/xics.h" /* for ICP_PROP_CPU */
>>>>  #include "hw/ppc/xive.h"
>>>> +#include "hw/ppc/xive_regs.h"
>>>> +
>>>> +/*
>>>> + * XIVE Interrupt Presenter
>>>> + */
>>>> +
>>>> +static uint64_t xive_nvt_accept(XiveNVT *nvt)
>>>> +{
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static void xive_nvt_set_cppr(XiveNVT *nvt, uint8_t cppr)
>>>> +{
>>>> +    if (cppr > XIVE_PRIORITY_MAX) {
>>>> +        cppr = 0xff;
>>>> +    }
>>>> +
>>>> +    nvt->ring_os[TM_CPPR] = cppr;
>>>
>>> Surely this needs to recheck if we should be interrupting the cpu?
>>
>> yes. In patch 9, when we introduce the nvt notify routine.
> 
> Ok.
> 
>>>> +}
>>>> +
>>>> +/*
>>>> + * OS Thread Interrupt Management Area MMIO
>>>> + */
>>>> +static uint64_t xive_tm_read_special(XiveNVT *nvt, hwaddr offset,
>>>> +                                           unsigned size)
>>>> +{
>>>> +    uint64_t ret = -1;
>>>> +
>>>> +    if (offset == TM_SPC_ACK_OS_REG && size == 2) {
>>>> +        ret = xive_nvt_accept(nvt);
>>>> +    } else {
>>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid TIMA read @%"
>>>> +                      HWADDR_PRIx" size %d\n", offset, size);
>>>> +    }
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +#define TM_RING(offset) ((offset) & 0xf0)
>>>> +
>>>> +static uint64_t xive_tm_os_read(void *opaque, hwaddr offset,
>>>> +                                      unsigned size)
>>>> +{
>>>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
>>>
>>> So, as I said on a previous version of this, we can actually correctly
>>> represent different mappings in different cpu spaces, by exploiting
>>> cpu->as and not just having them all point to &address_space_memory.
>>
>> Yes, you did and I haven't studied the question yet. For the next version.
> 
> So, it's possible that using the cpu->as thing will be more trouble
> that it's worth. 

One of the trouble is the number of memory regions to use, one per cpu, 
and the KVM support. Having a single region is much easier. 

> I am a little concerned about using current_cpu though.  
> First, will it work with KVM with kernel_irqchip=off - the
> cpus are running truly concurrently,

FWIW, I didn't see any issue yet while stressing. 

> but we still need to work out who's poking at the TIMA.  

I understand. The registers are accessed by the current cpu to set the 
CPPR and to ack an interrupt. But when we route an event, we also access 
and modify the registers. Do you suggest some locking ? I am not sure
how are protected the TIMA region accesses vs. the routing, which is 
necessarily initiated by an ESB MMIO though.

> Second, are there any cases where we might
> need to trip this "on behalf of" a specific cpu that's not the current
> one.

ah. yes. sort of :) only in powernv, when the xive is reseted (and when 
dumping the state for debug).

The IC has a way to access indirectly the registers of a HW thread. 
It, first, sets the PC_TCTXT_INDIR_THRDID register with the PIR of 
the targeted thread and then loads on the indirect TIMA can be done 
as if it was the current thread. The indirect TIMA is mapped 4 pages 
after the  IC BAR.

The resulting memory region op is a little ugly and might need 
some rework : 

static uint64_t xive_tm_hv_read(void *opaque, hwaddr offset,
                                 unsigned size)
{
    PowerPCCPU **cpuptr = opaque;
    PowerPCCPU *cpu = *cpuptr ? *cpuptr : POWERPC_CPU(current_cpu);
    ...


>>>> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
>>>> +    uint64_t ret = -1;
>>>> +    int i;
>>>> +
>>>> +    if (offset >= TM_SPC_ACK_EBB) {
>>>> +        return xive_tm_read_special(nvt, offset, size);
>>>> +    }
>>>> +
>>>> +    if (TM_RING(offset) != TM_QW1_OS) {
>>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid access to non-OS ring @%"
>>>> +                      HWADDR_PRIx"\n", offset);
>>>> +        return ret;
>>>
>>> Just return -1 would be clearer here;
>>
>> ok.
>>
>>>
>>>> +    }
>>>> +
>>>> +    ret = 0;
>>>> +    for (i = 0; i < size; i++) {
>>>> +        ret |= (uint64_t) nvt->regs[offset + i] << (8 * (size - i - 1));
>>>> +    }
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static bool xive_tm_is_readonly(uint8_t offset)
>>>> +{
>>>> +    return offset != TM_QW1_OS + TM_CPPR;
>>>> +}
>>>> +
>>>> +static void xive_tm_write_special(XiveNVT *nvt, hwaddr offset,
>>>> +                                        uint64_t value, unsigned size)
>>>> +{
>>>> +    /* TODO: support TM_SPC_SET_OS_PENDING */
>>>> +
>>>> +    /* TODO: support TM_SPC_ACK_OS_EL */
>>>> +}
>>>> +
>>>> +static void xive_tm_os_write(void *opaque, hwaddr offset,
>>>> +                                   uint64_t value, unsigned size)
>>>> +{
>>>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
>>>> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
>>>> +    int i;
>>>> +
>>>> +    if (offset >= TM_SPC_ACK_EBB) {
>>>> +        xive_tm_write_special(nvt, offset, value, size);
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    if (TM_RING(offset) != TM_QW1_OS) {
>>>
>>> Why have this if you have separate OS and user regions as you appear
>>> to do below?
>>
>> This is another problem we are trying to solve. 
>>
>> The registers a CPU can access depends on the TIMA view it is using. 
>> The OS TIMA view only sees the OS ring registers. The HV view sees all. 
>>
>>> Or to look at it another way, shouldn't it be possible to make the
>>> read/write accessors the same for the OS and user rings?
>>
>> For some parts yes, but the special load/store addresses are different
>> for each view, the read-only register also. It seemed easier to duplicate.
>>
>> I think the problem will become clearer (or worse) with pnv which uses 
>> the HV mode.
> 
> Oh.  I had the impression that each ring had a basically identical set
> of registers and you just had access to the region for your ring and
> the ones below.  Are you saying instead it's basically a single block
> of registers with various different privilege levels for each of them?

yes. I think I answered this question more clearly in a previous email.

> [snip]
>>>> +}
>>>> +
>>>> +static void xive_nvt_unrealize(DeviceState *dev, Error **errp)
>>>> +{
>>>> +    qemu_unregister_reset(xive_nvt_reset, dev);
>>>> +}
>>>> +
>>>> +static void xive_nvt_init(Object *obj)
>>>> +{
>>>> +    XiveNVT *nvt = XIVE_NVT(obj);
>>>> +
>>>> +    nvt->ring_os = &nvt->regs[TM_QW1_OS];
>>>
>>> The ring_os field is basically pointless, being just an offset into a
>>> structure you already have.  A macro or inline would be a better idea.
>>
>> ok. I liked the idea but I agree it's overkill to have an init routine
>> just for this. I will find something.
> 
> That too, but it's also something that looks like an optimization but
> isn't, which is bad practice.  On modern cpus math is cheap (and this
> is just a trivial offset), memory accesses are expensive.  You're
> essentially caching this offset - raising all the usual invalidation
> questions for a cache - when caching it is *more* expensive than just
> computing it every time.

ok. removing this offset was a good opportunity to generalize the 
routing algorithm and use a 'ring' parameter in all routines. Same 
for the accept path. 


C.
David Gibson May 4, 2018, 4:44 a.m. UTC | #11
On Thu, May 03, 2018 at 05:10:48PM +0200, Cédric Le Goater wrote:
> On 05/03/2018 07:39 AM, David Gibson wrote:
> > On Thu, Apr 26, 2018 at 07:15:29PM +0200, Cédric Le Goater wrote:
> >> On 04/26/2018 11:27 AM, Cédric Le Goater wrote:
> >>> On 04/26/2018 09:11 AM, David Gibson wrote:
> >>>> On Thu, Apr 19, 2018 at 02:43:02PM +0200, Cédric Le Goater wrote:
> > [snip]
> >>>>> +static void xive_tm_os_write(void *opaque, hwaddr offset,
> >>>>> +                                   uint64_t value, unsigned size)
> >>>>> +{
> >>>>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
> >>>>> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
> >>>>> +    int i;
> >>>>> +
> >>>>> +    if (offset >= TM_SPC_ACK_EBB) {
> >>>>> +        xive_tm_write_special(nvt, offset, value, size);
> >>>>> +        return;
> >>>>> +    }
> >>>>> +
> >>>>> +    if (TM_RING(offset) != TM_QW1_OS) {
> >>>>
> >>>> Why have this if you have separate OS and user regions as you appear
> >>>> to do below?
> >>>
> >>> This is another problem we are trying to solve. 
> >>>
> >>> The registers a CPU can access depends on the TIMA view it is using. 
> >>> The OS TIMA view only sees the OS ring registers. The HV view sees all. 
> >>
> >> So, I gave a deeper look at the specs and I understood a little more 
> >> details of the concepts behind. You need to do frequent round-trips 
> >> to this document ...  
> >>
> >> These registers are accessible through four aligned pages, each exposing 
> >> a different view of the registers. First page (page address ending 
> >> in 0b00) gives access to the entire context and is reserved for the 
> >> ring 0 security monitor. The second (page address ending in 0b01) 
> >> is for the hypervisor, ring 1. The third (page address ending in 0b10) 
> >> is for the operating system, ring 2. The fourth (page address ending 
> >> in 0b11) is for user level, ring 3.
> >>
> >> The sPAPR machine runs at the OS privilege and therefore can only 
> >> accesses the OS and the User rings, 2 and 3. The others are for
> >> hypervisor levels.
> > 
> > Ok, that much is what I thought.  What I'm less clear on is what each
> > page looks like compared to the others.  Previously I thought each one
> > had the same registers, 
> 
> yes.
> 
> > just manipulating the corresponding ring.  
> 
> no. 
> 
> > Are you saying instead that each ring's page basically has a subset 
> > of the registers in the next most privileged page?
> 
> That's the idea. 

Ah, ok.

> The registers are defined as follow :
> 
> 	QW-0 User      
> 	QW-1 O/S      
> 	QW-2 Pool   
> 	QW-3 Physical 
> 
> and the pages :
> 
> - 0006030203180000 security monitor 
>   can access all registers 
> 
> - 0006030203190000 hv
>   can access all registers minus the secure regs
> 
> - 00060302031a0000 os
>   can access some of the OS (QW1) and User (QW0) registers
>  
> - 00060302031b0000 user
>   can access NSR reg of User (QW0) registers

I can see two reasonable ways of doing this:

A)

Have a single set of read/write functions.  These implement all the
registers but take a "privilege level" parameter which controls which
will actually work.  Those could then be wired up in one of two ways:

  A1) Single memory region.  The accessor derives the priv level from
  the relevant address bits, before masking it down to a single
  register page.  Then, as above

  A2) Multiple memory regions with the same accessor functions but
  different opaque pointer.  The accessor gets the priv level from
  its opaque pointer, then the address is just within a single ring's
  page.

B)

Separate memory regions with separate accessors.  The ring-0 accessor
implements the ring-0 registers, then calls the ring-1 accessor
function for everything else.  ring-1 calls ring-2 and so forth.

> On sPAPR, we can remap the os/user pages to some other base address 
> but we should keep the same page offset.

Sure.

> 
> 
> >> I will try to come with a better implementation of the model and
> >> make sure the ring numbers are respected. I am not sure we should 
> >> have only one memory region or four distinct ones with their
> >> own ops. There are some differences in the load/store of each view.
> > 
> > Right.  I'm not clear at this point if that's for good reasons, or
> > just because IBM's hardware designers don't seem to have gotten the
> > hang of Don't Repeat Yourself.
> > 
>
David Gibson May 4, 2018, 4:51 a.m. UTC | #12
On Thu, May 03, 2018 at 06:06:14PM +0200, Cédric Le Goater wrote:
> On 05/03/2018 07:35 AM, David Gibson wrote:
> > On Thu, Apr 26, 2018 at 11:27:21AM +0200, Cédric Le Goater wrote:
> >> On 04/26/2018 09:11 AM, David Gibson wrote:
> >>> On Thu, Apr 19, 2018 at 02:43:02PM +0200, Cédric Le Goater wrote:
> >>>> The XIVE presenter engine uses a set of registers to handle priority
> >>>> management and interrupt acknowledgment among other things. The most
> >>>> important ones being :
> >>>>
> >>>>   - Interrupt Priority Register (PIPR)
> >>>>   - Interrupt Pending Buffer (IPB)
> >>>>   - Current Processor Priority (CPPR)
> >>>>   - Notification Source Register (NSR)
> >>>>
> >>>> There is one set of registers per level of privilege, four in all :
> >>>> HW, HV pool, OS and User. These are called rings. All registers are
> >>>> accessible through a specific MMIO region called the Thread Interrupt
> >>>> Management Areas (TIMA) but, depending on the privilege level of the
> >>>> CPU, the view of the TIMA is filtered. The sPAPR machine runs at the
> >>>> OS privilege and therefore can only accesses the OS and the User
> >>>> rings. The others are for hypervisor levels.
> >>>>
> >>>> The CPU interrupt state is modeled with a XiveNVT object which stores
> >>>> the values of the different registers. The different TIMA views are
> >>>> mapped at the same address for each CPU and 'current_cpu' is used to
> >>>> retrieve the XiveNVT holding the ring registers.
> >>>>
> >>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >>>> ---
> >>>>
> >>>>  Changes since v2 :
> >>>>
> >>>>  - introduced the XiveFabric interface
> >>>>
> >>>>  hw/intc/spapr_xive.c        |  25 ++++
> >>>>  hw/intc/xive.c              | 279 ++++++++++++++++++++++++++++++++++++++++++++
> >>>>  include/hw/ppc/spapr_xive.h |   5 +
> >>>>  include/hw/ppc/xive.h       |  31 +++++
> >>>>  include/hw/ppc/xive_regs.h  |  84 +++++++++++++
> >>>>  5 files changed, 424 insertions(+)
> >>>>
> >>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> >>>> index 90cde8a4082d..f07832bf0a00 100644
> >>>> --- a/hw/intc/spapr_xive.c
> >>>> +++ b/hw/intc/spapr_xive.c
> >>>> @@ -13,6 +13,7 @@
> >>>>  #include "target/ppc/cpu.h"
> >>>>  #include "sysemu/cpus.h"
> >>>>  #include "monitor/monitor.h"
> >>>> +#include "hw/ppc/spapr.h"
> >>>>  #include "hw/ppc/spapr_xive.h"
> >>>>  #include "hw/ppc/xive.h"
> >>>>  #include "hw/ppc/xive_regs.h"
> >>>> @@ -95,6 +96,22 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
> >>>>  
> >>>>      /* Allocate the Interrupt Virtualization Table */
> >>>>      xive->ivt = g_new0(XiveIVE, xive->nr_irqs);
> >>>> +
> >>>> +    /* The Thread Interrupt Management Area has the same address for
> >>>> +     * each chip. On sPAPR, we only need to expose the User and OS
> >>>> +     * level views of the TIMA.
> >>>> +     */
> >>>> +    xive->tm_base = XIVE_TM_BASE;
> >>>
> >>> The constant should probably have PAPR in the name somewhere, since
> >>> it's just for PAPR machines (same for the ESB mappings, actually).
> >>
> >> ok. 
> >>
> >> I have also made 'tm_base' a property, like 'vc_base' for ESBs, in 
> >> case we want to change the value when the guest is instantiated. 
> >> I doubt it but this is an address in the global address space, so 
> >> letting the machine have control is better I think.
> > 
> > I agree.
> > 
> >>>> +
> >>>> +    memory_region_init_io(&xive->tm_mmio_user, OBJECT(xive),
> >>>> +                          &xive_tm_user_ops, xive, "xive.tima.user",
> >>>> +                          1ull << TM_SHIFT);
> >>>> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_user);
> >>>> +
> >>>> +    memory_region_init_io(&xive->tm_mmio_os, OBJECT(xive),
> >>>> +                          &xive_tm_os_ops, xive, "xive.tima.os",
> >>>> +                          1ull << TM_SHIFT);
> >>>> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_os);
> >>>>  }
> >>>>  
> >>>>  static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
> >>>> @@ -104,6 +121,13 @@ static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
> >>>>      return lisn < xive->nr_irqs ? &xive->ivt[lisn] : NULL;
> >>>>  }
> >>>>  
> >>>> +static XiveNVT *spapr_xive_get_nvt(XiveFabric *xf, uint32_t server)
> >>>> +{
> >>>> +    PowerPCCPU *cpu = spapr_find_cpu(server);
> >>>> +
> >>>> +    return cpu ? XIVE_NVT(cpu->intc) : NULL;
> >>>> +}
> >>>
> >>> So this is a bit of a tangent, but I've been thinking of implementing
> >>> a scheme where there's an opaque pointer in the cpu structure for the
> >>> use of the machine.  I'm planning for that to replace the intc pointer
> >>> (which isn't really used directly by the cpu). That would allow us to
> >>> have spapr put a structure there and have both xics and xive pointers
> >>> which could be useful later on.
> >>
> >> ok. That should simplify the patchset at the end, in which we need to 
> >> switch the 'intc' pointer. 
> >>
> >>> I think we'd need something similar to correctly handle migration of
> >>> the VPA state, which is currently horribly broken.
> >>>
> >>>> +
> >>>>  static const VMStateDescription vmstate_spapr_xive_ive = {
> >>>>      .name = TYPE_SPAPR_XIVE "/ive",
> >>>>      .version_id = 1,
> >>>> @@ -143,6 +167,7 @@ static void spapr_xive_class_init(ObjectClass *klass, void *data)
> >>>>      dc->vmsd = &vmstate_spapr_xive;
> >>>>  
> >>>>      xfc->get_ive = spapr_xive_get_ive;
> >>>> +    xfc->get_nvt = spapr_xive_get_nvt;
> >>>>  }
> >>>>  
> >>>>  static const TypeInfo spapr_xive_info = {
> >>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> >>>> index dccad0318834..5691bb9474e4 100644
> >>>> --- a/hw/intc/xive.c
> >>>> +++ b/hw/intc/xive.c
> >>>> @@ -14,7 +14,278 @@
> >>>>  #include "sysemu/cpus.h"
> >>>>  #include "sysemu/dma.h"
> >>>>  #include "monitor/monitor.h"
> >>>> +#include "hw/ppc/xics.h" /* for ICP_PROP_CPU */
> >>>>  #include "hw/ppc/xive.h"
> >>>> +#include "hw/ppc/xive_regs.h"
> >>>> +
> >>>> +/*
> >>>> + * XIVE Interrupt Presenter
> >>>> + */
> >>>> +
> >>>> +static uint64_t xive_nvt_accept(XiveNVT *nvt)
> >>>> +{
> >>>> +    return 0;
> >>>> +}
> >>>> +
> >>>> +static void xive_nvt_set_cppr(XiveNVT *nvt, uint8_t cppr)
> >>>> +{
> >>>> +    if (cppr > XIVE_PRIORITY_MAX) {
> >>>> +        cppr = 0xff;
> >>>> +    }
> >>>> +
> >>>> +    nvt->ring_os[TM_CPPR] = cppr;
> >>>
> >>> Surely this needs to recheck if we should be interrupting the cpu?
> >>
> >> yes. In patch 9, when we introduce the nvt notify routine.
> > 
> > Ok.
> > 
> >>>> +}
> >>>> +
> >>>> +/*
> >>>> + * OS Thread Interrupt Management Area MMIO
> >>>> + */
> >>>> +static uint64_t xive_tm_read_special(XiveNVT *nvt, hwaddr offset,
> >>>> +                                           unsigned size)
> >>>> +{
> >>>> +    uint64_t ret = -1;
> >>>> +
> >>>> +    if (offset == TM_SPC_ACK_OS_REG && size == 2) {
> >>>> +        ret = xive_nvt_accept(nvt);
> >>>> +    } else {
> >>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid TIMA read @%"
> >>>> +                      HWADDR_PRIx" size %d\n", offset, size);
> >>>> +    }
> >>>> +
> >>>> +    return ret;
> >>>> +}
> >>>> +
> >>>> +#define TM_RING(offset) ((offset) & 0xf0)
> >>>> +
> >>>> +static uint64_t xive_tm_os_read(void *opaque, hwaddr offset,
> >>>> +                                      unsigned size)
> >>>> +{
> >>>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
> >>>
> >>> So, as I said on a previous version of this, we can actually correctly
> >>> represent different mappings in different cpu spaces, by exploiting
> >>> cpu->as and not just having them all point to &address_space_memory.
> >>
> >> Yes, you did and I haven't studied the question yet. For the next version.
> > 
> > So, it's possible that using the cpu->as thing will be more trouble
> > that it's worth. 
> 
> One of the trouble is the number of memory regions to use, one per cpu, 

Well, we're already going to have an NVT object for each cpu, yes?  So
a memory region per-cpu doesn't seem like a big stretch.

> and the KVM support.

And I really don't see how the memory regions impacts KVM.

> Having a single region is much easier. 
> 
> > I am a little concerned about using current_cpu though.  
> > First, will it work with KVM with kernel_irqchip=off - the
> > cpus are running truly concurrently,
> 
> FWIW, I didn't see any issue yet while stressing. 

Ok.

> > but we still need to work out who's poking at the TIMA.  
> 
> I understand. The registers are accessed by the current cpu to set the 
> CPPR and to ack an interrupt. But when we route an event, we also access 
> and modify the registers. Do you suggest some locking ? I am not sure
> how are protected the TIMA region accesses vs. the routing, which is 
> necessarily initiated by an ESB MMIO though.

Locking isn't really the issue.  I mean, we do need locking, but the
BQL should provide that.  The issue is what exactly does "current"
mean in the context of multiple concurrently running cpus.  Does it
always mean what we need it to mean in every context we might call
this from.

> > Second, are there any cases where we might
> > need to trip this "on behalf of" a specific cpu that's not the current
> > one.
> 
> ah. yes. sort of :) only in powernv, when the xive is reseted (and when 
> dumping the state for debug).
> 
> The IC has a way to access indirectly the registers of a HW thread. 
> It, first, sets the PC_TCTXT_INDIR_THRDID register with the PIR of 
> the targeted thread and then loads on the indirect TIMA can be done 
> as if it was the current thread. The indirect TIMA is mapped 4 pages 
> after the  IC BAR.
> 
> The resulting memory region op is a little ugly and might need 
> some rework : 
> 
> static uint64_t xive_tm_hv_read(void *opaque, hwaddr offset,
>                                  unsigned size)
> {
>     PowerPCCPU **cpuptr = opaque;
>     PowerPCCPU *cpu = *cpuptr ? *cpuptr : POWERPC_CPU(current_cpu);
>     ...
> 
> 
> >>>> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
> >>>> +    uint64_t ret = -1;
> >>>> +    int i;
> >>>> +
> >>>> +    if (offset >= TM_SPC_ACK_EBB) {
> >>>> +        return xive_tm_read_special(nvt, offset, size);
> >>>> +    }
> >>>> +
> >>>> +    if (TM_RING(offset) != TM_QW1_OS) {
> >>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid access to non-OS ring @%"
> >>>> +                      HWADDR_PRIx"\n", offset);
> >>>> +        return ret;
> >>>
> >>> Just return -1 would be clearer here;
> >>
> >> ok.
> >>
> >>>
> >>>> +    }
> >>>> +
> >>>> +    ret = 0;
> >>>> +    for (i = 0; i < size; i++) {
> >>>> +        ret |= (uint64_t) nvt->regs[offset + i] << (8 * (size - i - 1));
> >>>> +    }
> >>>> +
> >>>> +    return ret;
> >>>> +}
> >>>> +
> >>>> +static bool xive_tm_is_readonly(uint8_t offset)
> >>>> +{
> >>>> +    return offset != TM_QW1_OS + TM_CPPR;
> >>>> +}
> >>>> +
> >>>> +static void xive_tm_write_special(XiveNVT *nvt, hwaddr offset,
> >>>> +                                        uint64_t value, unsigned size)
> >>>> +{
> >>>> +    /* TODO: support TM_SPC_SET_OS_PENDING */
> >>>> +
> >>>> +    /* TODO: support TM_SPC_ACK_OS_EL */
> >>>> +}
> >>>> +
> >>>> +static void xive_tm_os_write(void *opaque, hwaddr offset,
> >>>> +                                   uint64_t value, unsigned size)
> >>>> +{
> >>>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
> >>>> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
> >>>> +    int i;
> >>>> +
> >>>> +    if (offset >= TM_SPC_ACK_EBB) {
> >>>> +        xive_tm_write_special(nvt, offset, value, size);
> >>>> +        return;
> >>>> +    }
> >>>> +
> >>>> +    if (TM_RING(offset) != TM_QW1_OS) {
> >>>
> >>> Why have this if you have separate OS and user regions as you appear
> >>> to do below?
> >>
> >> This is another problem we are trying to solve. 
> >>
> >> The registers a CPU can access depends on the TIMA view it is using. 
> >> The OS TIMA view only sees the OS ring registers. The HV view sees all. 
> >>
> >>> Or to look at it another way, shouldn't it be possible to make the
> >>> read/write accessors the same for the OS and user rings?
> >>
> >> For some parts yes, but the special load/store addresses are different
> >> for each view, the read-only register also. It seemed easier to duplicate.
> >>
> >> I think the problem will become clearer (or worse) with pnv which uses 
> >> the HV mode.
> > 
> > Oh.  I had the impression that each ring had a basically identical set
> > of registers and you just had access to the region for your ring and
> > the ones below.  Are you saying instead it's basically a single block
> > of registers with various different privilege levels for each of them?
> 
> yes. I think I answered this question more clearly in a previous email.
> 
> > [snip]
> >>>> +}
> >>>> +
> >>>> +static void xive_nvt_unrealize(DeviceState *dev, Error **errp)
> >>>> +{
> >>>> +    qemu_unregister_reset(xive_nvt_reset, dev);
> >>>> +}
> >>>> +
> >>>> +static void xive_nvt_init(Object *obj)
> >>>> +{
> >>>> +    XiveNVT *nvt = XIVE_NVT(obj);
> >>>> +
> >>>> +    nvt->ring_os = &nvt->regs[TM_QW1_OS];
> >>>
> >>> The ring_os field is basically pointless, being just an offset into a
> >>> structure you already have.  A macro or inline would be a better idea.
> >>
> >> ok. I liked the idea but I agree it's overkill to have an init routine
> >> just for this. I will find something.
> > 
> > That too, but it's also something that looks like an optimization but
> > isn't, which is bad practice.  On modern cpus math is cheap (and this
> > is just a trivial offset), memory accesses are expensive.  You're
> > essentially caching this offset - raising all the usual invalidation
> > questions for a cache - when caching it is *more* expensive than just
> > computing it every time.
> 
> ok. removing this offset was a good opportunity to generalize the 
> routing algorithm and use a 'ring' parameter in all routines. Same 
> for the accept path. 
> 
> 
> C.
>
Cédric Le Goater May 4, 2018, 1:11 p.m. UTC | #13
On 05/04/2018 06:51 AM, David Gibson wrote:
> On Thu, May 03, 2018 at 06:06:14PM +0200, Cédric Le Goater wrote:
>> On 05/03/2018 07:35 AM, David Gibson wrote:
>>> On Thu, Apr 26, 2018 at 11:27:21AM +0200, Cédric Le Goater wrote:
>>>> On 04/26/2018 09:11 AM, David Gibson wrote:
>>>>> On Thu, Apr 19, 2018 at 02:43:02PM +0200, Cédric Le Goater wrote:
>>>>>> The XIVE presenter engine uses a set of registers to handle priority
>>>>>> management and interrupt acknowledgment among other things. The most
>>>>>> important ones being :
>>>>>>
>>>>>>   - Interrupt Priority Register (PIPR)
>>>>>>   - Interrupt Pending Buffer (IPB)
>>>>>>   - Current Processor Priority (CPPR)
>>>>>>   - Notification Source Register (NSR)
>>>>>>
>>>>>> There is one set of registers per level of privilege, four in all :
>>>>>> HW, HV pool, OS and User. These are called rings. All registers are
>>>>>> accessible through a specific MMIO region called the Thread Interrupt
>>>>>> Management Areas (TIMA) but, depending on the privilege level of the
>>>>>> CPU, the view of the TIMA is filtered. The sPAPR machine runs at the
>>>>>> OS privilege and therefore can only accesses the OS and the User
>>>>>> rings. The others are for hypervisor levels.
>>>>>>
>>>>>> The CPU interrupt state is modeled with a XiveNVT object which stores
>>>>>> the values of the different registers. The different TIMA views are
>>>>>> mapped at the same address for each CPU and 'current_cpu' is used to
>>>>>> retrieve the XiveNVT holding the ring registers.
>>>>>>
>>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>>>> ---
>>>>>>
>>>>>>  Changes since v2 :
>>>>>>
>>>>>>  - introduced the XiveFabric interface
>>>>>>
>>>>>>  hw/intc/spapr_xive.c        |  25 ++++
>>>>>>  hw/intc/xive.c              | 279 ++++++++++++++++++++++++++++++++++++++++++++
>>>>>>  include/hw/ppc/spapr_xive.h |   5 +
>>>>>>  include/hw/ppc/xive.h       |  31 +++++
>>>>>>  include/hw/ppc/xive_regs.h  |  84 +++++++++++++
>>>>>>  5 files changed, 424 insertions(+)
>>>>>>
>>>>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>>>>>> index 90cde8a4082d..f07832bf0a00 100644
>>>>>> --- a/hw/intc/spapr_xive.c
>>>>>> +++ b/hw/intc/spapr_xive.c
>>>>>> @@ -13,6 +13,7 @@
>>>>>>  #include "target/ppc/cpu.h"
>>>>>>  #include "sysemu/cpus.h"
>>>>>>  #include "monitor/monitor.h"
>>>>>> +#include "hw/ppc/spapr.h"
>>>>>>  #include "hw/ppc/spapr_xive.h"
>>>>>>  #include "hw/ppc/xive.h"
>>>>>>  #include "hw/ppc/xive_regs.h"
>>>>>> @@ -95,6 +96,22 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>>>>>  
>>>>>>      /* Allocate the Interrupt Virtualization Table */
>>>>>>      xive->ivt = g_new0(XiveIVE, xive->nr_irqs);
>>>>>> +
>>>>>> +    /* The Thread Interrupt Management Area has the same address for
>>>>>> +     * each chip. On sPAPR, we only need to expose the User and OS
>>>>>> +     * level views of the TIMA.
>>>>>> +     */
>>>>>> +    xive->tm_base = XIVE_TM_BASE;
>>>>>
>>>>> The constant should probably have PAPR in the name somewhere, since
>>>>> it's just for PAPR machines (same for the ESB mappings, actually).
>>>>
>>>> ok. 
>>>>
>>>> I have also made 'tm_base' a property, like 'vc_base' for ESBs, in 
>>>> case we want to change the value when the guest is instantiated. 
>>>> I doubt it but this is an address in the global address space, so 
>>>> letting the machine have control is better I think.
>>>
>>> I agree.
>>>
>>>>>> +
>>>>>> +    memory_region_init_io(&xive->tm_mmio_user, OBJECT(xive),
>>>>>> +                          &xive_tm_user_ops, xive, "xive.tima.user",
>>>>>> +                          1ull << TM_SHIFT);
>>>>>> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_user);
>>>>>> +
>>>>>> +    memory_region_init_io(&xive->tm_mmio_os, OBJECT(xive),
>>>>>> +                          &xive_tm_os_ops, xive, "xive.tima.os",
>>>>>> +                          1ull << TM_SHIFT);
>>>>>> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_os);
>>>>>>  }
>>>>>>  
>>>>>>  static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
>>>>>> @@ -104,6 +121,13 @@ static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
>>>>>>      return lisn < xive->nr_irqs ? &xive->ivt[lisn] : NULL;
>>>>>>  }
>>>>>>  
>>>>>> +static XiveNVT *spapr_xive_get_nvt(XiveFabric *xf, uint32_t server)
>>>>>> +{
>>>>>> +    PowerPCCPU *cpu = spapr_find_cpu(server);
>>>>>> +
>>>>>> +    return cpu ? XIVE_NVT(cpu->intc) : NULL;
>>>>>> +}
>>>>>
>>>>> So this is a bit of a tangent, but I've been thinking of implementing
>>>>> a scheme where there's an opaque pointer in the cpu structure for the
>>>>> use of the machine.  I'm planning for that to replace the intc pointer
>>>>> (which isn't really used directly by the cpu). That would allow us to
>>>>> have spapr put a structure there and have both xics and xive pointers
>>>>> which could be useful later on.
>>>>
>>>> ok. That should simplify the patchset at the end, in which we need to 
>>>> switch the 'intc' pointer. 
>>>>
>>>>> I think we'd need something similar to correctly handle migration of
>>>>> the VPA state, which is currently horribly broken.
>>>>>
>>>>>> +
>>>>>>  static const VMStateDescription vmstate_spapr_xive_ive = {
>>>>>>      .name = TYPE_SPAPR_XIVE "/ive",
>>>>>>      .version_id = 1,
>>>>>> @@ -143,6 +167,7 @@ static void spapr_xive_class_init(ObjectClass *klass, void *data)
>>>>>>      dc->vmsd = &vmstate_spapr_xive;
>>>>>>  
>>>>>>      xfc->get_ive = spapr_xive_get_ive;
>>>>>> +    xfc->get_nvt = spapr_xive_get_nvt;
>>>>>>  }
>>>>>>  
>>>>>>  static const TypeInfo spapr_xive_info = {
>>>>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>>>>>> index dccad0318834..5691bb9474e4 100644
>>>>>> --- a/hw/intc/xive.c
>>>>>> +++ b/hw/intc/xive.c
>>>>>> @@ -14,7 +14,278 @@
>>>>>>  #include "sysemu/cpus.h"
>>>>>>  #include "sysemu/dma.h"
>>>>>>  #include "monitor/monitor.h"
>>>>>> +#include "hw/ppc/xics.h" /* for ICP_PROP_CPU */
>>>>>>  #include "hw/ppc/xive.h"
>>>>>> +#include "hw/ppc/xive_regs.h"
>>>>>> +
>>>>>> +/*
>>>>>> + * XIVE Interrupt Presenter
>>>>>> + */
>>>>>> +
>>>>>> +static uint64_t xive_nvt_accept(XiveNVT *nvt)
>>>>>> +{
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static void xive_nvt_set_cppr(XiveNVT *nvt, uint8_t cppr)
>>>>>> +{
>>>>>> +    if (cppr > XIVE_PRIORITY_MAX) {
>>>>>> +        cppr = 0xff;
>>>>>> +    }
>>>>>> +
>>>>>> +    nvt->ring_os[TM_CPPR] = cppr;
>>>>>
>>>>> Surely this needs to recheck if we should be interrupting the cpu?
>>>>
>>>> yes. In patch 9, when we introduce the nvt notify routine.
>>>
>>> Ok.
>>>
>>>>>> +}
>>>>>> +
>>>>>> +/*
>>>>>> + * OS Thread Interrupt Management Area MMIO
>>>>>> + */
>>>>>> +static uint64_t xive_tm_read_special(XiveNVT *nvt, hwaddr offset,
>>>>>> +                                           unsigned size)
>>>>>> +{
>>>>>> +    uint64_t ret = -1;
>>>>>> +
>>>>>> +    if (offset == TM_SPC_ACK_OS_REG && size == 2) {
>>>>>> +        ret = xive_nvt_accept(nvt);
>>>>>> +    } else {
>>>>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid TIMA read @%"
>>>>>> +                      HWADDR_PRIx" size %d\n", offset, size);
>>>>>> +    }
>>>>>> +
>>>>>> +    return ret;
>>>>>> +}
>>>>>> +
>>>>>> +#define TM_RING(offset) ((offset) & 0xf0)
>>>>>> +
>>>>>> +static uint64_t xive_tm_os_read(void *opaque, hwaddr offset,
>>>>>> +                                      unsigned size)
>>>>>> +{
>>>>>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
>>>>>
>>>>> So, as I said on a previous version of this, we can actually correctly
>>>>> represent different mappings in different cpu spaces, by exploiting
>>>>> cpu->as and not just having them all point to &address_space_memory.
>>>>
>>>> Yes, you did and I haven't studied the question yet. For the next version.
>>>
>>> So, it's possible that using the cpu->as thing will be more trouble
>>> that it's worth. 
>>
>> One of the trouble is the number of memory regions to use, one per cpu, 
> 
> Well, we're already going to have an NVT object for each cpu, yes?  So
> a memory region per-cpu doesn't seem like a big stretch.
> 
>> and the KVM support.
> 
> And I really don't see how the memory regions impacts KVM.

The TIMA is setup when the KVM device is initialized using some specific 
ioctl to get an fd on a MMIO region from the host. It is then passed to 
the guest as a 'ram_device', same for the ESBs. 

This is not a common region.
 
>> Having a single region is much easier. 
>>
>>> I am a little concerned about using current_cpu though.  
>>> First, will it work with KVM with kernel_irqchip=off - the
>>> cpus are running truly concurrently,
>>
>> FWIW, I didn't see any issue yet while stressing. 
> 
> Ok.
> 
>>> but we still need to work out who's poking at the TIMA.  
>>
>> I understand. The registers are accessed by the current cpu to set the 
>> CPPR and to ack an interrupt. But when we route an event, we also access 
>> and modify the registers. Do you suggest some locking ? I am not sure
>> how are protected the TIMA region accesses vs. the routing, which is 
>> necessarily initiated by an ESB MMIO though.
> 
> Locking isn't really the issue.  I mean, we do need locking, but the
> BQL should provide that.  The issue is what exactly does "current"
> mean in the context of multiple concurrently running cpus.  Does it
> always mean what we need it to mean in every context we might call
> this from.

I would say so.

C.

>>> Second, are there any cases where we might
>>> need to trip this "on behalf of" a specific cpu that's not the current
>>> one.
>>
>> ah. yes. sort of :) only in powernv, when the xive is reseted (and when 
>> dumping the state for debug).
>>
>> The IC has a way to access indirectly the registers of a HW thread. 
>> It, first, sets the PC_TCTXT_INDIR_THRDID register with the PIR of 
>> the targeted thread and then loads on the indirect TIMA can be done 
>> as if it was the current thread. The indirect TIMA is mapped 4 pages 
>> after the  IC BAR.
>>
>> The resulting memory region op is a little ugly and might need 
>> some rework : 
>>
>> static uint64_t xive_tm_hv_read(void *opaque, hwaddr offset,
>>                                  unsigned size)
>> {
>>     PowerPCCPU **cpuptr = opaque;
>>     PowerPCCPU *cpu = *cpuptr ? *cpuptr : POWERPC_CPU(current_cpu);
>>     ...
>>
>>
>>>>>> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
>>>>>> +    uint64_t ret = -1;
>>>>>> +    int i;
>>>>>> +
>>>>>> +    if (offset >= TM_SPC_ACK_EBB) {
>>>>>> +        return xive_tm_read_special(nvt, offset, size);
>>>>>> +    }
>>>>>> +
>>>>>> +    if (TM_RING(offset) != TM_QW1_OS) {
>>>>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid access to non-OS ring @%"
>>>>>> +                      HWADDR_PRIx"\n", offset);
>>>>>> +        return ret;
>>>>>
>>>>> Just return -1 would be clearer here;
>>>>
>>>> ok.
>>>>
>>>>>
>>>>>> +    }
>>>>>> +
>>>>>> +    ret = 0;
>>>>>> +    for (i = 0; i < size; i++) {
>>>>>> +        ret |= (uint64_t) nvt->regs[offset + i] << (8 * (size - i - 1));
>>>>>> +    }
>>>>>> +
>>>>>> +    return ret;
>>>>>> +}
>>>>>> +
>>>>>> +static bool xive_tm_is_readonly(uint8_t offset)
>>>>>> +{
>>>>>> +    return offset != TM_QW1_OS + TM_CPPR;
>>>>>> +}
>>>>>> +
>>>>>> +static void xive_tm_write_special(XiveNVT *nvt, hwaddr offset,
>>>>>> +                                        uint64_t value, unsigned size)
>>>>>> +{
>>>>>> +    /* TODO: support TM_SPC_SET_OS_PENDING */
>>>>>> +
>>>>>> +    /* TODO: support TM_SPC_ACK_OS_EL */
>>>>>> +}
>>>>>> +
>>>>>> +static void xive_tm_os_write(void *opaque, hwaddr offset,
>>>>>> +                                   uint64_t value, unsigned size)
>>>>>> +{
>>>>>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
>>>>>> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
>>>>>> +    int i;
>>>>>> +
>>>>>> +    if (offset >= TM_SPC_ACK_EBB) {
>>>>>> +        xive_tm_write_special(nvt, offset, value, size);
>>>>>> +        return;
>>>>>> +    }
>>>>>> +
>>>>>> +    if (TM_RING(offset) != TM_QW1_OS) {
>>>>>
>>>>> Why have this if you have separate OS and user regions as you appear
>>>>> to do below?
>>>>
>>>> This is another problem we are trying to solve. 
>>>>
>>>> The registers a CPU can access depends on the TIMA view it is using. 
>>>> The OS TIMA view only sees the OS ring registers. The HV view sees all. 
>>>>
>>>>> Or to look at it another way, shouldn't it be possible to make the
>>>>> read/write accessors the same for the OS and user rings?
>>>>
>>>> For some parts yes, but the special load/store addresses are different
>>>> for each view, the read-only register also. It seemed easier to duplicate.
>>>>
>>>> I think the problem will become clearer (or worse) with pnv which uses 
>>>> the HV mode.
>>>
>>> Oh.  I had the impression that each ring had a basically identical set
>>> of registers and you just had access to the region for your ring and
>>> the ones below.  Are you saying instead it's basically a single block
>>> of registers with various different privilege levels for each of them?
>>
>> yes. I think I answered this question more clearly in a previous email.
>>
>>> [snip]
>>>>>> +}
>>>>>> +
>>>>>> +static void xive_nvt_unrealize(DeviceState *dev, Error **errp)
>>>>>> +{
>>>>>> +    qemu_unregister_reset(xive_nvt_reset, dev);
>>>>>> +}
>>>>>> +
>>>>>> +static void xive_nvt_init(Object *obj)
>>>>>> +{
>>>>>> +    XiveNVT *nvt = XIVE_NVT(obj);
>>>>>> +
>>>>>> +    nvt->ring_os = &nvt->regs[TM_QW1_OS];
>>>>>
>>>>> The ring_os field is basically pointless, being just an offset into a
>>>>> structure you already have.  A macro or inline would be a better idea.
>>>>
>>>> ok. I liked the idea but I agree it's overkill to have an init routine
>>>> just for this. I will find something.
>>>
>>> That too, but it's also something that looks like an optimization but
>>> isn't, which is bad practice.  On modern cpus math is cheap (and this
>>> is just a trivial offset), memory accesses are expensive.  You're
>>> essentially caching this offset - raising all the usual invalidation
>>> questions for a cache - when caching it is *more* expensive than just
>>> computing it every time.
>>
>> ok. removing this offset was a good opportunity to generalize the 
>> routing algorithm and use a 'ring' parameter in all routines. Same 
>> for the accept path. 
>>
>>
>> C.
>>
>
Cédric Le Goater May 4, 2018, 2:15 p.m. UTC | #14
On 05/04/2018 06:44 AM, David Gibson wrote:
> On Thu, May 03, 2018 at 05:10:48PM +0200, Cédric Le Goater wrote:
>> On 05/03/2018 07:39 AM, David Gibson wrote:
>>> On Thu, Apr 26, 2018 at 07:15:29PM +0200, Cédric Le Goater wrote:
>>>> On 04/26/2018 11:27 AM, Cédric Le Goater wrote:
>>>>> On 04/26/2018 09:11 AM, David Gibson wrote:
>>>>>> On Thu, Apr 19, 2018 at 02:43:02PM +0200, Cédric Le Goater wrote:
>>> [snip]
>>>>>>> +static void xive_tm_os_write(void *opaque, hwaddr offset,
>>>>>>> +                                   uint64_t value, unsigned size)
>>>>>>> +{
>>>>>>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
>>>>>>> +    XiveNVT *nvt = XIVE_NVT(cpu->intc);
>>>>>>> +    int i;
>>>>>>> +
>>>>>>> +    if (offset >= TM_SPC_ACK_EBB) {
>>>>>>> +        xive_tm_write_special(nvt, offset, value, size);
>>>>>>> +        return;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    if (TM_RING(offset) != TM_QW1_OS) {
>>>>>>
>>>>>> Why have this if you have separate OS and user regions as you appear
>>>>>> to do below?
>>>>>
>>>>> This is another problem we are trying to solve. 
>>>>>
>>>>> The registers a CPU can access depends on the TIMA view it is using. 
>>>>> The OS TIMA view only sees the OS ring registers. The HV view sees all. 
>>>>
>>>> So, I gave a deeper look at the specs and I understood a little more 
>>>> details of the concepts behind. You need to do frequent round-trips 
>>>> to this document ...  
>>>>
>>>> These registers are accessible through four aligned pages, each exposing 
>>>> a different view of the registers. First page (page address ending 
>>>> in 0b00) gives access to the entire context and is reserved for the 
>>>> ring 0 security monitor. The second (page address ending in 0b01) 
>>>> is for the hypervisor, ring 1. The third (page address ending in 0b10) 
>>>> is for the operating system, ring 2. The fourth (page address ending 
>>>> in 0b11) is for user level, ring 3.
>>>>
>>>> The sPAPR machine runs at the OS privilege and therefore can only 
>>>> accesses the OS and the User rings, 2 and 3. The others are for
>>>> hypervisor levels.
>>>
>>> Ok, that much is what I thought.  What I'm less clear on is what each
>>> page looks like compared to the others.  Previously I thought each one
>>> had the same registers, 
>>
>> yes.
>>
>>> just manipulating the corresponding ring.  
>>
>> no. 
>>
>>> Are you saying instead that each ring's page basically has a subset 
>>> of the registers in the next most privileged page?
>>
>> That's the idea. 
> 
> Ah, ok.
> 
>> The registers are defined as follow :
>>
>> 	QW-0 User      
>> 	QW-1 O/S      
>> 	QW-2 Pool   
>> 	QW-3 Physical 
>>
>> and the pages :
>>
>> - 0006030203180000 security monitor 
>>   can access all registers 
>>
>> - 0006030203190000 hv
>>   can access all registers minus the secure regs
>>
>> - 00060302031a0000 os
>>   can access some of the OS (QW1) and User (QW0) registers
>>  
>> - 00060302031b0000 user
>>   can access NSR reg of User (QW0) registers
> 
> I can see two reasonable ways of doing this:
> 
> A)
> 
> Have a single set of read/write functions.  These implement all the
> registers but take a "privilege level" parameter which controls which
> will actually work.  Those could then be wired up in one of two ways:
> 
>   A1) Single memory region.  The accessor derives the priv level from
>   the relevant address bits, before masking it down to a single
>   register page.  Then, as above

Yes. That's the goal behind the page ordering :

page address ending in 0b00 : ring 0, security monitor 
page address ending in 0b01 : ring 1, hypervisor 
page address ending in 0b10 : ring 2, operating system  
page address ending in 0b11 : ring 3, user level

I don't why the registers are ordered the other way around though.

That's would be the direction to take for the emulated mode, I think.
It covers well the PowerNV (4 pages) and the sPAPR case (2 pages), 
in each case, the machine IC controller decides how much pages to map.
The memory region ops do the rest.

For KVM, we need to populate the VMA with the host TIMA page associated 
with ring 2 (OS) and then ring 3 (USER). 

This option looks better overall. I will see how ugly it gets with the implementation.

C.


>   A2) Multiple memory regions with the same accessor functions but
>   different opaque pointer.  The accessor gets the priv level from
>   its opaque pointer, then the address is just within a single ring's
>   page.
>
> B)
> 
> Separate memory regions with separate accessors.  The ring-0 accessor
> implements the ring-0 registers, then calls the ring-1 accessor
> function for everything else.  ring-1 calls ring-2 and so forth.
>
>> On sPAPR, we can remap the os/user pages to some other base address 
>> but we should keep the same page offset.
> 
> Sure.
> 
>>
>>
>>>> I will try to come with a better implementation of the model and
>>>> make sure the ring numbers are respected. I am not sure we should 
>>>> have only one memory region or four distinct ones with their
>>>> own ops. There are some differences in the load/store of each view.
>>>
>>> Right.  I'm not clear at this point if that's for good reasons, or
>>> just because IBM's hardware designers don't seem to have gotten the
>>> hang of Don't Repeat Yourself.
>>>
>>
>
David Gibson May 5, 2018, 4:27 a.m. UTC | #15
On Fri, May 04, 2018 at 03:11:57PM +0200, Cédric Le Goater wrote:
> On 05/04/2018 06:51 AM, David Gibson wrote:
> > On Thu, May 03, 2018 at 06:06:14PM +0200, Cédric Le Goater wrote:
> >> On 05/03/2018 07:35 AM, David Gibson wrote:
> >>> On Thu, Apr 26, 2018 at 11:27:21AM +0200, Cédric Le Goater wrote:
> >>>> On 04/26/2018 09:11 AM, David Gibson wrote:
> >>>>> On Thu, Apr 19, 2018 at 02:43:02PM +0200, Cédric Le Goater wrote:
> >>>>>> The XIVE presenter engine uses a set of registers to handle priority
> >>>>>> management and interrupt acknowledgment among other things. The most
> >>>>>> important ones being :
> >>>>>>
> >>>>>>   - Interrupt Priority Register (PIPR)
> >>>>>>   - Interrupt Pending Buffer (IPB)
> >>>>>>   - Current Processor Priority (CPPR)
> >>>>>>   - Notification Source Register (NSR)
> >>>>>>
> >>>>>> There is one set of registers per level of privilege, four in all :
> >>>>>> HW, HV pool, OS and User. These are called rings. All registers are
> >>>>>> accessible through a specific MMIO region called the Thread Interrupt
> >>>>>> Management Areas (TIMA) but, depending on the privilege level of the
> >>>>>> CPU, the view of the TIMA is filtered. The sPAPR machine runs at the
> >>>>>> OS privilege and therefore can only accesses the OS and the User
> >>>>>> rings. The others are for hypervisor levels.
> >>>>>>
> >>>>>> The CPU interrupt state is modeled with a XiveNVT object which stores
> >>>>>> the values of the different registers. The different TIMA views are
> >>>>>> mapped at the same address for each CPU and 'current_cpu' is used to
> >>>>>> retrieve the XiveNVT holding the ring registers.
> >>>>>>
> >>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >>>>>> ---
> >>>>>>
> >>>>>>  Changes since v2 :
> >>>>>>
> >>>>>>  - introduced the XiveFabric interface
> >>>>>>
> >>>>>>  hw/intc/spapr_xive.c        |  25 ++++
> >>>>>>  hw/intc/xive.c              | 279 ++++++++++++++++++++++++++++++++++++++++++++
> >>>>>>  include/hw/ppc/spapr_xive.h |   5 +
> >>>>>>  include/hw/ppc/xive.h       |  31 +++++
> >>>>>>  include/hw/ppc/xive_regs.h  |  84 +++++++++++++
> >>>>>>  5 files changed, 424 insertions(+)
> >>>>>>
> >>>>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> >>>>>> index 90cde8a4082d..f07832bf0a00 100644
> >>>>>> --- a/hw/intc/spapr_xive.c
> >>>>>> +++ b/hw/intc/spapr_xive.c
> >>>>>> @@ -13,6 +13,7 @@
> >>>>>>  #include "target/ppc/cpu.h"
> >>>>>>  #include "sysemu/cpus.h"
> >>>>>>  #include "monitor/monitor.h"
> >>>>>> +#include "hw/ppc/spapr.h"
> >>>>>>  #include "hw/ppc/spapr_xive.h"
> >>>>>>  #include "hw/ppc/xive.h"
> >>>>>>  #include "hw/ppc/xive_regs.h"
> >>>>>> @@ -95,6 +96,22 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
> >>>>>>  
> >>>>>>      /* Allocate the Interrupt Virtualization Table */
> >>>>>>      xive->ivt = g_new0(XiveIVE, xive->nr_irqs);
> >>>>>> +
> >>>>>> +    /* The Thread Interrupt Management Area has the same address for
> >>>>>> +     * each chip. On sPAPR, we only need to expose the User and OS
> >>>>>> +     * level views of the TIMA.
> >>>>>> +     */
> >>>>>> +    xive->tm_base = XIVE_TM_BASE;
> >>>>>
> >>>>> The constant should probably have PAPR in the name somewhere, since
> >>>>> it's just for PAPR machines (same for the ESB mappings, actually).
> >>>>
> >>>> ok. 
> >>>>
> >>>> I have also made 'tm_base' a property, like 'vc_base' for ESBs, in 
> >>>> case we want to change the value when the guest is instantiated. 
> >>>> I doubt it but this is an address in the global address space, so 
> >>>> letting the machine have control is better I think.
> >>>
> >>> I agree.
> >>>
> >>>>>> +
> >>>>>> +    memory_region_init_io(&xive->tm_mmio_user, OBJECT(xive),
> >>>>>> +                          &xive_tm_user_ops, xive, "xive.tima.user",
> >>>>>> +                          1ull << TM_SHIFT);
> >>>>>> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_user);
> >>>>>> +
> >>>>>> +    memory_region_init_io(&xive->tm_mmio_os, OBJECT(xive),
> >>>>>> +                          &xive_tm_os_ops, xive, "xive.tima.os",
> >>>>>> +                          1ull << TM_SHIFT);
> >>>>>> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_os);
> >>>>>>  }
> >>>>>>  
> >>>>>>  static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
> >>>>>> @@ -104,6 +121,13 @@ static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
> >>>>>>      return lisn < xive->nr_irqs ? &xive->ivt[lisn] : NULL;
> >>>>>>  }
> >>>>>>  
> >>>>>> +static XiveNVT *spapr_xive_get_nvt(XiveFabric *xf, uint32_t server)
> >>>>>> +{
> >>>>>> +    PowerPCCPU *cpu = spapr_find_cpu(server);
> >>>>>> +
> >>>>>> +    return cpu ? XIVE_NVT(cpu->intc) : NULL;
> >>>>>> +}
> >>>>>
> >>>>> So this is a bit of a tangent, but I've been thinking of implementing
> >>>>> a scheme where there's an opaque pointer in the cpu structure for the
> >>>>> use of the machine.  I'm planning for that to replace the intc pointer
> >>>>> (which isn't really used directly by the cpu). That would allow us to
> >>>>> have spapr put a structure there and have both xics and xive pointers
> >>>>> which could be useful later on.
> >>>>
> >>>> ok. That should simplify the patchset at the end, in which we need to 
> >>>> switch the 'intc' pointer. 
> >>>>
> >>>>> I think we'd need something similar to correctly handle migration of
> >>>>> the VPA state, which is currently horribly broken.
> >>>>>
> >>>>>> +
> >>>>>>  static const VMStateDescription vmstate_spapr_xive_ive = {
> >>>>>>      .name = TYPE_SPAPR_XIVE "/ive",
> >>>>>>      .version_id = 1,
> >>>>>> @@ -143,6 +167,7 @@ static void spapr_xive_class_init(ObjectClass *klass, void *data)
> >>>>>>      dc->vmsd = &vmstate_spapr_xive;
> >>>>>>  
> >>>>>>      xfc->get_ive = spapr_xive_get_ive;
> >>>>>> +    xfc->get_nvt = spapr_xive_get_nvt;
> >>>>>>  }
> >>>>>>  
> >>>>>>  static const TypeInfo spapr_xive_info = {
> >>>>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> >>>>>> index dccad0318834..5691bb9474e4 100644
> >>>>>> --- a/hw/intc/xive.c
> >>>>>> +++ b/hw/intc/xive.c
> >>>>>> @@ -14,7 +14,278 @@
> >>>>>>  #include "sysemu/cpus.h"
> >>>>>>  #include "sysemu/dma.h"
> >>>>>>  #include "monitor/monitor.h"
> >>>>>> +#include "hw/ppc/xics.h" /* for ICP_PROP_CPU */
> >>>>>>  #include "hw/ppc/xive.h"
> >>>>>> +#include "hw/ppc/xive_regs.h"
> >>>>>> +
> >>>>>> +/*
> >>>>>> + * XIVE Interrupt Presenter
> >>>>>> + */
> >>>>>> +
> >>>>>> +static uint64_t xive_nvt_accept(XiveNVT *nvt)
> >>>>>> +{
> >>>>>> +    return 0;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static void xive_nvt_set_cppr(XiveNVT *nvt, uint8_t cppr)
> >>>>>> +{
> >>>>>> +    if (cppr > XIVE_PRIORITY_MAX) {
> >>>>>> +        cppr = 0xff;
> >>>>>> +    }
> >>>>>> +
> >>>>>> +    nvt->ring_os[TM_CPPR] = cppr;
> >>>>>
> >>>>> Surely this needs to recheck if we should be interrupting the cpu?
> >>>>
> >>>> yes. In patch 9, when we introduce the nvt notify routine.
> >>>
> >>> Ok.
> >>>
> >>>>>> +}
> >>>>>> +
> >>>>>> +/*
> >>>>>> + * OS Thread Interrupt Management Area MMIO
> >>>>>> + */
> >>>>>> +static uint64_t xive_tm_read_special(XiveNVT *nvt, hwaddr offset,
> >>>>>> +                                           unsigned size)
> >>>>>> +{
> >>>>>> +    uint64_t ret = -1;
> >>>>>> +
> >>>>>> +    if (offset == TM_SPC_ACK_OS_REG && size == 2) {
> >>>>>> +        ret = xive_nvt_accept(nvt);
> >>>>>> +    } else {
> >>>>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid TIMA read @%"
> >>>>>> +                      HWADDR_PRIx" size %d\n", offset, size);
> >>>>>> +    }
> >>>>>> +
> >>>>>> +    return ret;
> >>>>>> +}
> >>>>>> +
> >>>>>> +#define TM_RING(offset) ((offset) & 0xf0)
> >>>>>> +
> >>>>>> +static uint64_t xive_tm_os_read(void *opaque, hwaddr offset,
> >>>>>> +                                      unsigned size)
> >>>>>> +{
> >>>>>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
> >>>>>
> >>>>> So, as I said on a previous version of this, we can actually correctly
> >>>>> represent different mappings in different cpu spaces, by exploiting
> >>>>> cpu->as and not just having them all point to &address_space_memory.
> >>>>
> >>>> Yes, you did and I haven't studied the question yet. For the next version.
> >>>
> >>> So, it's possible that using the cpu->as thing will be more trouble
> >>> that it's worth. 
> >>
> >> One of the trouble is the number of memory regions to use, one per cpu, 
> > 
> > Well, we're already going to have an NVT object for each cpu, yes?  So
> > a memory region per-cpu doesn't seem like a big stretch.
> > 
> >> and the KVM support.
> > 
> > And I really don't see how the memory regions impacts KVM.
> 
> The TIMA is setup when the KVM device is initialized using some specific 
> ioctl to get an fd on a MMIO region from the host. It is then passed to 
> the guest as a 'ram_device', same for the ESBs. 

Ah, good point.

> This is not a common region.

I'm not sure what you mean by that.

> >> Having a single region is much easier. 
> >>
> >>> I am a little concerned about using current_cpu though.  
> >>> First, will it work with KVM with kernel_irqchip=off - the
> >>> cpus are running truly concurrently,
> >>
> >> FWIW, I didn't see any issue yet while stressing. 
> > 
> > Ok.
> > 
> >>> but we still need to work out who's poking at the TIMA.  
> >>
> >> I understand. The registers are accessed by the current cpu to set the 
> >> CPPR and to ack an interrupt. But when we route an event, we also access 
> >> and modify the registers. Do you suggest some locking ? I am not sure
> >> how are protected the TIMA region accesses vs. the routing, which is 
> >> necessarily initiated by an ESB MMIO though.
> > 
> > Locking isn't really the issue.  I mean, we do need locking, but the
> > BQL should provide that.  The issue is what exactly does "current"
> > mean in the context of multiple concurrently running cpus.  Does it
> > always mean what we need it to mean in every context we might call
> > this from.
> 
> I would say so.

Ok.
Cédric Le Goater May 9, 2018, 7:27 a.m. UTC | #16
On 05/05/2018 06:27 AM, David Gibson wrote:
> On Fri, May 04, 2018 at 03:11:57PM +0200, Cédric Le Goater wrote:
>> On 05/04/2018 06:51 AM, David Gibson wrote:
>>> On Thu, May 03, 2018 at 06:06:14PM +0200, Cédric Le Goater wrote:
>>>> On 05/03/2018 07:35 AM, David Gibson wrote:
>>>>> On Thu, Apr 26, 2018 at 11:27:21AM +0200, Cédric Le Goater wrote:
>>>>>> On 04/26/2018 09:11 AM, David Gibson wrote:
>>>>>>> On Thu, Apr 19, 2018 at 02:43:02PM +0200, Cédric Le Goater wrote:
>>>>>>>> The XIVE presenter engine uses a set of registers to handle priority
>>>>>>>> management and interrupt acknowledgment among other things. The most
>>>>>>>> important ones being :
>>>>>>>>
>>>>>>>>   - Interrupt Priority Register (PIPR)
>>>>>>>>   - Interrupt Pending Buffer (IPB)
>>>>>>>>   - Current Processor Priority (CPPR)
>>>>>>>>   - Notification Source Register (NSR)
>>>>>>>>
>>>>>>>> There is one set of registers per level of privilege, four in all :
>>>>>>>> HW, HV pool, OS and User. These are called rings. All registers are
>>>>>>>> accessible through a specific MMIO region called the Thread Interrupt
>>>>>>>> Management Areas (TIMA) but, depending on the privilege level of the
>>>>>>>> CPU, the view of the TIMA is filtered. The sPAPR machine runs at the
>>>>>>>> OS privilege and therefore can only accesses the OS and the User
>>>>>>>> rings. The others are for hypervisor levels.
>>>>>>>>
>>>>>>>> The CPU interrupt state is modeled with a XiveNVT object which stores
>>>>>>>> the values of the different registers. The different TIMA views are
>>>>>>>> mapped at the same address for each CPU and 'current_cpu' is used to
>>>>>>>> retrieve the XiveNVT holding the ring registers.
>>>>>>>>
>>>>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>>>>>> ---
>>>>>>>>
>>>>>>>>  Changes since v2 :
>>>>>>>>
>>>>>>>>  - introduced the XiveFabric interface
>>>>>>>>
>>>>>>>>  hw/intc/spapr_xive.c        |  25 ++++
>>>>>>>>  hw/intc/xive.c              | 279 ++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>  include/hw/ppc/spapr_xive.h |   5 +
>>>>>>>>  include/hw/ppc/xive.h       |  31 +++++
>>>>>>>>  include/hw/ppc/xive_regs.h  |  84 +++++++++++++
>>>>>>>>  5 files changed, 424 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>>>>>>>> index 90cde8a4082d..f07832bf0a00 100644
>>>>>>>> --- a/hw/intc/spapr_xive.c
>>>>>>>> +++ b/hw/intc/spapr_xive.c
>>>>>>>> @@ -13,6 +13,7 @@
>>>>>>>>  #include "target/ppc/cpu.h"
>>>>>>>>  #include "sysemu/cpus.h"
>>>>>>>>  #include "monitor/monitor.h"
>>>>>>>> +#include "hw/ppc/spapr.h"
>>>>>>>>  #include "hw/ppc/spapr_xive.h"
>>>>>>>>  #include "hw/ppc/xive.h"
>>>>>>>>  #include "hw/ppc/xive_regs.h"
>>>>>>>> @@ -95,6 +96,22 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>>>>>>>  
>>>>>>>>      /* Allocate the Interrupt Virtualization Table */
>>>>>>>>      xive->ivt = g_new0(XiveIVE, xive->nr_irqs);
>>>>>>>> +
>>>>>>>> +    /* The Thread Interrupt Management Area has the same address for
>>>>>>>> +     * each chip. On sPAPR, we only need to expose the User and OS
>>>>>>>> +     * level views of the TIMA.
>>>>>>>> +     */
>>>>>>>> +    xive->tm_base = XIVE_TM_BASE;
>>>>>>>
>>>>>>> The constant should probably have PAPR in the name somewhere, since
>>>>>>> it's just for PAPR machines (same for the ESB mappings, actually).
>>>>>>
>>>>>> ok. 
>>>>>>
>>>>>> I have also made 'tm_base' a property, like 'vc_base' for ESBs, in 
>>>>>> case we want to change the value when the guest is instantiated. 
>>>>>> I doubt it but this is an address in the global address space, so 
>>>>>> letting the machine have control is better I think.
>>>>>
>>>>> I agree.
>>>>>
>>>>>>>> +
>>>>>>>> +    memory_region_init_io(&xive->tm_mmio_user, OBJECT(xive),
>>>>>>>> +                          &xive_tm_user_ops, xive, "xive.tima.user",
>>>>>>>> +                          1ull << TM_SHIFT);
>>>>>>>> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_user);
>>>>>>>> +
>>>>>>>> +    memory_region_init_io(&xive->tm_mmio_os, OBJECT(xive),
>>>>>>>> +                          &xive_tm_os_ops, xive, "xive.tima.os",
>>>>>>>> +                          1ull << TM_SHIFT);
>>>>>>>> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_os);
>>>>>>>>  }
>>>>>>>>  
>>>>>>>>  static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
>>>>>>>> @@ -104,6 +121,13 @@ static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
>>>>>>>>      return lisn < xive->nr_irqs ? &xive->ivt[lisn] : NULL;
>>>>>>>>  }
>>>>>>>>  
>>>>>>>> +static XiveNVT *spapr_xive_get_nvt(XiveFabric *xf, uint32_t server)
>>>>>>>> +{
>>>>>>>> +    PowerPCCPU *cpu = spapr_find_cpu(server);
>>>>>>>> +
>>>>>>>> +    return cpu ? XIVE_NVT(cpu->intc) : NULL;
>>>>>>>> +}
>>>>>>>
>>>>>>> So this is a bit of a tangent, but I've been thinking of implementing
>>>>>>> a scheme where there's an opaque pointer in the cpu structure for the
>>>>>>> use of the machine.  I'm planning for that to replace the intc pointer
>>>>>>> (which isn't really used directly by the cpu). That would allow us to
>>>>>>> have spapr put a structure there and have both xics and xive pointers
>>>>>>> which could be useful later on.
>>>>>>
>>>>>> ok. That should simplify the patchset at the end, in which we need to 
>>>>>> switch the 'intc' pointer. 
>>>>>>
>>>>>>> I think we'd need something similar to correctly handle migration of
>>>>>>> the VPA state, which is currently horribly broken.
>>>>>>>
>>>>>>>> +
>>>>>>>>  static const VMStateDescription vmstate_spapr_xive_ive = {
>>>>>>>>      .name = TYPE_SPAPR_XIVE "/ive",
>>>>>>>>      .version_id = 1,
>>>>>>>> @@ -143,6 +167,7 @@ static void spapr_xive_class_init(ObjectClass *klass, void *data)
>>>>>>>>      dc->vmsd = &vmstate_spapr_xive;
>>>>>>>>  
>>>>>>>>      xfc->get_ive = spapr_xive_get_ive;
>>>>>>>> +    xfc->get_nvt = spapr_xive_get_nvt;
>>>>>>>>  }
>>>>>>>>  
>>>>>>>>  static const TypeInfo spapr_xive_info = {
>>>>>>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>>>>>>>> index dccad0318834..5691bb9474e4 100644
>>>>>>>> --- a/hw/intc/xive.c
>>>>>>>> +++ b/hw/intc/xive.c
>>>>>>>> @@ -14,7 +14,278 @@
>>>>>>>>  #include "sysemu/cpus.h"
>>>>>>>>  #include "sysemu/dma.h"
>>>>>>>>  #include "monitor/monitor.h"
>>>>>>>> +#include "hw/ppc/xics.h" /* for ICP_PROP_CPU */
>>>>>>>>  #include "hw/ppc/xive.h"
>>>>>>>> +#include "hw/ppc/xive_regs.h"
>>>>>>>> +
>>>>>>>> +/*
>>>>>>>> + * XIVE Interrupt Presenter
>>>>>>>> + */
>>>>>>>> +
>>>>>>>> +static uint64_t xive_nvt_accept(XiveNVT *nvt)
>>>>>>>> +{
>>>>>>>> +    return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void xive_nvt_set_cppr(XiveNVT *nvt, uint8_t cppr)
>>>>>>>> +{
>>>>>>>> +    if (cppr > XIVE_PRIORITY_MAX) {
>>>>>>>> +        cppr = 0xff;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    nvt->ring_os[TM_CPPR] = cppr;
>>>>>>>
>>>>>>> Surely this needs to recheck if we should be interrupting the cpu?
>>>>>>
>>>>>> yes. In patch 9, when we introduce the nvt notify routine.
>>>>>
>>>>> Ok.
>>>>>
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +/*
>>>>>>>> + * OS Thread Interrupt Management Area MMIO
>>>>>>>> + */
>>>>>>>> +static uint64_t xive_tm_read_special(XiveNVT *nvt, hwaddr offset,
>>>>>>>> +                                           unsigned size)
>>>>>>>> +{
>>>>>>>> +    uint64_t ret = -1;
>>>>>>>> +
>>>>>>>> +    if (offset == TM_SPC_ACK_OS_REG && size == 2) {
>>>>>>>> +        ret = xive_nvt_accept(nvt);
>>>>>>>> +    } else {
>>>>>>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid TIMA read @%"
>>>>>>>> +                      HWADDR_PRIx" size %d\n", offset, size);
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    return ret;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +#define TM_RING(offset) ((offset) & 0xf0)
>>>>>>>> +
>>>>>>>> +static uint64_t xive_tm_os_read(void *opaque, hwaddr offset,
>>>>>>>> +                                      unsigned size)
>>>>>>>> +{
>>>>>>>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
>>>>>>>
>>>>>>> So, as I said on a previous version of this, we can actually correctly
>>>>>>> represent different mappings in different cpu spaces, by exploiting
>>>>>>> cpu->as and not just having them all point to &address_space_memory.
>>>>>>
>>>>>> Yes, you did and I haven't studied the question yet. For the next version.
>>>>>
>>>>> So, it's possible that using the cpu->as thing will be more trouble
>>>>> that it's worth. 
>>>>
>>>> One of the trouble is the number of memory regions to use, one per cpu, 
>>>
>>> Well, we're already going to have an NVT object for each cpu, yes?  So
>>> a memory region per-cpu doesn't seem like a big stretch.
>>>
>>>> and the KVM support.
>>>
>>> And I really don't see how the memory regions impacts KVM.
>>
>> The TIMA is setup when the KVM device is initialized using some specific 
>> ioctl to get an fd on a MMIO region from the host. It is then passed to 
>> the guest as a 'ram_device', same for the ESBs. 
> 
> Ah, good point.
> 
>> This is not a common region.
> 
> I'm not sure what you mean by that.

I meant by that 'out of the ordinary', 'unusual'. Specially when under KVM.

C.

 
>>>> Having a single region is much easier. 
>>>>
>>>>> I am a little concerned about using current_cpu though.  
>>>>> First, will it work with KVM with kernel_irqchip=off - the
>>>>> cpus are running truly concurrently,
>>>>
>>>> FWIW, I didn't see any issue yet while stressing. 
>>>
>>> Ok.
>>>
>>>>> but we still need to work out who's poking at the TIMA.  
>>>>
>>>> I understand. The registers are accessed by the current cpu to set the 
>>>> CPPR and to ack an interrupt. But when we route an event, we also access 
>>>> and modify the registers. Do you suggest some locking ? I am not sure
>>>> how are protected the TIMA region accesses vs. the routing, which is 
>>>> necessarily initiated by an ESB MMIO though.
>>>
>>> Locking isn't really the issue.  I mean, we do need locking, but the
>>> BQL should provide that.  The issue is what exactly does "current"
>>> mean in the context of multiple concurrently running cpus.  Does it
>>> always mean what we need it to mean in every context we might call
>>> this from.
>>
>> I would say so.
> 
> Ok.
>
diff mbox series

Patch

diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
index 90cde8a4082d..f07832bf0a00 100644
--- a/hw/intc/spapr_xive.c
+++ b/hw/intc/spapr_xive.c
@@ -13,6 +13,7 @@ 
 #include "target/ppc/cpu.h"
 #include "sysemu/cpus.h"
 #include "monitor/monitor.h"
+#include "hw/ppc/spapr.h"
 #include "hw/ppc/spapr_xive.h"
 #include "hw/ppc/xive.h"
 #include "hw/ppc/xive_regs.h"
@@ -95,6 +96,22 @@  static void spapr_xive_realize(DeviceState *dev, Error **errp)
 
     /* Allocate the Interrupt Virtualization Table */
     xive->ivt = g_new0(XiveIVE, xive->nr_irqs);
+
+    /* The Thread Interrupt Management Area has the same address for
+     * each chip. On sPAPR, we only need to expose the User and OS
+     * level views of the TIMA.
+     */
+    xive->tm_base = XIVE_TM_BASE;
+
+    memory_region_init_io(&xive->tm_mmio_user, OBJECT(xive),
+                          &xive_tm_user_ops, xive, "xive.tima.user",
+                          1ull << TM_SHIFT);
+    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_user);
+
+    memory_region_init_io(&xive->tm_mmio_os, OBJECT(xive),
+                          &xive_tm_os_ops, xive, "xive.tima.os",
+                          1ull << TM_SHIFT);
+    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_mmio_os);
 }
 
 static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
@@ -104,6 +121,13 @@  static XiveIVE *spapr_xive_get_ive(XiveFabric *xf, uint32_t lisn)
     return lisn < xive->nr_irqs ? &xive->ivt[lisn] : NULL;
 }
 
+static XiveNVT *spapr_xive_get_nvt(XiveFabric *xf, uint32_t server)
+{
+    PowerPCCPU *cpu = spapr_find_cpu(server);
+
+    return cpu ? XIVE_NVT(cpu->intc) : NULL;
+}
+
 static const VMStateDescription vmstate_spapr_xive_ive = {
     .name = TYPE_SPAPR_XIVE "/ive",
     .version_id = 1,
@@ -143,6 +167,7 @@  static void spapr_xive_class_init(ObjectClass *klass, void *data)
     dc->vmsd = &vmstate_spapr_xive;
 
     xfc->get_ive = spapr_xive_get_ive;
+    xfc->get_nvt = spapr_xive_get_nvt;
 }
 
 static const TypeInfo spapr_xive_info = {
diff --git a/hw/intc/xive.c b/hw/intc/xive.c
index dccad0318834..5691bb9474e4 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -14,7 +14,278 @@ 
 #include "sysemu/cpus.h"
 #include "sysemu/dma.h"
 #include "monitor/monitor.h"
+#include "hw/ppc/xics.h" /* for ICP_PROP_CPU */
 #include "hw/ppc/xive.h"
+#include "hw/ppc/xive_regs.h"
+
+/*
+ * XIVE Interrupt Presenter
+ */
+
+static uint64_t xive_nvt_accept(XiveNVT *nvt)
+{
+    return 0;
+}
+
+static void xive_nvt_set_cppr(XiveNVT *nvt, uint8_t cppr)
+{
+    if (cppr > XIVE_PRIORITY_MAX) {
+        cppr = 0xff;
+    }
+
+    nvt->ring_os[TM_CPPR] = cppr;
+}
+
+/*
+ * OS Thread Interrupt Management Area MMIO
+ */
+static uint64_t xive_tm_read_special(XiveNVT *nvt, hwaddr offset,
+                                           unsigned size)
+{
+    uint64_t ret = -1;
+
+    if (offset == TM_SPC_ACK_OS_REG && size == 2) {
+        ret = xive_nvt_accept(nvt);
+    } else {
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid TIMA read @%"
+                      HWADDR_PRIx" size %d\n", offset, size);
+    }
+
+    return ret;
+}
+
+#define TM_RING(offset) ((offset) & 0xf0)
+
+static uint64_t xive_tm_os_read(void *opaque, hwaddr offset,
+                                      unsigned size)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
+    XiveNVT *nvt = XIVE_NVT(cpu->intc);
+    uint64_t ret = -1;
+    int i;
+
+    if (offset >= TM_SPC_ACK_EBB) {
+        return xive_tm_read_special(nvt, offset, size);
+    }
+
+    if (TM_RING(offset) != TM_QW1_OS) {
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid access to non-OS ring @%"
+                      HWADDR_PRIx"\n", offset);
+        return ret;
+    }
+
+    ret = 0;
+    for (i = 0; i < size; i++) {
+        ret |= (uint64_t) nvt->regs[offset + i] << (8 * (size - i - 1));
+    }
+
+    return ret;
+}
+
+static bool xive_tm_is_readonly(uint8_t offset)
+{
+    return offset != TM_QW1_OS + TM_CPPR;
+}
+
+static void xive_tm_write_special(XiveNVT *nvt, hwaddr offset,
+                                        uint64_t value, unsigned size)
+{
+    /* TODO: support TM_SPC_SET_OS_PENDING */
+
+    /* TODO: support TM_SPC_ACK_OS_EL */
+}
+
+static void xive_tm_os_write(void *opaque, hwaddr offset,
+                                   uint64_t value, unsigned size)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
+    XiveNVT *nvt = XIVE_NVT(cpu->intc);
+    int i;
+
+    if (offset >= TM_SPC_ACK_EBB) {
+        xive_tm_write_special(nvt, offset, value, size);
+        return;
+    }
+
+    if (TM_RING(offset) != TM_QW1_OS) {
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid access to non-OS ring @%"
+                      HWADDR_PRIx"\n", offset);
+        return;
+    }
+
+    switch (size) {
+    case 1:
+        if (offset == TM_QW1_OS + TM_CPPR) {
+            xive_nvt_set_cppr(nvt, value & 0xff);
+        }
+        break;
+    case 4:
+    case 8:
+        for (i = 0; i < size; i++) {
+            if (!xive_tm_is_readonly(offset + i)) {
+                nvt->regs[offset + i] = (value >> (8 * (size - i - 1))) & 0xff;
+            }
+        }
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+const MemoryRegionOps xive_tm_os_ops = {
+    .read = xive_tm_os_read,
+    .write = xive_tm_os_write,
+    .endianness = DEVICE_BIG_ENDIAN,
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+    },
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+    },
+};
+
+/*
+ * User Thread Interrupt Management Area MMIO
+ */
+
+static uint64_t xive_tm_user_read(void *opaque, hwaddr offset,
+                                        unsigned size)
+{
+    qemu_log_mask(LOG_UNIMP, "XIVE: invalid access to User TIMA @%"
+                  HWADDR_PRIx"\n", offset);
+    return -1;
+}
+
+static void xive_tm_user_write(void *opaque, hwaddr offset,
+                                     uint64_t value, unsigned size)
+{
+    qemu_log_mask(LOG_UNIMP, "XIVE: invalid access to User TIMA @%"
+                  HWADDR_PRIx"\n", offset);
+}
+
+
+const MemoryRegionOps xive_tm_user_ops = {
+    .read = xive_tm_user_read,
+    .write = xive_tm_user_write,
+    .endianness = DEVICE_BIG_ENDIAN,
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+    },
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+    },
+};
+
+static char *xive_nvt_ring_print(uint8_t *ring)
+{
+    uint32_t w2 = be32_to_cpu(*((uint32_t *) &ring[TM_WORD2]));
+
+    return g_strdup_printf("%02x  %02x   %02x  %02x    %02x   "
+                   "%02x  %02x  %02x   %08x",
+                   ring[TM_NSR], ring[TM_CPPR], ring[TM_IPB], ring[TM_LSMFB],
+                   ring[TM_ACK_CNT], ring[TM_INC], ring[TM_AGE], ring[TM_PIPR],
+                   w2);
+}
+
+void xive_nvt_pic_print_info(XiveNVT *nvt, Monitor *mon)
+{
+    int cpu_index = nvt->cs ? nvt->cs->cpu_index : -1;
+    char *s;
+
+    monitor_printf(mon, "CPU[%04x]: QW    NSR CPPR IPB LSMFB ACK# INC AGE PIPR"
+                   " W2\n", cpu_index);
+
+    s = xive_nvt_ring_print(&nvt->regs[TM_QW1_OS]);
+    monitor_printf(mon, "CPU[%04x]: OS    %s\n", cpu_index, s);
+    g_free(s);
+    s = xive_nvt_ring_print(&nvt->regs[TM_QW0_USER]);
+    monitor_printf(mon, "CPU[%04x]: USER  %s\n", cpu_index, s);
+    g_free(s);
+}
+
+static void xive_nvt_reset(void *dev)
+{
+    XiveNVT *nvt = XIVE_NVT(dev);
+
+    memset(nvt->regs, 0, sizeof(nvt->regs));
+}
+
+static void xive_nvt_realize(DeviceState *dev, Error **errp)
+{
+    XiveNVT *nvt = XIVE_NVT(dev);
+    PowerPCCPU *cpu;
+    CPUPPCState *env;
+    Object *obj;
+    Error *err = NULL;
+
+    obj = object_property_get_link(OBJECT(dev), ICP_PROP_CPU, &err);
+    if (!obj) {
+        error_propagate(errp, err);
+        error_prepend(errp, "required link '" ICP_PROP_CPU "' not found: ");
+        return;
+    }
+
+    cpu = POWERPC_CPU(obj);
+    nvt->cs = CPU(obj);
+
+    env = &cpu->env;
+    switch (PPC_INPUT(env)) {
+    case PPC_FLAGS_INPUT_POWER7:
+        nvt->output = env->irq_inputs[POWER7_INPUT_INT];
+        break;
+
+    default:
+        error_setg(errp, "XIVE interrupt controller does not support "
+                   "this CPU bus model");
+        return;
+    }
+
+    qemu_register_reset(xive_nvt_reset, dev);
+}
+
+static void xive_nvt_unrealize(DeviceState *dev, Error **errp)
+{
+    qemu_unregister_reset(xive_nvt_reset, dev);
+}
+
+static void xive_nvt_init(Object *obj)
+{
+    XiveNVT *nvt = XIVE_NVT(obj);
+
+    nvt->ring_os = &nvt->regs[TM_QW1_OS];
+}
+
+static const VMStateDescription vmstate_xive_nvt = {
+    .name = TYPE_XIVE_NVT,
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_BUFFER(regs, XiveNVT),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+static void xive_nvt_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = xive_nvt_realize;
+    dc->unrealize = xive_nvt_unrealize;
+    dc->desc = "XIVE Interrupt Presenter";
+    dc->vmsd = &vmstate_xive_nvt;
+}
+
+static const TypeInfo xive_nvt_info = {
+    .name          = TYPE_XIVE_NVT,
+    .parent        = TYPE_DEVICE,
+    .instance_size = sizeof(XiveNVT),
+    .instance_init = xive_nvt_init,
+    .class_init    = xive_nvt_class_init,
+};
 
 /*
  * XIVE Fabric
@@ -27,6 +298,13 @@  XiveIVE *xive_fabric_get_ive(XiveFabric *xf, uint32_t lisn)
     return xfc->get_ive(xf, lisn);
 }
 
+XiveNVT *xive_fabric_get_nvt(XiveFabric *xf, uint32_t server)
+{
+    XiveFabricClass *xfc = XIVE_FABRIC_GET_CLASS(xf);
+
+    return xfc->get_nvt(xf, server);
+}
+
 static void xive_fabric_route(XiveFabric *xf, int lisn)
 {
 
@@ -418,6 +696,7 @@  static void xive_register_types(void)
 {
     type_register_static(&xive_source_info);
     type_register_static(&xive_fabric_info);
+    type_register_static(&xive_nvt_info);
 }
 
 type_init(xive_register_types)
diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
index 4538c622b60a..25d78eec884d 100644
--- a/include/hw/ppc/spapr_xive.h
+++ b/include/hw/ppc/spapr_xive.h
@@ -25,6 +25,11 @@  typedef struct sPAPRXive {
     /* Routing table */
     XiveIVE      *ivt;
     uint32_t     nr_irqs;
+
+    /* TIMA memory regions */
+    hwaddr       tm_base;
+    MemoryRegion tm_mmio_user;
+    MemoryRegion tm_mmio_os;
 } sPAPRXive;
 
 bool spapr_xive_irq_enable(sPAPRXive *xive, uint32_t lisn, bool lsi);
diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
index 57295715a4a5..1a2da610d91c 100644
--- a/include/hw/ppc/xive.h
+++ b/include/hw/ppc/xive.h
@@ -20,6 +20,7 @@  typedef struct XiveFabric XiveFabric;
  */
 
 #define XIVE_VC_BASE   0x0006010000000000ull
+#define XIVE_TM_BASE   0x0006030203180000ull
 
 /*
  * XIVE Interrupt Source
@@ -155,6 +156,34 @@  static inline void xive_source_irq_set(XiveSource *xsrc, uint32_t srcno,
 }
 
 /*
+ * XIVE Interrupt Presenter
+ */
+
+#define TYPE_XIVE_NVT "xive-nvt"
+#define XIVE_NVT(obj) OBJECT_CHECK(XiveNVT, (obj), TYPE_XIVE_NVT)
+
+#define TM_RING_COUNT           4
+#define TM_RING_SIZE            0x10
+
+typedef struct XiveNVT {
+    DeviceState parent_obj;
+
+    CPUState  *cs;
+    qemu_irq  output;
+
+    /* Thread interrupt Management (TM) registers */
+    uint8_t   regs[TM_RING_COUNT * TM_RING_SIZE];
+
+    /* Shortcuts to rings */
+    uint8_t   *ring_os;
+} XiveNVT;
+
+extern const MemoryRegionOps xive_tm_user_ops;
+extern const MemoryRegionOps xive_tm_os_ops;
+
+void xive_nvt_pic_print_info(XiveNVT *nvt, Monitor *mon);
+
+/*
  * XIVE Fabric
  */
 
@@ -175,8 +204,10 @@  typedef struct XiveFabricClass {
     void (*notify)(XiveFabric *xf, uint32_t lisn);
 
     XiveIVE *(*get_ive)(XiveFabric *xf, uint32_t lisn);
+    XiveNVT *(*get_nvt)(XiveFabric *xf, uint32_t server);
 } XiveFabricClass;
 
 XiveIVE *xive_fabric_get_ive(XiveFabric *xf, uint32_t lisn);
+XiveNVT *xive_fabric_get_nvt(XiveFabric *xf, uint32_t server);
 
 #endif /* PPC_XIVE_H */
diff --git a/include/hw/ppc/xive_regs.h b/include/hw/ppc/xive_regs.h
index 5903f29eb789..f2e2a1ac8f6e 100644
--- a/include/hw/ppc/xive_regs.h
+++ b/include/hw/ppc/xive_regs.h
@@ -10,6 +10,88 @@ 
 #ifndef _PPC_XIVE_REGS_H
 #define _PPC_XIVE_REGS_H
 
+#define TM_SHIFT                16
+
+/* TM register offsets */
+#define TM_QW0_USER             0x000 /* All rings */
+#define TM_QW1_OS               0x010 /* Ring 0..2 */
+#define TM_QW2_HV_POOL          0x020 /* Ring 0..1 */
+#define TM_QW3_HV_PHYS          0x030 /* Ring 0..1 */
+
+/* Byte offsets inside a QW             QW0 QW1 QW2 QW3 */
+#define TM_NSR                  0x0  /*  +   +   -   +  */
+#define TM_CPPR                 0x1  /*  -   +   -   +  */
+#define TM_IPB                  0x2  /*  -   +   +   +  */
+#define TM_LSMFB                0x3  /*  -   +   +   +  */
+#define TM_ACK_CNT              0x4  /*  -   +   -   -  */
+#define TM_INC                  0x5  /*  -   +   -   +  */
+#define TM_AGE                  0x6  /*  -   +   -   +  */
+#define TM_PIPR                 0x7  /*  -   +   -   +  */
+
+#define TM_WORD0                0x0
+#define TM_WORD1                0x4
+
+/*
+ * QW word 2 contains the valid bit at the top and other fields
+ * depending on the QW.
+ */
+#define TM_WORD2                0x8
+#define   TM_QW0W2_VU           PPC_BIT32(0)
+#define   TM_QW0W2_LOGIC_SERV   PPC_BITMASK32(1, 31) /* XX 2,31 ? */
+#define   TM_QW1W2_VO           PPC_BIT32(0)
+#define   TM_QW1W2_OS_CAM       PPC_BITMASK32(8, 31)
+#define   TM_QW2W2_VP           PPC_BIT32(0)
+#define   TM_QW2W2_POOL_CAM     PPC_BITMASK32(8, 31)
+#define   TM_QW3W2_VT           PPC_BIT32(0)
+#define   TM_QW3W2_LP           PPC_BIT32(6)
+#define   TM_QW3W2_LE           PPC_BIT32(7)
+#define   TM_QW3W2_T            PPC_BIT32(31)
+
+/*
+ * In addition to normal loads to "peek" and writes (only when invalid)
+ * using 4 and 8 bytes accesses, the above registers support these
+ * "special" byte operations:
+ *
+ *   - Byte load from QW0[NSR] - User level NSR (EBB)
+ *   - Byte store to QW0[NSR] - User level NSR (EBB)
+ *   - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access
+ *   - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0
+ *                                    otherwise VT||0000000
+ *   - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present)
+ *
+ * Then we have all these "special" CI ops at these offset that trigger
+ * all sorts of side effects:
+ */
+#define TM_SPC_ACK_EBB          0x800   /* Load8 ack EBB to reg*/
+#define TM_SPC_ACK_OS_REG       0x810   /* Load16 ack OS irq to reg */
+#define TM_SPC_PUSH_USR_CTX     0x808   /* Store32 Push/Validate user context */
+#define TM_SPC_PULL_USR_CTX     0x808   /* Load32 Pull/Invalidate user
+                                         * context */
+#define TM_SPC_SET_OS_PENDING   0x812   /* Store8 Set OS irq pending bit */
+#define TM_SPC_PULL_OS_CTX      0x818   /* Load32/Load64 Pull/Invalidate OS
+                                         * context to reg */
+#define TM_SPC_PULL_POOL_CTX    0x828   /* Load32/Load64 Pull/Invalidate Pool
+                                         * context to reg*/
+#define TM_SPC_ACK_HV_REG       0x830   /* Load16 ack HV irq to reg */
+#define TM_SPC_PULL_USR_CTX_OL  0xc08   /* Store8 Pull/Inval usr ctx to odd
+                                         * line */
+#define TM_SPC_ACK_OS_EL        0xc10   /* Store8 ack OS irq to even line */
+#define TM_SPC_ACK_HV_POOL_EL   0xc20   /* Store8 ack HV evt pool to even
+                                         * line */
+#define TM_SPC_ACK_HV_EL        0xc30   /* Store8 ack HV irq to even line */
+/* XXX more... */
+
+/* NSR fields for the various QW ack types */
+#define TM_QW0_NSR_EB           PPC_BIT8(0)
+#define TM_QW1_NSR_EO           PPC_BIT8(0)
+#define TM_QW3_NSR_HE           PPC_BITMASK8(0, 1)
+#define  TM_QW3_NSR_HE_NONE     0
+#define  TM_QW3_NSR_HE_POOL     1
+#define  TM_QW3_NSR_HE_PHYS     2
+#define  TM_QW3_NSR_HE_LSI      3
+#define TM_QW3_NSR_I            PPC_BIT8(2)
+#define TM_QW3_NSR_GRP_LVL      PPC_BIT8(3, 7)
+
 /* IVE/EAS
  *
  * One per interrupt source. Targets that interrupt to a given EQ
@@ -30,4 +112,6 @@  typedef struct XiveIVE {
 #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
 } XiveIVE;
 
+#define XIVE_PRIORITY_MAX  7
+
 #endif /* _INTC_XIVE_INTERNAL_H */