diff mbox series

[v3,07/35] spapr/xive: introduce the XIVE Event Queues

Message ID 20180419124331.3915-8-clg@kaod.org
State New
Headers show
Series ppc: support for the XIVE interrupt controller (POWER9) | expand

Commit Message

Cédric Le Goater April 19, 2018, 12:43 p.m. UTC
The Event Queue Descriptor (EQD) table is an internal table of the
XIVE routing sub-engine. It specifies on which Event Queue the event
data should be posted when an exception occurs (later on pulled by the
OS) and which Virtual Processor to notify. The Event Queue is a much
more complex structure but we start with a simple model for the sPAPR
machine.

There is one XiveEQ per priority and these are stored under the XIVE
virtualization presenter (sPAPRXiveNVT). EQs are simply indexed with :

       (server << 3) | (priority & 0x7)

This is not in the XIVE architecture but as the EQ index is never
exposed to the guest, in the hcalls nor in the device tree, we are
free to use what fits best the current model.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---

 Changes since v2 :

 - introduced the XiveFabric interface

 hw/intc/spapr_xive.c        | 31 +++++++++++++++++---
 hw/intc/xive.c              | 71 +++++++++++++++++++++++++++++++++++++++++++++
 include/hw/ppc/spapr_xive.h |  7 +++++
 include/hw/ppc/xive.h       |  8 +++++
 include/hw/ppc/xive_regs.h  | 48 ++++++++++++++++++++++++++++++
 5 files changed, 161 insertions(+), 4 deletions(-)

Comments

David Gibson April 26, 2018, 7:25 a.m. UTC | #1
On Thu, Apr 19, 2018 at 02:43:03PM +0200, Cédric Le Goater wrote:
> The Event Queue Descriptor (EQD) table is an internal table of the
> XIVE routing sub-engine. It specifies on which Event Queue the event
> data should be posted when an exception occurs (later on pulled by the
> OS) and which Virtual Processor to notify.

Uhhh.. I thought the IVT said which queue and vp to notify, and the
EQD gave metadata for event queues.

> The Event Queue is a much
> more complex structure but we start with a simple model for the sPAPR
> machine.
> 
> There is one XiveEQ per priority and these are stored under the XIVE
> virtualization presenter (sPAPRXiveNVT). EQs are simply indexed with :
> 
>        (server << 3) | (priority & 0x7)
> 
> This is not in the XIVE architecture but as the EQ index is never
> exposed to the guest, in the hcalls nor in the device tree, we are
> free to use what fits best the current model.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>

Is the EQD actually modifiable by a guest?  Or are the settings of the
EQs fixed by PAPR?

> ---
> 
>  Changes since v2 :
> 
>  - introduced the XiveFabric interface
> 
>  hw/intc/spapr_xive.c        | 31 +++++++++++++++++---
>  hw/intc/xive.c              | 71 +++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/ppc/spapr_xive.h |  7 +++++
>  include/hw/ppc/xive.h       |  8 +++++
>  include/hw/ppc/xive_regs.h  | 48 ++++++++++++++++++++++++++++++
>  5 files changed, 161 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> index f07832bf0a00..d0d5a7d7f969 100644
> --- a/hw/intc/spapr_xive.c
> +++ b/hw/intc/spapr_xive.c
> @@ -27,15 +27,30 @@ void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
>      monitor_printf(mon, "IVE Table\n");
>      for (i = 0; i < xive->nr_irqs; i++) {
>          XiveIVE *ive = &xive->ivt[i];
> +        uint32_t eq_idx;
>  
>          if (!(ive->w & IVE_VALID)) {
>              continue;
>          }
>  
> -        monitor_printf(mon, "  %4x %s %08x %08x\n", i,
> -                       ive->w & IVE_MASKED ? "M" : " ",
> -                       (int) GETFIELD(IVE_EQ_INDEX, ive->w),
> -                       (int) GETFIELD(IVE_EQ_DATA, ive->w));
> +        eq_idx = GETFIELD(IVE_EQ_INDEX, ive->w);
> +
> +        monitor_printf(mon, "  %6x %s eqidx:%03d ", i,
> +                       ive->w & IVE_MASKED ? "M" : " ", eq_idx);
> +
> +        if (!(ive->w & IVE_MASKED)) {
> +            XiveEQ *eq;
> +
> +            eq = xive_fabric_get_eq(XIVE_FABRIC(xive), eq_idx);
> +            if (eq && (eq->w0 & EQ_W0_VALID)) {
> +                xive_eq_pic_print_info(eq, mon);
> +                monitor_printf(mon, " data:%08x",
> +                               (int) GETFIELD(IVE_EQ_DATA, ive->w));
> +            } else {
> +                monitor_printf(mon, "no eq ?!");
> +            }
> +        }
> +        monitor_printf(mon, "\n");
>      }
>  }
>  
> @@ -128,6 +143,13 @@ static XiveNVT *spapr_xive_get_nvt(XiveFabric *xf, uint32_t server)
>      return cpu ? XIVE_NVT(cpu->intc) : NULL;
>  }
>  
> +static XiveEQ *spapr_xive_get_eq(XiveFabric *xf, uint32_t eq_idx)
> +{
> +    XiveNVT *nvt = xive_fabric_get_nvt(xf, SPAPR_XIVE_EQ_SERVER(eq_idx));
> +
> +    return xive_nvt_eq_get(nvt, SPAPR_XIVE_EQ_PRIO(eq_idx));
> +}
> +
>  static const VMStateDescription vmstate_spapr_xive_ive = {
>      .name = TYPE_SPAPR_XIVE "/ive",
>      .version_id = 1,
> @@ -168,6 +190,7 @@ static void spapr_xive_class_init(ObjectClass *klass, void *data)
>  
>      xfc->get_ive = spapr_xive_get_ive;
>      xfc->get_nvt = spapr_xive_get_nvt;
> +    xfc->get_eq = spapr_xive_get_eq;
>  }
>  
>  static const TypeInfo spapr_xive_info = {
> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> index 5691bb9474e4..2ab37fde80e8 100644
> --- a/hw/intc/xive.c
> +++ b/hw/intc/xive.c
> @@ -19,6 +19,47 @@
>  #include "hw/ppc/xive_regs.h"
>  
>  /*
> + * XiveEQ helpers
> + */
> +
> +XiveEQ *xive_nvt_eq_get(XiveNVT *nvt, uint8_t priority)
> +{
> +    if (!nvt || priority > XIVE_PRIORITY_MAX) {
> +        return NULL;
> +    }
> +    return &nvt->eqt[priority];
> +}
> +
> +void xive_eq_reset(XiveEQ *eq)
> +{
> +    memset(eq, 0, sizeof(*eq));
> +
> +    /* switch off the escalation and notification ESBs */
> +    eq->w1 = EQ_W1_ESe_Q | EQ_W1_ESn_Q;
> +}
> +
> +void xive_eq_pic_print_info(XiveEQ *eq, Monitor *mon)
> +{
> +    uint64_t qaddr_base = (((uint64_t)(eq->w2 & 0x0fffffff)) << 32) | eq->w3;
> +    uint32_t qindex = GETFIELD(EQ_W1_PAGE_OFF, eq->w1);
> +    uint32_t qgen = GETFIELD(EQ_W1_GENERATION, eq->w1);
> +    uint32_t qsize = GETFIELD(EQ_W0_QSIZE, eq->w0);
> +    uint32_t qentries = 1 << (qsize + 10);
> +
> +    uint32_t server = GETFIELD(EQ_W6_NVT_INDEX, eq->w6);
> +    uint8_t priority = GETFIELD(EQ_W7_F0_PRIORITY, eq->w7);
> +
> +    monitor_printf(mon, "%c%c%c%c%c prio:%d server:%03d eq:@%08"PRIx64
> +                   "% 6d/%5d ^%d",
> +                   eq->w0 & EQ_W0_VALID ? 'v' : '-',
> +                   eq->w0 & EQ_W0_ENQUEUE ? 'q' : '-',
> +                   eq->w0 & EQ_W0_UCOND_NOTIFY ? 'n' : '-',
> +                   eq->w0 & EQ_W0_BACKLOG ? 'b' : '-',
> +                   eq->w0 & EQ_W0_ESCALATE_CTL ? 'e' : '-',
> +                   priority, server, qaddr_base, qindex, qentries, qgen);
> +}
> +
> +/*
>   * XIVE Interrupt Presenter
>   */
>  
> @@ -210,8 +251,12 @@ void xive_nvt_pic_print_info(XiveNVT *nvt, Monitor *mon)
>  static void xive_nvt_reset(void *dev)
>  {
>      XiveNVT *nvt = XIVE_NVT(dev);
> +    int i;
>  
>      memset(nvt->regs, 0, sizeof(nvt->regs));
> +    for (i = 0; i < ARRAY_SIZE(nvt->eqt); i++) {
> +        xive_eq_reset(&nvt->eqt[i]);
> +    }

Hrm.  Having the EQs "owned" by the NVT makes things simple for PAPR.
But won't that break down for the powernv case?

>  }
>  
>  static void xive_nvt_realize(DeviceState *dev, Error **errp)
> @@ -259,12 +304,31 @@ static void xive_nvt_init(Object *obj)
>      nvt->ring_os = &nvt->regs[TM_QW1_OS];
>  }
>  
> +static const VMStateDescription vmstate_xive_nvt_eq = {
> +    .name = TYPE_XIVE_NVT "/eq",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .fields = (VMStateField []) {
> +        VMSTATE_UINT32(w0, XiveEQ),
> +        VMSTATE_UINT32(w1, XiveEQ),
> +        VMSTATE_UINT32(w2, XiveEQ),
> +        VMSTATE_UINT32(w3, XiveEQ),
> +        VMSTATE_UINT32(w4, XiveEQ),
> +        VMSTATE_UINT32(w5, XiveEQ),
> +        VMSTATE_UINT32(w6, XiveEQ),
> +        VMSTATE_UINT32(w7, XiveEQ),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
>  static const VMStateDescription vmstate_xive_nvt = {
>      .name = TYPE_XIVE_NVT,
>      .version_id = 1,
>      .minimum_version_id = 1,
>      .fields = (VMStateField[]) {
>          VMSTATE_BUFFER(regs, XiveNVT),
> +        VMSTATE_STRUCT_ARRAY(eqt, XiveNVT, (XIVE_PRIORITY_MAX + 1), 1,
> +                             vmstate_xive_nvt_eq, XiveEQ),
>          VMSTATE_END_OF_LIST()
>      },
>  };
> @@ -305,6 +369,13 @@ XiveNVT *xive_fabric_get_nvt(XiveFabric *xf, uint32_t server)
>      return xfc->get_nvt(xf, server);
>  }
>  
> +XiveEQ *xive_fabric_get_eq(XiveFabric *xf, uint32_t eq_idx)
> +{
> +   XiveFabricClass *xfc = XIVE_FABRIC_GET_CLASS(xf);
> +
> +   return xfc->get_eq(xf, eq_idx);
> +}
> +
>  static void xive_fabric_route(XiveFabric *xf, int lisn)
>  {
>  
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index 25d78eec884d..7cb3561aa3d3 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -36,4 +36,11 @@ bool spapr_xive_irq_enable(sPAPRXive *xive, uint32_t lisn, bool lsi);
>  bool spapr_xive_irq_disable(sPAPRXive *xive, uint32_t lisn);
>  void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon);
>  
> +/*
> + * sPAPR encoding of EQ indexes
> + */
> +#define SPAPR_XIVE_EQ_INDEX(server, prio)  (((server) << 3) | ((prio) & 0x7))
> +#define SPAPR_XIVE_EQ_SERVER(eq_idx) ((eq_idx) >> 3)
> +#define SPAPR_XIVE_EQ_PRIO(eq_idx)   ((eq_idx) & 0x7)
> +
>  #endif /* PPC_SPAPR_XIVE_H */
> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> index 1a2da610d91c..6cc02638c677 100644
> --- a/include/hw/ppc/xive.h
> +++ b/include/hw/ppc/xive.h
> @@ -176,12 +176,18 @@ typedef struct XiveNVT {
>  
>      /* Shortcuts to rings */
>      uint8_t   *ring_os;
> +
> +    XiveEQ    eqt[XIVE_PRIORITY_MAX + 1];
>  } XiveNVT;
>  
>  extern const MemoryRegionOps xive_tm_user_ops;
>  extern const MemoryRegionOps xive_tm_os_ops;
>  
>  void xive_nvt_pic_print_info(XiveNVT *nvt, Monitor *mon);
> +XiveEQ *xive_nvt_eq_get(XiveNVT *nvt, uint8_t priority);
> +
> +void xive_eq_reset(XiveEQ *eq);
> +void xive_eq_pic_print_info(XiveEQ *eq, Monitor *mon);
>  
>  /*
>   * XIVE Fabric
> @@ -205,9 +211,11 @@ typedef struct XiveFabricClass {
>  
>      XiveIVE *(*get_ive)(XiveFabric *xf, uint32_t lisn);
>      XiveNVT *(*get_nvt)(XiveFabric *xf, uint32_t server);
> +    XiveEQ  *(*get_eq)(XiveFabric *xf, uint32_t eq_idx);
>  } XiveFabricClass;
>  
>  XiveIVE *xive_fabric_get_ive(XiveFabric *xf, uint32_t lisn);
>  XiveNVT *xive_fabric_get_nvt(XiveFabric *xf, uint32_t server);
> +XiveEQ  *xive_fabric_get_eq(XiveFabric *xf, uint32_t eq_idx);
>  
>  #endif /* PPC_XIVE_H */
> diff --git a/include/hw/ppc/xive_regs.h b/include/hw/ppc/xive_regs.h
> index f2e2a1ac8f6e..bcc44e766db9 100644
> --- a/include/hw/ppc/xive_regs.h
> +++ b/include/hw/ppc/xive_regs.h
> @@ -112,6 +112,54 @@ typedef struct XiveIVE {
>  #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
>  } XiveIVE;
>  
> +/* EQ */
> +typedef struct XiveEQ {
> +        uint32_t        w0;
> +#define EQ_W0_VALID             PPC_BIT32(0) /* "v" bit */
> +#define EQ_W0_ENQUEUE           PPC_BIT32(1) /* "q" bit */
> +#define EQ_W0_UCOND_NOTIFY      PPC_BIT32(2) /* "n" bit */
> +#define EQ_W0_BACKLOG           PPC_BIT32(3) /* "b" bit */
> +#define EQ_W0_PRECL_ESC_CTL     PPC_BIT32(4) /* "p" bit */
> +#define EQ_W0_ESCALATE_CTL      PPC_BIT32(5) /* "e" bit */
> +#define EQ_W0_UNCOND_ESCALATE   PPC_BIT32(6) /* "u" bit - DD2.0 */
> +#define EQ_W0_SILENT_ESCALATE   PPC_BIT32(7) /* "s" bit - DD2.0 */
> +#define EQ_W0_QSIZE             PPC_BITMASK32(12, 15)
> +#define EQ_W0_SW0               PPC_BIT32(16)
> +#define EQ_W0_FIRMWARE          EQ_W0_SW0 /* Owned by FW */
> +#define EQ_QSIZE_4K             0
> +#define EQ_QSIZE_64K            4
> +#define EQ_W0_HWDEP             PPC_BITMASK32(24, 31)
> +        uint32_t        w1;
> +#define EQ_W1_ESn               PPC_BITMASK32(0, 1)
> +#define EQ_W1_ESn_P             PPC_BIT32(0)
> +#define EQ_W1_ESn_Q             PPC_BIT32(1)
> +#define EQ_W1_ESe               PPC_BITMASK32(2, 3)
> +#define EQ_W1_ESe_P             PPC_BIT32(2)
> +#define EQ_W1_ESe_Q             PPC_BIT32(3)
> +#define EQ_W1_GENERATION        PPC_BIT32(9)
> +#define EQ_W1_PAGE_OFF          PPC_BITMASK32(10, 31)
> +        uint32_t        w2;
> +#define EQ_W2_MIGRATION_REG     PPC_BITMASK32(0, 3)
> +#define EQ_W2_OP_DESC_HI        PPC_BITMASK32(4, 31)
> +        uint32_t        w3;
> +#define EQ_W3_OP_DESC_LO        PPC_BITMASK32(0, 31)
> +        uint32_t        w4;
> +#define EQ_W4_ESC_EQ_BLOCK      PPC_BITMASK32(4, 7)
> +#define EQ_W4_ESC_EQ_INDEX      PPC_BITMASK32(8, 31)
> +        uint32_t        w5;
> +#define EQ_W5_ESC_EQ_DATA       PPC_BITMASK32(1, 31)
> +        uint32_t        w6;
> +#define EQ_W6_FORMAT_BIT        PPC_BIT32(8)
> +#define EQ_W6_NVT_BLOCK         PPC_BITMASK32(9, 12)
> +#define EQ_W6_NVT_INDEX         PPC_BITMASK32(13, 31)
> +        uint32_t        w7;
> +#define EQ_W7_F0_IGNORE         PPC_BIT32(0)
> +#define EQ_W7_F0_BLK_GROUPING   PPC_BIT32(1)
> +#define EQ_W7_F0_PRIORITY       PPC_BITMASK32(8, 15)
> +#define EQ_W7_F1_WAKEZ          PPC_BIT32(0)
> +#define EQ_W7_F1_LOG_SERVER_ID  PPC_BITMASK32(1, 31)
> +} XiveEQ;
> +
>  #define XIVE_PRIORITY_MAX  7
>  
>  #endif /* _INTC_XIVE_INTERNAL_H */
Cédric Le Goater April 26, 2018, 9:48 a.m. UTC | #2
On 04/26/2018 09:25 AM, David Gibson wrote:
> On Thu, Apr 19, 2018 at 02:43:03PM +0200, Cédric Le Goater wrote:
>> The Event Queue Descriptor (EQD) table is an internal table of the
>> XIVE routing sub-engine. It specifies on which Event Queue the event
>> data should be posted when an exception occurs (later on pulled by the
>> OS) and which Virtual Processor to notify.
> 
> Uhhh.. I thought the IVT said which queue and vp to notify, and the
> EQD gave metadata for event queues.

yes. the above poorly written. The Event Queue Descriptor contains the
guest address of the event queue in which the data is written. I will 
rephrase.      

The IVT contains IVEs which indeed define for an IRQ which EQ to notify 
and what data to push on the queue. 
 
>> The Event Queue is a much
>> more complex structure but we start with a simple model for the sPAPR
>> machine.
>>
>> There is one XiveEQ per priority and these are stored under the XIVE
>> virtualization presenter (sPAPRXiveNVT). EQs are simply indexed with :
>>
>>        (server << 3) | (priority & 0x7)
>>
>> This is not in the XIVE architecture but as the EQ index is never
>> exposed to the guest, in the hcalls nor in the device tree, we are
>> free to use what fits best the current model.

This EQ indexing is important to notice because it will also show up 
in KVM to build the IVE from the KVM irq state. 

  
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> 
> Is the EQD actually modifiable by a guest?  Or are the settings of the
> EQs fixed by PAPR?

The guest uses the H_INT_SET_QUEUE_CONFIG hcall to define the address
of the event queue for a couple prio/server.

>> ---
>>
>>  Changes since v2 :
>>
>>  - introduced the XiveFabric interface
>>
>>  hw/intc/spapr_xive.c        | 31 +++++++++++++++++---
>>  hw/intc/xive.c              | 71 +++++++++++++++++++++++++++++++++++++++++++++
>>  include/hw/ppc/spapr_xive.h |  7 +++++
>>  include/hw/ppc/xive.h       |  8 +++++
>>  include/hw/ppc/xive_regs.h  | 48 ++++++++++++++++++++++++++++++
>>  5 files changed, 161 insertions(+), 4 deletions(-)
>>
>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>> index f07832bf0a00..d0d5a7d7f969 100644
>> --- a/hw/intc/spapr_xive.c
>> +++ b/hw/intc/spapr_xive.c
>> @@ -27,15 +27,30 @@ void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
>>      monitor_printf(mon, "IVE Table\n");
>>      for (i = 0; i < xive->nr_irqs; i++) {
>>          XiveIVE *ive = &xive->ivt[i];
>> +        uint32_t eq_idx;
>>  
>>          if (!(ive->w & IVE_VALID)) {
>>              continue;
>>          }
>>  
>> -        monitor_printf(mon, "  %4x %s %08x %08x\n", i,
>> -                       ive->w & IVE_MASKED ? "M" : " ",
>> -                       (int) GETFIELD(IVE_EQ_INDEX, ive->w),
>> -                       (int) GETFIELD(IVE_EQ_DATA, ive->w));
>> +        eq_idx = GETFIELD(IVE_EQ_INDEX, ive->w);
>> +
>> +        monitor_printf(mon, "  %6x %s eqidx:%03d ", i,
>> +                       ive->w & IVE_MASKED ? "M" : " ", eq_idx);
>> +
>> +        if (!(ive->w & IVE_MASKED)) {
>> +            XiveEQ *eq;
>> +
>> +            eq = xive_fabric_get_eq(XIVE_FABRIC(xive), eq_idx);
>> +            if (eq && (eq->w0 & EQ_W0_VALID)) {
>> +                xive_eq_pic_print_info(eq, mon);
>> +                monitor_printf(mon, " data:%08x",
>> +                               (int) GETFIELD(IVE_EQ_DATA, ive->w));
>> +            } else {
>> +                monitor_printf(mon, "no eq ?!");
>> +            }
>> +        }
>> +        monitor_printf(mon, "\n");
>>      }
>>  }
>>  
>> @@ -128,6 +143,13 @@ static XiveNVT *spapr_xive_get_nvt(XiveFabric *xf, uint32_t server)
>>      return cpu ? XIVE_NVT(cpu->intc) : NULL;
>>  }
>>  
>> +static XiveEQ *spapr_xive_get_eq(XiveFabric *xf, uint32_t eq_idx)
>> +{
>> +    XiveNVT *nvt = xive_fabric_get_nvt(xf, SPAPR_XIVE_EQ_SERVER(eq_idx));
>> +
>> +    return xive_nvt_eq_get(nvt, SPAPR_XIVE_EQ_PRIO(eq_idx));
>> +}
>> +
>>  static const VMStateDescription vmstate_spapr_xive_ive = {
>>      .name = TYPE_SPAPR_XIVE "/ive",
>>      .version_id = 1,
>> @@ -168,6 +190,7 @@ static void spapr_xive_class_init(ObjectClass *klass, void *data)
>>  
>>      xfc->get_ive = spapr_xive_get_ive;
>>      xfc->get_nvt = spapr_xive_get_nvt;
>> +    xfc->get_eq = spapr_xive_get_eq;
>>  }
>>  
>>  static const TypeInfo spapr_xive_info = {
>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>> index 5691bb9474e4..2ab37fde80e8 100644
>> --- a/hw/intc/xive.c
>> +++ b/hw/intc/xive.c
>> @@ -19,6 +19,47 @@
>>  #include "hw/ppc/xive_regs.h"
>>  
>>  /*
>> + * XiveEQ helpers
>> + */
>> +
>> +XiveEQ *xive_nvt_eq_get(XiveNVT *nvt, uint8_t priority)
>> +{
>> +    if (!nvt || priority > XIVE_PRIORITY_MAX) {
>> +        return NULL;
>> +    }
>> +    return &nvt->eqt[priority];
>> +}
>> +
>> +void xive_eq_reset(XiveEQ *eq)
>> +{
>> +    memset(eq, 0, sizeof(*eq));
>> +
>> +    /* switch off the escalation and notification ESBs */
>> +    eq->w1 = EQ_W1_ESe_Q | EQ_W1_ESn_Q;
>> +}
>> +
>> +void xive_eq_pic_print_info(XiveEQ *eq, Monitor *mon)
>> +{
>> +    uint64_t qaddr_base = (((uint64_t)(eq->w2 & 0x0fffffff)) << 32) | eq->w3;
>> +    uint32_t qindex = GETFIELD(EQ_W1_PAGE_OFF, eq->w1);
>> +    uint32_t qgen = GETFIELD(EQ_W1_GENERATION, eq->w1);
>> +    uint32_t qsize = GETFIELD(EQ_W0_QSIZE, eq->w0);
>> +    uint32_t qentries = 1 << (qsize + 10);
>> +
>> +    uint32_t server = GETFIELD(EQ_W6_NVT_INDEX, eq->w6);
>> +    uint8_t priority = GETFIELD(EQ_W7_F0_PRIORITY, eq->w7);
>> +
>> +    monitor_printf(mon, "%c%c%c%c%c prio:%d server:%03d eq:@%08"PRIx64
>> +                   "% 6d/%5d ^%d",
>> +                   eq->w0 & EQ_W0_VALID ? 'v' : '-',
>> +                   eq->w0 & EQ_W0_ENQUEUE ? 'q' : '-',
>> +                   eq->w0 & EQ_W0_UCOND_NOTIFY ? 'n' : '-',
>> +                   eq->w0 & EQ_W0_BACKLOG ? 'b' : '-',
>> +                   eq->w0 & EQ_W0_ESCALATE_CTL ? 'e' : '-',
>> +                   priority, server, qaddr_base, qindex, qentries, qgen);
>> +}
>> +
>> +/*
>>   * XIVE Interrupt Presenter
>>   */
>>  
>> @@ -210,8 +251,12 @@ void xive_nvt_pic_print_info(XiveNVT *nvt, Monitor *mon)
>>  static void xive_nvt_reset(void *dev)
>>  {
>>      XiveNVT *nvt = XIVE_NVT(dev);
>> +    int i;
>>  
>>      memset(nvt->regs, 0, sizeof(nvt->regs));
>> +    for (i = 0; i < ARRAY_SIZE(nvt->eqt); i++) {
>> +        xive_eq_reset(&nvt->eqt[i]);
>> +    }
> 
> Hrm.  Having the EQs "owned" by the NVT makes things simple for PAPR.
> But won't that break down for the powernv case?

powernv stores the EQs in the RAM of the machine and they are maintained 
by skiboot using IC registers. To get/set an EQ from QEMU powernv, we need 
to read/write the RAM and the ones under the XiveNVT become useless. 

The model does not use much the skiboot VP table though, only to get the
valid bit, and instead, it uses XiveNVT objects. In the future, we might 
use more the VP table to be more precise. But nevertheless we will need 
a XiveNVT object to store the interrupt management registers.

> 
>>  }
>>  
>>  static void xive_nvt_realize(DeviceState *dev, Error **errp)
>> @@ -259,12 +304,31 @@ static void xive_nvt_init(Object *obj)
>>      nvt->ring_os = &nvt->regs[TM_QW1_OS];
>>  }
>>  
>> +static const VMStateDescription vmstate_xive_nvt_eq = {
>> +    .name = TYPE_XIVE_NVT "/eq",
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .fields = (VMStateField []) {
>> +        VMSTATE_UINT32(w0, XiveEQ),
>> +        VMSTATE_UINT32(w1, XiveEQ),
>> +        VMSTATE_UINT32(w2, XiveEQ),
>> +        VMSTATE_UINT32(w3, XiveEQ),
>> +        VMSTATE_UINT32(w4, XiveEQ),
>> +        VMSTATE_UINT32(w5, XiveEQ),
>> +        VMSTATE_UINT32(w6, XiveEQ),
>> +        VMSTATE_UINT32(w7, XiveEQ),
>> +        VMSTATE_END_OF_LIST()
>> +    },
>> +};
>> +
>>  static const VMStateDescription vmstate_xive_nvt = {
>>      .name = TYPE_XIVE_NVT,
>>      .version_id = 1,
>>      .minimum_version_id = 1,
>>      .fields = (VMStateField[]) {
>>          VMSTATE_BUFFER(regs, XiveNVT),
>> +        VMSTATE_STRUCT_ARRAY(eqt, XiveNVT, (XIVE_PRIORITY_MAX + 1), 1,
>> +                             vmstate_xive_nvt_eq, XiveEQ),
>>          VMSTATE_END_OF_LIST()
>>      },
>>  };
>> @@ -305,6 +369,13 @@ XiveNVT *xive_fabric_get_nvt(XiveFabric *xf, uint32_t server)
>>      return xfc->get_nvt(xf, server);
>>  }
>>  
>> +XiveEQ *xive_fabric_get_eq(XiveFabric *xf, uint32_t eq_idx)
>> +{
>> +   XiveFabricClass *xfc = XIVE_FABRIC_GET_CLASS(xf);
>> +
>> +   return xfc->get_eq(xf, eq_idx);
>> +}
>> +
>>  static void xive_fabric_route(XiveFabric *xf, int lisn)
>>  {
>>  
>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
>> index 25d78eec884d..7cb3561aa3d3 100644
>> --- a/include/hw/ppc/spapr_xive.h
>> +++ b/include/hw/ppc/spapr_xive.h
>> @@ -36,4 +36,11 @@ bool spapr_xive_irq_enable(sPAPRXive *xive, uint32_t lisn, bool lsi);
>>  bool spapr_xive_irq_disable(sPAPRXive *xive, uint32_t lisn);
>>  void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon);
>>  
>> +/*
>> + * sPAPR encoding of EQ indexes
>> + */
>> +#define SPAPR_XIVE_EQ_INDEX(server, prio)  (((server) << 3) | ((prio) & 0x7))
>> +#define SPAPR_XIVE_EQ_SERVER(eq_idx) ((eq_idx) >> 3)
>> +#define SPAPR_XIVE_EQ_PRIO(eq_idx)   ((eq_idx) & 0x7)
>> +
>>  #endif /* PPC_SPAPR_XIVE_H */
>> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
>> index 1a2da610d91c..6cc02638c677 100644
>> --- a/include/hw/ppc/xive.h
>> +++ b/include/hw/ppc/xive.h
>> @@ -176,12 +176,18 @@ typedef struct XiveNVT {
>>  
>>      /* Shortcuts to rings */
>>      uint8_t   *ring_os;
>> +
>> +    XiveEQ    eqt[XIVE_PRIORITY_MAX + 1];
>>  } XiveNVT;
>>  
>>  extern const MemoryRegionOps xive_tm_user_ops;
>>  extern const MemoryRegionOps xive_tm_os_ops;
>>  
>>  void xive_nvt_pic_print_info(XiveNVT *nvt, Monitor *mon);
>> +XiveEQ *xive_nvt_eq_get(XiveNVT *nvt, uint8_t priority);
>> +
>> +void xive_eq_reset(XiveEQ *eq);
>> +void xive_eq_pic_print_info(XiveEQ *eq, Monitor *mon);
>>  
>>  /*
>>   * XIVE Fabric
>> @@ -205,9 +211,11 @@ typedef struct XiveFabricClass {
>>  
>>      XiveIVE *(*get_ive)(XiveFabric *xf, uint32_t lisn);
>>      XiveNVT *(*get_nvt)(XiveFabric *xf, uint32_t server);
>> +    XiveEQ  *(*get_eq)(XiveFabric *xf, uint32_t eq_idx);
>>  } XiveFabricClass;
>>  
>>  XiveIVE *xive_fabric_get_ive(XiveFabric *xf, uint32_t lisn);
>>  XiveNVT *xive_fabric_get_nvt(XiveFabric *xf, uint32_t server);
>> +XiveEQ  *xive_fabric_get_eq(XiveFabric *xf, uint32_t eq_idx);
>>  
>>  #endif /* PPC_XIVE_H */
>> diff --git a/include/hw/ppc/xive_regs.h b/include/hw/ppc/xive_regs.h
>> index f2e2a1ac8f6e..bcc44e766db9 100644
>> --- a/include/hw/ppc/xive_regs.h
>> +++ b/include/hw/ppc/xive_regs.h
>> @@ -112,6 +112,54 @@ typedef struct XiveIVE {
>>  #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
>>  } XiveIVE;
>>  
>> +/* EQ */
>> +typedef struct XiveEQ {
>> +        uint32_t        w0;
>> +#define EQ_W0_VALID             PPC_BIT32(0) /* "v" bit */
>> +#define EQ_W0_ENQUEUE           PPC_BIT32(1) /* "q" bit */
>> +#define EQ_W0_UCOND_NOTIFY      PPC_BIT32(2) /* "n" bit */
>> +#define EQ_W0_BACKLOG           PPC_BIT32(3) /* "b" bit */
>> +#define EQ_W0_PRECL_ESC_CTL     PPC_BIT32(4) /* "p" bit */
>> +#define EQ_W0_ESCALATE_CTL      PPC_BIT32(5) /* "e" bit */
>> +#define EQ_W0_UNCOND_ESCALATE   PPC_BIT32(6) /* "u" bit - DD2.0 */
>> +#define EQ_W0_SILENT_ESCALATE   PPC_BIT32(7) /* "s" bit - DD2.0 */
>> +#define EQ_W0_QSIZE             PPC_BITMASK32(12, 15)
>> +#define EQ_W0_SW0               PPC_BIT32(16)
>> +#define EQ_W0_FIRMWARE          EQ_W0_SW0 /* Owned by FW */
>> +#define EQ_QSIZE_4K             0
>> +#define EQ_QSIZE_64K            4
>> +#define EQ_W0_HWDEP             PPC_BITMASK32(24, 31)
>> +        uint32_t        w1;
>> +#define EQ_W1_ESn               PPC_BITMASK32(0, 1)
>> +#define EQ_W1_ESn_P             PPC_BIT32(0)
>> +#define EQ_W1_ESn_Q             PPC_BIT32(1)
>> +#define EQ_W1_ESe               PPC_BITMASK32(2, 3)
>> +#define EQ_W1_ESe_P             PPC_BIT32(2)
>> +#define EQ_W1_ESe_Q             PPC_BIT32(3)
>> +#define EQ_W1_GENERATION        PPC_BIT32(9)
>> +#define EQ_W1_PAGE_OFF          PPC_BITMASK32(10, 31)
>> +        uint32_t        w2;
>> +#define EQ_W2_MIGRATION_REG     PPC_BITMASK32(0, 3)
>> +#define EQ_W2_OP_DESC_HI        PPC_BITMASK32(4, 31)
>> +        uint32_t        w3;
>> +#define EQ_W3_OP_DESC_LO        PPC_BITMASK32(0, 31)
>> +        uint32_t        w4;
>> +#define EQ_W4_ESC_EQ_BLOCK      PPC_BITMASK32(4, 7)
>> +#define EQ_W4_ESC_EQ_INDEX      PPC_BITMASK32(8, 31)
>> +        uint32_t        w5;
>> +#define EQ_W5_ESC_EQ_DATA       PPC_BITMASK32(1, 31)
>> +        uint32_t        w6;
>> +#define EQ_W6_FORMAT_BIT        PPC_BIT32(8)
>> +#define EQ_W6_NVT_BLOCK         PPC_BITMASK32(9, 12)
>> +#define EQ_W6_NVT_INDEX         PPC_BITMASK32(13, 31)
>> +        uint32_t        w7;
>> +#define EQ_W7_F0_IGNORE         PPC_BIT32(0)
>> +#define EQ_W7_F0_BLK_GROUPING   PPC_BIT32(1)
>> +#define EQ_W7_F0_PRIORITY       PPC_BITMASK32(8, 15)
>> +#define EQ_W7_F1_WAKEZ          PPC_BIT32(0)
>> +#define EQ_W7_F1_LOG_SERVER_ID  PPC_BITMASK32(1, 31)
>> +} XiveEQ;
>> +
>>  #define XIVE_PRIORITY_MAX  7
>>  
>>  #endif /* _INTC_XIVE_INTERNAL_H */
>
David Gibson May 3, 2018, 5:45 a.m. UTC | #3
On Thu, Apr 26, 2018 at 11:48:06AM +0200, Cédric Le Goater wrote:
> On 04/26/2018 09:25 AM, David Gibson wrote:
> > On Thu, Apr 19, 2018 at 02:43:03PM +0200, Cédric Le Goater wrote:
> >> The Event Queue Descriptor (EQD) table is an internal table of the
> >> XIVE routing sub-engine. It specifies on which Event Queue the event
> >> data should be posted when an exception occurs (later on pulled by the
> >> OS) and which Virtual Processor to notify.
> > 
> > Uhhh.. I thought the IVT said which queue and vp to notify, and the
> > EQD gave metadata for event queues.
> 
> yes. the above poorly written. The Event Queue Descriptor contains the
> guest address of the event queue in which the data is written. I will 
> rephrase.      
> 
> The IVT contains IVEs which indeed define for an IRQ which EQ to notify 
> and what data to push on the queue. 
>  
> >> The Event Queue is a much
> >> more complex structure but we start with a simple model for the sPAPR
> >> machine.
> >>
> >> There is one XiveEQ per priority and these are stored under the XIVE
> >> virtualization presenter (sPAPRXiveNVT). EQs are simply indexed with :
> >>
> >>        (server << 3) | (priority & 0x7)
> >>
> >> This is not in the XIVE architecture but as the EQ index is never
> >> exposed to the guest, in the hcalls nor in the device tree, we are
> >> free to use what fits best the current model.
> 
> This EQ indexing is important to notice because it will also show up 
> in KVM to build the IVE from the KVM irq state.

Ok, are you saying that while this combined EQ index will never appear
in guest <-> host interfaces, it might show up in qemu <-> KVM
interfaces?

> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> > 
> > Is the EQD actually modifiable by a guest?  Or are the settings of the
> > EQs fixed by PAPR?
> 
> The guest uses the H_INT_SET_QUEUE_CONFIG hcall to define the address
> of the event queue for a couple prio/server.

Ok, so the EQD can be modified by the guest.  In which case we need to
work out what object owns it, since it'll need to migrate it.
Cédric Le Goater May 3, 2018, 6:07 a.m. UTC | #4
On 05/03/2018 07:45 AM, David Gibson wrote:
> On Thu, Apr 26, 2018 at 11:48:06AM +0200, Cédric Le Goater wrote:
>> On 04/26/2018 09:25 AM, David Gibson wrote:
>>> On Thu, Apr 19, 2018 at 02:43:03PM +0200, Cédric Le Goater wrote:
>>>> The Event Queue Descriptor (EQD) table is an internal table of the
>>>> XIVE routing sub-engine. It specifies on which Event Queue the event
>>>> data should be posted when an exception occurs (later on pulled by the
>>>> OS) and which Virtual Processor to notify.
>>>
>>> Uhhh.. I thought the IVT said which queue and vp to notify, and the
>>> EQD gave metadata for event queues.
>>
>> yes. the above poorly written. The Event Queue Descriptor contains the
>> guest address of the event queue in which the data is written. I will 
>> rephrase.      
>>
>> The IVT contains IVEs which indeed define for an IRQ which EQ to notify 
>> and what data to push on the queue. 
>>  
>>>> The Event Queue is a much
>>>> more complex structure but we start with a simple model for the sPAPR
>>>> machine.
>>>>
>>>> There is one XiveEQ per priority and these are stored under the XIVE
>>>> virtualization presenter (sPAPRXiveNVT). EQs are simply indexed with :
>>>>
>>>>        (server << 3) | (priority & 0x7)
>>>>
>>>> This is not in the XIVE architecture but as the EQ index is never
>>>> exposed to the guest, in the hcalls nor in the device tree, we are
>>>> free to use what fits best the current model.
>>
>> This EQ indexing is important to notice because it will also show up 
>> in KVM to build the IVE from the KVM irq state.
> 
> Ok, are you saying that while this combined EQ index will never appear
> in guest <-> host interfaces, 

Indeed.

> it might show up in qemu <-> KVM interfaces?

Not directly but it is part of the IVE as the IVE_EQ_INDEX field. When
dumped, it has to be built in some ways, compatible with the emulated 
mode in QEMU. 

>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>
>>> Is the EQD actually modifiable by a guest?  Or are the settings of the
>>> EQs fixed by PAPR?
>>
>> The guest uses the H_INT_SET_QUEUE_CONFIG hcall to define the address
>> of the event queue for a couple prio/server.
> 
> Ok, so the EQD can be modified by the guest.  In which case we need to
> work out what object owns it, since it'll need to migrate it.

Indeed. The EQD are CPU related as there is one EQD per couple (cpu, 
priority). The KVM patchset dumps/restores the eight XiveEQ struct 
using per cpu ioctls. The EQ in the OS RAM is marked dirty at that
stage.

C.
David Gibson May 3, 2018, 6:25 a.m. UTC | #5
On Thu, May 03, 2018 at 08:07:54AM +0200, Cédric Le Goater wrote:
> On 05/03/2018 07:45 AM, David Gibson wrote:
> > On Thu, Apr 26, 2018 at 11:48:06AM +0200, Cédric Le Goater wrote:
> >> On 04/26/2018 09:25 AM, David Gibson wrote:
> >>> On Thu, Apr 19, 2018 at 02:43:03PM +0200, Cédric Le Goater wrote:
> >>>> The Event Queue Descriptor (EQD) table is an internal table of the
> >>>> XIVE routing sub-engine. It specifies on which Event Queue the event
> >>>> data should be posted when an exception occurs (later on pulled by the
> >>>> OS) and which Virtual Processor to notify.
> >>>
> >>> Uhhh.. I thought the IVT said which queue and vp to notify, and the
> >>> EQD gave metadata for event queues.
> >>
> >> yes. the above poorly written. The Event Queue Descriptor contains the
> >> guest address of the event queue in which the data is written. I will 
> >> rephrase.      
> >>
> >> The IVT contains IVEs which indeed define for an IRQ which EQ to notify 
> >> and what data to push on the queue. 
> >>  
> >>>> The Event Queue is a much
> >>>> more complex structure but we start with a simple model for the sPAPR
> >>>> machine.
> >>>>
> >>>> There is one XiveEQ per priority and these are stored under the XIVE
> >>>> virtualization presenter (sPAPRXiveNVT). EQs are simply indexed with :
> >>>>
> >>>>        (server << 3) | (priority & 0x7)
> >>>>
> >>>> This is not in the XIVE architecture but as the EQ index is never
> >>>> exposed to the guest, in the hcalls nor in the device tree, we are
> >>>> free to use what fits best the current model.
> >>
> >> This EQ indexing is important to notice because it will also show up 
> >> in KVM to build the IVE from the KVM irq state.
> > 
> > Ok, are you saying that while this combined EQ index will never appear
> > in guest <-> host interfaces, 
> 
> Indeed.
> 
> > it might show up in qemu <-> KVM interfaces?
> 
> Not directly but it is part of the IVE as the IVE_EQ_INDEX field. When
> dumped, it has to be built in some ways, compatible with the emulated 
> mode in QEMU. 

Hrm.  But is the exact IVE contents visible to qemu (for a PAPR
guest)?  I would have thought the qemu <-> KVM interfaces would have
abstracted this the same way the guest <-> KVM interfaces do.  Or is
there a reason not to?

> >>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >>>
> >>> Is the EQD actually modifiable by a guest?  Or are the settings of the
> >>> EQs fixed by PAPR?
> >>
> >> The guest uses the H_INT_SET_QUEUE_CONFIG hcall to define the address
> >> of the event queue for a couple prio/server.
> > 
> > Ok, so the EQD can be modified by the guest.  In which case we need to
> > work out what object owns it, since it'll need to migrate it.
> 
> Indeed. The EQD are CPU related as there is one EQD per couple (cpu, 
> priority). The KVM patchset dumps/restores the eight XiveEQ struct 
> using per cpu ioctls. The EQ in the OS RAM is marked dirty at that
> stage.

To make sure I'm clear: for PAPR there's a strict relationship between
EQD and CPU (one EQD for each (cpu, priority) tuple).  But for powernv
that's not the case, right?  AIUI the mapping of EQs to cpus was
configurable, is that right?
Cédric Le Goater May 3, 2018, 2:37 p.m. UTC | #6
On 05/03/2018 08:25 AM, David Gibson wrote:
> On Thu, May 03, 2018 at 08:07:54AM +0200, Cédric Le Goater wrote:
>> On 05/03/2018 07:45 AM, David Gibson wrote:
>>> On Thu, Apr 26, 2018 at 11:48:06AM +0200, Cédric Le Goater wrote:
>>>> On 04/26/2018 09:25 AM, David Gibson wrote:
>>>>> On Thu, Apr 19, 2018 at 02:43:03PM +0200, Cédric Le Goater wrote:
>>>>>> The Event Queue Descriptor (EQD) table is an internal table of the
>>>>>> XIVE routing sub-engine. It specifies on which Event Queue the event
>>>>>> data should be posted when an exception occurs (later on pulled by the
>>>>>> OS) and which Virtual Processor to notify.
>>>>>
>>>>> Uhhh.. I thought the IVT said which queue and vp to notify, and the
>>>>> EQD gave metadata for event queues.
>>>>
>>>> yes. the above poorly written. The Event Queue Descriptor contains the
>>>> guest address of the event queue in which the data is written. I will 
>>>> rephrase.      
>>>>
>>>> The IVT contains IVEs which indeed define for an IRQ which EQ to notify 
>>>> and what data to push on the queue. 
>>>>  
>>>>>> The Event Queue is a much
>>>>>> more complex structure but we start with a simple model for the sPAPR
>>>>>> machine.
>>>>>>
>>>>>> There is one XiveEQ per priority and these are stored under the XIVE
>>>>>> virtualization presenter (sPAPRXiveNVT). EQs are simply indexed with :
>>>>>>
>>>>>>        (server << 3) | (priority & 0x7)
>>>>>>
>>>>>> This is not in the XIVE architecture but as the EQ index is never
>>>>>> exposed to the guest, in the hcalls nor in the device tree, we are
>>>>>> free to use what fits best the current model.
>>>>
>>>> This EQ indexing is important to notice because it will also show up 
>>>> in KVM to build the IVE from the KVM irq state.
>>>
>>> Ok, are you saying that while this combined EQ index will never appear
>>> in guest <-> host interfaces, 
>>
>> Indeed.
>>
>>> it might show up in qemu <-> KVM interfaces?
>>
>> Not directly but it is part of the IVE as the IVE_EQ_INDEX field. When
>> dumped, it has to be built in some ways, compatible with the emulated 
>> mode in QEMU. 
> 
> Hrm.  But is the exact IVE contents visible to qemu (for a PAPR
> guest)?  

The guest only uses hcalls which arguments are :
 
	- cpu numbers,
	- priority numbers from defined ranges, 
	- logical interrupt numbers.  
	- physical address of the EQ 

The visible parts for the guest of the IVE are the 'priority', the 'cpu', 
and the 'eisn', which is the effective IRQ number the guest is assigning 
to the source. The 'eisn" will be pushed in the EQ.

The IVE EQ index is not visible.
 
> I would have thought the qemu <-> KVM interfaces would have
> abstracted this the same way the guest <-> KVM interfaces do.  > Or is there a reason not to?

It is practical to dump 64bit IVEs directly from KVM into the QEMU 
internal structures because it fits the emulated mode without doing 
any translation ... This might be seen as a shortcut. You will tell 
me when you reach the KVM part.   

>>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>>>
>>>>> Is the EQD actually modifiable by a guest?  Or are the settings of the
>>>>> EQs fixed by PAPR?
>>>>
>>>> The guest uses the H_INT_SET_QUEUE_CONFIG hcall to define the address
>>>> of the event queue for a couple prio/server.
>>>
>>> Ok, so the EQD can be modified by the guest.  In which case we need to
>>> work out what object owns it, since it'll need to migrate it.
>>
>> Indeed. The EQD are CPU related as there is one EQD per couple (cpu, 
>> priority). The KVM patchset dumps/restores the eight XiveEQ struct 
>> using per cpu ioctls. The EQ in the OS RAM is marked dirty at that
>> stage.
> 
> To make sure I'm clear: for PAPR there's a strict relationship between
> EQD and CPU (one EQD for each (cpu, priority) tuple).  

Yes.

> But for powernv that's not the case, right?  

It is.

> AIUI the mapping of EQs to cpus was configurable, is that right?

Each cpu has 8 EQD. Same for virtual cpus. 

I am not sure what you understood before ? It is surely something
I wrote, my XIVE understanding is still making progress.


C.
David Gibson May 4, 2018, 5:19 a.m. UTC | #7
On Thu, May 03, 2018 at 04:37:29PM +0200, Cédric Le Goater wrote:
> On 05/03/2018 08:25 AM, David Gibson wrote:
> > On Thu, May 03, 2018 at 08:07:54AM +0200, Cédric Le Goater wrote:
> >> On 05/03/2018 07:45 AM, David Gibson wrote:
> >>> On Thu, Apr 26, 2018 at 11:48:06AM +0200, Cédric Le Goater wrote:
> >>>> On 04/26/2018 09:25 AM, David Gibson wrote:
> >>>>> On Thu, Apr 19, 2018 at 02:43:03PM +0200, Cédric Le Goater wrote:
> >>>>>> The Event Queue Descriptor (EQD) table is an internal table of the
> >>>>>> XIVE routing sub-engine. It specifies on which Event Queue the event
> >>>>>> data should be posted when an exception occurs (later on pulled by the
> >>>>>> OS) and which Virtual Processor to notify.
> >>>>>
> >>>>> Uhhh.. I thought the IVT said which queue and vp to notify, and the
> >>>>> EQD gave metadata for event queues.
> >>>>
> >>>> yes. the above poorly written. The Event Queue Descriptor contains the
> >>>> guest address of the event queue in which the data is written. I will 
> >>>> rephrase.      
> >>>>
> >>>> The IVT contains IVEs which indeed define for an IRQ which EQ to notify 
> >>>> and what data to push on the queue. 
> >>>>  
> >>>>>> The Event Queue is a much
> >>>>>> more complex structure but we start with a simple model for the sPAPR
> >>>>>> machine.
> >>>>>>
> >>>>>> There is one XiveEQ per priority and these are stored under the XIVE
> >>>>>> virtualization presenter (sPAPRXiveNVT). EQs are simply indexed with :
> >>>>>>
> >>>>>>        (server << 3) | (priority & 0x7)
> >>>>>>
> >>>>>> This is not in the XIVE architecture but as the EQ index is never
> >>>>>> exposed to the guest, in the hcalls nor in the device tree, we are
> >>>>>> free to use what fits best the current model.
> >>>>
> >>>> This EQ indexing is important to notice because it will also show up 
> >>>> in KVM to build the IVE from the KVM irq state.
> >>>
> >>> Ok, are you saying that while this combined EQ index will never appear
> >>> in guest <-> host interfaces, 
> >>
> >> Indeed.
> >>
> >>> it might show up in qemu <-> KVM interfaces?
> >>
> >> Not directly but it is part of the IVE as the IVE_EQ_INDEX field. When
> >> dumped, it has to be built in some ways, compatible with the emulated 
> >> mode in QEMU. 
> > 
> > Hrm.  But is the exact IVE contents visible to qemu (for a PAPR
> > guest)?  
> 
> The guest only uses hcalls which arguments are :
>  
> 	- cpu numbers,
> 	- priority numbers from defined ranges, 
> 	- logical interrupt numbers.  
> 	- physical address of the EQ 
> 
> The visible parts for the guest of the IVE are the 'priority', the 'cpu', 
> and the 'eisn', which is the effective IRQ number the guest is assigning 
> to the source. The 'eisn" will be pushed in the EQ.

Ok.

> The IVE EQ index is not visible.

Good.

> > I would have thought the qemu <-> KVM interfaces would have
> > abstracted this the same way the guest <-> KVM interfaces do.  > Or is there a reason not to?
> 
> It is practical to dump 64bit IVEs directly from KVM into the QEMU 
> internal structures because it fits the emulated mode without doing 
> any translation ... This might be seen as a shortcut. You will tell 
> me when you reach the KVM part.   

Ugh.. exposing to qemu the raw IVEs sounds like a bad idea to me.
When we migrate, we're going to have to assign the guest (server,
priority) tuples to host EQ indicies, and I think it makes more sense
to do that in KVM and hide the raw indices from qemu than to have qemu
mangle them explicitly on migration.

> >>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >>>>>
> >>>>> Is the EQD actually modifiable by a guest?  Or are the settings of the
> >>>>> EQs fixed by PAPR?
> >>>>
> >>>> The guest uses the H_INT_SET_QUEUE_CONFIG hcall to define the address
> >>>> of the event queue for a couple prio/server.
> >>>
> >>> Ok, so the EQD can be modified by the guest.  In which case we need to
> >>> work out what object owns it, since it'll need to migrate it.
> >>
> >> Indeed. The EQD are CPU related as there is one EQD per couple (cpu, 
> >> priority). The KVM patchset dumps/restores the eight XiveEQ struct 
> >> using per cpu ioctls. The EQ in the OS RAM is marked dirty at that
> >> stage.
> > 
> > To make sure I'm clear: for PAPR there's a strict relationship between
> > EQD and CPU (one EQD for each (cpu, priority) tuple).  
> 
> Yes.
> 
> > But for powernv that's not the case, right?  
> 
> It is.

Uh.. I don't think either of us phrased that well, I'm still not sure
which way you're answering that.

> > AIUI the mapping of EQs to cpus was configurable, is that right?
> 
> Each cpu has 8 EQD. Same for virtual cpus.

Hmm.. but is that 8 EQD per cpu something built into the hardware, or
just a convention of how the host kernel and OPAL operate?

> 
> I am not sure what you understood before ? It is surely something
> I wrote, my XIVE understanding is still making progress.
> 
> 
> C.
>
Cédric Le Goater May 4, 2018, 1:29 p.m. UTC | #8
On 05/04/2018 07:19 AM, David Gibson wrote:
> On Thu, May 03, 2018 at 04:37:29PM +0200, Cédric Le Goater wrote:
>> On 05/03/2018 08:25 AM, David Gibson wrote:
>>> On Thu, May 03, 2018 at 08:07:54AM +0200, Cédric Le Goater wrote:
>>>> On 05/03/2018 07:45 AM, David Gibson wrote:
>>>>> On Thu, Apr 26, 2018 at 11:48:06AM +0200, Cédric Le Goater wrote:
>>>>>> On 04/26/2018 09:25 AM, David Gibson wrote:
>>>>>>> On Thu, Apr 19, 2018 at 02:43:03PM +0200, Cédric Le Goater wrote:
>>>>>>>> The Event Queue Descriptor (EQD) table is an internal table of the
>>>>>>>> XIVE routing sub-engine. It specifies on which Event Queue the event
>>>>>>>> data should be posted when an exception occurs (later on pulled by the
>>>>>>>> OS) and which Virtual Processor to notify.
>>>>>>>
>>>>>>> Uhhh.. I thought the IVT said which queue and vp to notify, and the
>>>>>>> EQD gave metadata for event queues.
>>>>>>
>>>>>> yes. the above poorly written. The Event Queue Descriptor contains the
>>>>>> guest address of the event queue in which the data is written. I will 
>>>>>> rephrase.      
>>>>>>
>>>>>> The IVT contains IVEs which indeed define for an IRQ which EQ to notify 
>>>>>> and what data to push on the queue. 
>>>>>>  
>>>>>>>> The Event Queue is a much
>>>>>>>> more complex structure but we start with a simple model for the sPAPR
>>>>>>>> machine.
>>>>>>>>
>>>>>>>> There is one XiveEQ per priority and these are stored under the XIVE
>>>>>>>> virtualization presenter (sPAPRXiveNVT). EQs are simply indexed with :
>>>>>>>>
>>>>>>>>        (server << 3) | (priority & 0x7)
>>>>>>>>
>>>>>>>> This is not in the XIVE architecture but as the EQ index is never
>>>>>>>> exposed to the guest, in the hcalls nor in the device tree, we are
>>>>>>>> free to use what fits best the current model.
>>>>>>
>>>>>> This EQ indexing is important to notice because it will also show up 
>>>>>> in KVM to build the IVE from the KVM irq state.
>>>>>
>>>>> Ok, are you saying that while this combined EQ index will never appear
>>>>> in guest <-> host interfaces, 
>>>>
>>>> Indeed.
>>>>
>>>>> it might show up in qemu <-> KVM interfaces?
>>>>
>>>> Not directly but it is part of the IVE as the IVE_EQ_INDEX field. When
>>>> dumped, it has to be built in some ways, compatible with the emulated 
>>>> mode in QEMU. 
>>>
>>> Hrm.  But is the exact IVE contents visible to qemu (for a PAPR
>>> guest)?  
>>
>> The guest only uses hcalls which arguments are :
>>  
>> 	- cpu numbers,
>> 	- priority numbers from defined ranges, 
>> 	- logical interrupt numbers.  
>> 	- physical address of the EQ 
>>
>> The visible parts for the guest of the IVE are the 'priority', the 'cpu', 
>> and the 'eisn', which is the effective IRQ number the guest is assigning 
>> to the source. The 'eisn" will be pushed in the EQ.
> 
> Ok.
> 
>> The IVE EQ index is not visible.
> 
> Good.
> 
>>> I would have thought the qemu <-> KVM interfaces would have
>>> abstracted this the same way the guest <-> KVM interfaces do.  > Or is there a reason not to?
>>
>> It is practical to dump 64bit IVEs directly from KVM into the QEMU 
>> internal structures because it fits the emulated mode without doing 
>> any translation ... This might be seen as a shortcut. You will tell 
>> me when you reach the KVM part.   
> 
> Ugh.. exposing to qemu the raw IVEs sounds like a bad idea to me.

You definitely need to in QEMU in emulation mode. The whole routing 
relies on it. 

> When we migrate, we're going to have to assign the guest (server,
> priority) tuples to host EQ indicies, and I think it makes more sense
> to do that in KVM and hide the raw indices from qemu than to have qemu
> mangle them explicitly on migration.

We will need some mangling mechanism for the KVM ioctls saving and
restoring state. This is very similar to XICS. 
 
>>>>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>>>>>
>>>>>>> Is the EQD actually modifiable by a guest?  Or are the settings of the
>>>>>>> EQs fixed by PAPR?
>>>>>>
>>>>>> The guest uses the H_INT_SET_QUEUE_CONFIG hcall to define the address
>>>>>> of the event queue for a couple prio/server.
>>>>>
>>>>> Ok, so the EQD can be modified by the guest.  In which case we need to
>>>>> work out what object owns it, since it'll need to migrate it.
>>>>
>>>> Indeed. The EQD are CPU related as there is one EQD per couple (cpu, 
>>>> priority). The KVM patchset dumps/restores the eight XiveEQ struct 
>>>> using per cpu ioctls. The EQ in the OS RAM is marked dirty at that
>>>> stage.
>>>
>>> To make sure I'm clear: for PAPR there's a strict relationship between
>>> EQD and CPU (one EQD for each (cpu, priority) tuple).  
>>
>> Yes.
>>
>>> But for powernv that's not the case, right?  
>>
>> It is.
> 
> Uh.. I don't think either of us phrased that well, I'm still not sure
> which way you're answering that.

there's a strict relationship between EQD and CPU (one EQD for each (cpu, priority) tuple) in spapr and in powernv.

>>> AIUI the mapping of EQs to cpus was configurable, is that right?
>>
>> Each cpu has 8 EQD. Same for virtual cpus.
> 
> Hmm.. but is that 8 EQD per cpu something built into the hardware, or
> just a convention of how the host kernel and OPAL operate?

It's not in the HW, it is used by the HW to route the notification. 
The EQD contains the EQ characteristics :

* functional bits :
  - valid bit
  - enqueue bit, to update OS in RAM EQ or not
  - unconditional notification
  - backlog
  - escalation
  - ...
* OS EQ fields 
  - physical address
  - entry index
  - toggle bit
* NVT fields
  - block/chip
  - index
* etc.

It's a big structure : 8 words.

The EQD table is allocated by OPAL/skiboot and fed to the HW for
its use. The OS powernv uses OPAL calls  configure the EQD with its 
needs : 

int64_t opal_xive_set_queue_info(uint64_t vp, uint32_t prio,
				 uint64_t qpage,
				 uint64_t qsize,
				 uint64_t qflags);


sPAPR uses an hcall :

static long plpar_int_set_queue_config(unsigned long flags,
				       unsigned long target,
				       unsigned long priority,
				       unsigned long qpage,
				       unsigned long qsize)


but it is translated in an OPAL call in KVM.

C.

 
>  
>>
>> I am not sure what you understood before ? It is surely something
>> I wrote, my XIVE understanding is still making progress.
>>
>>
>> C.
>>
>
David Gibson May 5, 2018, 4:29 a.m. UTC | #9
On Fri, May 04, 2018 at 03:29:02PM +0200, Cédric Le Goater wrote:
> On 05/04/2018 07:19 AM, David Gibson wrote:
> > On Thu, May 03, 2018 at 04:37:29PM +0200, Cédric Le Goater wrote:
> >> On 05/03/2018 08:25 AM, David Gibson wrote:
> >>> On Thu, May 03, 2018 at 08:07:54AM +0200, Cédric Le Goater wrote:
> >>>> On 05/03/2018 07:45 AM, David Gibson wrote:
> >>>>> On Thu, Apr 26, 2018 at 11:48:06AM +0200, Cédric Le Goater wrote:
> >>>>>> On 04/26/2018 09:25 AM, David Gibson wrote:
> >>>>>>> On Thu, Apr 19, 2018 at 02:43:03PM +0200, Cédric Le Goater wrote:
> >>>>>>>> The Event Queue Descriptor (EQD) table is an internal table of the
> >>>>>>>> XIVE routing sub-engine. It specifies on which Event Queue the event
> >>>>>>>> data should be posted when an exception occurs (later on pulled by the
> >>>>>>>> OS) and which Virtual Processor to notify.
> >>>>>>>
> >>>>>>> Uhhh.. I thought the IVT said which queue and vp to notify, and the
> >>>>>>> EQD gave metadata for event queues.
> >>>>>>
> >>>>>> yes. the above poorly written. The Event Queue Descriptor contains the
> >>>>>> guest address of the event queue in which the data is written. I will 
> >>>>>> rephrase.      
> >>>>>>
> >>>>>> The IVT contains IVEs which indeed define for an IRQ which EQ to notify 
> >>>>>> and what data to push on the queue. 
> >>>>>>  
> >>>>>>>> The Event Queue is a much
> >>>>>>>> more complex structure but we start with a simple model for the sPAPR
> >>>>>>>> machine.
> >>>>>>>>
> >>>>>>>> There is one XiveEQ per priority and these are stored under the XIVE
> >>>>>>>> virtualization presenter (sPAPRXiveNVT). EQs are simply indexed with :
> >>>>>>>>
> >>>>>>>>        (server << 3) | (priority & 0x7)
> >>>>>>>>
> >>>>>>>> This is not in the XIVE architecture but as the EQ index is never
> >>>>>>>> exposed to the guest, in the hcalls nor in the device tree, we are
> >>>>>>>> free to use what fits best the current model.
> >>>>>>
> >>>>>> This EQ indexing is important to notice because it will also show up 
> >>>>>> in KVM to build the IVE from the KVM irq state.
> >>>>>
> >>>>> Ok, are you saying that while this combined EQ index will never appear
> >>>>> in guest <-> host interfaces, 
> >>>>
> >>>> Indeed.
> >>>>
> >>>>> it might show up in qemu <-> KVM interfaces?
> >>>>
> >>>> Not directly but it is part of the IVE as the IVE_EQ_INDEX field. When
> >>>> dumped, it has to be built in some ways, compatible with the emulated 
> >>>> mode in QEMU. 
> >>>
> >>> Hrm.  But is the exact IVE contents visible to qemu (for a PAPR
> >>> guest)?  
> >>
> >> The guest only uses hcalls which arguments are :
> >>  
> >> 	- cpu numbers,
> >> 	- priority numbers from defined ranges, 
> >> 	- logical interrupt numbers.  
> >> 	- physical address of the EQ 
> >>
> >> The visible parts for the guest of the IVE are the 'priority', the 'cpu', 
> >> and the 'eisn', which is the effective IRQ number the guest is assigning 
> >> to the source. The 'eisn" will be pushed in the EQ.
> > 
> > Ok.
> > 
> >> The IVE EQ index is not visible.
> > 
> > Good.
> > 
> >>> I would have thought the qemu <-> KVM interfaces would have
> >>> abstracted this the same way the guest <-> KVM interfaces do.  > Or is there a reason not to?
> >>
> >> It is practical to dump 64bit IVEs directly from KVM into the QEMU 
> >> internal structures because it fits the emulated mode without doing 
> >> any translation ... This might be seen as a shortcut. You will tell 
> >> me when you reach the KVM part.   
> > 
> > Ugh.. exposing to qemu the raw IVEs sounds like a bad idea to me.
> 
> You definitely need to in QEMU in emulation mode. The whole routing 
> relies on it. 

I'm not exactly sure what you mean by "emulation mode" here.  Above,
I'm talking specifically about a KVM HV, PAPR guest.

> > When we migrate, we're going to have to assign the guest (server,
> > priority) tuples to host EQ indicies, and I think it makes more sense
> > to do that in KVM and hide the raw indices from qemu than to have qemu
> > mangle them explicitly on migration.
> 
> We will need some mangling mechanism for the KVM ioctls saving and
> restoring state. This is very similar to XICS. 
>  
> >>>>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >>>>>>>
> >>>>>>> Is the EQD actually modifiable by a guest?  Or are the settings of the
> >>>>>>> EQs fixed by PAPR?
> >>>>>>
> >>>>>> The guest uses the H_INT_SET_QUEUE_CONFIG hcall to define the address
> >>>>>> of the event queue for a couple prio/server.
> >>>>>
> >>>>> Ok, so the EQD can be modified by the guest.  In which case we need to
> >>>>> work out what object owns it, since it'll need to migrate it.
> >>>>
> >>>> Indeed. The EQD are CPU related as there is one EQD per couple (cpu, 
> >>>> priority). The KVM patchset dumps/restores the eight XiveEQ struct 
> >>>> using per cpu ioctls. The EQ in the OS RAM is marked dirty at that
> >>>> stage.
> >>>
> >>> To make sure I'm clear: for PAPR there's a strict relationship between
> >>> EQD and CPU (one EQD for each (cpu, priority) tuple).  
> >>
> >> Yes.
> >>
> >>> But for powernv that's not the case, right?  
> >>
> >> It is.
> > 
> > Uh.. I don't think either of us phrased that well, I'm still not sure
> > which way you're answering that.
> 
> there's a strict relationship between EQD and CPU (one EQD for each (cpu, priority) tuple) in spapr and in powernv.

For powernv that seems to be contradicted by what you say below.
AFAICT there might be a strict association at the host kernel or even
the OPAL level, but not at the hardware level.

> >>> AIUI the mapping of EQs to cpus was configurable, is that right?
> >>
> >> Each cpu has 8 EQD. Same for virtual cpus.
> > 
> > Hmm.. but is that 8 EQD per cpu something built into the hardware, or
> > just a convention of how the host kernel and OPAL operate?
> 
> It's not in the HW, it is used by the HW to route the notification. 
> The EQD contains the EQ characteristics :
> 
> * functional bits :
>   - valid bit
>   - enqueue bit, to update OS in RAM EQ or not
>   - unconditional notification
>   - backlog
>   - escalation
>   - ...
> * OS EQ fields 
>   - physical address
>   - entry index
>   - toggle bit
> * NVT fields
>   - block/chip
>   - index
> * etc.
> 
> It's a big structure : 8 words.

Ok.  So yeah, the cpu association of the EQ is there in the NVT
fields, not baked into the hardware.

> The EQD table is allocated by OPAL/skiboot and fed to the HW for
> its use. The OS powernv uses OPAL calls  configure the EQD with its 
> needs : 
> 
> int64_t opal_xive_set_queue_info(uint64_t vp, uint32_t prio,
> 				 uint64_t qpage,
> 				 uint64_t qsize,
> 				 uint64_t qflags);
> 
> 
> sPAPR uses an hcall :
> 
> static long plpar_int_set_queue_config(unsigned long flags,
> 				       unsigned long target,
> 				       unsigned long priority,
> 				       unsigned long qpage,
> 				       unsigned long qsize)
> 
> 
> but it is translated in an OPAL call in KVM.
> 
> C.
> 
>  
> >  
> >>
> >> I am not sure what you understood before ? It is surely something
> >> I wrote, my XIVE understanding is still making progress.
> >>
> >>
> >> C.
> >>
> > 
>
Cédric Le Goater May 9, 2018, 8:01 a.m. UTC | #10
On 05/05/2018 06:29 AM, David Gibson wrote:
> On Fri, May 04, 2018 at 03:29:02PM +0200, Cédric Le Goater wrote:
>> On 05/04/2018 07:19 AM, David Gibson wrote:
>>> On Thu, May 03, 2018 at 04:37:29PM +0200, Cédric Le Goater wrote:
>>>> On 05/03/2018 08:25 AM, David Gibson wrote:
>>>>> On Thu, May 03, 2018 at 08:07:54AM +0200, Cédric Le Goater wrote:
>>>>>> On 05/03/2018 07:45 AM, David Gibson wrote:
>>>>>>> On Thu, Apr 26, 2018 at 11:48:06AM +0200, Cédric Le Goater wrote:
>>>>>>>> On 04/26/2018 09:25 AM, David Gibson wrote:
>>>>>>>>> On Thu, Apr 19, 2018 at 02:43:03PM +0200, Cédric Le Goater wrote:
>>>>>>>>>> The Event Queue Descriptor (EQD) table is an internal table of the
>>>>>>>>>> XIVE routing sub-engine. It specifies on which Event Queue the event
>>>>>>>>>> data should be posted when an exception occurs (later on pulled by the
>>>>>>>>>> OS) and which Virtual Processor to notify.
>>>>>>>>>
>>>>>>>>> Uhhh.. I thought the IVT said which queue and vp to notify, and the
>>>>>>>>> EQD gave metadata for event queues.
>>>>>>>>
>>>>>>>> yes. the above poorly written. The Event Queue Descriptor contains the
>>>>>>>> guest address of the event queue in which the data is written. I will 
>>>>>>>> rephrase.      
>>>>>>>>
>>>>>>>> The IVT contains IVEs which indeed define for an IRQ which EQ to notify 
>>>>>>>> and what data to push on the queue. 
>>>>>>>>  
>>>>>>>>>> The Event Queue is a much
>>>>>>>>>> more complex structure but we start with a simple model for the sPAPR
>>>>>>>>>> machine.
>>>>>>>>>>
>>>>>>>>>> There is one XiveEQ per priority and these are stored under the XIVE
>>>>>>>>>> virtualization presenter (sPAPRXiveNVT). EQs are simply indexed with :
>>>>>>>>>>
>>>>>>>>>>        (server << 3) | (priority & 0x7)
>>>>>>>>>>
>>>>>>>>>> This is not in the XIVE architecture but as the EQ index is never
>>>>>>>>>> exposed to the guest, in the hcalls nor in the device tree, we are
>>>>>>>>>> free to use what fits best the current model.
>>>>>>>>
>>>>>>>> This EQ indexing is important to notice because it will also show up 
>>>>>>>> in KVM to build the IVE from the KVM irq state.
>>>>>>>
>>>>>>> Ok, are you saying that while this combined EQ index will never appear
>>>>>>> in guest <-> host interfaces, 
>>>>>>
>>>>>> Indeed.
>>>>>>
>>>>>>> it might show up in qemu <-> KVM interfaces?
>>>>>>
>>>>>> Not directly but it is part of the IVE as the IVE_EQ_INDEX field. When
>>>>>> dumped, it has to be built in some ways, compatible with the emulated 
>>>>>> mode in QEMU. 
>>>>>
>>>>> Hrm.  But is the exact IVE contents visible to qemu (for a PAPR
>>>>> guest)?  
>>>>
>>>> The guest only uses hcalls which arguments are :
>>>>  
>>>> 	- cpu numbers,
>>>> 	- priority numbers from defined ranges, 
>>>> 	- logical interrupt numbers.  
>>>> 	- physical address of the EQ 
>>>>
>>>> The visible parts for the guest of the IVE are the 'priority', the 'cpu', 
>>>> and the 'eisn', which is the effective IRQ number the guest is assigning 
>>>> to the source. The 'eisn" will be pushed in the EQ.
>>>
>>> Ok.
>>>
>>>> The IVE EQ index is not visible.
>>>
>>> Good.
>>>
>>>>> I would have thought the qemu <-> KVM interfaces would have
>>>>> abstracted this the same way the guest <-> KVM interfaces do.  > Or is there a reason not to?
>>>>
>>>> It is practical to dump 64bit IVEs directly from KVM into the QEMU 
>>>> internal structures because it fits the emulated mode without doing 
>>>> any translation ... This might be seen as a shortcut. You will tell 
>>>> me when you reach the KVM part.   
>>>
>>> Ugh.. exposing to qemu the raw IVEs sounds like a bad idea to me.
>>
>> You definitely need to in QEMU in emulation mode. The whole routing 
>> relies on it. 
> 
> I'm not exactly sure what you mean by "emulation mode" here.  Above,
> I'm talking specifically about a KVM HV, PAPR guest.

ah ok. I understand. 

KVM does not manipulate raw IVEs. Only OPAL manipulates the raw 
XIVE structures. But as the emulation mode under QEMU needs to 
also manipulate these structures, it seemed practical to use raw 
XIVE structures to transfer the state from KVM to QEMU. 

But, It might not be such a great idea. I suppose we should define 
a QEMU/KVM format for the exchanges with KVM and then, inside QEMU, 
have a translation QEMU/KVM to XIVE. The XIVE format being the
format used for migration.


>>> When we migrate, we're going to have to assign the guest (server,
>>> priority) tuples to host EQ indicies, and I think it makes more sense
>>> to do that in KVM and hide the raw indices from qemu than to have qemu
>>> mangle them explicitly on migration.
>>
>> We will need some mangling mechanism for the KVM ioctls saving and
>> restoring state. This is very similar to XICS. 
>>  
>>>>>>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>>>>>>>
>>>>>>>>> Is the EQD actually modifiable by a guest?  Or are the settings of the
>>>>>>>>> EQs fixed by PAPR?
>>>>>>>>
>>>>>>>> The guest uses the H_INT_SET_QUEUE_CONFIG hcall to define the address
>>>>>>>> of the event queue for a couple prio/server.
>>>>>>>
>>>>>>> Ok, so the EQD can be modified by the guest.  In which case we need to
>>>>>>> work out what object owns it, since it'll need to migrate it.
>>>>>>
>>>>>> Indeed. The EQD are CPU related as there is one EQD per couple (cpu, 
>>>>>> priority). The KVM patchset dumps/restores the eight XiveEQ struct 
>>>>>> using per cpu ioctls. The EQ in the OS RAM is marked dirty at that
>>>>>> stage.
>>>>>
>>>>> To make sure I'm clear: for PAPR there's a strict relationship between
>>>>> EQD and CPU (one EQD for each (cpu, priority) tuple).  
>>>>
>>>> Yes.
>>>>
>>>>> But for powernv that's not the case, right?  
>>>>
>>>> It is.
>>>
>>> Uh.. I don't think either of us phrased that well, I'm still not sure
>>> which way you're answering that.
>>
>> there's a strict relationship between EQD and CPU (one EQD for each (cpu, priority) tuple) in spapr and in powernv.
> 
> For powernv that seems to be contradicted by what you say below.

ok. I see what you mean. There is a difference for the hypervisor when 
guests are running. As QEMU PowerNV does not support guests (yet), 
when can start the model with a strict relationship between EQD and 
CPU.

But it's not the case when guest are running, because the EQD refers 
to a NVT/VP which can be a virtual processor or a group of such. 

The current model is taking a shortcut, the CPU list should be scanned
to find matching CAM lines (W2 in the TIMA). I need to take a closer
look for powernv even if it is not strictly needed for the model 
without guest. 

> AFAICT there might be a strict association at the host kernel or even
> the OPAL level, but not at the hardware level.
> 
>>>>> AIUI the mapping of EQs to cpus was configurable, is that right?
>>>>
>>>> Each cpu has 8 EQD. Same for virtual cpus.
>>>
>>> Hmm.. but is that 8 EQD per cpu something built into the hardware, or
>>> just a convention of how the host kernel and OPAL operate?
>>
>> It's not in the HW, it is used by the HW to route the notification. 
>> The EQD contains the EQ characteristics :
>>
>> * functional bits :
>>   - valid bit
>>   - enqueue bit, to update OS in RAM EQ or not
>>   - unconditional notification
>>   - backlog
>>   - escalation
>>   - ...
>> * OS EQ fields 
>>   - physical address
>>   - entry index
>>   - toggle bit
>> * NVT fields
>>   - block/chip
>>   - index
>> * etc.
>>
>> It's a big structure : 8 words.
> 
> Ok.  So yeah, the cpu association of the EQ is there in the NVT
> fields, not baked into the hardware.

yes.

C. 

>> The EQD table is allocated by OPAL/skiboot and fed to the HW for
>> its use. The OS powernv uses OPAL calls  configure the EQD with its 
>> needs : 
>>
>> int64_t opal_xive_set_queue_info(uint64_t vp, uint32_t prio,
>> 				 uint64_t qpage,
>> 				 uint64_t qsize,
>> 				 uint64_t qflags);
>>
>>
>> sPAPR uses an hcall :
>>
>> static long plpar_int_set_queue_config(unsigned long flags,
>> 				       unsigned long target,
>> 				       unsigned long priority,
>> 				       unsigned long qpage,
>> 				       unsigned long qsize)
>>
>>
>> but it is translated in an OPAL call in KVM.
>>
>> C.
>>
>>  
>>>  
>>>>
>>>> I am not sure what you understood before ? It is surely something
>>>> I wrote, my XIVE understanding is still making progress.
>>>>
>>>>
>>>> C.
>>>>
>>>
>>
>
diff mbox series

Patch

diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
index f07832bf0a00..d0d5a7d7f969 100644
--- a/hw/intc/spapr_xive.c
+++ b/hw/intc/spapr_xive.c
@@ -27,15 +27,30 @@  void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon)
     monitor_printf(mon, "IVE Table\n");
     for (i = 0; i < xive->nr_irqs; i++) {
         XiveIVE *ive = &xive->ivt[i];
+        uint32_t eq_idx;
 
         if (!(ive->w & IVE_VALID)) {
             continue;
         }
 
-        monitor_printf(mon, "  %4x %s %08x %08x\n", i,
-                       ive->w & IVE_MASKED ? "M" : " ",
-                       (int) GETFIELD(IVE_EQ_INDEX, ive->w),
-                       (int) GETFIELD(IVE_EQ_DATA, ive->w));
+        eq_idx = GETFIELD(IVE_EQ_INDEX, ive->w);
+
+        monitor_printf(mon, "  %6x %s eqidx:%03d ", i,
+                       ive->w & IVE_MASKED ? "M" : " ", eq_idx);
+
+        if (!(ive->w & IVE_MASKED)) {
+            XiveEQ *eq;
+
+            eq = xive_fabric_get_eq(XIVE_FABRIC(xive), eq_idx);
+            if (eq && (eq->w0 & EQ_W0_VALID)) {
+                xive_eq_pic_print_info(eq, mon);
+                monitor_printf(mon, " data:%08x",
+                               (int) GETFIELD(IVE_EQ_DATA, ive->w));
+            } else {
+                monitor_printf(mon, "no eq ?!");
+            }
+        }
+        monitor_printf(mon, "\n");
     }
 }
 
@@ -128,6 +143,13 @@  static XiveNVT *spapr_xive_get_nvt(XiveFabric *xf, uint32_t server)
     return cpu ? XIVE_NVT(cpu->intc) : NULL;
 }
 
+static XiveEQ *spapr_xive_get_eq(XiveFabric *xf, uint32_t eq_idx)
+{
+    XiveNVT *nvt = xive_fabric_get_nvt(xf, SPAPR_XIVE_EQ_SERVER(eq_idx));
+
+    return xive_nvt_eq_get(nvt, SPAPR_XIVE_EQ_PRIO(eq_idx));
+}
+
 static const VMStateDescription vmstate_spapr_xive_ive = {
     .name = TYPE_SPAPR_XIVE "/ive",
     .version_id = 1,
@@ -168,6 +190,7 @@  static void spapr_xive_class_init(ObjectClass *klass, void *data)
 
     xfc->get_ive = spapr_xive_get_ive;
     xfc->get_nvt = spapr_xive_get_nvt;
+    xfc->get_eq = spapr_xive_get_eq;
 }
 
 static const TypeInfo spapr_xive_info = {
diff --git a/hw/intc/xive.c b/hw/intc/xive.c
index 5691bb9474e4..2ab37fde80e8 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -19,6 +19,47 @@ 
 #include "hw/ppc/xive_regs.h"
 
 /*
+ * XiveEQ helpers
+ */
+
+XiveEQ *xive_nvt_eq_get(XiveNVT *nvt, uint8_t priority)
+{
+    if (!nvt || priority > XIVE_PRIORITY_MAX) {
+        return NULL;
+    }
+    return &nvt->eqt[priority];
+}
+
+void xive_eq_reset(XiveEQ *eq)
+{
+    memset(eq, 0, sizeof(*eq));
+
+    /* switch off the escalation and notification ESBs */
+    eq->w1 = EQ_W1_ESe_Q | EQ_W1_ESn_Q;
+}
+
+void xive_eq_pic_print_info(XiveEQ *eq, Monitor *mon)
+{
+    uint64_t qaddr_base = (((uint64_t)(eq->w2 & 0x0fffffff)) << 32) | eq->w3;
+    uint32_t qindex = GETFIELD(EQ_W1_PAGE_OFF, eq->w1);
+    uint32_t qgen = GETFIELD(EQ_W1_GENERATION, eq->w1);
+    uint32_t qsize = GETFIELD(EQ_W0_QSIZE, eq->w0);
+    uint32_t qentries = 1 << (qsize + 10);
+
+    uint32_t server = GETFIELD(EQ_W6_NVT_INDEX, eq->w6);
+    uint8_t priority = GETFIELD(EQ_W7_F0_PRIORITY, eq->w7);
+
+    monitor_printf(mon, "%c%c%c%c%c prio:%d server:%03d eq:@%08"PRIx64
+                   "% 6d/%5d ^%d",
+                   eq->w0 & EQ_W0_VALID ? 'v' : '-',
+                   eq->w0 & EQ_W0_ENQUEUE ? 'q' : '-',
+                   eq->w0 & EQ_W0_UCOND_NOTIFY ? 'n' : '-',
+                   eq->w0 & EQ_W0_BACKLOG ? 'b' : '-',
+                   eq->w0 & EQ_W0_ESCALATE_CTL ? 'e' : '-',
+                   priority, server, qaddr_base, qindex, qentries, qgen);
+}
+
+/*
  * XIVE Interrupt Presenter
  */
 
@@ -210,8 +251,12 @@  void xive_nvt_pic_print_info(XiveNVT *nvt, Monitor *mon)
 static void xive_nvt_reset(void *dev)
 {
     XiveNVT *nvt = XIVE_NVT(dev);
+    int i;
 
     memset(nvt->regs, 0, sizeof(nvt->regs));
+    for (i = 0; i < ARRAY_SIZE(nvt->eqt); i++) {
+        xive_eq_reset(&nvt->eqt[i]);
+    }
 }
 
 static void xive_nvt_realize(DeviceState *dev, Error **errp)
@@ -259,12 +304,31 @@  static void xive_nvt_init(Object *obj)
     nvt->ring_os = &nvt->regs[TM_QW1_OS];
 }
 
+static const VMStateDescription vmstate_xive_nvt_eq = {
+    .name = TYPE_XIVE_NVT "/eq",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField []) {
+        VMSTATE_UINT32(w0, XiveEQ),
+        VMSTATE_UINT32(w1, XiveEQ),
+        VMSTATE_UINT32(w2, XiveEQ),
+        VMSTATE_UINT32(w3, XiveEQ),
+        VMSTATE_UINT32(w4, XiveEQ),
+        VMSTATE_UINT32(w5, XiveEQ),
+        VMSTATE_UINT32(w6, XiveEQ),
+        VMSTATE_UINT32(w7, XiveEQ),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
 static const VMStateDescription vmstate_xive_nvt = {
     .name = TYPE_XIVE_NVT,
     .version_id = 1,
     .minimum_version_id = 1,
     .fields = (VMStateField[]) {
         VMSTATE_BUFFER(regs, XiveNVT),
+        VMSTATE_STRUCT_ARRAY(eqt, XiveNVT, (XIVE_PRIORITY_MAX + 1), 1,
+                             vmstate_xive_nvt_eq, XiveEQ),
         VMSTATE_END_OF_LIST()
     },
 };
@@ -305,6 +369,13 @@  XiveNVT *xive_fabric_get_nvt(XiveFabric *xf, uint32_t server)
     return xfc->get_nvt(xf, server);
 }
 
+XiveEQ *xive_fabric_get_eq(XiveFabric *xf, uint32_t eq_idx)
+{
+   XiveFabricClass *xfc = XIVE_FABRIC_GET_CLASS(xf);
+
+   return xfc->get_eq(xf, eq_idx);
+}
+
 static void xive_fabric_route(XiveFabric *xf, int lisn)
 {
 
diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
index 25d78eec884d..7cb3561aa3d3 100644
--- a/include/hw/ppc/spapr_xive.h
+++ b/include/hw/ppc/spapr_xive.h
@@ -36,4 +36,11 @@  bool spapr_xive_irq_enable(sPAPRXive *xive, uint32_t lisn, bool lsi);
 bool spapr_xive_irq_disable(sPAPRXive *xive, uint32_t lisn);
 void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon);
 
+/*
+ * sPAPR encoding of EQ indexes
+ */
+#define SPAPR_XIVE_EQ_INDEX(server, prio)  (((server) << 3) | ((prio) & 0x7))
+#define SPAPR_XIVE_EQ_SERVER(eq_idx) ((eq_idx) >> 3)
+#define SPAPR_XIVE_EQ_PRIO(eq_idx)   ((eq_idx) & 0x7)
+
 #endif /* PPC_SPAPR_XIVE_H */
diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
index 1a2da610d91c..6cc02638c677 100644
--- a/include/hw/ppc/xive.h
+++ b/include/hw/ppc/xive.h
@@ -176,12 +176,18 @@  typedef struct XiveNVT {
 
     /* Shortcuts to rings */
     uint8_t   *ring_os;
+
+    XiveEQ    eqt[XIVE_PRIORITY_MAX + 1];
 } XiveNVT;
 
 extern const MemoryRegionOps xive_tm_user_ops;
 extern const MemoryRegionOps xive_tm_os_ops;
 
 void xive_nvt_pic_print_info(XiveNVT *nvt, Monitor *mon);
+XiveEQ *xive_nvt_eq_get(XiveNVT *nvt, uint8_t priority);
+
+void xive_eq_reset(XiveEQ *eq);
+void xive_eq_pic_print_info(XiveEQ *eq, Monitor *mon);
 
 /*
  * XIVE Fabric
@@ -205,9 +211,11 @@  typedef struct XiveFabricClass {
 
     XiveIVE *(*get_ive)(XiveFabric *xf, uint32_t lisn);
     XiveNVT *(*get_nvt)(XiveFabric *xf, uint32_t server);
+    XiveEQ  *(*get_eq)(XiveFabric *xf, uint32_t eq_idx);
 } XiveFabricClass;
 
 XiveIVE *xive_fabric_get_ive(XiveFabric *xf, uint32_t lisn);
 XiveNVT *xive_fabric_get_nvt(XiveFabric *xf, uint32_t server);
+XiveEQ  *xive_fabric_get_eq(XiveFabric *xf, uint32_t eq_idx);
 
 #endif /* PPC_XIVE_H */
diff --git a/include/hw/ppc/xive_regs.h b/include/hw/ppc/xive_regs.h
index f2e2a1ac8f6e..bcc44e766db9 100644
--- a/include/hw/ppc/xive_regs.h
+++ b/include/hw/ppc/xive_regs.h
@@ -112,6 +112,54 @@  typedef struct XiveIVE {
 #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
 } XiveIVE;
 
+/* EQ */
+typedef struct XiveEQ {
+        uint32_t        w0;
+#define EQ_W0_VALID             PPC_BIT32(0) /* "v" bit */
+#define EQ_W0_ENQUEUE           PPC_BIT32(1) /* "q" bit */
+#define EQ_W0_UCOND_NOTIFY      PPC_BIT32(2) /* "n" bit */
+#define EQ_W0_BACKLOG           PPC_BIT32(3) /* "b" bit */
+#define EQ_W0_PRECL_ESC_CTL     PPC_BIT32(4) /* "p" bit */
+#define EQ_W0_ESCALATE_CTL      PPC_BIT32(5) /* "e" bit */
+#define EQ_W0_UNCOND_ESCALATE   PPC_BIT32(6) /* "u" bit - DD2.0 */
+#define EQ_W0_SILENT_ESCALATE   PPC_BIT32(7) /* "s" bit - DD2.0 */
+#define EQ_W0_QSIZE             PPC_BITMASK32(12, 15)
+#define EQ_W0_SW0               PPC_BIT32(16)
+#define EQ_W0_FIRMWARE          EQ_W0_SW0 /* Owned by FW */
+#define EQ_QSIZE_4K             0
+#define EQ_QSIZE_64K            4
+#define EQ_W0_HWDEP             PPC_BITMASK32(24, 31)
+        uint32_t        w1;
+#define EQ_W1_ESn               PPC_BITMASK32(0, 1)
+#define EQ_W1_ESn_P             PPC_BIT32(0)
+#define EQ_W1_ESn_Q             PPC_BIT32(1)
+#define EQ_W1_ESe               PPC_BITMASK32(2, 3)
+#define EQ_W1_ESe_P             PPC_BIT32(2)
+#define EQ_W1_ESe_Q             PPC_BIT32(3)
+#define EQ_W1_GENERATION        PPC_BIT32(9)
+#define EQ_W1_PAGE_OFF          PPC_BITMASK32(10, 31)
+        uint32_t        w2;
+#define EQ_W2_MIGRATION_REG     PPC_BITMASK32(0, 3)
+#define EQ_W2_OP_DESC_HI        PPC_BITMASK32(4, 31)
+        uint32_t        w3;
+#define EQ_W3_OP_DESC_LO        PPC_BITMASK32(0, 31)
+        uint32_t        w4;
+#define EQ_W4_ESC_EQ_BLOCK      PPC_BITMASK32(4, 7)
+#define EQ_W4_ESC_EQ_INDEX      PPC_BITMASK32(8, 31)
+        uint32_t        w5;
+#define EQ_W5_ESC_EQ_DATA       PPC_BITMASK32(1, 31)
+        uint32_t        w6;
+#define EQ_W6_FORMAT_BIT        PPC_BIT32(8)
+#define EQ_W6_NVT_BLOCK         PPC_BITMASK32(9, 12)
+#define EQ_W6_NVT_INDEX         PPC_BITMASK32(13, 31)
+        uint32_t        w7;
+#define EQ_W7_F0_IGNORE         PPC_BIT32(0)
+#define EQ_W7_F0_BLK_GROUPING   PPC_BIT32(1)
+#define EQ_W7_F0_PRIORITY       PPC_BITMASK32(8, 15)
+#define EQ_W7_F1_WAKEZ          PPC_BIT32(0)
+#define EQ_W7_F1_LOG_SERVER_ID  PPC_BITMASK32(1, 31)
+} XiveEQ;
+
 #define XIVE_PRIORITY_MAX  7
 
 #endif /* _INTC_XIVE_INTERNAL_H */