diff mbox series

[13/25] spapr: introduce the XIVE Event Queues

Message ID 20171123132955.1261-14-clg@kaod.org
State New
Headers show
Series spapr: Guest exploitation of the XIVE interrupt controller (POWER9) | expand

Commit Message

Cédric Le Goater Nov. 23, 2017, 1:29 p.m. UTC
The Event Queue Descriptor (EQD) table, also known as Event Notification
Descriptor (END), is one of the internal tables the XIVE interrupt
controller uses to redirect exception from event sources to CPU
threads.

The EQD specifies on which Event Queue the event data should be posted
when an exception occurs (later on pulled by the OS) and which server
(VPD in XIVE terminology) to notify. The Event Queue is a much more
complex structure but we start with a simple model for the sPAPR
machine.

There is one XiveEQ per priority and the model chooses to store them
under the Xive Interrupt presenter model. It will be retrieved, just
like for XICS, through the 'intc' object pointer of the CPU.

The EQ indexing follows a simple pattern:

       (server << 3) | (priority & 0x7)

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 hw/intc/spapr_xive.c    | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
 hw/intc/xive-internal.h | 50 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)

Comments

Benjamin Herrenschmidt Nov. 23, 2017, 8:31 p.m. UTC | #1
On Thu, 2017-11-23 at 14:29 +0100, Cédric Le Goater wrote:
> The Event Queue Descriptor (EQD) table, also known as Event Notification
> Descriptor (END), is one of the internal tables the XIVE interrupt
> controller uses to redirect exception from event sources to CPU
> threads.

Keep in mind tha we want to only expose to the guest priorities 0..6
as priority 7 will not be available with KVM on DD2.0 chips.

> The EQD specifies on which Event Queue the event data should be posted
> when an exception occurs (later on pulled by the OS) and which server
> (VPD in XIVE terminology) to notify. The Event Queue is a much more
> complex structure but we start with a simple model for the sPAPR
> machine.
> 
> There is one XiveEQ per priority and the model chooses to store them
> under the Xive Interrupt presenter model. It will be retrieved, just
> like for XICS, through the 'intc' object pointer of the CPU.
> 
> The EQ indexing follows a simple pattern:
> 
>        (server << 3) | (priority & 0x7)
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  hw/intc/spapr_xive.c    | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/intc/xive-internal.h | 50 +++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 106 insertions(+)
> 
> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> index 554b25e0884c..983317a6b3f6 100644
> --- a/hw/intc/spapr_xive.c
> +++ b/hw/intc/spapr_xive.c
> @@ -23,6 +23,7 @@
>  #include "sysemu/dma.h"
>  #include "monitor/monitor.h"
>  #include "hw/ppc/spapr_xive.h"
> +#include "hw/ppc/spapr.h"
>  #include "hw/ppc/xics.h"
>  
>  #include "xive-internal.h"
> @@ -34,6 +35,8 @@ struct sPAPRXiveICP {
>      uint8_t   tima[TM_RING_COUNT * 0x10];
>      uint8_t   *tima_os;
>      qemu_irq  output;
> +
> +    XiveEQ    eqt[XIVE_PRIORITY_MAX + 1];
>  };
>  
>  static uint64_t spapr_xive_icp_accept(sPAPRXiveICP *icp)
> @@ -183,6 +186,13 @@ static const MemoryRegionOps spapr_xive_tm_ops = {
>      },
>  };
>  
> +static sPAPRXiveICP *spapr_xive_icp_get(sPAPRXive *xive, int server)
> +{
> +    PowerPCCPU *cpu = spapr_find_cpu(server);
> +
> +    return cpu ? SPAPR_XIVE_ICP(cpu->intc) : NULL;
> +}
> +
>  static void spapr_xive_irq(sPAPRXive *xive, int lisn)
>  {
>  
> @@ -632,6 +642,8 @@ static void spapr_xive_icp_reset(void *dev)
>      sPAPRXiveICP *xicp = SPAPR_XIVE_ICP(dev);
>  
>      memset(xicp->tima, 0, sizeof(xicp->tima));
> +
> +    memset(xicp->eqt, 0, sizeof(xicp->eqt));
>  }
>  
>  static void spapr_xive_icp_realize(DeviceState *dev, Error **errp)
> @@ -683,6 +695,23 @@ static void spapr_xive_icp_init(Object *obj)
>      xicp->tima_os = &xicp->tima[TM_QW1_OS];
>  }
>  
> +static const VMStateDescription vmstate_spapr_xive_icp_eq = {
> +    .name = TYPE_SPAPR_XIVE_ICP "/eq",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .fields = (VMStateField []) {
> +        VMSTATE_UINT32(w0, XiveEQ),
> +        VMSTATE_UINT32(w1, XiveEQ),
> +        VMSTATE_UINT32(w2, XiveEQ),
> +        VMSTATE_UINT32(w3, XiveEQ),
> +        VMSTATE_UINT32(w4, XiveEQ),
> +        VMSTATE_UINT32(w5, XiveEQ),
> +        VMSTATE_UINT32(w6, XiveEQ),
> +        VMSTATE_UINT32(w7, XiveEQ),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
>  static bool vmstate_spapr_xive_icp_needed(void *opaque)
>  {
>      /* TODO check machine XIVE support */
> @@ -696,6 +725,8 @@ static const VMStateDescription vmstate_spapr_xive_icp = {
>      .needed = vmstate_spapr_xive_icp_needed,
>      .fields = (VMStateField[]) {
>          VMSTATE_BUFFER(tima, sPAPRXiveICP),
> +        VMSTATE_STRUCT_ARRAY(eqt, sPAPRXiveICP, (XIVE_PRIORITY_MAX + 1), 1,
> +                             vmstate_spapr_xive_icp_eq, XiveEQ),
>          VMSTATE_END_OF_LIST()
>      },
>  };
> @@ -755,3 +786,28 @@ bool spapr_xive_irq_unset(sPAPRXive *xive, uint32_t lisn)
>      ive->w &= ~IVE_VALID;
>      return true;
>  }
> +
> +/*
> + * Use a simple indexing for the EQs.
> + */
> +XiveEQ *spapr_xive_get_eq(sPAPRXive *xive, uint32_t eq_idx)
> +{
> +    int priority = eq_idx & 0x7;
> +    sPAPRXiveICP *xicp = spapr_xive_icp_get(xive, eq_idx >> 3);
> +
> +    return xicp ? &xicp->eqt[priority] : NULL;
> +}
> +
> +bool spapr_xive_eq_for_server(sPAPRXive *xive, uint32_t server,
> +                              uint8_t priority, uint32_t *out_eq_idx)
> +{
> +    if (priority > XIVE_PRIORITY_MAX) {
> +        return false;
> +    }
> +
> +    if (out_eq_idx) {
> +        *out_eq_idx = (server << 3) | (priority & 0x7);
> +    }
> +
> +    return true;
> +}
> diff --git a/hw/intc/xive-internal.h b/hw/intc/xive-internal.h
> index 7d329f203a9b..c3949671aa03 100644
> --- a/hw/intc/xive-internal.h
> +++ b/hw/intc/xive-internal.h
> @@ -131,9 +131,59 @@ typedef struct XiveIVE {
>  #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
>  } XiveIVE;
>  
> +/* EQ */
> +typedef struct XiveEQ {
> +        uint32_t        w0;
> +#define EQ_W0_VALID             PPC_BIT32(0)
> +#define EQ_W0_ENQUEUE           PPC_BIT32(1)
> +#define EQ_W0_UCOND_NOTIFY      PPC_BIT32(2)
> +#define EQ_W0_BACKLOG           PPC_BIT32(3)
> +#define EQ_W0_PRECL_ESC_CTL     PPC_BIT32(4)
> +#define EQ_W0_ESCALATE_CTL      PPC_BIT32(5)
> +#define EQ_W0_END_OF_INTR       PPC_BIT32(6)
> +#define EQ_W0_QSIZE             PPC_BITMASK32(12, 15)
> +#define EQ_W0_SW0               PPC_BIT32(16)
> +#define EQ_W0_FIRMWARE          EQ_W0_SW0 /* Owned by FW */
> +#define EQ_QSIZE_4K             0
> +#define EQ_QSIZE_64K            4
> +#define EQ_W0_HWDEP             PPC_BITMASK32(24, 31)
> +        uint32_t        w1;
> +#define EQ_W1_ESn               PPC_BITMASK32(0, 1)
> +#define EQ_W1_ESn_P             PPC_BIT32(0)
> +#define EQ_W1_ESn_Q             PPC_BIT32(1)
> +#define EQ_W1_ESe               PPC_BITMASK32(2, 3)
> +#define EQ_W1_ESe_P             PPC_BIT32(2)
> +#define EQ_W1_ESe_Q             PPC_BIT32(3)
> +#define EQ_W1_GENERATION        PPC_BIT32(9)
> +#define EQ_W1_PAGE_OFF          PPC_BITMASK32(10, 31)
> +        uint32_t        w2;
> +#define EQ_W2_MIGRATION_REG     PPC_BITMASK32(0, 3)
> +#define EQ_W2_OP_DESC_HI        PPC_BITMASK32(4, 31)
> +        uint32_t        w3;
> +#define EQ_W3_OP_DESC_LO        PPC_BITMASK32(0, 31)
> +        uint32_t        w4;
> +#define EQ_W4_ESC_EQ_BLOCK      PPC_BITMASK32(4, 7)
> +#define EQ_W4_ESC_EQ_INDEX      PPC_BITMASK32(8, 31)
> +        uint32_t        w5;
> +#define EQ_W5_ESC_EQ_DATA       PPC_BITMASK32(1, 31)
> +        uint32_t        w6;
> +#define EQ_W6_FORMAT_BIT        PPC_BIT32(8)
> +#define EQ_W6_NVT_BLOCK         PPC_BITMASK32(9, 12)
> +#define EQ_W6_NVT_INDEX         PPC_BITMASK32(13, 31)
> +        uint32_t        w7;
> +#define EQ_W7_F0_IGNORE         PPC_BIT32(0)
> +#define EQ_W7_F0_BLK_GROUPING   PPC_BIT32(1)
> +#define EQ_W7_F0_PRIORITY       PPC_BITMASK32(8, 15)
> +#define EQ_W7_F1_WAKEZ          PPC_BIT32(0)
> +#define EQ_W7_F1_LOG_SERVER_ID  PPC_BITMASK32(1, 31)
> +} XiveEQ;
> +
>  #define XIVE_PRIORITY_MAX  7
>  
>  void spapr_xive_reset(void *dev);
>  XiveIVE *spapr_xive_get_ive(sPAPRXive *xive, uint32_t lisn);
> +XiveEQ *spapr_xive_get_eq(sPAPRXive *xive, uint32_t idx);
> +bool spapr_xive_eq_for_server(sPAPRXive *xive, uint32_t server, uint8_t prio,
> +                              uint32_t *out_eq_idx);
>  
>  #endif /* _INTC_XIVE_INTERNAL_H */
Cédric Le Goater Nov. 24, 2017, 8:15 a.m. UTC | #2
On 11/23/2017 09:31 PM, Benjamin Herrenschmidt wrote:
> On Thu, 2017-11-23 at 14:29 +0100, Cédric Le Goater wrote:
>> The Event Queue Descriptor (EQD) table, also known as Event Notification
>> Descriptor (END), is one of the internal tables the XIVE interrupt
>> controller uses to redirect exception from event sources to CPU
>> threads.
> 
> Keep in mind tha we want to only expose to the guest priorities 0..6
> as priority 7 will not be available with KVM on DD2.0 chips.

The hcall patch 19 introduces the "ibm,plat-res-int-priorities" property.
The priority ranges reserved by the hypervisor are the following :

    static const uint32_t reserved_priorities[] = {
        7,    /* start */
        0xf8, /* count */
    };

So The Linux driver is expected to choose priority 6. The priority
validity is then checked in each hcall returning H_P4/H_P3 in case of 
failure.  

But it is true that we scale the arrays with :
 
    #define XIVE_PRIORITY_MAX  7

Do you want QEMU to completely remove prio 7 ? 

C. 

>> The EQD specifies on which Event Queue the event data should be posted
>> when an exception occurs (later on pulled by the OS) and which server
>> (VPD in XIVE terminology) to notify. The Event Queue is a much more
>> complex structure but we start with a simple model for the sPAPR
>> machine.
>>
>> There is one XiveEQ per priority and the model chooses to store them
>> under the Xive Interrupt presenter model. It will be retrieved, just
>> like for XICS, through the 'intc' object pointer of the CPU.
>>
>> The EQ indexing follows a simple pattern:
>>
>>        (server << 3) | (priority & 0x7)
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  hw/intc/spapr_xive.c    | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  hw/intc/xive-internal.h | 50 +++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 106 insertions(+)
>>
>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>> index 554b25e0884c..983317a6b3f6 100644
>> --- a/hw/intc/spapr_xive.c
>> +++ b/hw/intc/spapr_xive.c
>> @@ -23,6 +23,7 @@
>>  #include "sysemu/dma.h"
>>  #include "monitor/monitor.h"
>>  #include "hw/ppc/spapr_xive.h"
>> +#include "hw/ppc/spapr.h"
>>  #include "hw/ppc/xics.h"
>>  
>>  #include "xive-internal.h"
>> @@ -34,6 +35,8 @@ struct sPAPRXiveICP {
>>      uint8_t   tima[TM_RING_COUNT * 0x10];
>>      uint8_t   *tima_os;
>>      qemu_irq  output;
>> +
>> +    XiveEQ    eqt[XIVE_PRIORITY_MAX + 1];
>>  };
>>  
>>  static uint64_t spapr_xive_icp_accept(sPAPRXiveICP *icp)
>> @@ -183,6 +186,13 @@ static const MemoryRegionOps spapr_xive_tm_ops = {
>>      },
>>  };
>>  
>> +static sPAPRXiveICP *spapr_xive_icp_get(sPAPRXive *xive, int server)
>> +{
>> +    PowerPCCPU *cpu = spapr_find_cpu(server);
>> +
>> +    return cpu ? SPAPR_XIVE_ICP(cpu->intc) : NULL;
>> +}
>> +
>>  static void spapr_xive_irq(sPAPRXive *xive, int lisn)
>>  {
>>  
>> @@ -632,6 +642,8 @@ static void spapr_xive_icp_reset(void *dev)
>>      sPAPRXiveICP *xicp = SPAPR_XIVE_ICP(dev);
>>  
>>      memset(xicp->tima, 0, sizeof(xicp->tima));
>> +
>> +    memset(xicp->eqt, 0, sizeof(xicp->eqt));
>>  }
>>  
>>  static void spapr_xive_icp_realize(DeviceState *dev, Error **errp)
>> @@ -683,6 +695,23 @@ static void spapr_xive_icp_init(Object *obj)
>>      xicp->tima_os = &xicp->tima[TM_QW1_OS];
>>  }
>>  
>> +static const VMStateDescription vmstate_spapr_xive_icp_eq = {
>> +    .name = TYPE_SPAPR_XIVE_ICP "/eq",
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .fields = (VMStateField []) {
>> +        VMSTATE_UINT32(w0, XiveEQ),
>> +        VMSTATE_UINT32(w1, XiveEQ),
>> +        VMSTATE_UINT32(w2, XiveEQ),
>> +        VMSTATE_UINT32(w3, XiveEQ),
>> +        VMSTATE_UINT32(w4, XiveEQ),
>> +        VMSTATE_UINT32(w5, XiveEQ),
>> +        VMSTATE_UINT32(w6, XiveEQ),
>> +        VMSTATE_UINT32(w7, XiveEQ),
>> +        VMSTATE_END_OF_LIST()
>> +    },
>> +};
>> +
>>  static bool vmstate_spapr_xive_icp_needed(void *opaque)
>>  {
>>      /* TODO check machine XIVE support */
>> @@ -696,6 +725,8 @@ static const VMStateDescription vmstate_spapr_xive_icp = {
>>      .needed = vmstate_spapr_xive_icp_needed,
>>      .fields = (VMStateField[]) {
>>          VMSTATE_BUFFER(tima, sPAPRXiveICP),
>> +        VMSTATE_STRUCT_ARRAY(eqt, sPAPRXiveICP, (XIVE_PRIORITY_MAX + 1), 1,
>> +                             vmstate_spapr_xive_icp_eq, XiveEQ),
>>          VMSTATE_END_OF_LIST()
>>      },
>>  };
>> @@ -755,3 +786,28 @@ bool spapr_xive_irq_unset(sPAPRXive *xive, uint32_t lisn)
>>      ive->w &= ~IVE_VALID;
>>      return true;
>>  }
>> +
>> +/*
>> + * Use a simple indexing for the EQs.
>> + */
>> +XiveEQ *spapr_xive_get_eq(sPAPRXive *xive, uint32_t eq_idx)
>> +{
>> +    int priority = eq_idx & 0x7;
>> +    sPAPRXiveICP *xicp = spapr_xive_icp_get(xive, eq_idx >> 3);
>> +
>> +    return xicp ? &xicp->eqt[priority] : NULL;
>> +}
>> +
>> +bool spapr_xive_eq_for_server(sPAPRXive *xive, uint32_t server,
>> +                              uint8_t priority, uint32_t *out_eq_idx)
>> +{
>> +    if (priority > XIVE_PRIORITY_MAX) {
>> +        return false;
>> +    }
>> +
>> +    if (out_eq_idx) {
>> +        *out_eq_idx = (server << 3) | (priority & 0x7);
>> +    }
>> +
>> +    return true;
>> +}
>> diff --git a/hw/intc/xive-internal.h b/hw/intc/xive-internal.h
>> index 7d329f203a9b..c3949671aa03 100644
>> --- a/hw/intc/xive-internal.h
>> +++ b/hw/intc/xive-internal.h
>> @@ -131,9 +131,59 @@ typedef struct XiveIVE {
>>  #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
>>  } XiveIVE;
>>  
>> +/* EQ */
>> +typedef struct XiveEQ {
>> +        uint32_t        w0;
>> +#define EQ_W0_VALID             PPC_BIT32(0)
>> +#define EQ_W0_ENQUEUE           PPC_BIT32(1)
>> +#define EQ_W0_UCOND_NOTIFY      PPC_BIT32(2)
>> +#define EQ_W0_BACKLOG           PPC_BIT32(3)
>> +#define EQ_W0_PRECL_ESC_CTL     PPC_BIT32(4)
>> +#define EQ_W0_ESCALATE_CTL      PPC_BIT32(5)
>> +#define EQ_W0_END_OF_INTR       PPC_BIT32(6)
>> +#define EQ_W0_QSIZE             PPC_BITMASK32(12, 15)
>> +#define EQ_W0_SW0               PPC_BIT32(16)
>> +#define EQ_W0_FIRMWARE          EQ_W0_SW0 /* Owned by FW */
>> +#define EQ_QSIZE_4K             0
>> +#define EQ_QSIZE_64K            4
>> +#define EQ_W0_HWDEP             PPC_BITMASK32(24, 31)
>> +        uint32_t        w1;
>> +#define EQ_W1_ESn               PPC_BITMASK32(0, 1)
>> +#define EQ_W1_ESn_P             PPC_BIT32(0)
>> +#define EQ_W1_ESn_Q             PPC_BIT32(1)
>> +#define EQ_W1_ESe               PPC_BITMASK32(2, 3)
>> +#define EQ_W1_ESe_P             PPC_BIT32(2)
>> +#define EQ_W1_ESe_Q             PPC_BIT32(3)
>> +#define EQ_W1_GENERATION        PPC_BIT32(9)
>> +#define EQ_W1_PAGE_OFF          PPC_BITMASK32(10, 31)
>> +        uint32_t        w2;
>> +#define EQ_W2_MIGRATION_REG     PPC_BITMASK32(0, 3)
>> +#define EQ_W2_OP_DESC_HI        PPC_BITMASK32(4, 31)
>> +        uint32_t        w3;
>> +#define EQ_W3_OP_DESC_LO        PPC_BITMASK32(0, 31)
>> +        uint32_t        w4;
>> +#define EQ_W4_ESC_EQ_BLOCK      PPC_BITMASK32(4, 7)
>> +#define EQ_W4_ESC_EQ_INDEX      PPC_BITMASK32(8, 31)
>> +        uint32_t        w5;
>> +#define EQ_W5_ESC_EQ_DATA       PPC_BITMASK32(1, 31)
>> +        uint32_t        w6;
>> +#define EQ_W6_FORMAT_BIT        PPC_BIT32(8)
>> +#define EQ_W6_NVT_BLOCK         PPC_BITMASK32(9, 12)
>> +#define EQ_W6_NVT_INDEX         PPC_BITMASK32(13, 31)
>> +        uint32_t        w7;
>> +#define EQ_W7_F0_IGNORE         PPC_BIT32(0)
>> +#define EQ_W7_F0_BLK_GROUPING   PPC_BIT32(1)
>> +#define EQ_W7_F0_PRIORITY       PPC_BITMASK32(8, 15)
>> +#define EQ_W7_F1_WAKEZ          PPC_BIT32(0)
>> +#define EQ_W7_F1_LOG_SERVER_ID  PPC_BITMASK32(1, 31)
>> +} XiveEQ;
>> +
>>  #define XIVE_PRIORITY_MAX  7
>>  
>>  void spapr_xive_reset(void *dev);
>>  XiveIVE *spapr_xive_get_ive(sPAPRXive *xive, uint32_t lisn);
>> +XiveEQ *spapr_xive_get_eq(sPAPRXive *xive, uint32_t idx);
>> +bool spapr_xive_eq_for_server(sPAPRXive *xive, uint32_t server, uint8_t prio,
>> +                              uint32_t *out_eq_idx);
>>  
>>  #endif /* _INTC_XIVE_INTERNAL_H */
Benjamin Herrenschmidt Nov. 26, 2017, 9:52 p.m. UTC | #3
On Fri, 2017-11-24 at 09:15 +0100, Cédric Le Goater wrote:
> So The Linux driver is expected to choose priority 6. The priority
> validity is then checked in each hcall returning H_P4/H_P3 in case of 
> failure.  
> 
> But it is true that we scale the arrays with :
>  
>     #define XIVE_PRIORITY_MAX  7
> 
> Do you want QEMU to completely remove prio 7 ? 

I'd like qemu to be consistent, at least make sure it errors out if the
OS tries to configue prio 7 or route an irq to it.

Cheers,
Ben.
David Gibson Nov. 30, 2017, 4:38 a.m. UTC | #4
On Thu, Nov 23, 2017 at 02:29:43PM +0100, Cédric Le Goater wrote:
> The Event Queue Descriptor (EQD) table, also known as Event Notification
> Descriptor (END), is one of the internal tables the XIVE interrupt
> controller uses to redirect exception from event sources to CPU
> threads.
> 
> The EQD specifies on which Event Queue the event data should be posted
> when an exception occurs (later on pulled by the OS) and which server
> (VPD in XIVE terminology) to notify. The Event Queue is a much more
> complex structure but we start with a simple model for the sPAPR
> machine.

Just to clarify my understanding a server / VPD in XIVE would
typically correspond to a cpu - either real or virtual, yes?

> There is one XiveEQ per priority and the model chooses to store them
> under the Xive Interrupt presenter model. It will be retrieved, just
> like for XICS, through the 'intc' object pointer of the CPU.
> 
> The EQ indexing follows a simple pattern:
> 
>        (server << 3) | (priority & 0x7)
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  hw/intc/spapr_xive.c    | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/intc/xive-internal.h | 50 +++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 106 insertions(+)
> 
> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> index 554b25e0884c..983317a6b3f6 100644
> --- a/hw/intc/spapr_xive.c
> +++ b/hw/intc/spapr_xive.c
> @@ -23,6 +23,7 @@
>  #include "sysemu/dma.h"
>  #include "monitor/monitor.h"
>  #include "hw/ppc/spapr_xive.h"
> +#include "hw/ppc/spapr.h"
>  #include "hw/ppc/xics.h"
>  
>  #include "xive-internal.h"
> @@ -34,6 +35,8 @@ struct sPAPRXiveICP {
>      uint8_t   tima[TM_RING_COUNT * 0x10];
>      uint8_t   *tima_os;
>      qemu_irq  output;
> +
> +    XiveEQ    eqt[XIVE_PRIORITY_MAX + 1];
>  };
>  
>  static uint64_t spapr_xive_icp_accept(sPAPRXiveICP *icp)
> @@ -183,6 +186,13 @@ static const MemoryRegionOps spapr_xive_tm_ops = {
>      },
>  };
>  
> +static sPAPRXiveICP *spapr_xive_icp_get(sPAPRXive *xive, int server)
> +{
> +    PowerPCCPU *cpu = spapr_find_cpu(server);
> +
> +    return cpu ? SPAPR_XIVE_ICP(cpu->intc) : NULL;
> +}
> +
>  static void spapr_xive_irq(sPAPRXive *xive, int lisn)
>  {
>  
> @@ -632,6 +642,8 @@ static void spapr_xive_icp_reset(void *dev)
>      sPAPRXiveICP *xicp = SPAPR_XIVE_ICP(dev);
>  
>      memset(xicp->tima, 0, sizeof(xicp->tima));
> +
> +    memset(xicp->eqt, 0, sizeof(xicp->eqt));
>  }
>  
>  static void spapr_xive_icp_realize(DeviceState *dev, Error **errp)
> @@ -683,6 +695,23 @@ static void spapr_xive_icp_init(Object *obj)
>      xicp->tima_os = &xicp->tima[TM_QW1_OS];
>  }
>  
> +static const VMStateDescription vmstate_spapr_xive_icp_eq = {
> +    .name = TYPE_SPAPR_XIVE_ICP "/eq",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .fields = (VMStateField []) {
> +        VMSTATE_UINT32(w0, XiveEQ),
> +        VMSTATE_UINT32(w1, XiveEQ),
> +        VMSTATE_UINT32(w2, XiveEQ),
> +        VMSTATE_UINT32(w3, XiveEQ),
> +        VMSTATE_UINT32(w4, XiveEQ),
> +        VMSTATE_UINT32(w5, XiveEQ),
> +        VMSTATE_UINT32(w6, XiveEQ),
> +        VMSTATE_UINT32(w7, XiveEQ),

Wow.  Super descriptive field names there, but I guess that's not your fault.

> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
>  static bool vmstate_spapr_xive_icp_needed(void *opaque)
>  {
>      /* TODO check machine XIVE support */
> @@ -696,6 +725,8 @@ static const VMStateDescription vmstate_spapr_xive_icp = {
>      .needed = vmstate_spapr_xive_icp_needed,
>      .fields = (VMStateField[]) {
>          VMSTATE_BUFFER(tima, sPAPRXiveICP),
> +        VMSTATE_STRUCT_ARRAY(eqt, sPAPRXiveICP, (XIVE_PRIORITY_MAX + 1), 1,
> +                             vmstate_spapr_xive_icp_eq, XiveEQ),
>          VMSTATE_END_OF_LIST()
>      },
>  };
> @@ -755,3 +786,28 @@ bool spapr_xive_irq_unset(sPAPRXive *xive, uint32_t lisn)
>      ive->w &= ~IVE_VALID;
>      return true;
>  }
> +
> +/*
> + * Use a simple indexing for the EQs.

Is this server+priority encoding architected anywhere?  Otherwise, why
not use separate parameters?

> + */
> +XiveEQ *spapr_xive_get_eq(sPAPRXive *xive, uint32_t eq_idx)
> +{
> +    int priority = eq_idx & 0x7;
> +    sPAPRXiveICP *xicp = spapr_xive_icp_get(xive, eq_idx >> 3);
> +
> +    return xicp ? &xicp->eqt[priority] : NULL;
> +}
> +
> +bool spapr_xive_eq_for_server(sPAPRXive *xive, uint32_t server,
> +                              uint8_t priority, uint32_t *out_eq_idx)
> +{
> +    if (priority > XIVE_PRIORITY_MAX) {
> +        return false;
> +    }
> +
> +    if (out_eq_idx) {
> +        *out_eq_idx = (server << 3) | (priority & 0x7);
> +    }
> +
> +    return true;
> +}
> diff --git a/hw/intc/xive-internal.h b/hw/intc/xive-internal.h
> index 7d329f203a9b..c3949671aa03 100644
> --- a/hw/intc/xive-internal.h
> +++ b/hw/intc/xive-internal.h
> @@ -131,9 +131,59 @@ typedef struct XiveIVE {
>  #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
>  } XiveIVE;
>  
> +/* EQ */
> +typedef struct XiveEQ {
> +        uint32_t        w0;
> +#define EQ_W0_VALID             PPC_BIT32(0)
> +#define EQ_W0_ENQUEUE           PPC_BIT32(1)
> +#define EQ_W0_UCOND_NOTIFY      PPC_BIT32(2)
> +#define EQ_W0_BACKLOG           PPC_BIT32(3)
> +#define EQ_W0_PRECL_ESC_CTL     PPC_BIT32(4)
> +#define EQ_W0_ESCALATE_CTL      PPC_BIT32(5)
> +#define EQ_W0_END_OF_INTR       PPC_BIT32(6)
> +#define EQ_W0_QSIZE             PPC_BITMASK32(12, 15)
> +#define EQ_W0_SW0               PPC_BIT32(16)
> +#define EQ_W0_FIRMWARE          EQ_W0_SW0 /* Owned by FW */
> +#define EQ_QSIZE_4K             0
> +#define EQ_QSIZE_64K            4
> +#define EQ_W0_HWDEP             PPC_BITMASK32(24, 31)
> +        uint32_t        w1;
> +#define EQ_W1_ESn               PPC_BITMASK32(0, 1)
> +#define EQ_W1_ESn_P             PPC_BIT32(0)
> +#define EQ_W1_ESn_Q             PPC_BIT32(1)
> +#define EQ_W1_ESe               PPC_BITMASK32(2, 3)
> +#define EQ_W1_ESe_P             PPC_BIT32(2)
> +#define EQ_W1_ESe_Q             PPC_BIT32(3)
> +#define EQ_W1_GENERATION        PPC_BIT32(9)
> +#define EQ_W1_PAGE_OFF          PPC_BITMASK32(10, 31)
> +        uint32_t        w2;
> +#define EQ_W2_MIGRATION_REG     PPC_BITMASK32(0, 3)
> +#define EQ_W2_OP_DESC_HI        PPC_BITMASK32(4, 31)
> +        uint32_t        w3;
> +#define EQ_W3_OP_DESC_LO        PPC_BITMASK32(0, 31)
> +        uint32_t        w4;
> +#define EQ_W4_ESC_EQ_BLOCK      PPC_BITMASK32(4, 7)
> +#define EQ_W4_ESC_EQ_INDEX      PPC_BITMASK32(8, 31)
> +        uint32_t        w5;
> +#define EQ_W5_ESC_EQ_DATA       PPC_BITMASK32(1, 31)
> +        uint32_t        w6;
> +#define EQ_W6_FORMAT_BIT        PPC_BIT32(8)
> +#define EQ_W6_NVT_BLOCK         PPC_BITMASK32(9, 12)
> +#define EQ_W6_NVT_INDEX         PPC_BITMASK32(13, 31)
> +        uint32_t        w7;
> +#define EQ_W7_F0_IGNORE         PPC_BIT32(0)
> +#define EQ_W7_F0_BLK_GROUPING   PPC_BIT32(1)
> +#define EQ_W7_F0_PRIORITY       PPC_BITMASK32(8, 15)
> +#define EQ_W7_F1_WAKEZ          PPC_BIT32(0)
> +#define EQ_W7_F1_LOG_SERVER_ID  PPC_BITMASK32(1, 31)
> +} XiveEQ;
> +
>  #define XIVE_PRIORITY_MAX  7
>  
>  void spapr_xive_reset(void *dev);
>  XiveIVE *spapr_xive_get_ive(sPAPRXive *xive, uint32_t lisn);
> +XiveEQ *spapr_xive_get_eq(sPAPRXive *xive, uint32_t idx);
> +bool spapr_xive_eq_for_server(sPAPRXive *xive, uint32_t server, uint8_t prio,
> +                              uint32_t *out_eq_idx);
>  
>  #endif /* _INTC_XIVE_INTERNAL_H */
Cédric Le Goater Nov. 30, 2017, 2:06 p.m. UTC | #5
On 11/30/2017 04:38 AM, David Gibson wrote:
> On Thu, Nov 23, 2017 at 02:29:43PM +0100, Cédric Le Goater wrote:
>> The Event Queue Descriptor (EQD) table, also known as Event Notification
>> Descriptor (END), is one of the internal tables the XIVE interrupt
>> controller uses to redirect exception from event sources to CPU
>> threads.
>>
>> The EQD specifies on which Event Queue the event data should be posted
>> when an exception occurs (later on pulled by the OS) and which server
>> (VPD in XIVE terminology) to notify. The Event Queue is a much more
>> complex structure but we start with a simple model for the sPAPR
>> machine.
> 
> Just to clarify my understanding a server / VPD in XIVE would
> typically correspond to a cpu - either real or virtual, yes?

yes. VP for "virtual processor" and VPD for "virtual processor 
descriptor" which contains the XIVE interrupt state of the VP 
when not dispatched. It is still described in some documentation 
as an NVT : Notification Virtual Target.  

XIVE concepts were renamed at some time but the old name perdured.
I am still struggling my way through all the names.


>> There is one XiveEQ per priority and the model chooses to store them
>> under the Xive Interrupt presenter model. It will be retrieved, just
>> like for XICS, through the 'intc' object pointer of the CPU.
>>
>> The EQ indexing follows a simple pattern:
>>
>>        (server << 3) | (priority & 0x7)
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  hw/intc/spapr_xive.c    | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  hw/intc/xive-internal.h | 50 +++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 106 insertions(+)
>>
>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>> index 554b25e0884c..983317a6b3f6 100644
>> --- a/hw/intc/spapr_xive.c
>> +++ b/hw/intc/spapr_xive.c
>> @@ -23,6 +23,7 @@
>>  #include "sysemu/dma.h"
>>  #include "monitor/monitor.h"
>>  #include "hw/ppc/spapr_xive.h"
>> +#include "hw/ppc/spapr.h"
>>  #include "hw/ppc/xics.h"
>>  
>>  #include "xive-internal.h"
>> @@ -34,6 +35,8 @@ struct sPAPRXiveICP {
>>      uint8_t   tima[TM_RING_COUNT * 0x10];
>>      uint8_t   *tima_os;
>>      qemu_irq  output;
>> +
>> +    XiveEQ    eqt[XIVE_PRIORITY_MAX + 1];
>>  };
>>  
>>  static uint64_t spapr_xive_icp_accept(sPAPRXiveICP *icp)
>> @@ -183,6 +186,13 @@ static const MemoryRegionOps spapr_xive_tm_ops = {
>>      },
>>  };
>>  
>> +static sPAPRXiveICP *spapr_xive_icp_get(sPAPRXive *xive, int server)
>> +{
>> +    PowerPCCPU *cpu = spapr_find_cpu(server);
>> +
>> +    return cpu ? SPAPR_XIVE_ICP(cpu->intc) : NULL;
>> +}
>> +
>>  static void spapr_xive_irq(sPAPRXive *xive, int lisn)
>>  {
>>  
>> @@ -632,6 +642,8 @@ static void spapr_xive_icp_reset(void *dev)
>>      sPAPRXiveICP *xicp = SPAPR_XIVE_ICP(dev);
>>  
>>      memset(xicp->tima, 0, sizeof(xicp->tima));
>> +
>> +    memset(xicp->eqt, 0, sizeof(xicp->eqt));
>>  }
>>  
>>  static void spapr_xive_icp_realize(DeviceState *dev, Error **errp)
>> @@ -683,6 +695,23 @@ static void spapr_xive_icp_init(Object *obj)
>>      xicp->tima_os = &xicp->tima[TM_QW1_OS];
>>  }
>>  
>> +static const VMStateDescription vmstate_spapr_xive_icp_eq = {
>> +    .name = TYPE_SPAPR_XIVE_ICP "/eq",
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .fields = (VMStateField []) {
>> +        VMSTATE_UINT32(w0, XiveEQ),
>> +        VMSTATE_UINT32(w1, XiveEQ),
>> +        VMSTATE_UINT32(w2, XiveEQ),
>> +        VMSTATE_UINT32(w3, XiveEQ),
>> +        VMSTATE_UINT32(w4, XiveEQ),
>> +        VMSTATE_UINT32(w5, XiveEQ),
>> +        VMSTATE_UINT32(w6, XiveEQ),
>> +        VMSTATE_UINT32(w7, XiveEQ),
> 
> Wow.  Super descriptive field names there, but I guess that's not your fault.

The defines in the "xive-internal.h" give a better view ... 

>> +        VMSTATE_END_OF_LIST()
>> +    },
>> +};
>> +
>>  static bool vmstate_spapr_xive_icp_needed(void *opaque)
>>  {
>>      /* TODO check machine XIVE support */
>> @@ -696,6 +725,8 @@ static const VMStateDescription vmstate_spapr_xive_icp = {
>>      .needed = vmstate_spapr_xive_icp_needed,
>>      .fields = (VMStateField[]) {
>>          VMSTATE_BUFFER(tima, sPAPRXiveICP),
>> +        VMSTATE_STRUCT_ARRAY(eqt, sPAPRXiveICP, (XIVE_PRIORITY_MAX + 1), 1,
>> +                             vmstate_spapr_xive_icp_eq, XiveEQ),
>>          VMSTATE_END_OF_LIST()
>>      },
>>  };
>> @@ -755,3 +786,28 @@ bool spapr_xive_irq_unset(sPAPRXive *xive, uint32_t lisn)
>>      ive->w &= ~IVE_VALID;
>>      return true;
>>  }
>> +
>> +/*
>> + * Use a simple indexing for the EQs.
> 
> Is this server+priority encoding architected anywhere?  

no. This is a model shortcut.

> Otherwise, why not use separate parameters?

yes. spapr_xive_get_eq() could use separate parameters and it would
shorten the some of the hcalls.

The result is stored in a single field of the IVE, EQ_INDEX. So I will 
still need mangle/demangle routines but these could be simple macros.
I will look at it.

>> + */
>> +XiveEQ *spapr_xive_get_eq(sPAPRXive *xive, uint32_t eq_idx)
>> +{
>> +    int priority = eq_idx & 0x7;
>> +    sPAPRXiveICP *xicp = spapr_xive_icp_get(xive, eq_idx >> 3);
>> +
>> +    return xicp ? &xicp->eqt[priority] : NULL;
>> +}
>> +
>> +bool spapr_xive_eq_for_server(sPAPRXive *xive, uint32_t server,
>> +                              uint8_t priority, uint32_t *out_eq_idx)
>> +{
>> +    if (priority > XIVE_PRIORITY_MAX) {
>> +        return false;
>> +    }
>> +
>> +    if (out_eq_idx) {
>> +        *out_eq_idx = (server << 3) | (priority & 0x7);
>> +    }
>> +
>> +    return true;
>> +}
>> diff --git a/hw/intc/xive-internal.h b/hw/intc/xive-internal.h
>> index 7d329f203a9b..c3949671aa03 100644
>> --- a/hw/intc/xive-internal.h
>> +++ b/hw/intc/xive-internal.h
>> @@ -131,9 +131,59 @@ typedef struct XiveIVE {
>>  #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
>>  } XiveIVE;
>>  
>> +/* EQ */
>> +typedef struct XiveEQ {
>> +        uint32_t        w0;
>> +#define EQ_W0_VALID             PPC_BIT32(0)
>> +#define EQ_W0_ENQUEUE           PPC_BIT32(1)
>> +#define EQ_W0_UCOND_NOTIFY      PPC_BIT32(2)
>> +#define EQ_W0_BACKLOG           PPC_BIT32(3)
>> +#define EQ_W0_PRECL_ESC_CTL     PPC_BIT32(4)
>> +#define EQ_W0_ESCALATE_CTL      PPC_BIT32(5)
>> +#define EQ_W0_END_OF_INTR       PPC_BIT32(6)
>> +#define EQ_W0_QSIZE             PPC_BITMASK32(12, 15)
>> +#define EQ_W0_SW0               PPC_BIT32(16)
>> +#define EQ_W0_FIRMWARE          EQ_W0_SW0 /* Owned by FW */
>> +#define EQ_QSIZE_4K             0
>> +#define EQ_QSIZE_64K            4
>> +#define EQ_W0_HWDEP             PPC_BITMASK32(24, 31)
>> +        uint32_t        w1;
>> +#define EQ_W1_ESn               PPC_BITMASK32(0, 1)
>> +#define EQ_W1_ESn_P             PPC_BIT32(0)
>> +#define EQ_W1_ESn_Q             PPC_BIT32(1)
>> +#define EQ_W1_ESe               PPC_BITMASK32(2, 3)
>> +#define EQ_W1_ESe_P             PPC_BIT32(2)
>> +#define EQ_W1_ESe_Q             PPC_BIT32(3)
>> +#define EQ_W1_GENERATION        PPC_BIT32(9)
>> +#define EQ_W1_PAGE_OFF          PPC_BITMASK32(10, 31)
>> +        uint32_t        w2;
>> +#define EQ_W2_MIGRATION_REG     PPC_BITMASK32(0, 3)
>> +#define EQ_W2_OP_DESC_HI        PPC_BITMASK32(4, 31)
>> +        uint32_t        w3;
>> +#define EQ_W3_OP_DESC_LO        PPC_BITMASK32(0, 31)
>> +        uint32_t        w4;
>> +#define EQ_W4_ESC_EQ_BLOCK      PPC_BITMASK32(4, 7)
>> +#define EQ_W4_ESC_EQ_INDEX      PPC_BITMASK32(8, 31)
>> +        uint32_t        w5;
>> +#define EQ_W5_ESC_EQ_DATA       PPC_BITMASK32(1, 31)
>> +        uint32_t        w6;
>> +#define EQ_W6_FORMAT_BIT        PPC_BIT32(8)
>> +#define EQ_W6_NVT_BLOCK         PPC_BITMASK32(9, 12)
>> +#define EQ_W6_NVT_INDEX         PPC_BITMASK32(13, 31)
>> +        uint32_t        w7;
>> +#define EQ_W7_F0_IGNORE         PPC_BIT32(0)
>> +#define EQ_W7_F0_BLK_GROUPING   PPC_BIT32(1)
>> +#define EQ_W7_F0_PRIORITY       PPC_BITMASK32(8, 15)
>> +#define EQ_W7_F1_WAKEZ          PPC_BIT32(0)
>> +#define EQ_W7_F1_LOG_SERVER_ID  PPC_BITMASK32(1, 31)
>> +} XiveEQ;
>> +
>>  #define XIVE_PRIORITY_MAX  7
>>  
>>  void spapr_xive_reset(void *dev);
>>  XiveIVE *spapr_xive_get_ive(sPAPRXive *xive, uint32_t lisn);
>> +XiveEQ *spapr_xive_get_eq(sPAPRXive *xive, uint32_t idx);
>> +bool spapr_xive_eq_for_server(sPAPRXive *xive, uint32_t server, uint8_t prio,
>> +                              uint32_t *out_eq_idx);
>>  
>>  #endif /* _INTC_XIVE_INTERNAL_H */
>
David Gibson Nov. 30, 2017, 11:35 p.m. UTC | #6
On Thu, Nov 30, 2017 at 02:06:27PM +0000, Cédric Le Goater wrote:
> On 11/30/2017 04:38 AM, David Gibson wrote:
> > On Thu, Nov 23, 2017 at 02:29:43PM +0100, Cédric Le Goater wrote:
> >> The Event Queue Descriptor (EQD) table, also known as Event Notification
> >> Descriptor (END), is one of the internal tables the XIVE interrupt
> >> controller uses to redirect exception from event sources to CPU
> >> threads.
> >>
> >> The EQD specifies on which Event Queue the event data should be posted
> >> when an exception occurs (later on pulled by the OS) and which server
> >> (VPD in XIVE terminology) to notify. The Event Queue is a much more
> >> complex structure but we start with a simple model for the sPAPR
> >> machine.
> > 
> > Just to clarify my understanding a server / VPD in XIVE would
> > typically correspond to a cpu - either real or virtual, yes?
> 
> yes. VP for "virtual processor" and VPD for "virtual processor 
> descriptor" which contains the XIVE interrupt state of the VP 
> when not dispatched. It is still described in some documentation 
> as an NVT : Notification Virtual Target.  
> 
> XIVE concepts were renamed at some time but the old name perdured.
> I am still struggling my way through all the names.
> 
> 
> >> There is one XiveEQ per priority and the model chooses to store them
> >> under the Xive Interrupt presenter model. It will be retrieved, just
> >> like for XICS, through the 'intc' object pointer of the CPU.
> >>
> >> The EQ indexing follows a simple pattern:
> >>
> >>        (server << 3) | (priority & 0x7)
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >> ---
> >>  hw/intc/spapr_xive.c    | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>  hw/intc/xive-internal.h | 50 +++++++++++++++++++++++++++++++++++++++++++
> >>  2 files changed, 106 insertions(+)
> >>
> >> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> >> index 554b25e0884c..983317a6b3f6 100644
> >> --- a/hw/intc/spapr_xive.c
> >> +++ b/hw/intc/spapr_xive.c
> >> @@ -23,6 +23,7 @@
> >>  #include "sysemu/dma.h"
> >>  #include "monitor/monitor.h"
> >>  #include "hw/ppc/spapr_xive.h"
> >> +#include "hw/ppc/spapr.h"
> >>  #include "hw/ppc/xics.h"
> >>  
> >>  #include "xive-internal.h"
> >> @@ -34,6 +35,8 @@ struct sPAPRXiveICP {
> >>      uint8_t   tima[TM_RING_COUNT * 0x10];
> >>      uint8_t   *tima_os;
> >>      qemu_irq  output;
> >> +
> >> +    XiveEQ    eqt[XIVE_PRIORITY_MAX + 1];
> >>  };
> >>  
> >>  static uint64_t spapr_xive_icp_accept(sPAPRXiveICP *icp)
> >> @@ -183,6 +186,13 @@ static const MemoryRegionOps spapr_xive_tm_ops = {
> >>      },
> >>  };
> >>  
> >> +static sPAPRXiveICP *spapr_xive_icp_get(sPAPRXive *xive, int server)
> >> +{
> >> +    PowerPCCPU *cpu = spapr_find_cpu(server);
> >> +
> >> +    return cpu ? SPAPR_XIVE_ICP(cpu->intc) : NULL;
> >> +}
> >> +
> >>  static void spapr_xive_irq(sPAPRXive *xive, int lisn)
> >>  {
> >>  
> >> @@ -632,6 +642,8 @@ static void spapr_xive_icp_reset(void *dev)
> >>      sPAPRXiveICP *xicp = SPAPR_XIVE_ICP(dev);
> >>  
> >>      memset(xicp->tima, 0, sizeof(xicp->tima));
> >> +
> >> +    memset(xicp->eqt, 0, sizeof(xicp->eqt));
> >>  }
> >>  
> >>  static void spapr_xive_icp_realize(DeviceState *dev, Error **errp)
> >> @@ -683,6 +695,23 @@ static void spapr_xive_icp_init(Object *obj)
> >>      xicp->tima_os = &xicp->tima[TM_QW1_OS];
> >>  }
> >>  
> >> +static const VMStateDescription vmstate_spapr_xive_icp_eq = {
> >> +    .name = TYPE_SPAPR_XIVE_ICP "/eq",
> >> +    .version_id = 1,
> >> +    .minimum_version_id = 1,
> >> +    .fields = (VMStateField []) {
> >> +        VMSTATE_UINT32(w0, XiveEQ),
> >> +        VMSTATE_UINT32(w1, XiveEQ),
> >> +        VMSTATE_UINT32(w2, XiveEQ),
> >> +        VMSTATE_UINT32(w3, XiveEQ),
> >> +        VMSTATE_UINT32(w4, XiveEQ),
> >> +        VMSTATE_UINT32(w5, XiveEQ),
> >> +        VMSTATE_UINT32(w6, XiveEQ),
> >> +        VMSTATE_UINT32(w7, XiveEQ),
> > 
> > Wow.  Super descriptive field names there, but I guess that's not your fault.
> 
> The defines in the "xive-internal.h" give a better view ... 
> 
> >> +        VMSTATE_END_OF_LIST()
> >> +    },
> >> +};
> >> +
> >>  static bool vmstate_spapr_xive_icp_needed(void *opaque)
> >>  {
> >>      /* TODO check machine XIVE support */
> >> @@ -696,6 +725,8 @@ static const VMStateDescription vmstate_spapr_xive_icp = {
> >>      .needed = vmstate_spapr_xive_icp_needed,
> >>      .fields = (VMStateField[]) {
> >>          VMSTATE_BUFFER(tima, sPAPRXiveICP),
> >> +        VMSTATE_STRUCT_ARRAY(eqt, sPAPRXiveICP, (XIVE_PRIORITY_MAX + 1), 1,
> >> +                             vmstate_spapr_xive_icp_eq, XiveEQ),
> >>          VMSTATE_END_OF_LIST()
> >>      },
> >>  };
> >> @@ -755,3 +786,28 @@ bool spapr_xive_irq_unset(sPAPRXive *xive, uint32_t lisn)
> >>      ive->w &= ~IVE_VALID;
> >>      return true;
> >>  }
> >> +
> >> +/*
> >> + * Use a simple indexing for the EQs.
> > 
> > Is this server+priority encoding architected anywhere?  
> 
> no. This is a model shortcut.
> 
> > Otherwise, why not use separate parameters?
> 
> yes. spapr_xive_get_eq() could use separate parameters and it would
> shorten the some of the hcalls.
> 
> The result is stored in a single field of the IVE, EQ_INDEX. So I will 
> still need mangle/demangle routines but these could be simple macros.
> I will look at it.

Hm, ok.  So it's architected in the sense that you're using the
encoding from the EQ_INDEX field throughout.  That's could be a
reasonable choice, I can't really tell yet.

On the other hand, it might be easier to read if we use server and
priority as separate parameters until the point we actually encode
into the EQ_INDEX field.

> 
> >> + */
> >> +XiveEQ *spapr_xive_get_eq(sPAPRXive *xive, uint32_t eq_idx)
> >> +{
> >> +    int priority = eq_idx & 0x7;
> >> +    sPAPRXiveICP *xicp = spapr_xive_icp_get(xive, eq_idx >> 3);
> >> +
> >> +    return xicp ? &xicp->eqt[priority] : NULL;
> >> +}
> >> +
> >> +bool spapr_xive_eq_for_server(sPAPRXive *xive, uint32_t server,
> >> +                              uint8_t priority, uint32_t *out_eq_idx)
> >> +{
> >> +    if (priority > XIVE_PRIORITY_MAX) {
> >> +        return false;
> >> +    }
> >> +
> >> +    if (out_eq_idx) {
> >> +        *out_eq_idx = (server << 3) | (priority & 0x7);
> >> +    }
> >> +
> >> +    return true;
> >> +}
> >> diff --git a/hw/intc/xive-internal.h b/hw/intc/xive-internal.h
> >> index 7d329f203a9b..c3949671aa03 100644
> >> --- a/hw/intc/xive-internal.h
> >> +++ b/hw/intc/xive-internal.h
> >> @@ -131,9 +131,59 @@ typedef struct XiveIVE {
> >>  #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
> >>  } XiveIVE;
> >>  
> >> +/* EQ */
> >> +typedef struct XiveEQ {
> >> +        uint32_t        w0;
> >> +#define EQ_W0_VALID             PPC_BIT32(0)
> >> +#define EQ_W0_ENQUEUE           PPC_BIT32(1)
> >> +#define EQ_W0_UCOND_NOTIFY      PPC_BIT32(2)
> >> +#define EQ_W0_BACKLOG           PPC_BIT32(3)
> >> +#define EQ_W0_PRECL_ESC_CTL     PPC_BIT32(4)
> >> +#define EQ_W0_ESCALATE_CTL      PPC_BIT32(5)
> >> +#define EQ_W0_END_OF_INTR       PPC_BIT32(6)
> >> +#define EQ_W0_QSIZE             PPC_BITMASK32(12, 15)
> >> +#define EQ_W0_SW0               PPC_BIT32(16)
> >> +#define EQ_W0_FIRMWARE          EQ_W0_SW0 /* Owned by FW */
> >> +#define EQ_QSIZE_4K             0
> >> +#define EQ_QSIZE_64K            4
> >> +#define EQ_W0_HWDEP             PPC_BITMASK32(24, 31)
> >> +        uint32_t        w1;
> >> +#define EQ_W1_ESn               PPC_BITMASK32(0, 1)
> >> +#define EQ_W1_ESn_P             PPC_BIT32(0)
> >> +#define EQ_W1_ESn_Q             PPC_BIT32(1)
> >> +#define EQ_W1_ESe               PPC_BITMASK32(2, 3)
> >> +#define EQ_W1_ESe_P             PPC_BIT32(2)
> >> +#define EQ_W1_ESe_Q             PPC_BIT32(3)
> >> +#define EQ_W1_GENERATION        PPC_BIT32(9)
> >> +#define EQ_W1_PAGE_OFF          PPC_BITMASK32(10, 31)
> >> +        uint32_t        w2;
> >> +#define EQ_W2_MIGRATION_REG     PPC_BITMASK32(0, 3)
> >> +#define EQ_W2_OP_DESC_HI        PPC_BITMASK32(4, 31)
> >> +        uint32_t        w3;
> >> +#define EQ_W3_OP_DESC_LO        PPC_BITMASK32(0, 31)
> >> +        uint32_t        w4;
> >> +#define EQ_W4_ESC_EQ_BLOCK      PPC_BITMASK32(4, 7)
> >> +#define EQ_W4_ESC_EQ_INDEX      PPC_BITMASK32(8, 31)
> >> +        uint32_t        w5;
> >> +#define EQ_W5_ESC_EQ_DATA       PPC_BITMASK32(1, 31)
> >> +        uint32_t        w6;
> >> +#define EQ_W6_FORMAT_BIT        PPC_BIT32(8)
> >> +#define EQ_W6_NVT_BLOCK         PPC_BITMASK32(9, 12)
> >> +#define EQ_W6_NVT_INDEX         PPC_BITMASK32(13, 31)
> >> +        uint32_t        w7;
> >> +#define EQ_W7_F0_IGNORE         PPC_BIT32(0)
> >> +#define EQ_W7_F0_BLK_GROUPING   PPC_BIT32(1)
> >> +#define EQ_W7_F0_PRIORITY       PPC_BITMASK32(8, 15)
> >> +#define EQ_W7_F1_WAKEZ          PPC_BIT32(0)
> >> +#define EQ_W7_F1_LOG_SERVER_ID  PPC_BITMASK32(1, 31)
> >> +} XiveEQ;
> >> +
> >>  #define XIVE_PRIORITY_MAX  7
> >>  
> >>  void spapr_xive_reset(void *dev);
> >>  XiveIVE *spapr_xive_get_ive(sPAPRXive *xive, uint32_t lisn);
> >> +XiveEQ *spapr_xive_get_eq(sPAPRXive *xive, uint32_t idx);
> >> +bool spapr_xive_eq_for_server(sPAPRXive *xive, uint32_t server, uint8_t prio,
> >> +                              uint32_t *out_eq_idx);
> >>  
> >>  #endif /* _INTC_XIVE_INTERNAL_H */
> > 
>
Cédric Le Goater Dec. 1, 2017, 4:36 p.m. UTC | #7
On 12/01/2017 12:35 AM, David Gibson wrote:
> On Thu, Nov 30, 2017 at 02:06:27PM +0000, Cédric Le Goater wrote:
>> On 11/30/2017 04:38 AM, David Gibson wrote:
>>> On Thu, Nov 23, 2017 at 02:29:43PM +0100, Cédric Le Goater wrote:
>>>> The Event Queue Descriptor (EQD) table, also known as Event Notification
>>>> Descriptor (END), is one of the internal tables the XIVE interrupt
>>>> controller uses to redirect exception from event sources to CPU
>>>> threads.
>>>>
>>>> The EQD specifies on which Event Queue the event data should be posted
>>>> when an exception occurs (later on pulled by the OS) and which server
>>>> (VPD in XIVE terminology) to notify. The Event Queue is a much more
>>>> complex structure but we start with a simple model for the sPAPR
>>>> machine.
>>>
>>> Just to clarify my understanding a server / VPD in XIVE would
>>> typically correspond to a cpu - either real or virtual, yes?
>>
>> yes. VP for "virtual processor" and VPD for "virtual processor 
>> descriptor" which contains the XIVE interrupt state of the VP 
>> when not dispatched. It is still described in some documentation 
>> as an NVT : Notification Virtual Target.  
>>
>> XIVE concepts were renamed at some time but the old name perdured.
>> I am still struggling my way through all the names.
>>
>>
>>>> There is one XiveEQ per priority and the model chooses to store them
>>>> under the Xive Interrupt presenter model. It will be retrieved, just
>>>> like for XICS, through the 'intc' object pointer of the CPU.
>>>>
>>>> The EQ indexing follows a simple pattern:
>>>>
>>>>        (server << 3) | (priority & 0x7)
>>>>
>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>> ---
>>>>  hw/intc/spapr_xive.c    | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  hw/intc/xive-internal.h | 50 +++++++++++++++++++++++++++++++++++++++++++
>>>>  2 files changed, 106 insertions(+)
>>>>
>>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>>>> index 554b25e0884c..983317a6b3f6 100644
>>>> --- a/hw/intc/spapr_xive.c
>>>> +++ b/hw/intc/spapr_xive.c
>>>> @@ -23,6 +23,7 @@
>>>>  #include "sysemu/dma.h"
>>>>  #include "monitor/monitor.h"
>>>>  #include "hw/ppc/spapr_xive.h"
>>>> +#include "hw/ppc/spapr.h"
>>>>  #include "hw/ppc/xics.h"
>>>>  
>>>>  #include "xive-internal.h"
>>>> @@ -34,6 +35,8 @@ struct sPAPRXiveICP {
>>>>      uint8_t   tima[TM_RING_COUNT * 0x10];
>>>>      uint8_t   *tima_os;
>>>>      qemu_irq  output;
>>>> +
>>>> +    XiveEQ    eqt[XIVE_PRIORITY_MAX + 1];
>>>>  };
>>>>  
>>>>  static uint64_t spapr_xive_icp_accept(sPAPRXiveICP *icp)
>>>> @@ -183,6 +186,13 @@ static const MemoryRegionOps spapr_xive_tm_ops = {
>>>>      },
>>>>  };
>>>>  
>>>> +static sPAPRXiveICP *spapr_xive_icp_get(sPAPRXive *xive, int server)
>>>> +{
>>>> +    PowerPCCPU *cpu = spapr_find_cpu(server);
>>>> +
>>>> +    return cpu ? SPAPR_XIVE_ICP(cpu->intc) : NULL;
>>>> +}
>>>> +
>>>>  static void spapr_xive_irq(sPAPRXive *xive, int lisn)
>>>>  {
>>>>  
>>>> @@ -632,6 +642,8 @@ static void spapr_xive_icp_reset(void *dev)
>>>>      sPAPRXiveICP *xicp = SPAPR_XIVE_ICP(dev);
>>>>  
>>>>      memset(xicp->tima, 0, sizeof(xicp->tima));
>>>> +
>>>> +    memset(xicp->eqt, 0, sizeof(xicp->eqt));
>>>>  }
>>>>  
>>>>  static void spapr_xive_icp_realize(DeviceState *dev, Error **errp)
>>>> @@ -683,6 +695,23 @@ static void spapr_xive_icp_init(Object *obj)
>>>>      xicp->tima_os = &xicp->tima[TM_QW1_OS];
>>>>  }
>>>>  
>>>> +static const VMStateDescription vmstate_spapr_xive_icp_eq = {
>>>> +    .name = TYPE_SPAPR_XIVE_ICP "/eq",
>>>> +    .version_id = 1,
>>>> +    .minimum_version_id = 1,
>>>> +    .fields = (VMStateField []) {
>>>> +        VMSTATE_UINT32(w0, XiveEQ),
>>>> +        VMSTATE_UINT32(w1, XiveEQ),
>>>> +        VMSTATE_UINT32(w2, XiveEQ),
>>>> +        VMSTATE_UINT32(w3, XiveEQ),
>>>> +        VMSTATE_UINT32(w4, XiveEQ),
>>>> +        VMSTATE_UINT32(w5, XiveEQ),
>>>> +        VMSTATE_UINT32(w6, XiveEQ),
>>>> +        VMSTATE_UINT32(w7, XiveEQ),
>>>
>>> Wow.  Super descriptive field names there, but I guess that's not your fault.
>>
>> The defines in the "xive-internal.h" give a better view ... 
>>
>>>> +        VMSTATE_END_OF_LIST()
>>>> +    },
>>>> +};
>>>> +
>>>>  static bool vmstate_spapr_xive_icp_needed(void *opaque)
>>>>  {
>>>>      /* TODO check machine XIVE support */
>>>> @@ -696,6 +725,8 @@ static const VMStateDescription vmstate_spapr_xive_icp = {
>>>>      .needed = vmstate_spapr_xive_icp_needed,
>>>>      .fields = (VMStateField[]) {
>>>>          VMSTATE_BUFFER(tima, sPAPRXiveICP),
>>>> +        VMSTATE_STRUCT_ARRAY(eqt, sPAPRXiveICP, (XIVE_PRIORITY_MAX + 1), 1,
>>>> +                             vmstate_spapr_xive_icp_eq, XiveEQ),
>>>>          VMSTATE_END_OF_LIST()
>>>>      },
>>>>  };
>>>> @@ -755,3 +786,28 @@ bool spapr_xive_irq_unset(sPAPRXive *xive, uint32_t lisn)
>>>>      ive->w &= ~IVE_VALID;
>>>>      return true;
>>>>  }
>>>> +
>>>> +/*
>>>> + * Use a simple indexing for the EQs.
>>>
>>> Is this server+priority encoding architected anywhere?  
>>
>> no. This is a model shortcut.
>>
>>> Otherwise, why not use separate parameters?
>>
>> yes. spapr_xive_get_eq() could use separate parameters and it would
>> shorten the some of the hcalls.
>>
>> The result is stored in a single field of the IVE, EQ_INDEX. So I will 
>> still need mangle/demangle routines but these could be simple macros.
>> I will look at it.
> 
> Hm, ok.  So it's architected in the sense that you're using the
> encoding from the EQ_INDEX field throughout.  That's could be a
> reasonable choice, I can't really tell yet.
> 
> On the other hand, it might be easier to read if we use server and
> priority as separate parameters until the point we actually encode
> into the EQ_INDEX field.

In the architecture, the EQ_INDEX field contains an index to an 
Event Queue Descriptor and the Event Queue Descriptor has a 
EQ_W6_NVT_INDEX field pointing to an Notification Virtual Target.
So there are two extra tables for the EQs and for the NVTs
used by the HW.

In the sPAPR model, an EQ array is stored under the sPAPRXiveNVT 
object which is stored under the ->intc pointer of the CPUState 
object

So the EQ_INDEX field is really taking a shortcut, encoding 
the cpu number and the priority to find an EQ, and the 
EQ_W6_NVT_INDEX field holds a value which is the cpu number.
But at the end, we save two tables. 

C.


> 
>>
>>>> + */
>>>> +XiveEQ *spapr_xive_get_eq(sPAPRXive *xive, uint32_t eq_idx)
>>>> +{
>>>> +    int priority = eq_idx & 0x7;
>>>> +    sPAPRXiveICP *xicp = spapr_xive_icp_get(xive, eq_idx >> 3);
>>>> +
>>>> +    return xicp ? &xicp->eqt[priority] : NULL;
>>>> +}
>>>> +
>>>> +bool spapr_xive_eq_for_server(sPAPRXive *xive, uint32_t server,
>>>> +                              uint8_t priority, uint32_t *out_eq_idx)
>>>> +{
>>>> +    if (priority > XIVE_PRIORITY_MAX) {
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    if (out_eq_idx) {
>>>> +        *out_eq_idx = (server << 3) | (priority & 0x7);
>>>> +    }
>>>> +
>>>> +    return true;
>>>> +}
>>>> diff --git a/hw/intc/xive-internal.h b/hw/intc/xive-internal.h
>>>> index 7d329f203a9b..c3949671aa03 100644
>>>> --- a/hw/intc/xive-internal.h
>>>> +++ b/hw/intc/xive-internal.h
>>>> @@ -131,9 +131,59 @@ typedef struct XiveIVE {
>>>>  #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
>>>>  } XiveIVE;
>>>>  
>>>> +/* EQ */
>>>> +typedef struct XiveEQ {
>>>> +        uint32_t        w0;
>>>> +#define EQ_W0_VALID             PPC_BIT32(0)
>>>> +#define EQ_W0_ENQUEUE           PPC_BIT32(1)
>>>> +#define EQ_W0_UCOND_NOTIFY      PPC_BIT32(2)
>>>> +#define EQ_W0_BACKLOG           PPC_BIT32(3)
>>>> +#define EQ_W0_PRECL_ESC_CTL     PPC_BIT32(4)
>>>> +#define EQ_W0_ESCALATE_CTL      PPC_BIT32(5)
>>>> +#define EQ_W0_END_OF_INTR       PPC_BIT32(6)
>>>> +#define EQ_W0_QSIZE             PPC_BITMASK32(12, 15)
>>>> +#define EQ_W0_SW0               PPC_BIT32(16)
>>>> +#define EQ_W0_FIRMWARE          EQ_W0_SW0 /* Owned by FW */
>>>> +#define EQ_QSIZE_4K             0
>>>> +#define EQ_QSIZE_64K            4
>>>> +#define EQ_W0_HWDEP             PPC_BITMASK32(24, 31)
>>>> +        uint32_t        w1;
>>>> +#define EQ_W1_ESn               PPC_BITMASK32(0, 1)
>>>> +#define EQ_W1_ESn_P             PPC_BIT32(0)
>>>> +#define EQ_W1_ESn_Q             PPC_BIT32(1)
>>>> +#define EQ_W1_ESe               PPC_BITMASK32(2, 3)
>>>> +#define EQ_W1_ESe_P             PPC_BIT32(2)
>>>> +#define EQ_W1_ESe_Q             PPC_BIT32(3)
>>>> +#define EQ_W1_GENERATION        PPC_BIT32(9)
>>>> +#define EQ_W1_PAGE_OFF          PPC_BITMASK32(10, 31)
>>>> +        uint32_t        w2;
>>>> +#define EQ_W2_MIGRATION_REG     PPC_BITMASK32(0, 3)
>>>> +#define EQ_W2_OP_DESC_HI        PPC_BITMASK32(4, 31)
>>>> +        uint32_t        w3;
>>>> +#define EQ_W3_OP_DESC_LO        PPC_BITMASK32(0, 31)
>>>> +        uint32_t        w4;
>>>> +#define EQ_W4_ESC_EQ_BLOCK      PPC_BITMASK32(4, 7)
>>>> +#define EQ_W4_ESC_EQ_INDEX      PPC_BITMASK32(8, 31)
>>>> +        uint32_t        w5;
>>>> +#define EQ_W5_ESC_EQ_DATA       PPC_BITMASK32(1, 31)
>>>> +        uint32_t        w6;
>>>> +#define EQ_W6_FORMAT_BIT        PPC_BIT32(8)
>>>> +#define EQ_W6_NVT_BLOCK         PPC_BITMASK32(9, 12)
>>>> +#define EQ_W6_NVT_INDEX         PPC_BITMASK32(13, 31)
>>>> +        uint32_t        w7;
>>>> +#define EQ_W7_F0_IGNORE         PPC_BIT32(0)
>>>> +#define EQ_W7_F0_BLK_GROUPING   PPC_BIT32(1)
>>>> +#define EQ_W7_F0_PRIORITY       PPC_BITMASK32(8, 15)
>>>> +#define EQ_W7_F1_WAKEZ          PPC_BIT32(0)
>>>> +#define EQ_W7_F1_LOG_SERVER_ID  PPC_BITMASK32(1, 31)
>>>> +} XiveEQ;
>>>> +
>>>>  #define XIVE_PRIORITY_MAX  7
>>>>  
>>>>  void spapr_xive_reset(void *dev);
>>>>  XiveIVE *spapr_xive_get_ive(sPAPRXive *xive, uint32_t lisn);
>>>> +XiveEQ *spapr_xive_get_eq(sPAPRXive *xive, uint32_t idx);
>>>> +bool spapr_xive_eq_for_server(sPAPRXive *xive, uint32_t server, uint8_t prio,
>>>> +                              uint32_t *out_eq_idx);
>>>>  
>>>>  #endif /* _INTC_XIVE_INTERNAL_H */
>>>
>>
>
Benjamin Herrenschmidt Dec. 2, 2017, 2:39 p.m. UTC | #8
On Thu, 2017-11-30 at 15:38 +1100, David Gibson wrote:
> On Thu, Nov 23, 2017 at 02:29:43PM +0100, Cédric Le Goater wrote:
> > The Event Queue Descriptor (EQD) table, also known as Event Notification
> > Descriptor (END), is one of the internal tables the XIVE interrupt
> > controller uses to redirect exception from event sources to CPU
> > threads.
> > 
> > The EQD specifies on which Event Queue the event data should be posted
> > when an exception occurs (later on pulled by the OS) and which server
> > (VPD in XIVE terminology) to notify. The Event Queue is a much more
> > complex structure but we start with a simple model for the sPAPR
> > machine.
> 
> Just to clarify my understanding a server / VPD in XIVE would
> typically correspond to a cpu - either real or virtual, yes?

The IVEs and EQs are managed by the virtualization controller. The VPs
(aka ENDs) are managed by the presentation controller. There's a VP per
real and virtual CPU.

You can think of the XIVE as having 3 main component types:


 - Source controller(s). There are some in the PHBs, one generic in the
XIVE itself, and one in the PSI bridge. Those contain the PQ bits and
thus the trigger & coalescing logic. They effectively shoot an MMIO to
the virtualization controller on events.

 - Virtualization controller (one per chip). This receives the above
MMIOs from the sources, manages the IVEs to get the target queue and
remap the number, and manages the queues. When a queue is enabled for
notification (or escalation) and such an event occurs, an MMIO goes to
the corresponding presentation controller.

 - Presentation controller (one per chip). This receives the above
notifications and sets as a result the IPB bits for one of the 8
priorities. Basically this guy tracks a single pending bit per priority
for each VP indicating whether there's something in the queue for that
priority and delivers interrupts to the core accordingly.


Now this is a simplified view. The PC supports groups but we don't
handle that yet, there are escalation interrupts, there are
redistribution mechanisms etc... but for now you get the basic idea.

Cheers,
Ben.
Benjamin Herrenschmidt Dec. 2, 2017, 2:41 p.m. UTC | #9
On Sat, 2017-12-02 at 08:39 -0600, Benjamin Herrenschmidt wrote:
> The IVEs and EQs are managed by the virtualization controller. The VPs
> (aka ENDs) 

typo. aka NVTs

> are managed by the presentation controller. There's a VP per
> real and virtual CPU.
David Gibson Dec. 4, 2017, 1:09 a.m. UTC | #10
On Fri, Dec 01, 2017 at 05:36:39PM +0100, Cédric Le Goater wrote:
> On 12/01/2017 12:35 AM, David Gibson wrote:
> > On Thu, Nov 30, 2017 at 02:06:27PM +0000, Cédric Le Goater wrote:
> >> On 11/30/2017 04:38 AM, David Gibson wrote:
> >>> On Thu, Nov 23, 2017 at 02:29:43PM +0100, Cédric Le Goater wrote:
> >>>> The Event Queue Descriptor (EQD) table, also known as Event Notification
> >>>> Descriptor (END), is one of the internal tables the XIVE interrupt
> >>>> controller uses to redirect exception from event sources to CPU
> >>>> threads.
> >>>>
> >>>> The EQD specifies on which Event Queue the event data should be posted
> >>>> when an exception occurs (later on pulled by the OS) and which server
> >>>> (VPD in XIVE terminology) to notify. The Event Queue is a much more
> >>>> complex structure but we start with a simple model for the sPAPR
> >>>> machine.
> >>>
> >>> Just to clarify my understanding a server / VPD in XIVE would
> >>> typically correspond to a cpu - either real or virtual, yes?
> >>
> >> yes. VP for "virtual processor" and VPD for "virtual processor 
> >> descriptor" which contains the XIVE interrupt state of the VP 
> >> when not dispatched. It is still described in some documentation 
> >> as an NVT : Notification Virtual Target.  
> >>
> >> XIVE concepts were renamed at some time but the old name perdured.
> >> I am still struggling my way through all the names.
> >>
> >>
> >>>> There is one XiveEQ per priority and the model chooses to store them
> >>>> under the Xive Interrupt presenter model. It will be retrieved, just
> >>>> like for XICS, through the 'intc' object pointer of the CPU.
> >>>>
> >>>> The EQ indexing follows a simple pattern:
> >>>>
> >>>>        (server << 3) | (priority & 0x7)
> >>>>
> >>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >>>> ---
> >>>>  hw/intc/spapr_xive.c    | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>  hw/intc/xive-internal.h | 50 +++++++++++++++++++++++++++++++++++++++++++
> >>>>  2 files changed, 106 insertions(+)
> >>>>
> >>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> >>>> index 554b25e0884c..983317a6b3f6 100644
> >>>> --- a/hw/intc/spapr_xive.c
> >>>> +++ b/hw/intc/spapr_xive.c
> >>>> @@ -23,6 +23,7 @@
> >>>>  #include "sysemu/dma.h"
> >>>>  #include "monitor/monitor.h"
> >>>>  #include "hw/ppc/spapr_xive.h"
> >>>> +#include "hw/ppc/spapr.h"
> >>>>  #include "hw/ppc/xics.h"
> >>>>  
> >>>>  #include "xive-internal.h"
> >>>> @@ -34,6 +35,8 @@ struct sPAPRXiveICP {
> >>>>      uint8_t   tima[TM_RING_COUNT * 0x10];
> >>>>      uint8_t   *tima_os;
> >>>>      qemu_irq  output;
> >>>> +
> >>>> +    XiveEQ    eqt[XIVE_PRIORITY_MAX + 1];
> >>>>  };
> >>>>  
> >>>>  static uint64_t spapr_xive_icp_accept(sPAPRXiveICP *icp)
> >>>> @@ -183,6 +186,13 @@ static const MemoryRegionOps spapr_xive_tm_ops = {
> >>>>      },
> >>>>  };
> >>>>  
> >>>> +static sPAPRXiveICP *spapr_xive_icp_get(sPAPRXive *xive, int server)
> >>>> +{
> >>>> +    PowerPCCPU *cpu = spapr_find_cpu(server);
> >>>> +
> >>>> +    return cpu ? SPAPR_XIVE_ICP(cpu->intc) : NULL;
> >>>> +}
> >>>> +
> >>>>  static void spapr_xive_irq(sPAPRXive *xive, int lisn)
> >>>>  {
> >>>>  
> >>>> @@ -632,6 +642,8 @@ static void spapr_xive_icp_reset(void *dev)
> >>>>      sPAPRXiveICP *xicp = SPAPR_XIVE_ICP(dev);
> >>>>  
> >>>>      memset(xicp->tima, 0, sizeof(xicp->tima));
> >>>> +
> >>>> +    memset(xicp->eqt, 0, sizeof(xicp->eqt));
> >>>>  }
> >>>>  
> >>>>  static void spapr_xive_icp_realize(DeviceState *dev, Error **errp)
> >>>> @@ -683,6 +695,23 @@ static void spapr_xive_icp_init(Object *obj)
> >>>>      xicp->tima_os = &xicp->tima[TM_QW1_OS];
> >>>>  }
> >>>>  
> >>>> +static const VMStateDescription vmstate_spapr_xive_icp_eq = {
> >>>> +    .name = TYPE_SPAPR_XIVE_ICP "/eq",
> >>>> +    .version_id = 1,
> >>>> +    .minimum_version_id = 1,
> >>>> +    .fields = (VMStateField []) {
> >>>> +        VMSTATE_UINT32(w0, XiveEQ),
> >>>> +        VMSTATE_UINT32(w1, XiveEQ),
> >>>> +        VMSTATE_UINT32(w2, XiveEQ),
> >>>> +        VMSTATE_UINT32(w3, XiveEQ),
> >>>> +        VMSTATE_UINT32(w4, XiveEQ),
> >>>> +        VMSTATE_UINT32(w5, XiveEQ),
> >>>> +        VMSTATE_UINT32(w6, XiveEQ),
> >>>> +        VMSTATE_UINT32(w7, XiveEQ),
> >>>
> >>> Wow.  Super descriptive field names there, but I guess that's not your fault.
> >>
> >> The defines in the "xive-internal.h" give a better view ... 
> >>
> >>>> +        VMSTATE_END_OF_LIST()
> >>>> +    },
> >>>> +};
> >>>> +
> >>>>  static bool vmstate_spapr_xive_icp_needed(void *opaque)
> >>>>  {
> >>>>      /* TODO check machine XIVE support */
> >>>> @@ -696,6 +725,8 @@ static const VMStateDescription vmstate_spapr_xive_icp = {
> >>>>      .needed = vmstate_spapr_xive_icp_needed,
> >>>>      .fields = (VMStateField[]) {
> >>>>          VMSTATE_BUFFER(tima, sPAPRXiveICP),
> >>>> +        VMSTATE_STRUCT_ARRAY(eqt, sPAPRXiveICP, (XIVE_PRIORITY_MAX + 1), 1,
> >>>> +                             vmstate_spapr_xive_icp_eq, XiveEQ),
> >>>>          VMSTATE_END_OF_LIST()
> >>>>      },
> >>>>  };
> >>>> @@ -755,3 +786,28 @@ bool spapr_xive_irq_unset(sPAPRXive *xive, uint32_t lisn)
> >>>>      ive->w &= ~IVE_VALID;
> >>>>      return true;
> >>>>  }
> >>>> +
> >>>> +/*
> >>>> + * Use a simple indexing for the EQs.
> >>>
> >>> Is this server+priority encoding architected anywhere?  
> >>
> >> no. This is a model shortcut.
> >>
> >>> Otherwise, why not use separate parameters?
> >>
> >> yes. spapr_xive_get_eq() could use separate parameters and it would
> >> shorten the some of the hcalls.
> >>
> >> The result is stored in a single field of the IVE, EQ_INDEX. So I will 
> >> still need mangle/demangle routines but these could be simple macros.
> >> I will look at it.
> > 
> > Hm, ok.  So it's architected in the sense that you're using the
> > encoding from the EQ_INDEX field throughout.  That's could be a
> > reasonable choice, I can't really tell yet.
> > 
> > On the other hand, it might be easier to read if we use server and
> > priority as separate parameters until the point we actually encode
> > into the EQ_INDEX field.
> 
> In the architecture, the EQ_INDEX field contains an index to an 
> Event Queue Descriptor and the Event Queue Descriptor has a 
> EQ_W6_NVT_INDEX field pointing to an Notification Virtual Target.
> So there are two extra tables for the EQs and for the NVTs
> used by the HW.

Ok.  In the PAPR interface is the EQ_INDEX ever exposed to the guest?
Or does it just supply target/priority numbers and the hypervisor
manages the mapping to queues internally?

> In the sPAPR model, an EQ array is stored under the sPAPRXiveNVT 
> object which is stored under the ->intc pointer of the CPUState 
> object
> 
> So the EQ_INDEX field is really taking a shortcut, encoding 
> the cpu number and the priority to find an EQ, and the 
> EQ_W6_NVT_INDEX field holds a value which is the cpu number.
> But at the end, we save two tables. 
> 
> C.
> 
> 
> > 
> >>
> >>>> + */
> >>>> +XiveEQ *spapr_xive_get_eq(sPAPRXive *xive, uint32_t eq_idx)
> >>>> +{
> >>>> +    int priority = eq_idx & 0x7;
> >>>> +    sPAPRXiveICP *xicp = spapr_xive_icp_get(xive, eq_idx >> 3);
> >>>> +
> >>>> +    return xicp ? &xicp->eqt[priority] : NULL;
> >>>> +}
> >>>> +
> >>>> +bool spapr_xive_eq_for_server(sPAPRXive *xive, uint32_t server,
> >>>> +                              uint8_t priority, uint32_t *out_eq_idx)
> >>>> +{
> >>>> +    if (priority > XIVE_PRIORITY_MAX) {
> >>>> +        return false;
> >>>> +    }
> >>>> +
> >>>> +    if (out_eq_idx) {
> >>>> +        *out_eq_idx = (server << 3) | (priority & 0x7);
> >>>> +    }
> >>>> +
> >>>> +    return true;
> >>>> +}
> >>>> diff --git a/hw/intc/xive-internal.h b/hw/intc/xive-internal.h
> >>>> index 7d329f203a9b..c3949671aa03 100644
> >>>> --- a/hw/intc/xive-internal.h
> >>>> +++ b/hw/intc/xive-internal.h
> >>>> @@ -131,9 +131,59 @@ typedef struct XiveIVE {
> >>>>  #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
> >>>>  } XiveIVE;
> >>>>  
> >>>> +/* EQ */
> >>>> +typedef struct XiveEQ {
> >>>> +        uint32_t        w0;
> >>>> +#define EQ_W0_VALID             PPC_BIT32(0)
> >>>> +#define EQ_W0_ENQUEUE           PPC_BIT32(1)
> >>>> +#define EQ_W0_UCOND_NOTIFY      PPC_BIT32(2)
> >>>> +#define EQ_W0_BACKLOG           PPC_BIT32(3)
> >>>> +#define EQ_W0_PRECL_ESC_CTL     PPC_BIT32(4)
> >>>> +#define EQ_W0_ESCALATE_CTL      PPC_BIT32(5)
> >>>> +#define EQ_W0_END_OF_INTR       PPC_BIT32(6)
> >>>> +#define EQ_W0_QSIZE             PPC_BITMASK32(12, 15)
> >>>> +#define EQ_W0_SW0               PPC_BIT32(16)
> >>>> +#define EQ_W0_FIRMWARE          EQ_W0_SW0 /* Owned by FW */
> >>>> +#define EQ_QSIZE_4K             0
> >>>> +#define EQ_QSIZE_64K            4
> >>>> +#define EQ_W0_HWDEP             PPC_BITMASK32(24, 31)
> >>>> +        uint32_t        w1;
> >>>> +#define EQ_W1_ESn               PPC_BITMASK32(0, 1)
> >>>> +#define EQ_W1_ESn_P             PPC_BIT32(0)
> >>>> +#define EQ_W1_ESn_Q             PPC_BIT32(1)
> >>>> +#define EQ_W1_ESe               PPC_BITMASK32(2, 3)
> >>>> +#define EQ_W1_ESe_P             PPC_BIT32(2)
> >>>> +#define EQ_W1_ESe_Q             PPC_BIT32(3)
> >>>> +#define EQ_W1_GENERATION        PPC_BIT32(9)
> >>>> +#define EQ_W1_PAGE_OFF          PPC_BITMASK32(10, 31)
> >>>> +        uint32_t        w2;
> >>>> +#define EQ_W2_MIGRATION_REG     PPC_BITMASK32(0, 3)
> >>>> +#define EQ_W2_OP_DESC_HI        PPC_BITMASK32(4, 31)
> >>>> +        uint32_t        w3;
> >>>> +#define EQ_W3_OP_DESC_LO        PPC_BITMASK32(0, 31)
> >>>> +        uint32_t        w4;
> >>>> +#define EQ_W4_ESC_EQ_BLOCK      PPC_BITMASK32(4, 7)
> >>>> +#define EQ_W4_ESC_EQ_INDEX      PPC_BITMASK32(8, 31)
> >>>> +        uint32_t        w5;
> >>>> +#define EQ_W5_ESC_EQ_DATA       PPC_BITMASK32(1, 31)
> >>>> +        uint32_t        w6;
> >>>> +#define EQ_W6_FORMAT_BIT        PPC_BIT32(8)
> >>>> +#define EQ_W6_NVT_BLOCK         PPC_BITMASK32(9, 12)
> >>>> +#define EQ_W6_NVT_INDEX         PPC_BITMASK32(13, 31)
> >>>> +        uint32_t        w7;
> >>>> +#define EQ_W7_F0_IGNORE         PPC_BIT32(0)
> >>>> +#define EQ_W7_F0_BLK_GROUPING   PPC_BIT32(1)
> >>>> +#define EQ_W7_F0_PRIORITY       PPC_BITMASK32(8, 15)
> >>>> +#define EQ_W7_F1_WAKEZ          PPC_BIT32(0)
> >>>> +#define EQ_W7_F1_LOG_SERVER_ID  PPC_BITMASK32(1, 31)
> >>>> +} XiveEQ;
> >>>> +
> >>>>  #define XIVE_PRIORITY_MAX  7
> >>>>  
> >>>>  void spapr_xive_reset(void *dev);
> >>>>  XiveIVE *spapr_xive_get_ive(sPAPRXive *xive, uint32_t lisn);
> >>>> +XiveEQ *spapr_xive_get_eq(sPAPRXive *xive, uint32_t idx);
> >>>> +bool spapr_xive_eq_for_server(sPAPRXive *xive, uint32_t server, uint8_t prio,
> >>>> +                              uint32_t *out_eq_idx);
> >>>>  
> >>>>  #endif /* _INTC_XIVE_INTERNAL_H */
> >>>
> >>
> > 
>
Cédric Le Goater Dec. 4, 2017, 4:31 p.m. UTC | #11
On 12/04/2017 02:09 AM, David Gibson wrote:
> On Fri, Dec 01, 2017 at 05:36:39PM +0100, Cédric Le Goater wrote:
>> On 12/01/2017 12:35 AM, David Gibson wrote:
>>> On Thu, Nov 30, 2017 at 02:06:27PM +0000, Cédric Le Goater wrote:
>>>> On 11/30/2017 04:38 AM, David Gibson wrote:
>>>>> On Thu, Nov 23, 2017 at 02:29:43PM +0100, Cédric Le Goater wrote:
>>>>>> The Event Queue Descriptor (EQD) table, also known as Event Notification
>>>>>> Descriptor (END), is one of the internal tables the XIVE interrupt
>>>>>> controller uses to redirect exception from event sources to CPU
>>>>>> threads.
>>>>>>
>>>>>> The EQD specifies on which Event Queue the event data should be posted
>>>>>> when an exception occurs (later on pulled by the OS) and which server
>>>>>> (VPD in XIVE terminology) to notify. The Event Queue is a much more
>>>>>> complex structure but we start with a simple model for the sPAPR
>>>>>> machine.
>>>>>
>>>>> Just to clarify my understanding a server / VPD in XIVE would
>>>>> typically correspond to a cpu - either real or virtual, yes?
>>>>
>>>> yes. VP for "virtual processor" and VPD for "virtual processor 
>>>> descriptor" which contains the XIVE interrupt state of the VP 
>>>> when not dispatched. It is still described in some documentation 
>>>> as an NVT : Notification Virtual Target.  
>>>>
>>>> XIVE concepts were renamed at some time but the old name perdured.
>>>> I am still struggling my way through all the names.
>>>>
>>>>
>>>>>> There is one XiveEQ per priority and the model chooses to store them
>>>>>> under the Xive Interrupt presenter model. It will be retrieved, just
>>>>>> like for XICS, through the 'intc' object pointer of the CPU.
>>>>>>
>>>>>> The EQ indexing follows a simple pattern:
>>>>>>
>>>>>>        (server << 3) | (priority & 0x7)
>>>>>>
>>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>>>> ---
>>>>>>  hw/intc/spapr_xive.c    | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>  hw/intc/xive-internal.h | 50 +++++++++++++++++++++++++++++++++++++++++++
>>>>>>  2 files changed, 106 insertions(+)
>>>>>>
>>>>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>>>>>> index 554b25e0884c..983317a6b3f6 100644
>>>>>> --- a/hw/intc/spapr_xive.c
>>>>>> +++ b/hw/intc/spapr_xive.c
>>>>>> @@ -23,6 +23,7 @@
>>>>>>  #include "sysemu/dma.h"
>>>>>>  #include "monitor/monitor.h"
>>>>>>  #include "hw/ppc/spapr_xive.h"
>>>>>> +#include "hw/ppc/spapr.h"
>>>>>>  #include "hw/ppc/xics.h"
>>>>>>  
>>>>>>  #include "xive-internal.h"
>>>>>> @@ -34,6 +35,8 @@ struct sPAPRXiveICP {
>>>>>>      uint8_t   tima[TM_RING_COUNT * 0x10];
>>>>>>      uint8_t   *tima_os;
>>>>>>      qemu_irq  output;
>>>>>> +
>>>>>> +    XiveEQ    eqt[XIVE_PRIORITY_MAX + 1];
>>>>>>  };
>>>>>>  
>>>>>>  static uint64_t spapr_xive_icp_accept(sPAPRXiveICP *icp)
>>>>>> @@ -183,6 +186,13 @@ static const MemoryRegionOps spapr_xive_tm_ops = {
>>>>>>      },
>>>>>>  };
>>>>>>  
>>>>>> +static sPAPRXiveICP *spapr_xive_icp_get(sPAPRXive *xive, int server)
>>>>>> +{
>>>>>> +    PowerPCCPU *cpu = spapr_find_cpu(server);
>>>>>> +
>>>>>> +    return cpu ? SPAPR_XIVE_ICP(cpu->intc) : NULL;
>>>>>> +}
>>>>>> +
>>>>>>  static void spapr_xive_irq(sPAPRXive *xive, int lisn)
>>>>>>  {
>>>>>>  
>>>>>> @@ -632,6 +642,8 @@ static void spapr_xive_icp_reset(void *dev)
>>>>>>      sPAPRXiveICP *xicp = SPAPR_XIVE_ICP(dev);
>>>>>>  
>>>>>>      memset(xicp->tima, 0, sizeof(xicp->tima));
>>>>>> +
>>>>>> +    memset(xicp->eqt, 0, sizeof(xicp->eqt));
>>>>>>  }
>>>>>>  
>>>>>>  static void spapr_xive_icp_realize(DeviceState *dev, Error **errp)
>>>>>> @@ -683,6 +695,23 @@ static void spapr_xive_icp_init(Object *obj)
>>>>>>      xicp->tima_os = &xicp->tima[TM_QW1_OS];
>>>>>>  }
>>>>>>  
>>>>>> +static const VMStateDescription vmstate_spapr_xive_icp_eq = {
>>>>>> +    .name = TYPE_SPAPR_XIVE_ICP "/eq",
>>>>>> +    .version_id = 1,
>>>>>> +    .minimum_version_id = 1,
>>>>>> +    .fields = (VMStateField []) {
>>>>>> +        VMSTATE_UINT32(w0, XiveEQ),
>>>>>> +        VMSTATE_UINT32(w1, XiveEQ),
>>>>>> +        VMSTATE_UINT32(w2, XiveEQ),
>>>>>> +        VMSTATE_UINT32(w3, XiveEQ),
>>>>>> +        VMSTATE_UINT32(w4, XiveEQ),
>>>>>> +        VMSTATE_UINT32(w5, XiveEQ),
>>>>>> +        VMSTATE_UINT32(w6, XiveEQ),
>>>>>> +        VMSTATE_UINT32(w7, XiveEQ),
>>>>>
>>>>> Wow.  Super descriptive field names there, but I guess that's not your fault.
>>>>
>>>> The defines in the "xive-internal.h" give a better view ... 
>>>>
>>>>>> +        VMSTATE_END_OF_LIST()
>>>>>> +    },
>>>>>> +};
>>>>>> +
>>>>>>  static bool vmstate_spapr_xive_icp_needed(void *opaque)
>>>>>>  {
>>>>>>      /* TODO check machine XIVE support */
>>>>>> @@ -696,6 +725,8 @@ static const VMStateDescription vmstate_spapr_xive_icp = {
>>>>>>      .needed = vmstate_spapr_xive_icp_needed,
>>>>>>      .fields = (VMStateField[]) {
>>>>>>          VMSTATE_BUFFER(tima, sPAPRXiveICP),
>>>>>> +        VMSTATE_STRUCT_ARRAY(eqt, sPAPRXiveICP, (XIVE_PRIORITY_MAX + 1), 1,
>>>>>> +                             vmstate_spapr_xive_icp_eq, XiveEQ),
>>>>>>          VMSTATE_END_OF_LIST()
>>>>>>      },
>>>>>>  };
>>>>>> @@ -755,3 +786,28 @@ bool spapr_xive_irq_unset(sPAPRXive *xive, uint32_t lisn)
>>>>>>      ive->w &= ~IVE_VALID;
>>>>>>      return true;
>>>>>>  }
>>>>>> +
>>>>>> +/*
>>>>>> + * Use a simple indexing for the EQs.
>>>>>
>>>>> Is this server+priority encoding architected anywhere?  
>>>>
>>>> no. This is a model shortcut.
>>>>
>>>>> Otherwise, why not use separate parameters?
>>>>
>>>> yes. spapr_xive_get_eq() could use separate parameters and it would
>>>> shorten the some of the hcalls.
>>>>
>>>> The result is stored in a single field of the IVE, EQ_INDEX. So I will 
>>>> still need mangle/demangle routines but these could be simple macros.
>>>> I will look at it.
>>>
>>> Hm, ok.  So it's architected in the sense that you're using the
>>> encoding from the EQ_INDEX field throughout.  That's could be a
>>> reasonable choice, I can't really tell yet.
>>>
>>> On the other hand, it might be easier to read if we use server and
>>> priority as separate parameters until the point we actually encode
>>> into the EQ_INDEX field.
>>
>> In the architecture, the EQ_INDEX field contains an index to an 
>> Event Queue Descriptor and the Event Queue Descriptor has a 
>> EQ_W6_NVT_INDEX field pointing to an Notification Virtual Target.
>> So there are two extra tables for the EQs and for the NVTs
>> used by the HW.
> 
> Ok.  In the PAPR interface is the EQ_INDEX ever exposed to the guest?

never. 

> Or does it just supply target/priority numbers and the hypervisor
> manages the mapping to queues internally?

Yes. target/priority numbers is the interface used by the hcalls. 

Same for baremetal. OPAL handles the EQ indexing because it creates 
the EQ table and register it in the controller   

C.
diff mbox series

Patch

diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
index 554b25e0884c..983317a6b3f6 100644
--- a/hw/intc/spapr_xive.c
+++ b/hw/intc/spapr_xive.c
@@ -23,6 +23,7 @@ 
 #include "sysemu/dma.h"
 #include "monitor/monitor.h"
 #include "hw/ppc/spapr_xive.h"
+#include "hw/ppc/spapr.h"
 #include "hw/ppc/xics.h"
 
 #include "xive-internal.h"
@@ -34,6 +35,8 @@  struct sPAPRXiveICP {
     uint8_t   tima[TM_RING_COUNT * 0x10];
     uint8_t   *tima_os;
     qemu_irq  output;
+
+    XiveEQ    eqt[XIVE_PRIORITY_MAX + 1];
 };
 
 static uint64_t spapr_xive_icp_accept(sPAPRXiveICP *icp)
@@ -183,6 +186,13 @@  static const MemoryRegionOps spapr_xive_tm_ops = {
     },
 };
 
+static sPAPRXiveICP *spapr_xive_icp_get(sPAPRXive *xive, int server)
+{
+    PowerPCCPU *cpu = spapr_find_cpu(server);
+
+    return cpu ? SPAPR_XIVE_ICP(cpu->intc) : NULL;
+}
+
 static void spapr_xive_irq(sPAPRXive *xive, int lisn)
 {
 
@@ -632,6 +642,8 @@  static void spapr_xive_icp_reset(void *dev)
     sPAPRXiveICP *xicp = SPAPR_XIVE_ICP(dev);
 
     memset(xicp->tima, 0, sizeof(xicp->tima));
+
+    memset(xicp->eqt, 0, sizeof(xicp->eqt));
 }
 
 static void spapr_xive_icp_realize(DeviceState *dev, Error **errp)
@@ -683,6 +695,23 @@  static void spapr_xive_icp_init(Object *obj)
     xicp->tima_os = &xicp->tima[TM_QW1_OS];
 }
 
+static const VMStateDescription vmstate_spapr_xive_icp_eq = {
+    .name = TYPE_SPAPR_XIVE_ICP "/eq",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField []) {
+        VMSTATE_UINT32(w0, XiveEQ),
+        VMSTATE_UINT32(w1, XiveEQ),
+        VMSTATE_UINT32(w2, XiveEQ),
+        VMSTATE_UINT32(w3, XiveEQ),
+        VMSTATE_UINT32(w4, XiveEQ),
+        VMSTATE_UINT32(w5, XiveEQ),
+        VMSTATE_UINT32(w6, XiveEQ),
+        VMSTATE_UINT32(w7, XiveEQ),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
 static bool vmstate_spapr_xive_icp_needed(void *opaque)
 {
     /* TODO check machine XIVE support */
@@ -696,6 +725,8 @@  static const VMStateDescription vmstate_spapr_xive_icp = {
     .needed = vmstate_spapr_xive_icp_needed,
     .fields = (VMStateField[]) {
         VMSTATE_BUFFER(tima, sPAPRXiveICP),
+        VMSTATE_STRUCT_ARRAY(eqt, sPAPRXiveICP, (XIVE_PRIORITY_MAX + 1), 1,
+                             vmstate_spapr_xive_icp_eq, XiveEQ),
         VMSTATE_END_OF_LIST()
     },
 };
@@ -755,3 +786,28 @@  bool spapr_xive_irq_unset(sPAPRXive *xive, uint32_t lisn)
     ive->w &= ~IVE_VALID;
     return true;
 }
+
+/*
+ * Use a simple indexing for the EQs.
+ */
+XiveEQ *spapr_xive_get_eq(sPAPRXive *xive, uint32_t eq_idx)
+{
+    int priority = eq_idx & 0x7;
+    sPAPRXiveICP *xicp = spapr_xive_icp_get(xive, eq_idx >> 3);
+
+    return xicp ? &xicp->eqt[priority] : NULL;
+}
+
+bool spapr_xive_eq_for_server(sPAPRXive *xive, uint32_t server,
+                              uint8_t priority, uint32_t *out_eq_idx)
+{
+    if (priority > XIVE_PRIORITY_MAX) {
+        return false;
+    }
+
+    if (out_eq_idx) {
+        *out_eq_idx = (server << 3) | (priority & 0x7);
+    }
+
+    return true;
+}
diff --git a/hw/intc/xive-internal.h b/hw/intc/xive-internal.h
index 7d329f203a9b..c3949671aa03 100644
--- a/hw/intc/xive-internal.h
+++ b/hw/intc/xive-internal.h
@@ -131,9 +131,59 @@  typedef struct XiveIVE {
 #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ */
 } XiveIVE;
 
+/* EQ */
+typedef struct XiveEQ {
+        uint32_t        w0;
+#define EQ_W0_VALID             PPC_BIT32(0)
+#define EQ_W0_ENQUEUE           PPC_BIT32(1)
+#define EQ_W0_UCOND_NOTIFY      PPC_BIT32(2)
+#define EQ_W0_BACKLOG           PPC_BIT32(3)
+#define EQ_W0_PRECL_ESC_CTL     PPC_BIT32(4)
+#define EQ_W0_ESCALATE_CTL      PPC_BIT32(5)
+#define EQ_W0_END_OF_INTR       PPC_BIT32(6)
+#define EQ_W0_QSIZE             PPC_BITMASK32(12, 15)
+#define EQ_W0_SW0               PPC_BIT32(16)
+#define EQ_W0_FIRMWARE          EQ_W0_SW0 /* Owned by FW */
+#define EQ_QSIZE_4K             0
+#define EQ_QSIZE_64K            4
+#define EQ_W0_HWDEP             PPC_BITMASK32(24, 31)
+        uint32_t        w1;
+#define EQ_W1_ESn               PPC_BITMASK32(0, 1)
+#define EQ_W1_ESn_P             PPC_BIT32(0)
+#define EQ_W1_ESn_Q             PPC_BIT32(1)
+#define EQ_W1_ESe               PPC_BITMASK32(2, 3)
+#define EQ_W1_ESe_P             PPC_BIT32(2)
+#define EQ_W1_ESe_Q             PPC_BIT32(3)
+#define EQ_W1_GENERATION        PPC_BIT32(9)
+#define EQ_W1_PAGE_OFF          PPC_BITMASK32(10, 31)
+        uint32_t        w2;
+#define EQ_W2_MIGRATION_REG     PPC_BITMASK32(0, 3)
+#define EQ_W2_OP_DESC_HI        PPC_BITMASK32(4, 31)
+        uint32_t        w3;
+#define EQ_W3_OP_DESC_LO        PPC_BITMASK32(0, 31)
+        uint32_t        w4;
+#define EQ_W4_ESC_EQ_BLOCK      PPC_BITMASK32(4, 7)
+#define EQ_W4_ESC_EQ_INDEX      PPC_BITMASK32(8, 31)
+        uint32_t        w5;
+#define EQ_W5_ESC_EQ_DATA       PPC_BITMASK32(1, 31)
+        uint32_t        w6;
+#define EQ_W6_FORMAT_BIT        PPC_BIT32(8)
+#define EQ_W6_NVT_BLOCK         PPC_BITMASK32(9, 12)
+#define EQ_W6_NVT_INDEX         PPC_BITMASK32(13, 31)
+        uint32_t        w7;
+#define EQ_W7_F0_IGNORE         PPC_BIT32(0)
+#define EQ_W7_F0_BLK_GROUPING   PPC_BIT32(1)
+#define EQ_W7_F0_PRIORITY       PPC_BITMASK32(8, 15)
+#define EQ_W7_F1_WAKEZ          PPC_BIT32(0)
+#define EQ_W7_F1_LOG_SERVER_ID  PPC_BITMASK32(1, 31)
+} XiveEQ;
+
 #define XIVE_PRIORITY_MAX  7
 
 void spapr_xive_reset(void *dev);
 XiveIVE *spapr_xive_get_ive(sPAPRXive *xive, uint32_t lisn);
+XiveEQ *spapr_xive_get_eq(sPAPRXive *xive, uint32_t idx);
+bool spapr_xive_eq_for_server(sPAPRXive *xive, uint32_t server, uint8_t prio,
+                              uint32_t *out_eq_idx);
 
 #endif /* _INTC_XIVE_INTERNAL_H */